From c2ee9ca3e02810c76f82bfc32a6643f2b0af0b84 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 30 Sep 2020 21:20:13 -0500 Subject: [PATCH 001/960] WIP - native _sre --- constants.rs | 114 ++++++++++++++++++++++++++++++++++++++++++++++ interp.rs | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 constants.rs create mode 100644 interp.rs diff --git a/constants.rs b/constants.rs new file mode 100644 index 0000000000..f6aeb3182f --- /dev/null +++ b/constants.rs @@ -0,0 +1,114 @@ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * NOTE: This file is generated by sre_constants.py. If you need + * to change anything in here, edit sre_constants.py and run it. + * + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. + * + * See the _sre.c file for information on usage and redistribution. + */ + +use bitflags::bitflags; + +pub const SRE_MAGIC: usize = 20140917; +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreOpcode { + FAILURE = 0, + SUCCESS = 1, + ANY = 2, + ANY_ALL = 3, + ASSERT = 4, + ASSERT_NOT = 5, + AT = 6, + BRANCH = 7, + CALL = 8, + CATEGORY = 9, + CHARSET = 10, + BIGCHARSET = 11, + GROUPREF = 12, + GROUPREF_EXISTS = 13, + GROUPREF_IGNORE = 14, + IN = 15, + IN_IGNORE = 16, + INFO = 17, + JUMP = 18, + LITERAL = 19, + LITERAL_IGNORE = 20, + MARK = 21, + MAX_UNTIL = 22, + MIN_UNTIL = 23, + NOT_LITERAL = 24, + NOT_LITERAL_IGNORE = 25, + NEGATE = 26, + RANGE = 27, + REPEAT = 28, + REPEAT_ONE = 29, + SUBPATTERN = 30, + MIN_REPEAT_ONE = 31, + RANGE_IGNORE = 32, +} +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreAtCode { + BEGINNING = 0, + BEGINNING_LINE = 1, + BEGINNING_STRING = 2, + BOUNDARY = 3, + NON_BOUNDARY = 4, + END = 5, + END_LINE = 6, + END_STRING = 7, + LOC_BOUNDARY = 8, + LOC_NON_BOUNDARY = 9, + UNI_BOUNDARY = 10, + UNI_NON_BOUNDARY = 11, +} +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreCatCode { + DIGIT = 0, + NOT_DIGIT = 1, + SPACE = 2, + NOT_SPACE = 3, + WORD = 4, + NOT_WORD = 5, + LINEBREAK = 6, + NOT_LINEBREAK = 7, + LOC_WORD = 8, + LOC_NOT_WORD = 9, + UNI_DIGIT = 10, + UNI_NOT_DIGIT = 11, + UNI_SPACE = 12, + UNI_NOT_SPACE = 13, + UNI_WORD = 14, + UNI_NOT_WORD = 15, + UNI_LINEBREAK = 16, + UNI_NOT_LINEBREAK = 17, +} +bitflags! { + pub struct SreFlag: u16 { + const TEMPLATE = 1; + const IGNORECASE = 2; + const LOCALE = 4; + const MULTILINE = 8; + const DOTALL = 16; + const UNICODE = 32; + const VERBOSE = 64; + const DEBUG = 128; + const ASCII = 256; + } +} +bitflags! { + pub struct SreInfo: u32 { + const PREFIX = 1; + const LITERAL = 2; + const CHARSET = 4; + } +} diff --git a/interp.rs b/interp.rs new file mode 100644 index 0000000000..7f93a82eb4 --- /dev/null +++ b/interp.rs @@ -0,0 +1,126 @@ +// good luck to those that follow; here be dragons + +use crate::builtins::PyStrRef; + +use super::constants::{SreFlag, SreOpcode}; + +use std::convert::TryFrom; +use std::{iter, slice}; + +pub struct State { + start: usize, + s_pos: usize, + end: usize, + pos: usize, + flags: SreFlag, + marks: Vec, + lastindex: isize, + marks_stack: Vec, + context_stack: Vec, + repeat: Option, + s: PyStrRef, +} + +// struct State1<'a> { +// state: &'a mut State, +// } + +struct MatchContext { + s_pos: usize, + code_pos: usize, +} + +// struct Context<'a> { +// context_stack: &mut Vec, +// } + +impl State { + pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self { + let end = std::cmp::min(end, s.char_len()); + Self { + start, + s_pos: start, + end, + pos: start, + flags, + marks: Vec::new(), + lastindex: -1, + marks_stack: Vec::new(), + context_stack: Vec::new(), + repeat: None, + s, + } + } +} + +// struct OpcodeDispatcher { +// executing_contexts: HashMap>, +// } + +pub struct BadSreCode; + +pub fn parse_ops(code: &[u32]) -> impl Iterator> + '_ { + let mut it = code.iter().copied(); + std::iter::from_fn(move || -> Option> { + let op = it.next()?; + let op = SreOpcode::try_from(op) + .ok() + .and_then(|op| extract_code(op, &mut it)); + Some(op) + }) + .map(|x| x.ok_or(BadSreCode)) +} + +type It<'a> = iter::Copied>; +fn extract_code(op: SreOpcode, it: &mut It) -> Option { + let skip = |it: &mut It| { + let skip = it.next()? as usize; + if skip > it.len() { + None + } else { + Some(skip) + } + }; + match op { + SreOpcode::FAILURE => {} + SreOpcode::SUCCESS => {} + SreOpcode::ANY => {} + SreOpcode::ANY_ALL => {} + SreOpcode::ASSERT => {} + SreOpcode::ASSERT_NOT => {} + SreOpcode::AT => {} + SreOpcode::BRANCH => {} + SreOpcode::CALL => {} + SreOpcode::CATEGORY => {} + SreOpcode::CHARSET => {} + SreOpcode::BIGCHARSET => {} + SreOpcode::GROUPREF => {} + SreOpcode::GROUPREF_EXISTS => {} + SreOpcode::GROUPREF_IGNORE => {} + SreOpcode::IN => {} + SreOpcode::IN_IGNORE => {} + SreOpcode::INFO => { + // let skip = it.next()?; + } + SreOpcode::JUMP => {} + SreOpcode::LITERAL => {} + SreOpcode::LITERAL_IGNORE => {} + SreOpcode::MARK => {} + SreOpcode::MAX_UNTIL => {} + SreOpcode::MIN_UNTIL => {} + SreOpcode::NOT_LITERAL => {} + SreOpcode::NOT_LITERAL_IGNORE => {} + SreOpcode::NEGATE => {} + SreOpcode::RANGE => {} + SreOpcode::REPEAT => {} + SreOpcode::REPEAT_ONE => {} + SreOpcode::SUBPATTERN => {} + SreOpcode::MIN_REPEAT_ONE => {} + SreOpcode::RANGE_IGNORE => {} + } + todo!() +} + +pub enum Op { + Info {}, +} From e1362ead3c8c7462b0027a7ecb4b9bbd23e76b5a Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 1 Dec 2020 17:51:11 +0200 Subject: [PATCH 002/960] WIP structure --- constants.rs | 4 + interp.rs | 467 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 379 insertions(+), 92 deletions(-) diff --git a/constants.rs b/constants.rs index f6aeb3182f..a80534d70b 100644 --- a/constants.rs +++ b/constants.rs @@ -14,6 +14,10 @@ use bitflags::bitflags; pub const SRE_MAGIC: usize = 20140917; +pub const SRE_CODESIZE: usize = 4; +pub const SRE_MAXREPEAT: usize = usize::max_value(); +pub const SRE_MAXGROUPS: usize = usize::max_value() / std::mem::size_of::() / 2; + #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] #[allow(non_camel_case_types)] diff --git a/interp.rs b/interp.rs index 7f93a82eb4..43c2a54515 100644 --- a/interp.rs +++ b/interp.rs @@ -1,126 +1,409 @@ // good luck to those that follow; here be dragons +use super::constants::{SreFlag, SreOpcode, SRE_MAXREPEAT}; use crate::builtins::PyStrRef; - -use super::constants::{SreFlag, SreOpcode}; - +use rustpython_common::borrow::BorrowValue; +use std::collections::HashMap; use std::convert::TryFrom; -use std::{iter, slice}; -pub struct State { +pub struct State<'a> { + // py_string: PyStrRef, + string: &'a str, start: usize, - s_pos: usize, end: usize, - pos: usize, flags: SreFlag, + pattern_codes: Vec, marks: Vec, lastindex: isize, marks_stack: Vec, context_stack: Vec, repeat: Option, - s: PyStrRef, + string_position: usize, } -// struct State1<'a> { -// state: &'a mut State, -// } - -struct MatchContext { - s_pos: usize, - code_pos: usize, -} - -// struct Context<'a> { -// context_stack: &mut Vec, -// } - -impl State { - pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self { - let end = std::cmp::min(end, s.char_len()); +impl<'a> State<'a> { + pub(crate) fn new( + // py_string: PyStrRef, + string: &'a str, + start: usize, + end: usize, + flags: SreFlag, + pattern_codes: Vec, + ) -> Self { + // let string = py_string.borrow_value(); Self { + // py_string, + string, start, - s_pos: start, end, - pos: start, flags, - marks: Vec::new(), + pattern_codes, lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), repeat: None, - s, + marks: Vec::new(), + string_position: start, } } + + fn reset(&mut self) { + self.marks.clear(); + self.lastindex = -1; + self.marks_stack.clear(); + self.context_stack.clear(); + self.repeat = None; + } } -// struct OpcodeDispatcher { -// executing_contexts: HashMap>, -// } +pub(crate) fn pymatch(mut state: State) -> bool { + let ctx = MatchContext { + string_position: state.start, + code_position: 0, + has_matched: None, + }; + state.context_stack.push(ctx); -pub struct BadSreCode; + let mut has_matched = None; + loop { + if state.context_stack.is_empty() { + break; + } + let ctx_id = state.context_stack.len() - 1; + let mut drive = MatchContextDrive::drive(ctx_id, state); + let mut dispatcher = OpcodeDispatcher::new(); -pub fn parse_ops(code: &[u32]) -> impl Iterator> + '_ { - let mut it = code.iter().copied(); - std::iter::from_fn(move || -> Option> { - let op = it.next()?; - let op = SreOpcode::try_from(op) - .ok() - .and_then(|op| extract_code(op, &mut it)); - Some(op) - }) - .map(|x| x.ok_or(BadSreCode)) + has_matched = dispatcher.pymatch(&mut drive); + state = drive.take(); + if has_matched.is_some() { + state.context_stack.pop(); + } + } + has_matched.unwrap_or(false) } -type It<'a> = iter::Copied>; -fn extract_code(op: SreOpcode, it: &mut It) -> Option { - let skip = |it: &mut It| { - let skip = it.next()? as usize; - if skip > it.len() { - None - } else { - Some(skip) +#[derive(Debug, Copy, Clone)] +struct MatchContext { + string_position: usize, + code_position: usize, + has_matched: Option, +} + +struct MatchContextDrive<'a> { + state: State<'a>, + ctx_id: usize, +} + +impl<'a> MatchContextDrive<'a> { + fn id(&self) -> usize { + self.ctx_id + } + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.state.context_stack[self.ctx_id] + } + fn ctx(&self) -> &MatchContext { + &self.state.context_stack[self.ctx_id] + } + fn push_new_context(&mut self, pattern_offset: usize) -> usize { + let ctx = self.ctx(); + let child_ctx = MatchContext { + string_position: ctx.string_position, + code_position: ctx.code_position + pattern_offset, + has_matched: None, + }; + self.state.context_stack.push(child_ctx); + self.state.context_stack.len() - 1 + } + fn drive(ctx_id: usize, state: State<'a>) -> Self { + Self { state, ctx_id } + } + fn take(self) -> State<'a> { + self.state + } + fn str(&self) -> &str { + unsafe { + std::str::from_utf8_unchecked( + &self.state.string.as_bytes()[self.ctx().string_position..], + ) + } + } + fn peek_char(&self) -> char { + self.str().chars().next().unwrap() + } + fn peek_code(&self, peek: usize) -> u32 { + self.state.pattern_codes[self.ctx().code_position + peek] + } + fn skip_char(&mut self, skip_count: usize) { + let skipped = self.str().char_indices().nth(skip_count).unwrap().0; + self.ctx_mut().string_position += skipped; + } + fn skip_code(&mut self, skip_count: usize) { + self.ctx_mut().code_position += skip_count; + } + fn remaining_chars(&self) -> usize { + let end = self.state.end; + end - self.ctx().string_position + self.str().len() + } + fn remaining_codes(&self) -> usize { + self.state.pattern_codes.len() - self.ctx().code_position + } + fn at_beginning(&self) -> bool { + self.ctx().string_position == 0 + } + fn at_end(&self) -> bool { + self.str().is_empty() + } + fn at_linebreak(&self) -> bool { + match self.str().chars().next() { + Some(c) => c == '\n', + None => false, } + } +} + +struct OpcodeDispatcher { + executing_contexts: HashMap>, +} + +macro_rules! once { + ($val:expr) => { + Box::new(OpEmpty {}) }; - match op { - SreOpcode::FAILURE => {} - SreOpcode::SUCCESS => {} - SreOpcode::ANY => {} - SreOpcode::ANY_ALL => {} - SreOpcode::ASSERT => {} - SreOpcode::ASSERT_NOT => {} - SreOpcode::AT => {} - SreOpcode::BRANCH => {} - SreOpcode::CALL => {} - SreOpcode::CATEGORY => {} - SreOpcode::CHARSET => {} - SreOpcode::BIGCHARSET => {} - SreOpcode::GROUPREF => {} - SreOpcode::GROUPREF_EXISTS => {} - SreOpcode::GROUPREF_IGNORE => {} - SreOpcode::IN => {} - SreOpcode::IN_IGNORE => {} - SreOpcode::INFO => { - // let skip = it.next()?; - } - SreOpcode::JUMP => {} - SreOpcode::LITERAL => {} - SreOpcode::LITERAL_IGNORE => {} - SreOpcode::MARK => {} - SreOpcode::MAX_UNTIL => {} - SreOpcode::MIN_UNTIL => {} - SreOpcode::NOT_LITERAL => {} - SreOpcode::NOT_LITERAL_IGNORE => {} - SreOpcode::NEGATE => {} - SreOpcode::RANGE => {} - SreOpcode::REPEAT => {} - SreOpcode::REPEAT_ONE => {} - SreOpcode::SUBPATTERN => {} - SreOpcode::MIN_REPEAT_ONE => {} - SreOpcode::RANGE_IGNORE => {} - } - todo!() -} - -pub enum Op { - Info {}, +} + +trait OpcodeExecutor { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>; +} + +struct OpFailure {} +impl OpcodeExecutor for OpFailure { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + drive.ctx_mut().has_matched = Some(false); + None + } +} + +struct OpEmpty {} +impl OpcodeExecutor for OpEmpty { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + None + } +} + +struct OpOnce { + f: Option, +} +impl OpcodeExecutor for OpOnce { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + let f = self.f.take()?; + f(drive); + None + } +} +fn once(f: F) -> Box> { + Box::new(OpOnce { f: Some(f) }) +} + +struct OpMinRepeatOne { + trace_id: usize, + mincount: usize, + maxcount: usize, + count: usize, + child_ctx_id: usize, +} +impl OpcodeExecutor for OpMinRepeatOne { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + match self.trace_id { + 0 => self._0(drive), + _ => unreachable!(), + } + } +} +impl Default for OpMinRepeatOne { + fn default() -> Self { + OpMinRepeatOne { + trace_id: 0, + mincount: 0, + maxcount: 0, + count: 0, + child_ctx_id: 0, + } + } +} +impl OpMinRepeatOne { + fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; + + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + + drive.state.string_position = drive.ctx().string_position; + + self.count = if self.mincount == 0 { + 0 + } else { + let count = count_repetitions(drive, self.mincount); + if count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(count); + count + }; + + if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } + + // mark push + self.trace_id = 1; + self._1(drive) + } + fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.trace_id = 2; + return Some(()); + } + + // mark discard + drive.ctx_mut().has_matched = Some(false); + None + } + fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + if count_repetitions(drive, 1) == 0 { + self.trace_id = 3; + return self._3(drive); + } + drive.skip_char(1); + self.count += 1; + // marks pop keep + self.trace_id = 1; + self._1(drive) + } + fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + // mark discard + drive.ctx_mut().has_matched = Some(false); + None + } +} + +impl OpcodeDispatcher { + fn new() -> Self { + Self { + executing_contexts: HashMap::new(), + } + } + // Returns True if the current context matches, False if it doesn't and + // None if matching is not finished, ie must be resumed after child + // contexts have been matched. + fn pymatch(&mut self, drive: &mut MatchContextDrive) -> Option { + while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { + let code = drive.peek_code(0); + let opcode = SreOpcode::try_from(code).unwrap(); + self.dispatch(opcode, drive); + // self.drive = self.drive; + } + match drive.ctx().has_matched { + Some(matched) => Some(matched), + None => { + drive.ctx_mut().has_matched = Some(false); + Some(false) + } + } + } + + // Dispatches a context on a given opcode. Returns True if the context + // is done matching, False if it must be resumed when next encountered. + fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool { + let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { + Some((_, mut executor)) => executor, + None => self.dispatch_table(opcode, drive), + }; + if let Some(()) = executor.next(drive) { + self.executing_contexts.insert(drive.id(), executor); + false + } else { + true + } + } + + fn dispatch_table( + &mut self, + opcode: SreOpcode, + drive: &mut MatchContextDrive, + ) -> Box { + // move || { + match opcode { + SreOpcode::FAILURE => { + Box::new(OpFailure {}) + } + SreOpcode::SUCCESS => once(|drive| { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + }), + SreOpcode::ANY => once!(true), + SreOpcode::ANY_ALL => once!(true), + SreOpcode::ASSERT => once!(true), + SreOpcode::ASSERT_NOT => once!(true), + SreOpcode::AT => once!(true), + SreOpcode::BRANCH => once!(true), + SreOpcode::CALL => once!(true), + SreOpcode::CATEGORY => once!(true), + SreOpcode::CHARSET => once!(true), + SreOpcode::BIGCHARSET => once!(true), + SreOpcode::GROUPREF => once!(true), + SreOpcode::GROUPREF_EXISTS => once!(true), + SreOpcode::GROUPREF_IGNORE => once!(true), + SreOpcode::IN => once!(true), + SreOpcode::IN_IGNORE => once!(true), + SreOpcode::INFO => once!(true), + SreOpcode::JUMP => once!(true), + SreOpcode::LITERAL => { + if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_char(1); + } + drive.skip_code(2); + once!(true) + } + SreOpcode::LITERAL_IGNORE => once!(true), + SreOpcode::MARK => once!(true), + SreOpcode::MAX_UNTIL => once!(true), + SreOpcode::MIN_UNTIL => once!(true), + SreOpcode::NOT_LITERAL => once!(true), + SreOpcode::NOT_LITERAL_IGNORE => once!(true), + SreOpcode::NEGATE => once!(true), + SreOpcode::RANGE => once!(true), + SreOpcode::REPEAT => once!(true), + SreOpcode::REPEAT_ONE => once!(true), + SreOpcode::SUBPATTERN => once!(true), + SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), + SreOpcode::RANGE_IGNORE => once!(true), + } + } +} + +// Returns the number of repetitions of a single item, starting from the +// current string position. The code pointer is expected to point to a +// REPEAT_ONE operation (with the repeated 4 ahead). +fn count_repetitions(drive: &mut MatchContextDrive, maxcount: usize) -> usize { + let mut count = 0; + let mut real_maxcount = drive.state.end - drive.ctx().string_position; + if maxcount < real_maxcount && maxcount != SRE_MAXREPEAT { + real_maxcount = maxcount; + } + count } From 82922bf0d796f6c79f5799a1b14b9d1ad9c26431 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 23 Dec 2020 09:01:50 +0200 Subject: [PATCH 003/960] upgrade re version; implement helper functions; --- constants.rs | 52 ++-- interp.rs | 717 ++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 588 insertions(+), 181 deletions(-) diff --git a/constants.rs b/constants.rs index a80534d70b..f5ab92c531 100644 --- a/constants.rs +++ b/constants.rs @@ -13,11 +13,7 @@ use bitflags::bitflags; -pub const SRE_MAGIC: usize = 20140917; -pub const SRE_CODESIZE: usize = 4; -pub const SRE_MAXREPEAT: usize = usize::max_value(); -pub const SRE_MAXGROUPS: usize = usize::max_value() / std::mem::size_of::() / 2; - +pub const SRE_MAGIC: usize = 20171005; #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] #[allow(non_camel_case_types)] @@ -36,25 +32,33 @@ pub enum SreOpcode { BIGCHARSET = 11, GROUPREF = 12, GROUPREF_EXISTS = 13, - GROUPREF_IGNORE = 14, - IN = 15, - IN_IGNORE = 16, - INFO = 17, - JUMP = 18, - LITERAL = 19, - LITERAL_IGNORE = 20, - MARK = 21, - MAX_UNTIL = 22, - MIN_UNTIL = 23, - NOT_LITERAL = 24, - NOT_LITERAL_IGNORE = 25, - NEGATE = 26, - RANGE = 27, - REPEAT = 28, - REPEAT_ONE = 29, - SUBPATTERN = 30, - MIN_REPEAT_ONE = 31, - RANGE_IGNORE = 32, + IN = 14, + INFO = 15, + JUMP = 16, + LITERAL = 17, + MARK = 18, + MAX_UNTIL = 19, + MIN_UNTIL = 20, + NOT_LITERAL = 21, + NEGATE = 22, + RANGE = 23, + REPEAT = 24, + REPEAT_ONE = 25, + SUBPATTERN = 26, + MIN_REPEAT_ONE = 27, + GROUPREF_IGNORE = 28, + IN_IGNORE = 29, + LITERAL_IGNORE = 30, + NOT_LITERAL_IGNORE = 31, + GROUPREF_LOC_IGNORE = 32, + IN_LOC_IGNORE = 33, + LITERAL_LOC_IGNORE = 34, + NOT_LITERAL_LOC_IGNORE = 35, + GROUPREF_UNI_IGNORE = 36, + IN_UNI_IGNORE = 37, + LITERAL_UNI_IGNORE = 38, + NOT_LITERAL_UNI_IGNORE = 39, + RANGE_UNI_IGNORE = 40, } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] diff --git a/interp.rs b/interp.rs index 43c2a54515..bc6753a5fe 100644 --- a/interp.rs +++ b/interp.rs @@ -1,14 +1,14 @@ // good luck to those that follow; here be dragons -use super::constants::{SreFlag, SreOpcode, SRE_MAXREPEAT}; -use crate::builtins::PyStrRef; -use rustpython_common::borrow::BorrowValue; +use super::_sre::MAXREPEAT; +use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use std::collections::HashMap; use std::convert::TryFrom; pub struct State<'a> { - // py_string: PyStrRef, string: &'a str, + // chars count + string_len: usize, start: usize, end: usize, flags: SreFlag, @@ -23,17 +23,18 @@ pub struct State<'a> { impl<'a> State<'a> { pub(crate) fn new( - // py_string: PyStrRef, string: &'a str, start: usize, end: usize, flags: SreFlag, pattern_codes: Vec, ) -> Self { - // let string = py_string.borrow_value(); + let string_len = string.chars().count(); + let end = std::cmp::min(end, string_len); + let start = std::cmp::min(start, end); Self { - // py_string, string, + string_len, start, end, flags, @@ -59,6 +60,7 @@ impl<'a> State<'a> { pub(crate) fn pymatch(mut state: State) -> bool { let ctx = MatchContext { string_position: state.start, + string_offset: state.string.char_indices().nth(state.start).unwrap().0, code_position: 0, has_matched: None, }; @@ -85,6 +87,7 @@ pub(crate) fn pymatch(mut state: State) -> bool { #[derive(Debug, Copy, Clone)] struct MatchContext { string_position: usize, + string_offset: usize, code_position: usize, has_matched: Option, } @@ -108,6 +111,7 @@ impl<'a> MatchContextDrive<'a> { let ctx = self.ctx(); let child_ctx = MatchContext { string_position: ctx.string_position, + string_offset: ctx.string_offset, code_position: ctx.code_position + pattern_offset, has_matched: None, }; @@ -122,9 +126,7 @@ impl<'a> MatchContextDrive<'a> { } fn str(&self) -> &str { unsafe { - std::str::from_utf8_unchecked( - &self.state.string.as_bytes()[self.ctx().string_position..], - ) + std::str::from_utf8_unchecked(&self.state.string.as_bytes()[self.ctx().string_offset..]) } } fn peek_char(&self) -> char { @@ -135,61 +137,90 @@ impl<'a> MatchContextDrive<'a> { } fn skip_char(&mut self, skip_count: usize) { let skipped = self.str().char_indices().nth(skip_count).unwrap().0; - self.ctx_mut().string_position += skipped; + self.ctx_mut().string_position += skip_count; + self.ctx_mut().string_offset += skipped; } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; } fn remaining_chars(&self) -> usize { - let end = self.state.end; - end - self.ctx().string_position + self.str().len() + self.state.end - self.ctx().string_position } fn remaining_codes(&self) -> usize { self.state.pattern_codes.len() - self.ctx().code_position } fn at_beginning(&self) -> bool { - self.ctx().string_position == 0 + self.ctx().string_position == self.state.start } fn at_end(&self) -> bool { - self.str().is_empty() + self.ctx().string_position == self.state.end } fn at_linebreak(&self) -> bool { - match self.str().chars().next() { - Some(c) => c == '\n', - None => false, + !self.at_end() && is_linebreak(self.peek_char()) + } + fn at_boundary bool>(&self, mut word_checker: F) -> bool { + if self.at_beginning() && self.at_end() { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char()); + let this = !self.at_end() && word_checker(self.peek_char()); + this != that + } + fn back_peek_offset(&self) -> usize { + let bytes = self.state.string.as_bytes(); + let mut offset = self.ctx().string_offset - 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + panic!("not utf-8 code point"); + } + } + } + } + offset + } + fn back_peek_char(&self) -> char { + let bytes = self.state.string.as_bytes(); + let offset = self.back_peek_offset(); + let current_offset = self.ctx().string_offset; + let code = match current_offset - offset { + 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), + 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), + 3 => u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]), + 4 => u32::from_ne_bytes([ + bytes[offset], + bytes[offset + 1], + bytes[offset + 2], + bytes[offset + 3], + ]), + _ => unreachable!(), + }; + unsafe { std::mem::transmute(code) } + } + fn back_skip_char(&mut self, skip_count: usize) { + self.ctx_mut().string_position -= skip_count; + for _ in 0..skip_count { + self.ctx_mut().string_offset = self.back_peek_offset(); } } -} - -struct OpcodeDispatcher { - executing_contexts: HashMap>, -} - -macro_rules! once { - ($val:expr) => { - Box::new(OpEmpty {}) - }; } trait OpcodeExecutor { fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>; } -struct OpFailure {} -impl OpcodeExecutor for OpFailure { +struct OpUnimplemented {} +impl OpcodeExecutor for OpUnimplemented { fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { drive.ctx_mut().has_matched = Some(false); None } } -struct OpEmpty {} -impl OpcodeExecutor for OpEmpty { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - None - } -} - struct OpOnce { f: Option, } @@ -204,6 +235,10 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } +fn unimplemented() -> Box { + Box::new(OpUnimplemented {}) +} + struct OpMinRepeatOne { trace_id: usize, mincount: usize, @@ -213,10 +248,11 @@ struct OpMinRepeatOne { } impl OpcodeExecutor for OpMinRepeatOne { fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - match self.trace_id { - 0 => self._0(drive), - _ => unreachable!(), - } + None + // match self.trace_id { + // 0 => self._0(drive), + // _ => unreachable!(), + // } } } impl Default for OpMinRepeatOne { @@ -230,75 +266,78 @@ impl Default for OpMinRepeatOne { } } } -impl OpMinRepeatOne { - fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; +// impl OpMinRepeatOne { +// fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// self.mincount = drive.peek_code(2) as usize; +// self.maxcount = drive.peek_code(3) as usize; - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } +// if drive.remaining_chars() < self.mincount { +// drive.ctx_mut().has_matched = Some(false); +// return None; +// } - drive.state.string_position = drive.ctx().string_position; +// drive.state.string_position = drive.ctx().string_position; - self.count = if self.mincount == 0 { - 0 - } else { - let count = count_repetitions(drive, self.mincount); - if count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(count); - count - }; +// self.count = if self.mincount == 0 { +// 0 +// } else { +// let count = count_repetitions(drive, self.mincount); +// if count < self.mincount { +// drive.ctx_mut().has_matched = Some(false); +// return None; +// } +// drive.skip_char(count); +// count +// }; - if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } +// if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { +// drive.state.string_position = drive.ctx().string_position; +// drive.ctx_mut().has_matched = Some(true); +// return None; +// } - // mark push - self.trace_id = 1; - self._1(drive) - } - fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { - drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); - self.trace_id = 2; - return Some(()); - } +// // mark push +// self.trace_id = 1; +// self._1(drive) +// } +// fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { +// drive.state.string_position = drive.ctx().string_position; +// self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); +// self.trace_id = 2; +// return Some(()); +// } - // mark discard - drive.ctx_mut().has_matched = Some(false); - None - } - fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - if count_repetitions(drive, 1) == 0 { - self.trace_id = 3; - return self._3(drive); - } - drive.skip_char(1); - self.count += 1; - // marks pop keep - self.trace_id = 1; - self._1(drive) - } - fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - // mark discard - drive.ctx_mut().has_matched = Some(false); - None - } -} +// // mark discard +// drive.ctx_mut().has_matched = Some(false); +// None +// } +// fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { +// drive.ctx_mut().has_matched = Some(true); +// return None; +// } +// drive.state.string_position = drive.ctx().string_position; +// if count_repetitions(drive, 1) == 0 { +// self.trace_id = 3; +// return self._3(drive); +// } +// drive.skip_char(1); +// self.count += 1; +// // marks pop keep +// self.trace_id = 1; +// self._1(drive) +// } +// fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// // mark discard +// drive.ctx_mut().has_matched = Some(false); +// None +// } +// } +struct OpcodeDispatcher { + executing_contexts: HashMap>, +} impl OpcodeDispatcher { fn new() -> Self { Self { @@ -313,7 +352,6 @@ impl OpcodeDispatcher { let code = drive.peek_code(0); let opcode = SreOpcode::try_from(code).unwrap(); self.dispatch(opcode, drive); - // self.drive = self.drive; } match drive.ctx().has_matched { Some(matched) => Some(matched), @@ -328,8 +366,8 @@ impl OpcodeDispatcher { // is done matching, False if it must be resumed when next encountered. fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool { let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { - Some((_, mut executor)) => executor, - None => self.dispatch_table(opcode, drive), + Some((_, executor)) => executor, + None => self.dispatch_table(opcode), }; if let Some(()) = executor.next(drive) { self.executing_contexts.insert(drive.id(), executor); @@ -339,71 +377,436 @@ impl OpcodeDispatcher { } } - fn dispatch_table( - &mut self, - opcode: SreOpcode, - drive: &mut MatchContextDrive, - ) -> Box { - // move || { + fn dispatch_table(&mut self, opcode: SreOpcode) -> Box { match opcode { - SreOpcode::FAILURE => { - Box::new(OpFailure {}) - } + SreOpcode::FAILURE => once(|drive| { + drive.ctx_mut().has_matched = Some(false); + }), SreOpcode::SUCCESS => once(|drive| { drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); }), - SreOpcode::ANY => once!(true), - SreOpcode::ANY_ALL => once!(true), - SreOpcode::ASSERT => once!(true), - SreOpcode::ASSERT_NOT => once!(true), - SreOpcode::AT => once!(true), - SreOpcode::BRANCH => once!(true), - SreOpcode::CALL => once!(true), - SreOpcode::CATEGORY => once!(true), - SreOpcode::CHARSET => once!(true), - SreOpcode::BIGCHARSET => once!(true), - SreOpcode::GROUPREF => once!(true), - SreOpcode::GROUPREF_EXISTS => once!(true), - SreOpcode::GROUPREF_IGNORE => once!(true), - SreOpcode::IN => once!(true), - SreOpcode::IN_IGNORE => once!(true), - SreOpcode::INFO => once!(true), - SreOpcode::JUMP => once!(true), - SreOpcode::LITERAL => { - if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { + SreOpcode::ANY => once(|drive| { + if drive.at_end() || drive.at_linebreak() { drive.ctx_mut().has_matched = Some(false); } else { + drive.skip_code(1); drive.skip_char(1); } + }), + SreOpcode::ANY_ALL => once(|drive| { + if drive.at_end() { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(1); + drive.skip_char(1); + } + }), + SreOpcode::ASSERT => Box::new(OpAssert::default()), + SreOpcode::ASSERT_NOT => unimplemented(), + SreOpcode::AT => once(|drive| { + let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); + if !at(drive, atcode) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(2); + } + }), + SreOpcode::BRANCH => unimplemented(), + SreOpcode::CALL => unimplemented(), + SreOpcode::CATEGORY => unimplemented(), + SreOpcode::CHARSET => unimplemented(), + SreOpcode::BIGCHARSET => unimplemented(), + SreOpcode::GROUPREF => unimplemented(), + SreOpcode::GROUPREF_EXISTS => unimplemented(), + SreOpcode::GROUPREF_IGNORE => unimplemented(), + SreOpcode::IN => unimplemented(), + SreOpcode::IN_IGNORE => unimplemented(), + SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { + drive.skip_code(drive.peek_code(1) as usize + 1); + }), + SreOpcode::LITERAL => once(|drive| { + if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { + drive.ctx_mut().has_matched = Some(false); + } drive.skip_code(2); - once!(true) - } - SreOpcode::LITERAL_IGNORE => once!(true), - SreOpcode::MARK => once!(true), - SreOpcode::MAX_UNTIL => once!(true), - SreOpcode::MIN_UNTIL => once!(true), - SreOpcode::NOT_LITERAL => once!(true), - SreOpcode::NOT_LITERAL_IGNORE => once!(true), - SreOpcode::NEGATE => once!(true), - SreOpcode::RANGE => once!(true), - SreOpcode::REPEAT => once!(true), - SreOpcode::REPEAT_ONE => once!(true), - SreOpcode::SUBPATTERN => once!(true), + drive.skip_char(1); + }), + SreOpcode::LITERAL_IGNORE => once(|drive| { + let code = drive.peek_code(1); + let c = drive.peek_char(); + if drive.at_end() + || (c.to_ascii_lowercase() as u32 != code + && c.to_ascii_uppercase() as u32 != code) + { + drive.ctx_mut().has_matched = Some(false); + } + drive.skip_code(2); + drive.skip_char(1); + }), + SreOpcode::MARK => unimplemented(), + SreOpcode::MAX_UNTIL => unimplemented(), + SreOpcode::MIN_UNTIL => unimplemented(), + SreOpcode::NOT_LITERAL => once(|drive| { + if drive.at_end() || drive.peek_char() as u32 == drive.peek_code(1) { + drive.ctx_mut().has_matched = Some(false); + } + drive.skip_code(2); + drive.skip_char(1); + }), + SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { + let code = drive.peek_code(1); + let c = drive.peek_char(); + if drive.at_end() + || (c.to_ascii_lowercase() as u32 == code + || c.to_ascii_uppercase() as u32 == code) + { + drive.ctx_mut().has_matched = Some(false); + } + drive.skip_code(2); + drive.skip_char(1); + }), + SreOpcode::NEGATE => unimplemented(), + SreOpcode::RANGE => unimplemented(), + SreOpcode::REPEAT => unimplemented(), + SreOpcode::REPEAT_ONE => unimplemented(), + SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), - SreOpcode::RANGE_IGNORE => once!(true), + SreOpcode::GROUPREF_LOC_IGNORE => unimplemented(), + SreOpcode::IN_LOC_IGNORE => unimplemented(), + SreOpcode::LITERAL_LOC_IGNORE => unimplemented(), + SreOpcode::NOT_LITERAL_LOC_IGNORE => unimplemented(), + SreOpcode::GROUPREF_UNI_IGNORE => unimplemented(), + SreOpcode::IN_UNI_IGNORE => unimplemented(), + SreOpcode::LITERAL_UNI_IGNORE => unimplemented(), + SreOpcode::NOT_LITERAL_UNI_IGNORE => unimplemented(), + SreOpcode::RANGE_UNI_IGNORE => unimplemented(), } } + + // Returns the number of repetitions of a single item, starting from the + // current string position. The code pointer is expected to point to a + // REPEAT_ONE operation (with the repeated 4 ahead). + fn count_repetitions(&mut self, drive: &mut MatchContextDrive, maxcount: usize) -> usize { + let mut count = 0; + let mut real_maxcount = drive.remaining_chars(); + if maxcount < real_maxcount && maxcount != MAXREPEAT { + real_maxcount = maxcount; + } + let code_position = drive.ctx().code_position; + let string_position = drive.ctx().string_position; + drive.skip_code(4); + let reset_position = drive.ctx().code_position; + while count < real_maxcount { + drive.ctx_mut().code_position = reset_position; + let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); + self.dispatch(opcode, drive); + if drive.ctx().has_matched == Some(false) { + break; + } + count += 1; + } + drive.ctx_mut().has_matched = None; + drive.ctx_mut().code_position = code_position; + drive.ctx_mut().string_position = string_position; + count + } +} + +fn at(drive: &mut MatchContextDrive, atcode: SreAtCode) -> bool { + match atcode { + SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), + SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), + SreAtCode::BOUNDARY => drive.at_boundary(is_word), + SreAtCode::NON_BOUNDARY => !drive.at_boundary(is_word), + SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), + SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), + SreAtCode::END_STRING => drive.at_end(), + SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => !drive.at_boundary(is_loc_word), + SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => !drive.at_boundary(is_uni_word), + } +} + +fn category(catcode: SreCatCode, c: char) -> bool { + match catcode { + SreCatCode::DIGIT => is_digit(c), + SreCatCode::NOT_DIGIT => !is_digit(c), + SreCatCode::SPACE => is_space(c), + SreCatCode::NOT_SPACE => !is_space(c), + SreCatCode::WORD => is_word(c), + SreCatCode::NOT_WORD => !is_word(c), + SreCatCode::LINEBREAK => is_linebreak(c), + SreCatCode::NOT_LINEBREAK => !is_linebreak(c), + SreCatCode::LOC_WORD => is_loc_word(c), + SreCatCode::LOC_NOT_WORD => !is_loc_word(c), + SreCatCode::UNI_DIGIT => is_uni_digit(c), + SreCatCode::UNI_NOT_DIGIT => !is_uni_digit(c), + SreCatCode::UNI_SPACE => is_uni_space(c), + SreCatCode::UNI_NOT_SPACE => !is_uni_space(c), + SreCatCode::UNI_WORD => is_uni_word(c), + SreCatCode::UNI_NOT_WORD => !is_uni_word(c), + SreCatCode::UNI_LINEBREAK => is_uni_linebreak(c), + SreCatCode::UNI_NOT_LINEBREAK => !is_uni_linebreak(c), + } +} + +fn charset(set: &[u32], c: char) -> bool { + /* check if character is a member of the given set */ + let ch = c as u32; + let mut ok = true; + let mut i = 0; + while i < set.len() { + let opcode = match SreOpcode::try_from(set[i]) { + Ok(code) => code, + Err(_) => { + break; + } + }; + match opcode { + SreOpcode::FAILURE => { + return !ok; + } + SreOpcode::CATEGORY => { + /* */ + let catcode = match SreCatCode::try_from(set[i + 1]) { + Ok(code) => code, + Err(_) => { + break; + } + }; + if category(catcode, c) { + return ok; + } + i += 2; + } + SreOpcode::CHARSET => { + /* */ + if ch < 256 && (set[(ch / 32) as usize] & (1 << (32 - 1))) != 0 { + return ok; + } + i += 8; + } + SreOpcode::BIGCHARSET => { + /* <256 blockindices> */ + let count = set[i + 1]; + if ch < 0x10000 { + let blockindices: &[u8] = unsafe { std::mem::transmute(&set[i + 2..]) }; + let block = blockindices[(ch >> 8) as usize]; + if set[2 + 64 + ((block as u32 * 256 + (ch & 255)) / 32) as usize] + & (1 << (ch & (32 - 1))) + != 0 + { + return ok; + } + } + i += 2 + 64 + count as usize * 8; + } + SreOpcode::LITERAL => { + /* */ + if ch == set[i + 1] { + return ok; + } + i += 2; + } + SreOpcode::NEGATE => { + ok = !ok; + i += 1; + } + SreOpcode::RANGE => { + /* */ + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + i += 3; + } + SreOpcode::RANGE_UNI_IGNORE => { + /* */ + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + let ch = upper_unicode(c) as u32; + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + i += 3; + } + _ => { + break; + } + } + } + /* internal error -- there's not much we can do about it + here, so let's just pretend it didn't match... */ + false +} + +fn count(drive: MatchContextDrive, maxcount: usize) -> usize { + let string_position = drive.state.string_position; + let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + + let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); + match opcode { + SreOpcode::FAILURE => {} + SreOpcode::SUCCESS => {} + SreOpcode::ANY => {} + SreOpcode::ANY_ALL => {} + SreOpcode::ASSERT => {} + SreOpcode::ASSERT_NOT => {} + SreOpcode::AT => {} + SreOpcode::BRANCH => {} + SreOpcode::CALL => {} + SreOpcode::CATEGORY => {} + SreOpcode::CHARSET => {} + SreOpcode::BIGCHARSET => {} + SreOpcode::GROUPREF => {} + SreOpcode::GROUPREF_EXISTS => {} + SreOpcode::IN => { + } + SreOpcode::INFO => {} + SreOpcode::JUMP => {} + SreOpcode::LITERAL => {} + SreOpcode::MARK => {} + SreOpcode::MAX_UNTIL => {} + SreOpcode::MIN_UNTIL => {} + SreOpcode::NOT_LITERAL => {} + SreOpcode::NEGATE => {} + SreOpcode::RANGE => {} + SreOpcode::REPEAT => {} + SreOpcode::REPEAT_ONE => {} + SreOpcode::SUBPATTERN => {} + SreOpcode::MIN_REPEAT_ONE => {} + SreOpcode::GROUPREF_IGNORE => {} + SreOpcode::IN_IGNORE => {} + SreOpcode::LITERAL_IGNORE => {} + SreOpcode::NOT_LITERAL_IGNORE => {} + SreOpcode::GROUPREF_LOC_IGNORE => {} + SreOpcode::IN_LOC_IGNORE => {} + SreOpcode::LITERAL_LOC_IGNORE => {} + SreOpcode::NOT_LITERAL_LOC_IGNORE => {} + SreOpcode::GROUPREF_UNI_IGNORE => {} + SreOpcode::IN_UNI_IGNORE => {} + SreOpcode::LITERAL_UNI_IGNORE => {} + SreOpcode::NOT_LITERAL_UNI_IGNORE => {} + SreOpcode::RANGE_UNI_IGNORE => {} + } +} + +fn eq_loc_ignore(code: u32, c: char) -> bool { + code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 } -// Returns the number of repetitions of a single item, starting from the -// current string position. The code pointer is expected to point to a -// REPEAT_ONE operation (with the repeated 4 ahead). -fn count_repetitions(drive: &mut MatchContextDrive, maxcount: usize) -> usize { - let mut count = 0; - let mut real_maxcount = drive.state.end - drive.ctx().string_position; - if maxcount < real_maxcount && maxcount != SRE_MAXREPEAT { - real_maxcount = maxcount; +fn is_word(c: char) -> bool { + c.is_ascii_alphanumeric() || c == '_' +} +fn is_space(c: char) -> bool { + c.is_ascii_whitespace() +} +fn is_digit(c: char) -> bool { + c.is_ascii_digit() +} +fn is_loc_alnum(c: char) -> bool { + // TODO: check with cpython + c.is_alphanumeric() +} +fn is_loc_word(c: char) -> bool { + is_loc_alnum(c) || c == '_' +} +fn is_linebreak(c: char) -> bool { + c == '\n' +} +pub(crate) fn lower_ascii(c: char) -> char { + c.to_ascii_lowercase() +} +fn lower_locate(c: char) -> char { + // TODO: check with cpython + // https://doc.rust-lang.org/std/primitive.char.html#method.to_lowercase + c.to_lowercase().next().unwrap() +} +fn upper_locate(c: char) -> char { + // TODO: check with cpython + // https://doc.rust-lang.org/std/primitive.char.html#method.to_uppercase + c.to_uppercase().next().unwrap() +} +fn is_uni_digit(c: char) -> bool { + // TODO: check with cpython + c.is_digit(10) +} +fn is_uni_space(c: char) -> bool { + // TODO: check with cpython + c.is_whitespace() +} +fn is_uni_linebreak(c: char) -> bool { + match c { + '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{001C}' | '\u{001D}' + | '\u{001E}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true, + _ => false, + } +} +fn is_uni_alnum(c: char) -> bool { + // TODO: check with cpython + c.is_alphanumeric() +} +fn is_uni_word(c: char) -> bool { + is_uni_alnum(c) || c == '_' +} +pub(crate) fn lower_unicode(c: char) -> char { + // TODO: check with cpython + c.to_lowercase().next().unwrap() +} +pub(crate) fn upper_unicode(c: char) -> char { + // TODO: check with cpython + c.to_uppercase().next().unwrap() +} + +fn is_utf8_first_byte(b: u8) -> bool { + // In UTF-8, there are three kinds of byte... + // 0xxxxxxx : ASCII + // 10xxxxxx : 2nd, 3rd or 4th byte of code + // 11xxxxxx : 1st byte of multibyte code + (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) +} + +struct OpAssert { + child_ctx_id: usize, + jump_id: usize, +} +impl Default for OpAssert { + fn default() -> Self { + OpAssert { + child_ctx_id: 0, + jump_id: 0, + } + } +} +impl OpcodeExecutor for OpAssert { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + match self.jump_id { + 0 => self._0(drive), + 1 => self._1(drive), + _ => unreachable!(), + } + } +} +impl OpAssert { + fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.state.string_position = drive.ctx().string_position - back; + self.child_ctx_id = drive.push_new_context(3); + self.jump_id = 1; + Some(()) + } + fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { + drive.skip_code(drive.peek_code(1) as usize + 1); + } else { + drive.ctx_mut().has_matched = Some(false); + } + None } - count } From 4e03fb361f731116d392cfdda0820353454fcaac Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 25 Dec 2020 16:31:22 +0200 Subject: [PATCH 004/960] upgrade sre_parse.py; impl marks count --- interp.rs | 633 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 365 insertions(+), 268 deletions(-) diff --git a/interp.rs b/interp.rs index bc6753a5fe..8e0968cbf5 100644 --- a/interp.rs +++ b/interp.rs @@ -5,6 +5,7 @@ use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use std::collections::HashMap; use std::convert::TryFrom; +#[derive(Debug)] pub struct State<'a> { string: &'a str, // chars count @@ -13,9 +14,9 @@ pub struct State<'a> { end: usize, flags: SreFlag, pattern_codes: Vec, - marks: Vec, + marks: Vec>, lastindex: isize, - marks_stack: Vec, + marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, repeat: Option, string_position: usize, @@ -55,12 +56,47 @@ impl<'a> State<'a> { self.context_stack.clear(); self.repeat = None; } + + fn set_mark(&mut self, mark_nr: usize, position: usize) { + if mark_nr & 1 != 0 { + self.lastindex = mark_nr as isize / 2 + 1; + } + if mark_nr >= self.marks.len() { + self.marks.resize(mark_nr + 1, None); + } + self.marks[mark_nr] = Some(position); + } + fn get_marks(&self, group_index: usize) -> (Option, Option) { + let marks_index = 2 * group_index; + if marks_index + 1 < self.marks.len() { + (self.marks[marks_index], self.marks[marks_index + 1]) + } else { + (None, None) + } + } + fn marks_push(&mut self) { + self.marks_stack.push(self.marks.clone(), self.lastindex); + } + fn marks_pop(&mut self) { + (self.marks, self.lastindex) = self.marks_stack.pop().unwrap(); + } + fn marks_pop_keep(&mut self) { + (self.marks, self.lastindex) = self.marks_stack.last().unwrap(); + } + fn marks_pop_discard(&mut self) { + self.marks_stack.pop(); + } } pub(crate) fn pymatch(mut state: State) -> bool { let ctx = MatchContext { string_position: state.start, - string_offset: state.string.char_indices().nth(state.start).unwrap().0, + string_offset: state + .string + .char_indices() + .nth(state.start) + .map(|x| x.0) + .unwrap_or(0), code_position: 0, has_matched: None, }; @@ -72,7 +108,7 @@ pub(crate) fn pymatch(mut state: State) -> bool { break; } let ctx_id = state.context_stack.len() - 1; - let mut drive = MatchContextDrive::drive(ctx_id, state); + let mut drive = StackDrive::drive(ctx_id, state); let mut dispatcher = OpcodeDispatcher::new(); has_matched = dispatcher.pymatch(&mut drive); @@ -92,68 +128,52 @@ struct MatchContext { has_matched: Option, } -struct MatchContextDrive<'a> { - state: State<'a>, - ctx_id: usize, -} - -impl<'a> MatchContextDrive<'a> { - fn id(&self) -> usize { - self.ctx_id - } - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.state.context_stack[self.ctx_id] - } - fn ctx(&self) -> &MatchContext { - &self.state.context_stack[self.ctx_id] - } - fn push_new_context(&mut self, pattern_offset: usize) -> usize { - let ctx = self.ctx(); - let child_ctx = MatchContext { - string_position: ctx.string_position, - string_offset: ctx.string_offset, - code_position: ctx.code_position + pattern_offset, - has_matched: None, - }; - self.state.context_stack.push(child_ctx); - self.state.context_stack.len() - 1 - } - fn drive(ctx_id: usize, state: State<'a>) -> Self { - Self { state, ctx_id } - } - fn take(self) -> State<'a> { - self.state - } +trait MatchContextDrive { + fn ctx_mut(&mut self) -> &mut MatchContext; + fn ctx(&self) -> &MatchContext; + fn state(&self) -> &State; fn str(&self) -> &str { unsafe { - std::str::from_utf8_unchecked(&self.state.string.as_bytes()[self.ctx().string_offset..]) + std::str::from_utf8_unchecked( + &self.state().string.as_bytes()[self.ctx().string_offset..], + ) } } + fn pattern(&self) -> &[u32] { + &self.state().pattern_codes[self.ctx().code_position..] + } fn peek_char(&self) -> char { self.str().chars().next().unwrap() } fn peek_code(&self, peek: usize) -> u32 { - self.state.pattern_codes[self.ctx().code_position + peek] + self.state().pattern_codes[self.ctx().code_position + peek] } fn skip_char(&mut self, skip_count: usize) { - let skipped = self.str().char_indices().nth(skip_count).unwrap().0; - self.ctx_mut().string_position += skip_count; - self.ctx_mut().string_offset += skipped; + match self.str().char_indices().nth(skip_count).map(|x| x.0) { + Some(skipped) => { + self.ctx_mut().string_position += skip_count; + self.ctx_mut().string_offset += skipped; + } + None => { + self.ctx_mut().string_position = self.state().end; + self.ctx_mut().string_offset = self.state().string.len(); // bytes len + } + } } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; } fn remaining_chars(&self) -> usize { - self.state.end - self.ctx().string_position + self.state().end - self.ctx().string_position } fn remaining_codes(&self) -> usize { - self.state.pattern_codes.len() - self.ctx().code_position + self.state().pattern_codes.len() - self.ctx().code_position } fn at_beginning(&self) -> bool { - self.ctx().string_position == self.state.start + self.ctx().string_position == self.state().start } fn at_end(&self) -> bool { - self.ctx().string_position == self.state.end + self.ctx().string_position == self.state().end } fn at_linebreak(&self) -> bool { !self.at_end() && is_linebreak(self.peek_char()) @@ -167,7 +187,7 @@ impl<'a> MatchContextDrive<'a> { this != that } fn back_peek_offset(&self) -> usize { - let bytes = self.state.string.as_bytes(); + let bytes = self.state().string.as_bytes(); let mut offset = self.ctx().string_offset - 1; if !is_utf8_first_byte(bytes[offset]) { offset -= 1; @@ -184,7 +204,7 @@ impl<'a> MatchContextDrive<'a> { offset } fn back_peek_char(&self) -> char { - let bytes = self.state.string.as_bytes(); + let bytes = self.state().string.as_bytes(); let offset = self.back_peek_offset(); let current_offset = self.ctx().string_offset; let code = match current_offset - offset { @@ -209,13 +229,74 @@ impl<'a> MatchContextDrive<'a> { } } +struct StackDrive<'a> { + state: State<'a>, + ctx_id: usize, +} +impl<'a> StackDrive<'a> { + fn id(&self) -> usize { + self.ctx_id + } + fn drive(ctx_id: usize, state: State<'a>) -> Self { + Self { state, ctx_id } + } + fn take(self) -> State<'a> { + self.state + } + fn push_new_context(&mut self, pattern_offset: usize) -> usize { + let ctx = self.ctx(); + let child_ctx = MatchContext { + string_position: ctx.string_position, + string_offset: ctx.string_offset, + code_position: ctx.code_position + pattern_offset, + has_matched: None, + }; + self.state.context_stack.push(child_ctx); + self.state.context_stack.len() - 1 + } +} +impl MatchContextDrive for StackDrive<'_> { + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.state.context_stack[self.ctx_id] + } + fn ctx(&self) -> &MatchContext { + &self.state.context_stack[self.ctx_id] + } + fn state(&self) -> &State { + &self.state + } +} + +struct WrapDrive<'a> { + stack_drive: &'a StackDrive<'a>, + ctx: MatchContext, +} +impl<'a> WrapDrive<'a> { + fn drive(ctx: MatchContext, stack_drive: &'a StackDrive<'a>) -> Self { + Self { stack_drive, ctx } + } +} +impl MatchContextDrive for WrapDrive<'_> { + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.ctx + } + + fn ctx(&self) -> &MatchContext { + &self.ctx + } + + fn state(&self) -> &State { + self.stack_drive.state() + } +} + trait OpcodeExecutor { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>; + fn next(&mut self, drive: &mut StackDrive) -> Option<()>; } struct OpUnimplemented {} impl OpcodeExecutor for OpUnimplemented { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { drive.ctx_mut().has_matched = Some(false); None } @@ -224,14 +305,14 @@ impl OpcodeExecutor for OpUnimplemented { struct OpOnce { f: Option, } -impl OpcodeExecutor for OpOnce { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +impl OpcodeExecutor for OpOnce { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { let f = self.f.take()?; f(drive); None } } -fn once(f: F) -> Box> { +fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } @@ -239,102 +320,6 @@ fn unimplemented() -> Box { Box::new(OpUnimplemented {}) } -struct OpMinRepeatOne { - trace_id: usize, - mincount: usize, - maxcount: usize, - count: usize, - child_ctx_id: usize, -} -impl OpcodeExecutor for OpMinRepeatOne { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - None - // match self.trace_id { - // 0 => self._0(drive), - // _ => unreachable!(), - // } - } -} -impl Default for OpMinRepeatOne { - fn default() -> Self { - OpMinRepeatOne { - trace_id: 0, - mincount: 0, - maxcount: 0, - count: 0, - child_ctx_id: 0, - } - } -} -// impl OpMinRepeatOne { -// fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// self.mincount = drive.peek_code(2) as usize; -// self.maxcount = drive.peek_code(3) as usize; - -// if drive.remaining_chars() < self.mincount { -// drive.ctx_mut().has_matched = Some(false); -// return None; -// } - -// drive.state.string_position = drive.ctx().string_position; - -// self.count = if self.mincount == 0 { -// 0 -// } else { -// let count = count_repetitions(drive, self.mincount); -// if count < self.mincount { -// drive.ctx_mut().has_matched = Some(false); -// return None; -// } -// drive.skip_char(count); -// count -// }; - -// if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { -// drive.state.string_position = drive.ctx().string_position; -// drive.ctx_mut().has_matched = Some(true); -// return None; -// } - -// // mark push -// self.trace_id = 1; -// self._1(drive) -// } -// fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { -// drive.state.string_position = drive.ctx().string_position; -// self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); -// self.trace_id = 2; -// return Some(()); -// } - -// // mark discard -// drive.ctx_mut().has_matched = Some(false); -// None -// } -// fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { -// drive.ctx_mut().has_matched = Some(true); -// return None; -// } -// drive.state.string_position = drive.ctx().string_position; -// if count_repetitions(drive, 1) == 0 { -// self.trace_id = 3; -// return self._3(drive); -// } -// drive.skip_char(1); -// self.count += 1; -// // marks pop keep -// self.trace_id = 1; -// self._1(drive) -// } -// fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// // mark discard -// drive.ctx_mut().has_matched = Some(false); -// None -// } -// } - struct OpcodeDispatcher { executing_contexts: HashMap>, } @@ -347,7 +332,7 @@ impl OpcodeDispatcher { // Returns True if the current context matches, False if it doesn't and // None if matching is not finished, ie must be resumed after child // contexts have been matched. - fn pymatch(&mut self, drive: &mut MatchContextDrive) -> Option { + fn pymatch(&mut self, drive: &mut StackDrive) -> Option { while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { let code = drive.peek_code(0); let opcode = SreOpcode::try_from(code).unwrap(); @@ -364,7 +349,7 @@ impl OpcodeDispatcher { // Dispatches a context on a given opcode. Returns True if the context // is done matching, False if it must be resumed when next encountered. - fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool { + fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { Some((_, executor)) => executor, None => self.dispatch_table(opcode), @@ -414,58 +399,67 @@ impl OpcodeDispatcher { }), SreOpcode::BRANCH => unimplemented(), SreOpcode::CALL => unimplemented(), - SreOpcode::CATEGORY => unimplemented(), - SreOpcode::CHARSET => unimplemented(), - SreOpcode::BIGCHARSET => unimplemented(), + SreOpcode::CATEGORY => once(|drive| { + let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); + if drive.at_end() || !category(catcode, drive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(2); + drive.skip_char(1); + } + }), + SreOpcode::CHARSET | SreOpcode::BIGCHARSET => unreachable!("unexpected opcode"), SreOpcode::GROUPREF => unimplemented(), SreOpcode::GROUPREF_EXISTS => unimplemented(), SreOpcode::GROUPREF_IGNORE => unimplemented(), - SreOpcode::IN => unimplemented(), - SreOpcode::IN_IGNORE => unimplemented(), + SreOpcode::IN => once(|drive| { + general_op_in(drive, |x| x); + }), + SreOpcode::IN_IGNORE => once(|drive| { + general_op_in(drive, lower_ascii); + }), + SreOpcode::IN_UNI_IGNORE => once(|drive| { + general_op_in(drive, lower_unicode); + }), + SreOpcode::IN_LOC_IGNORE => once(|drive| { + let skip = drive.peek_code(1) as usize; + if drive.at_end() || !charset_loc_ignore(&drive.pattern()[1..], drive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(skip + 1); + drive.skip_char(1); + } + }), SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { drive.skip_code(drive.peek_code(1) as usize + 1); }), SreOpcode::LITERAL => once(|drive| { - if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); + general_op_literal(drive, |code, c| code == c as u32); + }), + SreOpcode::NOT_LITERAL => once(|drive| { + general_op_literal(drive, |code, c| code != c as u32); }), SreOpcode::LITERAL_IGNORE => once(|drive| { - let code = drive.peek_code(1); - let c = drive.peek_char(); - if drive.at_end() - || (c.to_ascii_lowercase() as u32 != code - && c.to_ascii_uppercase() as u32 != code) - { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); + general_op_literal(drive, |code, c| code == lower_ascii(c) as u32); + }), + SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| code != lower_ascii(c) as u32); + }), + SreOpcode::LITERAL_UNI_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| code == lower_unicode(c) as u32); + }), + SreOpcode::NOT_LITERAL_UNI_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| code != lower_unicode(c) as u32); + }), + SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| char_loc_ignore(code, c)); + }), + SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); }), SreOpcode::MARK => unimplemented(), SreOpcode::MAX_UNTIL => unimplemented(), SreOpcode::MIN_UNTIL => unimplemented(), - SreOpcode::NOT_LITERAL => once(|drive| { - if drive.at_end() || drive.peek_char() as u32 == drive.peek_code(1) { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); - }), - SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { - let code = drive.peek_code(1); - let c = drive.peek_char(); - if drive.at_end() - || (c.to_ascii_lowercase() as u32 == code - || c.to_ascii_uppercase() as u32 == code) - { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); - }), SreOpcode::NEGATE => unimplemented(), SreOpcode::RANGE => unimplemented(), SreOpcode::REPEAT => unimplemented(), @@ -473,47 +467,45 @@ impl OpcodeDispatcher { SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF_LOC_IGNORE => unimplemented(), - SreOpcode::IN_LOC_IGNORE => unimplemented(), - SreOpcode::LITERAL_LOC_IGNORE => unimplemented(), - SreOpcode::NOT_LITERAL_LOC_IGNORE => unimplemented(), SreOpcode::GROUPREF_UNI_IGNORE => unimplemented(), - SreOpcode::IN_UNI_IGNORE => unimplemented(), - SreOpcode::LITERAL_UNI_IGNORE => unimplemented(), - SreOpcode::NOT_LITERAL_UNI_IGNORE => unimplemented(), SreOpcode::RANGE_UNI_IGNORE => unimplemented(), } } +} - // Returns the number of repetitions of a single item, starting from the - // current string position. The code pointer is expected to point to a - // REPEAT_ONE operation (with the repeated 4 ahead). - fn count_repetitions(&mut self, drive: &mut MatchContextDrive, maxcount: usize) -> usize { - let mut count = 0; - let mut real_maxcount = drive.remaining_chars(); - if maxcount < real_maxcount && maxcount != MAXREPEAT { - real_maxcount = maxcount; - } - let code_position = drive.ctx().code_position; - let string_position = drive.ctx().string_position; - drive.skip_code(4); - let reset_position = drive.ctx().code_position; - while count < real_maxcount { - drive.ctx_mut().code_position = reset_position; - let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); - self.dispatch(opcode, drive); - if drive.ctx().has_matched == Some(false) { - break; - } - count += 1; - } - drive.ctx_mut().has_matched = None; - drive.ctx_mut().code_position = code_position; - drive.ctx_mut().string_position = string_position; - count +fn char_loc_ignore(code: u32, c: char) -> bool { + code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 +} + +fn charset_loc_ignore(set: &[u32], c: char) -> bool { + let lo = lower_locate(c); + if charset(set, c) { + return true; } + let up = upper_locate(c); + up != lo && charset(set, up) } -fn at(drive: &mut MatchContextDrive, atcode: SreAtCode) -> bool { +fn general_op_literal bool>(drive: &mut StackDrive, f: F) { + if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(2); + drive.skip_char(1); + } +} + +fn general_op_in char>(drive: &mut StackDrive, f: F) { + let skip = drive.peek_code(1) as usize; + if drive.at_end() || !charset(&drive.pattern()[1..], f(drive.peek_char())) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(skip + 1); + drive.skip_char(1); + } +} + +fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), @@ -642,54 +634,67 @@ fn charset(set: &[u32], c: char) -> bool { false } -fn count(drive: MatchContextDrive, maxcount: usize) -> usize { - let string_position = drive.state.string_position; +fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { + let drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + let opcode = match SreOpcode::try_from(drive.peek_code(1)) { + Ok(code) => code, + Err(_) => { + panic!("FIXME:COUNT1"); + } + }; - let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); match opcode { - SreOpcode::FAILURE => {} - SreOpcode::SUCCESS => {} - SreOpcode::ANY => {} - SreOpcode::ANY_ALL => {} - SreOpcode::ASSERT => {} - SreOpcode::ASSERT_NOT => {} - SreOpcode::AT => {} - SreOpcode::BRANCH => {} - SreOpcode::CALL => {} - SreOpcode::CATEGORY => {} - SreOpcode::CHARSET => {} - SreOpcode::BIGCHARSET => {} - SreOpcode::GROUPREF => {} - SreOpcode::GROUPREF_EXISTS => {} + SreOpcode::ANY => { + while !drive.at_end() && !drive.at_linebreak() { + drive.skip_char(1); + } + } + SreOpcode::ANY_ALL => { + drive.skip_char(drive.remaining_chars()); + } SreOpcode::IN => { + // TODO: pattern[2 or 1..]? + while !drive.at_end() && charset(&drive.pattern()[2..], drive.peek_char()) { + drive.skip_char(1); + } + } + SreOpcode::LITERAL => { + general_count_literal(drive, |code, c| code == c as u32); + } + SreOpcode::NOT_LITERAL => { + general_count_literal(drive, |code, c| code != c as u32); + } + SreOpcode::LITERAL_IGNORE => { + general_count_literal(drive, |code, c| code == lower_ascii(c) as u32); + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_count_literal(drive, |code, c| code != lower_ascii(c) as u32); + } + SreOpcode::LITERAL_LOC_IGNORE => { + general_count_literal(drive, |code, c| char_loc_ignore(code, c)); + } + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_count_literal(drive, |code, c| !char_loc_ignore(code, c)); + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_count_literal(drive, |code, c| code == lower_unicode(c) as u32); + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_count_literal(drive, |code, c| code != lower_unicode(c) as u32); } - SreOpcode::INFO => {} - SreOpcode::JUMP => {} - SreOpcode::LITERAL => {} - SreOpcode::MARK => {} - SreOpcode::MAX_UNTIL => {} - SreOpcode::MIN_UNTIL => {} - SreOpcode::NOT_LITERAL => {} - SreOpcode::NEGATE => {} - SreOpcode::RANGE => {} - SreOpcode::REPEAT => {} - SreOpcode::REPEAT_ONE => {} - SreOpcode::SUBPATTERN => {} - SreOpcode::MIN_REPEAT_ONE => {} - SreOpcode::GROUPREF_IGNORE => {} - SreOpcode::IN_IGNORE => {} - SreOpcode::LITERAL_IGNORE => {} - SreOpcode::NOT_LITERAL_IGNORE => {} - SreOpcode::GROUPREF_LOC_IGNORE => {} - SreOpcode::IN_LOC_IGNORE => {} - SreOpcode::LITERAL_LOC_IGNORE => {} - SreOpcode::NOT_LITERAL_LOC_IGNORE => {} - SreOpcode::GROUPREF_UNI_IGNORE => {} - SreOpcode::IN_UNI_IGNORE => {} - SreOpcode::LITERAL_UNI_IGNORE => {} - SreOpcode::NOT_LITERAL_UNI_IGNORE => {} - SreOpcode::RANGE_UNI_IGNORE => {} + _ => { + panic!("TODO: Not Implemented."); + } + } + + drive.ctx().string_position - stack_drive.ctx().string_position +} + +fn general_count_literal bool>(drive: &mut WrapDrive, f: F) { + let ch = drive.peek_code(1); + while !drive.at_end() && f(ch, drive.peek_char()) { + drive.skip_char(1); } } @@ -781,7 +786,7 @@ impl Default for OpAssert { } } impl OpcodeExecutor for OpAssert { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => self._0(drive), 1 => self._1(drive), @@ -790,7 +795,7 @@ impl OpcodeExecutor for OpAssert { } } impl OpAssert { - fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { let back = drive.peek_code(2) as usize; if back > drive.ctx().string_position { drive.ctx_mut().has_matched = Some(false); @@ -801,7 +806,7 @@ impl OpAssert { self.jump_id = 1; Some(()) } - fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { drive.skip_code(drive.peek_code(1) as usize + 1); } else { @@ -810,3 +815,95 @@ impl OpAssert { None } } + +struct OpMinRepeatOne { + jump_id: usize, + mincount: usize, + maxcount: usize, + count: usize, + child_ctx_id: usize, +} +impl OpcodeExecutor for OpMinRepeatOne { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => self._0(drive), + 1 => self._1(drive), + 2 => self._2(drive), + _ => unreachable!(), + } + } +} +impl Default for OpMinRepeatOne { + fn default() -> Self { + OpMinRepeatOne { + jump_id: 0, + mincount: 0, + maxcount: 0, + count: 0, + child_ctx_id: 0, + } + } +} +impl OpMinRepeatOne { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; + + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + + drive.state.string_position = drive.ctx().string_position; + + self.count = if self.mincount == 0 { + 0 + } else { + let count = count(drive, self.mincount); + if count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(count); + count + }; + + if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } + + drive.state.marks_push(); + self.jump_id = 1; + self._1(drive) + } + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { + if self.maxcount == MAXREPEAT || self.count <= self.maxcount { + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + return Some(()); + } + + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + None + } + fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { + if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + if count(drive, 1) == 0 { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(1); + self.count += 1; + drive.state.marks_pop_keep(); + self.jump_id = 1; + self._1(drive) + } +} From 78485fd8df79444283d3cb978654500da0ce0590 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 26 Dec 2020 16:59:30 +0200 Subject: [PATCH 005/960] create _sre.Match --- interp.rs | 68 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/interp.rs b/interp.rs index 8e0968cbf5..71d89cc0e8 100644 --- a/interp.rs +++ b/interp.rs @@ -1,21 +1,24 @@ // good luck to those that follow; here be dragons -use super::_sre::MAXREPEAT; +use rustpython_common::borrow::BorrowValue; + +use super::_sre::{Match, Pattern, MAXREPEAT}; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; +use crate::builtins::PyStrRef; use std::collections::HashMap; use std::convert::TryFrom; #[derive(Debug)] -pub struct State<'a> { +pub(crate) struct State<'a> { string: &'a str, // chars count string_len: usize, - start: usize, - end: usize, + pub start: usize, + pub end: usize, flags: SreFlag, - pattern_codes: Vec, + pattern_codes: &'a [u32], marks: Vec>, - lastindex: isize, + pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, repeat: Option, @@ -28,7 +31,7 @@ impl<'a> State<'a> { start: usize, end: usize, flags: SreFlag, - pattern_codes: Vec, + pattern_codes: &'a [u32], ) -> Self { let string_len = string.chars().count(); let end = std::cmp::min(end, string_len); @@ -75,20 +78,36 @@ impl<'a> State<'a> { } } fn marks_push(&mut self) { - self.marks_stack.push(self.marks.clone(), self.lastindex); + self.marks_stack.push((self.marks.clone(), self.lastindex)); } fn marks_pop(&mut self) { - (self.marks, self.lastindex) = self.marks_stack.pop().unwrap(); + let (marks, lastindex) = self.marks_stack.pop().unwrap(); + self.marks = marks; + self.lastindex = lastindex; } fn marks_pop_keep(&mut self) { - (self.marks, self.lastindex) = self.marks_stack.last().unwrap(); + let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); + self.marks = marks; + self.lastindex = lastindex; } fn marks_pop_discard(&mut self) { self.marks_stack.pop(); } } -pub(crate) fn pymatch(mut state: State) -> bool { +pub(crate) fn pymatch( + string: PyStrRef, + start: usize, + end: usize, + pattern: &Pattern, +) -> Option { + let mut state = State::new( + string.borrow_value(), + start, + end, + pattern.flags.clone(), + &pattern.code, + ); let ctx = MatchContext { string_position: state.start, string_offset: state @@ -117,7 +136,12 @@ pub(crate) fn pymatch(mut state: State) -> bool { state.context_stack.pop(); } } - has_matched.unwrap_or(false) + + if has_matched == None || has_matched == Some(false) { + return None; + } + + Some(Match::new(&state, pattern.pattern.clone(), string.clone())) } #[derive(Debug, Copy, Clone)] @@ -635,7 +659,7 @@ fn charset(set: &[u32], c: char) -> bool { } fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { - let drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); + let mut drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let opcode = match SreOpcode::try_from(drive.peek_code(1)) { Ok(code) => code, @@ -660,28 +684,28 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { } } SreOpcode::LITERAL => { - general_count_literal(drive, |code, c| code == c as u32); + general_count_literal(&mut drive, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(drive, |code, c| code != c as u32); + general_count_literal(&mut drive, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(drive, |code, c| code == lower_ascii(c) as u32); + general_count_literal(&mut drive, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(drive, |code, c| code != lower_ascii(c) as u32); + general_count_literal(&mut drive, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(drive, |code, c| char_loc_ignore(code, c)); + general_count_literal(&mut drive, |code, c| char_loc_ignore(code, c)); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(drive, |code, c| !char_loc_ignore(code, c)); + general_count_literal(&mut drive, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(drive, |code, c| code == lower_unicode(c) as u32); + general_count_literal(&mut drive, |code, c| code == lower_unicode(c) as u32); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(drive, |code, c| code != lower_unicode(c) as u32); + general_count_literal(&mut drive, |code, c| code != lower_unicode(c) as u32); } _ => { panic!("TODO: Not Implemented."); @@ -691,7 +715,7 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { drive.ctx().string_position - stack_drive.ctx().string_position } -fn general_count_literal bool>(drive: &mut WrapDrive, f: F) { +fn general_count_literal bool>(drive: &mut WrapDrive, mut f: F) { let ch = drive.peek_code(1); while !drive.at_end() && f(ch, drive.peek_char()) { drive.skip_char(1); From aa0f20b93e86e6b63eadefb1f17958b2d513ccd3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 26 Dec 2020 18:44:33 +0200 Subject: [PATCH 006/960] impl Pattern.fullmatch, Pattern.search --- interp.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/interp.rs b/interp.rs index 71d89cc0e8..f87cc85abc 100644 --- a/interp.rs +++ b/interp.rs @@ -1,10 +1,9 @@ // good luck to those that follow; here be dragons -use rustpython_common::borrow::BorrowValue; - use super::_sre::{Match, Pattern, MAXREPEAT}; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use crate::builtins::PyStrRef; +use rustpython_common::borrow::BorrowValue; use std::collections::HashMap; use std::convert::TryFrom; From 312e5b875677bbc57b085ffae98f230e56717d21 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 27 Dec 2020 21:27:06 +0200 Subject: [PATCH 007/960] impl opcode groupref and assert_not --- interp.rs | 143 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 124 insertions(+), 19 deletions(-) diff --git a/interp.rs b/interp.rs index f87cc85abc..88dbb034da 100644 --- a/interp.rs +++ b/interp.rs @@ -109,12 +109,7 @@ pub(crate) fn pymatch( ); let ctx = MatchContext { string_position: state.start, - string_offset: state - .string - .char_indices() - .nth(state.start) - .map(|x| x.0) - .unwrap_or(0), + string_offset: calc_string_offset(state.string, state.start), code_position: 0, has_matched: None, }; @@ -411,7 +406,7 @@ impl OpcodeDispatcher { } }), SreOpcode::ASSERT => Box::new(OpAssert::default()), - SreOpcode::ASSERT_NOT => unimplemented(), + SreOpcode::ASSERT_NOT => Box::new(OpAssertNot::default()), SreOpcode::AT => once(|drive| { let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); if !at(drive, atcode) { @@ -432,9 +427,6 @@ impl OpcodeDispatcher { } }), SreOpcode::CHARSET | SreOpcode::BIGCHARSET => unreachable!("unexpected opcode"), - SreOpcode::GROUPREF => unimplemented(), - SreOpcode::GROUPREF_EXISTS => unimplemented(), - SreOpcode::GROUPREF_IGNORE => unimplemented(), SreOpcode::IN => once(|drive| { general_op_in(drive, |x| x); }), @@ -480,7 +472,12 @@ impl OpcodeDispatcher { SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); }), - SreOpcode::MARK => unimplemented(), + SreOpcode::MARK => once(|drive| { + drive + .state + .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); + drive.skip_code(2); + }), SreOpcode::MAX_UNTIL => unimplemented(), SreOpcode::MIN_UNTIL => unimplemented(), SreOpcode::NEGATE => unimplemented(), @@ -489,13 +486,36 @@ impl OpcodeDispatcher { SreOpcode::REPEAT_ONE => unimplemented(), SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), - SreOpcode::GROUPREF_LOC_IGNORE => unimplemented(), - SreOpcode::GROUPREF_UNI_IGNORE => unimplemented(), + SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), + SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), + SreOpcode::GROUPREF_LOC_IGNORE => { + once(|drive| general_op_groupref(drive, lower_locate)) + } + SreOpcode::GROUPREF_UNI_IGNORE => { + once(|drive| general_op_groupref(drive, lower_unicode)) + } + SreOpcode::GROUPREF_EXISTS => once(|drive| { + let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); + match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => { + drive.skip_code(3); + } + _ => drive.skip_code(drive.peek_code(2) as usize + 1), + } + }), SreOpcode::RANGE_UNI_IGNORE => unimplemented(), } } } +fn calc_string_offset(string: &str, position: usize) -> usize { + string + .char_indices() + .nth(position) + .map(|(i, _)| i) + .unwrap_or(0) +} + fn char_loc_ignore(code: u32, c: char) -> bool { code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 } @@ -509,6 +529,40 @@ fn charset_loc_ignore(set: &[u32], c: char) -> bool { up != lo && charset(set, up) } +fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) { + let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); + let (group_start, group_end) = match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => (start, end), + _ => { + drive.ctx_mut().has_matched = Some(false); + return; + } + }; + let mut wdrive = WrapDrive::drive(*drive.ctx(), &drive); + let mut gdrive = WrapDrive::drive( + MatchContext { + string_position: group_start, + // TODO: cache the offset + string_offset: calc_string_offset(drive.state.string, group_start), + ..*drive.ctx() + }, + &drive, + ); + for _ in group_start..group_end { + if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + return; + } + wdrive.skip_char(1); + gdrive.skip_char(1); + } + let position = wdrive.ctx().string_position; + let offset = wdrive.ctx().string_offset; + drive.skip_code(2); + drive.ctx_mut().string_position = position; + drive.ctx_mut().string_offset = offset; +} + fn general_op_literal bool>(drive: &mut StackDrive, f: F) { if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); @@ -766,11 +820,19 @@ fn is_uni_space(c: char) -> bool { c.is_whitespace() } fn is_uni_linebreak(c: char) -> bool { - match c { - '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{001C}' | '\u{001D}' - | '\u{001E}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true, - _ => false, - } + matches!( + c, + '\u{000A}' + | '\u{000B}' + | '\u{000C}' + | '\u{000D}' + | '\u{001C}' + | '\u{001D}' + | '\u{001E}' + | '\u{0085}' + | '\u{2028}' + | '\u{2029}' + ) } fn is_uni_alnum(c: char) -> bool { // TODO: check with cpython @@ -802,7 +864,7 @@ struct OpAssert { } impl Default for OpAssert { fn default() -> Self { - OpAssert { + Self { child_ctx_id: 0, jump_id: 0, } @@ -839,6 +901,49 @@ impl OpAssert { } } +struct OpAssertNot { + child_ctx_id: usize, + jump_id: usize, +} +impl Default for OpAssertNot { + fn default() -> Self { + Self { + child_ctx_id: 0, + jump_id: 0, + } + } +} +impl OpcodeExecutor for OpAssertNot { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => self._0(drive), + 1 => self._1(drive), + _ => unreachable!(), + } + } +} +impl OpAssertNot { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.skip_code(drive.peek_code(1) as usize + 1); + return None; + } + drive.state.string_position = drive.ctx().string_position - back; + self.child_ctx_id = drive.push_new_context(3); + self.jump_id = 1; + Some(()) + } + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { + if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(drive.peek_code(1) as usize + 1); + } + None + } +} + struct OpMinRepeatOne { jump_id: usize, mincount: usize, From 0b2c8d1fa256a0b40223f97b2a5de6b2747c3397 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 10:33:28 +0200 Subject: [PATCH 008/960] impl OpMaxUntil --- interp.rs | 133 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 122 insertions(+), 11 deletions(-) diff --git a/interp.rs b/interp.rs index 88dbb034da..85787ae516 100644 --- a/interp.rs +++ b/interp.rs @@ -20,7 +20,7 @@ pub(crate) struct State<'a> { pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, - repeat: Option, + repeat_stack: Vec, string_position: usize, } @@ -45,7 +45,7 @@ impl<'a> State<'a> { lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - repeat: None, + repeat_stack: Vec::new(), marks: Vec::new(), string_position: start, } @@ -56,7 +56,7 @@ impl<'a> State<'a> { self.lastindex = -1; self.marks_stack.clear(); self.context_stack.clear(); - self.repeat = None; + self.repeat_stack.clear(); } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -104,7 +104,7 @@ pub(crate) fn pymatch( string.borrow_value(), start, end, - pattern.flags.clone(), + pattern.flags, &pattern.code, ); let ctx = MatchContext { @@ -237,6 +237,7 @@ trait MatchContextDrive { ]), _ => unreachable!(), }; + // TODO: char::from_u32_unchecked is stable from 1.5.0 unsafe { std::mem::transmute(code) } } fn back_skip_char(&mut self, skip_count: usize) { @@ -426,7 +427,6 @@ impl OpcodeDispatcher { drive.skip_char(1); } }), - SreOpcode::CHARSET | SreOpcode::BIGCHARSET => unreachable!("unexpected opcode"), SreOpcode::IN => once(|drive| { general_op_in(drive, |x| x); }), @@ -467,7 +467,7 @@ impl OpcodeDispatcher { general_op_literal(drive, |code, c| code != lower_unicode(c) as u32); }), SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| char_loc_ignore(code, c)); + general_op_literal(drive, char_loc_ignore); }), SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); @@ -478,9 +478,8 @@ impl OpcodeDispatcher { .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); drive.skip_code(2); }), - SreOpcode::MAX_UNTIL => unimplemented(), + SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), SreOpcode::MIN_UNTIL => unimplemented(), - SreOpcode::NEGATE => unimplemented(), SreOpcode::RANGE => unimplemented(), SreOpcode::REPEAT => unimplemented(), SreOpcode::REPEAT_ONE => unimplemented(), @@ -504,6 +503,10 @@ impl OpcodeDispatcher { } }), SreOpcode::RANGE_UNI_IGNORE => unimplemented(), + _ => { + // TODO + unreachable!("unexpected opcode") + } } } } @@ -661,7 +664,7 @@ fn charset(set: &[u32], c: char) -> bool { /* <256 blockindices> */ let count = set[i + 1]; if ch < 0x10000 { - let blockindices: &[u8] = unsafe { std::mem::transmute(&set[i + 2..]) }; + let (_, blockindices, _) = unsafe { set[i + 2..].align_to::() }; let block = blockindices[(ch >> 8) as usize]; if set[2 + 64 + ((block as u32 * 256 + (ch & 255)) / 32) as usize] & (1 << (ch & (32 - 1))) @@ -712,7 +715,7 @@ fn charset(set: &[u32], c: char) -> bool { } fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { - let mut drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); + let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let opcode = match SreOpcode::try_from(drive.peek_code(1)) { Ok(code) => code, @@ -749,7 +752,7 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { general_count_literal(&mut drive, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, |code, c| char_loc_ignore(code, c)); + general_count_literal(&mut drive, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { general_count_literal(&mut drive, |code, c| !char_loc_ignore(code, c)); @@ -1035,3 +1038,111 @@ impl OpMinRepeatOne { self._1(drive) } } + +#[derive(Debug, Copy, Clone)] +struct RepeatContext { + skip: usize, + mincount: usize, + maxcount: usize, + count: isize, + last_position: isize, +} + +struct OpMaxUntil { + jump_id: usize, + count: isize, + save_last_position: isize, + child_ctx_id: usize, +} +impl Default for OpMaxUntil { + fn default() -> Self { + Self { + jump_id: 0, + count: 0, + save_last_position: -1, + child_ctx_id: 0, + } + } +} +impl OpcodeExecutor for OpMaxUntil { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + drive.state.string_position = drive.ctx().string_position; + let repeat = match drive.state.repeat_stack.last_mut() { + Some(repeat) => repeat, + None => { + todo!("Internal re error: MAX_UNTIL without REPEAT."); + } + }; + self.count = repeat.count + 1; + + if self.count < repeat.mincount as isize { + // not enough matches + repeat.count = self.count; + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } + + if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) + && (drive.state.string_position as isize != repeat.last_position) + { + // we may have enough matches, if we can match another item, do so + repeat.count = self.count; + self.save_last_position = repeat.last_position; + repeat.last_position = drive.state.string_position as isize; + drive.state.marks_push(); + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 2; + return Some(()); + } + + self.child_ctx_id = drive.push_new_context(1); + + self.jump_id = 3; + Some(()) + } + 1 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + repeat.count = self.count - 1; + } + None + } + 2 => { + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + repeat.last_position = drive.state.string_position as isize; + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + repeat.count = self.count - 1; + drive.state.marks_pop(); + drive.state.string_position = drive.ctx().string_position; + + self.child_ctx_id = drive.push_new_context(1); + + self.jump_id = 3; + Some(()) + } + 3 => { + // cannot match more repeated items here. make sure the tail matches + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + } else { + drive.state.repeat_stack.pop(); + } + None + } + _ => unreachable!(), + } + } +} From 93c2b8b55513989982156136e6b92a1f07e02c24 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 13:02:43 +0200 Subject: [PATCH 009/960] impl OpBranch --- interp.rs | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index 85787ae516..03cfdfa96a 100644 --- a/interp.rs +++ b/interp.rs @@ -417,7 +417,6 @@ impl OpcodeDispatcher { } }), SreOpcode::BRANCH => unimplemented(), - SreOpcode::CALL => unimplemented(), SreOpcode::CATEGORY => once(|drive| { let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); if drive.at_end() || !category(catcode, drive.peek_char()) { @@ -1146,3 +1145,51 @@ impl OpcodeExecutor for OpMaxUntil { } } } + +struct OpBranch { + jump_id: usize, + child_ctx_id: usize, + current_branch_length: usize, +} +impl Default for OpBranch { + fn default() -> Self { + Self { jump_id: 0, child_ctx_id: 0, current_branch_length: 0 } + } +} +impl OpcodeExecutor for OpBranch { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + drive.state.marks_push(); + // jump out the head + self.current_branch_length = 1; + self.jump_id = 1; + self.next(drive) + } + 1 => { + drive.skip_code(self.current_branch_length); + self.current_branch_length = drive.peek_code(0) as usize; + if self.current_branch_length == 0 { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(1); + self.jump_id = 2; + Some(()) + } + 2 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.marks_pop_keep(); + self.jump_id = 1; + Some(()) + } + _ => unreachable!() + } + } +} \ No newline at end of file From 5a4459856ca57886279f0cc4f1abc33c7cf4a397 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 13:56:37 +0200 Subject: [PATCH 010/960] impl OpRepeat --- interp.rs | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/interp.rs b/interp.rs index 03cfdfa96a..fbcbf260e5 100644 --- a/interp.rs +++ b/interp.rs @@ -416,7 +416,7 @@ impl OpcodeDispatcher { drive.skip_code(2); } }), - SreOpcode::BRANCH => unimplemented(), + SreOpcode::BRANCH => Box::new(OpBranch::default()), SreOpcode::CATEGORY => once(|drive| { let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); if drive.at_end() || !category(catcode, drive.peek_char()) { @@ -1146,6 +1146,39 @@ impl OpcodeExecutor for OpMaxUntil { } } +struct OpMinUntil { + jump_id: usize, + count: isize, + child_ctx_id: usize, +} +impl Default for OpMinUntil { + fn default() -> Self { + Self { + jump_id: 0, + count: 0, + child_ctx_id: 0, + } + } +} +impl OpcodeExecutor for OpMinUntil { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + drive.state.string_position = drive.ctx().string_position; + let repeat = match drive.state.repeat_stack.last_mut() { + Some(repeat) => repeat, + None => { + todo!("Internal re error: MAX_UNTIL without REPEAT."); + } + }; + self.count = repeat.count + 1; + None + } + _ => unreachable!(), + } + } +} + struct OpBranch { jump_id: usize, child_ctx_id: usize, @@ -1153,7 +1186,11 @@ struct OpBranch { } impl Default for OpBranch { fn default() -> Self { - Self { jump_id: 0, child_ctx_id: 0, current_branch_length: 0 } + Self { + jump_id: 0, + child_ctx_id: 0, + current_branch_length: 0, + } } } impl OpcodeExecutor for OpBranch { @@ -1189,7 +1226,46 @@ impl OpcodeExecutor for OpBranch { self.jump_id = 1; Some(()) } - _ => unreachable!() + _ => unreachable!(), + } + } +} + +struct OpRepeat { + jump_id: usize, + child_ctx_id: usize, +} +impl Default for OpRepeat { + fn default() -> Self { + Self { + jump_id: 0, + child_ctx_id: 0, + } + } +} +impl OpcodeExecutor for OpRepeat { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + let repeat = RepeatContext { + skip: drive.peek_code(1), + mincount: drive.peek_code(2), + maxcount: drive.peek_code(3), + count: -1, + last_position: -1, + }; + drive.state.repeat_stack.push(repeat); + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 1; + Some(()) + } + 1 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + None + } + _ => unreachable!(), } } -} \ No newline at end of file +} From af7901dcb21cba20bcfa8635b3c86d84574a988e Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 16:48:30 +0200 Subject: [PATCH 011/960] impl OpMinUntil --- interp.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/interp.rs b/interp.rs index fbcbf260e5..ec6d41eade 100644 --- a/interp.rs +++ b/interp.rs @@ -478,11 +478,9 @@ impl OpcodeDispatcher { drive.skip_code(2); }), SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => unimplemented(), - SreOpcode::RANGE => unimplemented(), - SreOpcode::REPEAT => unimplemented(), + SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), + SreOpcode::REPEAT => Box::new(OpRepeat::default()), SreOpcode::REPEAT_ONE => unimplemented(), - SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), @@ -501,9 +499,8 @@ impl OpcodeDispatcher { _ => drive.skip_code(drive.peek_code(2) as usize + 1), } }), - SreOpcode::RANGE_UNI_IGNORE => unimplemented(), _ => { - // TODO + // TODO error expcetion unreachable!("unexpected opcode") } } @@ -1172,8 +1169,52 @@ impl OpcodeExecutor for OpMinUntil { } }; self.count = repeat.count + 1; + + if self.count < repeat.mincount as isize { + // not enough matches + repeat.count = self.count; + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } + + // see if the tail matches + drive.state.marks_push(); + self.child_ctx_id = drive.push_new_context(1); + self.jump_id = 2; + Some(()) + } + 1 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + repeat.count = self.count - 1; + } None } + 2 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.state.repeat_stack.pop(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + drive.state.marks_pop(); + + // match more until tail matches + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { + drive.ctx_mut().has_matched = Some(false); + return None; + } + repeat.count = self.count; + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 1; + Some(()) + } _ => unreachable!(), } } @@ -1248,9 +1289,9 @@ impl OpcodeExecutor for OpRepeat { match self.jump_id { 0 => { let repeat = RepeatContext { - skip: drive.peek_code(1), - mincount: drive.peek_code(2), - maxcount: drive.peek_code(3), + skip: drive.peek_code(1) as usize, + mincount: drive.peek_code(2) as usize, + maxcount: drive.peek_code(3) as usize, count: -1, last_position: -1, }; From fa2adaf2ff9bf8acf642ad210480e686848d4061 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 17:53:14 +0200 Subject: [PATCH 012/960] Impl OpRepeatONe --- interp.rs | 132 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 103 insertions(+), 29 deletions(-) diff --git a/interp.rs b/interp.rs index ec6d41eade..64d70216e3 100644 --- a/interp.rs +++ b/interp.rs @@ -313,14 +313,6 @@ trait OpcodeExecutor { fn next(&mut self, drive: &mut StackDrive) -> Option<()>; } -struct OpUnimplemented {} -impl OpcodeExecutor for OpUnimplemented { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - drive.ctx_mut().has_matched = Some(false); - None - } -} - struct OpOnce { f: Option, } @@ -335,10 +327,6 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } -fn unimplemented() -> Box { - Box::new(OpUnimplemented {}) -} - struct OpcodeDispatcher { executing_contexts: HashMap>, } @@ -480,7 +468,7 @@ impl OpcodeDispatcher { SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), SreOpcode::REPEAT => Box::new(OpRepeat::default()), - SreOpcode::REPEAT_ONE => unimplemented(), + SreOpcode::REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), @@ -500,7 +488,7 @@ impl OpcodeDispatcher { } }), _ => { - // TODO error expcetion + // TODO python expcetion? unreachable!("unexpected opcode") } } @@ -713,6 +701,7 @@ fn charset(set: &[u32], c: char) -> bool { fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + let end = drive.ctx().string_position + maxcount; let opcode = match SreOpcode::try_from(drive.peek_code(1)) { Ok(code) => code, Err(_) => { @@ -722,54 +711,56 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { match opcode { SreOpcode::ANY => { - while !drive.at_end() && !drive.at_linebreak() { + while !drive.ctx().string_position < end && !drive.at_linebreak() { drive.skip_char(1); } } SreOpcode::ANY_ALL => { - drive.skip_char(drive.remaining_chars()); + drive.skip_char(maxcount); } SreOpcode::IN => { // TODO: pattern[2 or 1..]? - while !drive.at_end() && charset(&drive.pattern()[2..], drive.peek_char()) { + while !drive.ctx().string_position < end + && charset(&drive.pattern()[2..], drive.peek_char()) + { drive.skip_char(1); } } SreOpcode::LITERAL => { - general_count_literal(&mut drive, |code, c| code == c as u32); + general_count_literal(&mut drive, end, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(&mut drive, |code, c| code != c as u32); + general_count_literal(&mut drive, end, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(&mut drive, |code, c| code == lower_ascii(c) as u32); + general_count_literal(&mut drive, end, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(&mut drive, |code, c| code != lower_ascii(c) as u32); + general_count_literal(&mut drive, end, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, char_loc_ignore); + general_count_literal(&mut drive, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, |code, c| !char_loc_ignore(code, c)); + general_count_literal(&mut drive, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, |code, c| code == lower_unicode(c) as u32); + general_count_literal(&mut drive, end, |code, c| code == lower_unicode(c) as u32); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, |code, c| code != lower_unicode(c) as u32); + general_count_literal(&mut drive, end, |code, c| code != lower_unicode(c) as u32); } _ => { - panic!("TODO: Not Implemented."); + todo!("repeated single character pattern?"); } } - drive.ctx().string_position - stack_drive.ctx().string_position + drive.ctx().string_position - drive.state().string_position } -fn general_count_literal bool>(drive: &mut WrapDrive, mut f: F) { +fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { let ch = drive.peek_code(1); - while !drive.at_end() && f(ch, drive.peek_char()) { + while !drive.ctx().string_position < end && f(ch, drive.peek_char()) { drive.skip_char(1); } } @@ -1310,3 +1301,86 @@ impl OpcodeExecutor for OpRepeat { } } } + +struct OpRepeatOne { + jump_id: usize, + child_ctx_id: usize, + mincount: usize, + maxcount: usize, + count: usize, +} +impl Default for OpRepeatOne { + fn default() -> Self { + Self { + jump_id: 0, + child_ctx_id: 0, + mincount: 0, + maxcount: 0, + count: 0, + } + } +} +impl OpcodeExecutor for OpRepeatOne { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; + + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.state.string_position = drive.ctx().string_position; + self.count = count(drive, self.maxcount); + drive.skip_char(self.count); + if self.count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 { + // tail is empty. we're finished + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } + + drive.state.marks_push(); + // TODO: + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + self.jump_id = 1; + self.next(drive) + } + 1 => { + // General case: backtracking + if self.count >= self.mincount { + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + return Some(()); + } + + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + None + } + 2 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.back_skip_char(1); + self.count -= 1; + drive.state.marks_pop_keep(); + + self.jump_id = 1; + Some(()) + } + _ => unreachable!(), + } + } +} From ae44580371afb3e347f31647368c2a7a8b1a1578 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 30 Dec 2020 14:14:13 +0200 Subject: [PATCH 013/960] general case for count --- interp.rs | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/interp.rs b/interp.rs index 64d70216e3..99574443c0 100644 --- a/interp.rs +++ b/interp.rs @@ -3,6 +3,7 @@ use super::_sre::{Match, Pattern, MAXREPEAT}; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use crate::builtins::PyStrRef; +use crate::pyobject::PyRef; use rustpython_common::borrow::BorrowValue; use std::collections::HashMap; use std::convert::TryFrom; @@ -98,7 +99,7 @@ pub(crate) fn pymatch( string: PyStrRef, start: usize, end: usize, - pattern: &Pattern, + pattern: PyRef, ) -> Option { let mut state = State::new( string.borrow_value(), @@ -135,7 +136,7 @@ pub(crate) fn pymatch( return None; } - Some(Match::new(&state, pattern.pattern.clone(), string.clone())) + Some(Match::new(&state, pattern.clone().into_object(), string.clone())) } #[derive(Debug, Copy, Clone)] @@ -425,7 +426,7 @@ impl OpcodeDispatcher { }), SreOpcode::IN_LOC_IGNORE => once(|drive| { let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset_loc_ignore(&drive.pattern()[1..], drive.peek_char()) { + if drive.at_end() || !charset_loc_ignore(&drive.pattern()[2..], drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(skip + 1); @@ -561,7 +562,7 @@ fn general_op_literal bool>(drive: &mut StackDrive, f: F fn general_op_in char>(drive: &mut StackDrive, f: F) { let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset(&drive.pattern()[1..], f(drive.peek_char())) { + if drive.at_end() || !charset(&drive.pattern()[2..], f(drive.peek_char())) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(skip + 1); @@ -698,7 +699,28 @@ fn charset(set: &[u32], c: char) -> bool { false } -fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { +fn count(drive: &mut StackDrive, maxcount: usize) -> usize { + let mut count = 0; + let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + + let save_ctx = *drive.ctx(); + drive.skip_code(4); + let reset_position = drive.ctx().code_position; + + let mut dispatcher = OpcodeDispatcher::new(); + while count < maxcount { + drive.ctx_mut().code_position = reset_position; + dispatcher.dispatch(SreOpcode::try_from(drive.peek_code(0)).unwrap(), drive); + if drive.ctx().has_matched == Some(false) { + break; + } + count += 1; + } + *drive.ctx_mut() = save_ctx; + count +} + +fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let end = drive.ctx().string_position + maxcount; From f7287553e9ed42df638190b56ce4474eb7783c38 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 30 Dec 2020 18:04:00 +0200 Subject: [PATCH 014/960] impl re.Match object --- interp.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/interp.rs b/interp.rs index 99574443c0..ac0eadb4ae 100644 --- a/interp.rs +++ b/interp.rs @@ -17,12 +17,12 @@ pub(crate) struct State<'a> { pub end: usize, flags: SreFlag, pattern_codes: &'a [u32], - marks: Vec>, + pub marks: Vec>, pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, repeat_stack: Vec, - string_position: usize, + pub string_position: usize, } impl<'a> State<'a> { @@ -136,7 +136,7 @@ pub(crate) fn pymatch( return None; } - Some(Match::new(&state, pattern.clone().into_object(), string.clone())) + Some(Match::new(&state, pattern.clone(), string.clone())) } #[derive(Debug, Copy, Clone)] From 04bb80f157128d59e2a5e25261311ee1be8c143e Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 31 Dec 2020 11:22:17 +0200 Subject: [PATCH 015/960] impl Match.group --- interp.rs | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/interp.rs b/interp.rs index ac0eadb4ae..f9b0898a31 100644 --- a/interp.rs +++ b/interp.rs @@ -115,6 +115,7 @@ pub(crate) fn pymatch( has_matched: None, }; state.context_stack.push(ctx); + let mut dispatcher = OpcodeDispatcher::new(); let mut has_matched = None; loop { @@ -123,7 +124,6 @@ pub(crate) fn pymatch( } let ctx_id = state.context_stack.len() - 1; let mut drive = StackDrive::drive(ctx_id, state); - let mut dispatcher = OpcodeDispatcher::new(); has_matched = dispatcher.pymatch(&mut drive); state = drive.take(); @@ -132,11 +132,11 @@ pub(crate) fn pymatch( } } - if has_matched == None || has_matched == Some(false) { - return None; + if has_matched != Some(true) { + None + } else { + Some(Match::new(&state, pattern.clone(), string.clone())) } - - Some(Match::new(&state, pattern.clone(), string.clone())) } #[derive(Debug, Copy, Clone)] @@ -344,7 +344,9 @@ impl OpcodeDispatcher { while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { let code = drive.peek_code(0); let opcode = SreOpcode::try_from(code).unwrap(); - self.dispatch(opcode, drive); + if !self.dispatch(opcode, drive) { + return None; + } } match drive.ctx().has_matched { Some(matched) => Some(matched), @@ -469,7 +471,7 @@ impl OpcodeDispatcher { SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), SreOpcode::REPEAT => Box::new(OpRepeat::default()), - SreOpcode::REPEAT_ONE => Box::new(OpMinRepeatOne::default()), + SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), @@ -1329,7 +1331,7 @@ struct OpRepeatOne { child_ctx_id: usize, mincount: usize, maxcount: usize, - count: usize, + count: isize, } impl Default for OpRepeatOne { fn default() -> Self { @@ -1354,9 +1356,9 @@ impl OpcodeExecutor for OpRepeatOne { return None; } drive.state.string_position = drive.ctx().string_position; - self.count = count(drive, self.maxcount); - drive.skip_char(self.count); - if self.count < self.mincount { + self.count = count(drive, self.maxcount) as isize; + drive.skip_char(self.count as usize); + if self.count < self.mincount as isize { drive.ctx_mut().has_matched = Some(false); return None; } @@ -1378,7 +1380,7 @@ impl OpcodeExecutor for OpRepeatOne { } 1 => { // General case: backtracking - if self.count >= self.mincount { + if self.count >= self.mincount as isize { drive.state.string_position = drive.ctx().string_position; self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); self.jump_id = 2; From 8c442f599bb67f4a9878cad9e4df180ab52b39b7 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 10:30:05 +0200 Subject: [PATCH 016/960] rework OpMaxUntil; restruct popping context; add tests; --- interp.rs | 660 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 364 insertions(+), 296 deletions(-) diff --git a/interp.rs b/interp.rs index f9b0898a31..cb06ee0b8a 100644 --- a/interp.rs +++ b/interp.rs @@ -23,6 +23,7 @@ pub(crate) struct State<'a> { context_stack: Vec, repeat_stack: Vec, pub string_position: usize, + popped_context: Option, } impl<'a> State<'a> { @@ -49,6 +50,7 @@ impl<'a> State<'a> { repeat_stack: Vec::new(), marks: Vec::new(), string_position: start, + popped_context: None, } } @@ -58,6 +60,7 @@ impl<'a> State<'a> { self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); + self.popped_context = None; } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -128,7 +131,7 @@ pub(crate) fn pymatch( has_matched = dispatcher.pymatch(&mut drive); state = drive.take(); if has_matched.is_some() { - state.context_stack.pop(); + state.popped_context = state.context_stack.pop(); } } @@ -151,6 +154,9 @@ trait MatchContextDrive { fn ctx_mut(&mut self) -> &mut MatchContext; fn ctx(&self) -> &MatchContext; fn state(&self) -> &State; + fn repeat_ctx(&self) -> &RepeatContext { + self.state().repeat_stack.last().unwrap() + } fn str(&self) -> &str { unsafe { std::str::from_utf8_unchecked( @@ -181,6 +187,9 @@ trait MatchContextDrive { } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; + if self.ctx().code_position > self.state().pattern_codes.len() { + self.ctx_mut().code_position = self.state().pattern_codes.len(); + } } fn remaining_chars(&self) -> usize { self.state().end - self.ctx().string_position @@ -263,16 +272,17 @@ impl<'a> StackDrive<'a> { fn take(self) -> State<'a> { self.state } - fn push_new_context(&mut self, pattern_offset: usize) -> usize { + fn push_new_context(&mut self, pattern_offset: usize) { let ctx = self.ctx(); - let child_ctx = MatchContext { - string_position: ctx.string_position, - string_offset: ctx.string_offset, - code_position: ctx.code_position + pattern_offset, - has_matched: None, - }; + let mut child_ctx = MatchContext { ..*ctx }; + child_ctx.code_position += pattern_offset; + if child_ctx.code_position > self.state.pattern_codes.len() { + child_ctx.code_position = self.state.pattern_codes.len(); + } self.state.context_stack.push(child_ctx); - self.state.context_stack.len() - 1 + } + fn repeat_ctx_mut(&mut self) -> &mut RepeatContext { + self.state.repeat_stack.last_mut().unwrap() } } impl MatchContextDrive for StackDrive<'_> { @@ -328,6 +338,39 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } +// F1 F2 are same identical, but workaround for closure +struct OpTwice { + f1: Option, + f2: Option, +} +impl OpcodeExecutor for OpTwice +where + F1: FnOnce(&mut StackDrive), + F2: FnOnce(&mut StackDrive), +{ + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + if let Some(f1) = self.f1.take() { + f1(drive); + Some(()) + } else if let Some(f2) = self.f2.take() { + f2(drive); + None + } else { + unreachable!() + } + } +} +fn twice(f1: F1, f2: F2) -> Box> +where + F1: FnOnce(&mut StackDrive), + F2: FnOnce(&mut StackDrive), +{ + Box::new(OpTwice { + f1: Some(f1), + f2: Some(f2), + }) +} + struct OpcodeDispatcher { executing_contexts: HashMap>, } @@ -397,8 +440,44 @@ impl OpcodeDispatcher { drive.skip_char(1); } }), - SreOpcode::ASSERT => Box::new(OpAssert::default()), - SreOpcode::ASSERT_NOT => Box::new(OpAssertNot::default()), + SreOpcode::ASSERT => twice( + |drive| { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.ctx_mut().has_matched = Some(false); + return; + } + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); + }, + |drive| { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.skip_code(drive.peek_code(1) as usize + 1); + } else { + drive.ctx_mut().has_matched = Some(false); + } + }, + ), + SreOpcode::ASSERT_NOT => twice( + |drive| { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.skip_code(drive.peek_code(1) as usize + 1); + return; + } + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); + }, + |drive| { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(drive.peek_code(1) as usize + 1); + } + }, + ), SreOpcode::AT => once(|drive| { let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); if !at(drive, atcode) { @@ -468,9 +547,29 @@ impl OpcodeDispatcher { .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); drive.skip_code(2); }), + SreOpcode::REPEAT => twice( + // create repeat context. all the hard work is done by the UNTIL + // operator (MAX_UNTIL, MIN_UNTIL) + // <1=min> <2=max> item tail + |drive| { + let repeat = RepeatContext { + count: -1, + code_position: drive.ctx().code_position, + last_position: std::usize::MAX, + }; + drive.state.repeat_stack.push(repeat); + drive.state.string_position = drive.ctx().string_position; + // execute UNTIL operator + drive.push_new_context(drive.peek_code(1) as usize + 1); + }, + |drive| { + drive.state.repeat_stack.pop(); + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + }, + ), SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), - SreOpcode::REPEAT => Box::new(OpRepeat::default()), + SreOpcode::MIN_UNTIL => todo!("min until"), SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), @@ -872,90 +971,12 @@ fn is_utf8_first_byte(b: u8) -> bool { (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) } -struct OpAssert { - child_ctx_id: usize, - jump_id: usize, -} -impl Default for OpAssert { - fn default() -> Self { - Self { - child_ctx_id: 0, - jump_id: 0, - } - } -} -impl OpcodeExecutor for OpAssert { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - _ => unreachable!(), - } - } -} -impl OpAssert { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.state.string_position = drive.ctx().string_position - back; - self.child_ctx_id = drive.push_new_context(3); - self.jump_id = 1; - Some(()) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { - drive.skip_code(drive.peek_code(1) as usize + 1); - } else { - drive.ctx_mut().has_matched = Some(false); - } - None - } -} - -struct OpAssertNot { - child_ctx_id: usize, - jump_id: usize, -} -impl Default for OpAssertNot { - fn default() -> Self { - Self { - child_ctx_id: 0, - jump_id: 0, - } - } -} -impl OpcodeExecutor for OpAssertNot { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - _ => unreachable!(), - } - } -} -impl OpAssertNot { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { - drive.skip_code(drive.peek_code(1) as usize + 1); - return None; - } - drive.state.string_position = drive.ctx().string_position - back; - self.child_ctx_id = drive.push_new_context(3); - self.jump_id = 1; - Some(()) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(drive.peek_code(1) as usize + 1); - } - None - } +#[derive(Debug, Copy, Clone)] +struct RepeatContext { + count: isize, + code_position: usize, + // zero-width match protection + last_position: usize, } struct OpMinRepeatOne { @@ -963,7 +984,6 @@ struct OpMinRepeatOne { mincount: usize, maxcount: usize, count: usize, - child_ctx_id: usize, } impl OpcodeExecutor for OpMinRepeatOne { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { @@ -982,7 +1002,6 @@ impl Default for OpMinRepeatOne { mincount: 0, maxcount: 0, count: 0, - child_ctx_id: 0, } } } @@ -1023,7 +1042,7 @@ impl OpMinRepeatOne { fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { if self.maxcount == MAXREPEAT || self.count <= self.maxcount { drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + drive.push_new_context(drive.peek_code(1) as usize + 1); self.jump_id = 2; return Some(()); } @@ -1033,7 +1052,8 @@ impl OpMinRepeatOne { None } fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; } @@ -1050,201 +1070,290 @@ impl OpMinRepeatOne { } } -#[derive(Debug, Copy, Clone)] -struct RepeatContext { - skip: usize, - mincount: usize, - maxcount: usize, - count: isize, - last_position: isize, -} - +// Everything is stored in RepeatContext struct OpMaxUntil { jump_id: usize, count: isize, - save_last_position: isize, - child_ctx_id: usize, + save_last_position: usize, } impl Default for OpMaxUntil { fn default() -> Self { - Self { + OpMaxUntil { jump_id: 0, count: 0, - save_last_position: -1, - child_ctx_id: 0, + save_last_position: 0, } } } impl OpcodeExecutor for OpMaxUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { - 0 => { - drive.state.string_position = drive.ctx().string_position; - let repeat = match drive.state.repeat_stack.last_mut() { - Some(repeat) => repeat, - None => { - todo!("Internal re error: MAX_UNTIL without REPEAT."); - } - }; - self.count = repeat.count + 1; - - if self.count < repeat.mincount as isize { - // not enough matches - repeat.count = self.count; - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 1; - return Some(()); - } - - if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) - && (drive.state.string_position as isize != repeat.last_position) - { - // we may have enough matches, if we can match another item, do so - repeat.count = self.count; - self.save_last_position = repeat.last_position; - repeat.last_position = drive.state.string_position as isize; - drive.state.marks_push(); - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 2; - return Some(()); - } - - self.child_ctx_id = drive.push_new_context(1); - - self.jump_id = 3; - Some(()) - } - 1 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - repeat.count = self.count - 1; - } - None - } - 2 => { - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - repeat.last_position = drive.state.string_position as isize; - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - if child_ctx.has_matched == Some(true) { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - repeat.count = self.count - 1; - drive.state.marks_pop(); - drive.state.string_position = drive.ctx().string_position; + 0 => self._0(drive), + 1 => self._1(drive), + 2 => self._2(drive), + 3 => self._3(drive), + 4 => self._4(drive), + _ => unreachable!(), + } + } +} +impl OpMaxUntil { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { + let RepeatContext { + count, + code_position, + last_position, + } = *drive.repeat_ctx(); + drive.ctx_mut().code_position = code_position; + let mincount = drive.peek_code(2) as usize; + let maxcount = drive.peek_code(3) as usize; + self.count = count + 1; + + if (self.count as usize) < mincount { + // not enough matches + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } - self.child_ctx_id = drive.push_new_context(1); + if ((count as usize) < maxcount || maxcount == MAXREPEAT) + && drive.state.string_position != last_position + { + // we may have enough matches, if we can match another item, do so + drive.repeat_ctx_mut().count = self.count; + drive.state.marks_push(); + // self.save_last_position = last_position; + // drive.repeat_ctx_mut().last_position = drive.state.string_position; + drive.push_new_context(4); + self.jump_id = 2; + return Some(()); + } - self.jump_id = 3; - Some(()) - } - 3 => { - // cannot match more repeated items here. make sure the tail matches - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - } else { - drive.state.repeat_stack.pop(); - } - None - } - _ => unreachable!(), + self.jump_id = 3; + self.next(drive) + } + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; } + None + } + fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { + // drive.repeat_ctx_mut().last_position = self.save_last_position; + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.marks_pop(); + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + self.jump_id = 3; + self.next(drive) + } + fn _3(&mut self, drive: &mut StackDrive) -> Option<()> { + // cannot match more repeated items here. make sure the tail matches + drive.skip_code(drive.peek_code(1) as usize + 1); + drive.push_new_context(1); + self.jump_id = 4; + Some(()) + } + fn _4(&mut self, drive: &mut StackDrive) -> Option<()> { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + } + None } } +// struct OpMaxUntil { +// jump_id: usize, +// count: isize, +// save_last_position: isize, +// } +// impl Default for OpMaxUntil { +// fn default() -> Self { +// Self { +// jump_id: 0, +// count: 0, +// save_last_position: -1, +// } +// } +// } +// impl OpcodeExecutor for OpMaxUntil { +// fn next(&mut self, drive: &mut StackDrive) -> Option<()> { +// match self.jump_id { +// 0 => { +// drive.state.string_position = drive.ctx().string_position; +// let repeat = match drive.state.repeat_stack.last_mut() { +// Some(repeat) => repeat, +// None => { +// panic!("Internal re error: MAX_UNTIL without REPEAT."); +// } +// }; +// self.count = repeat.count + 1; + +// if self.count < repeat.mincount as isize { +// // not enough matches +// repeat.count = self.count; +// drive.push_new_context(4); +// self.jump_id = 1; +// return Some(()); +// } + +// if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) +// && (drive.state.string_position as isize != repeat.last_position) +// { +// // we may have enough matches, if we can match another item, do so +// repeat.count = self.count; +// self.save_last_position = repeat.last_position; +// repeat.last_position = drive.state.string_position as isize; +// drive.state.marks_push(); +// drive.push_new_context(4); +// self.jump_id = 2; +// return Some(()); +// } + +// drive.push_new_context(1); + +// self.jump_id = 3; +// Some(()) +// } +// 1 => { +// let child_ctx = drive.state.popped_context.unwrap(); +// drive.ctx_mut().has_matched = child_ctx.has_matched; +// if drive.ctx().has_matched != Some(true) { +// drive.state.string_position = drive.ctx().string_position; +// let repeat = drive.state.repeat_stack.last_mut().unwrap(); +// repeat.count = self.count - 1; +// } +// None +// } +// 2 => { +// let repeat = drive.state.repeat_stack.last_mut().unwrap(); +// repeat.last_position = drive.state.string_position as isize; +// let child_ctx = drive.state.popped_context.unwrap(); +// if child_ctx.has_matched == Some(true) { +// drive.state.marks_pop_discard(); +// drive.ctx_mut().has_matched = Some(true); +// return None; +// } +// repeat.count = self.count - 1; +// drive.state.marks_pop(); +// drive.state.string_position = drive.ctx().string_position; + +// drive.push_new_context(1); + +// self.jump_id = 3; +// Some(()) +// } +// 3 => { +// // cannot match more repeated items here. make sure the tail matches +// let child_ctx = drive.state.popped_context.unwrap(); +// drive.ctx_mut().has_matched = child_ctx.has_matched; +// if drive.ctx().has_matched != Some(true) { +// drive.state.string_position = drive.ctx().string_position; +// } else { +// drive.state.repeat_stack.pop(); +// } +// None +// } +// _ => unreachable!(), +// } +// } +// } + struct OpMinUntil { jump_id: usize, count: isize, - child_ctx_id: usize, } impl Default for OpMinUntil { fn default() -> Self { Self { jump_id: 0, count: 0, - child_ctx_id: 0, } } } impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - drive.state.string_position = drive.ctx().string_position; - let repeat = match drive.state.repeat_stack.last_mut() { - Some(repeat) => repeat, - None => { - todo!("Internal re error: MAX_UNTIL without REPEAT."); - } - }; - self.count = repeat.count + 1; - - if self.count < repeat.mincount as isize { - // not enough matches - repeat.count = self.count; - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 1; - return Some(()); - } - - // see if the tail matches - drive.state.marks_push(); - self.child_ctx_id = drive.push_new_context(1); - self.jump_id = 2; - Some(()) - } - 1 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - repeat.count = self.count - 1; - } - None - } - 2 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - if child_ctx.has_matched == Some(true) { - drive.state.repeat_stack.pop(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - drive.state.marks_pop(); - - // match more until tail matches - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { - drive.ctx_mut().has_matched = Some(false); - return None; - } - repeat.count = self.count; - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 1; - Some(()) - } - _ => unreachable!(), - } + None + // match self.jump_id { + // 0 => { + // drive.state.string_position = drive.ctx().string_position; + // let repeat = match drive.state.repeat_stack.last_mut() { + // Some(repeat) => repeat, + // None => { + // todo!("Internal re error: MAX_UNTIL without REPEAT."); + // } + // }; + // self.count = repeat.count + 1; + + // if self.count < repeat.mincount as isize { + // // not enough matches + // repeat.count = self.count; + // drive.push_new_context(4); + // self.jump_id = 1; + // return Some(()); + // } + + // // see if the tail matches + // drive.state.marks_push(); + // drive.push_new_context(1); + // self.jump_id = 2; + // Some(()) + // } + // 1 => { + // let child_ctx = drive.state.popped_context.unwrap(); + // drive.ctx_mut().has_matched = child_ctx.has_matched; + // if drive.ctx().has_matched != Some(true) { + // drive.state.string_position = drive.ctx().string_position; + // let repeat = drive.state.repeat_stack.last_mut().unwrap(); + // repeat.count = self.count - 1; + // } + // None + // } + // 2 => { + // let child_ctx = drive.state.popped_context.unwrap(); + // if child_ctx.has_matched == Some(true) { + // drive.state.repeat_stack.pop(); + // drive.ctx_mut().has_matched = Some(true); + // return None; + // } + // drive.state.string_position = drive.ctx().string_position; + // drive.state.marks_pop(); + + // // match more until tail matches + // let repeat = drive.state.repeat_stack.last_mut().unwrap(); + // if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { + // drive.ctx_mut().has_matched = Some(false); + // return None; + // } + // repeat.count = self.count; + // drive.push_new_context(4); + // self.jump_id = 1; + // Some(()) + // } + // _ => unreachable!(), + // } } } struct OpBranch { jump_id: usize, - child_ctx_id: usize, current_branch_length: usize, } impl Default for OpBranch { fn default() -> Self { Self { jump_id: 0, - child_ctx_id: 0, current_branch_length: 0, } } @@ -1268,12 +1377,12 @@ impl OpcodeExecutor for OpBranch { return None; } drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(1); + drive.push_new_context(1); self.jump_id = 2; Some(()) } 2 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; @@ -1287,48 +1396,8 @@ impl OpcodeExecutor for OpBranch { } } -struct OpRepeat { - jump_id: usize, - child_ctx_id: usize, -} -impl Default for OpRepeat { - fn default() -> Self { - Self { - jump_id: 0, - child_ctx_id: 0, - } - } -} -impl OpcodeExecutor for OpRepeat { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - let repeat = RepeatContext { - skip: drive.peek_code(1) as usize, - mincount: drive.peek_code(2) as usize, - maxcount: drive.peek_code(3) as usize, - count: -1, - last_position: -1, - }; - drive.state.repeat_stack.push(repeat); - drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 1; - Some(()) - } - 1 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - None - } - _ => unreachable!(), - } - } -} - struct OpRepeatOne { jump_id: usize, - child_ctx_id: usize, mincount: usize, maxcount: usize, count: isize, @@ -1337,7 +1406,6 @@ impl Default for OpRepeatOne { fn default() -> Self { Self { jump_id: 0, - child_ctx_id: 0, mincount: 0, maxcount: 0, count: 0, @@ -1382,7 +1450,7 @@ impl OpcodeExecutor for OpRepeatOne { // General case: backtracking if self.count >= self.mincount as isize { drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + drive.push_new_context(drive.peek_code(1) as usize + 1); self.jump_id = 2; return Some(()); } @@ -1392,7 +1460,7 @@ impl OpcodeExecutor for OpRepeatOne { None } 2 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; From 8fba935bba46ade66107a3c7aabb65e4eae00dcb Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 10:54:44 +0200 Subject: [PATCH 017/960] OpMaxUntil zero-width protection --- interp.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/interp.rs b/interp.rs index cb06ee0b8a..4a015187e9 100644 --- a/interp.rs +++ b/interp.rs @@ -1107,6 +1107,7 @@ impl OpMaxUntil { drive.ctx_mut().code_position = code_position; let mincount = drive.peek_code(2) as usize; let maxcount = drive.peek_code(3) as usize; + drive.state.string_position = drive.ctx().string_position; self.count = count + 1; if (self.count as usize) < mincount { @@ -1123,8 +1124,8 @@ impl OpMaxUntil { // we may have enough matches, if we can match another item, do so drive.repeat_ctx_mut().count = self.count; drive.state.marks_push(); - // self.save_last_position = last_position; - // drive.repeat_ctx_mut().last_position = drive.state.string_position; + self.save_last_position = last_position; + drive.repeat_ctx_mut().last_position = drive.state.string_position; drive.push_new_context(4); self.jump_id = 2; return Some(()); @@ -1143,7 +1144,7 @@ impl OpMaxUntil { None } fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - // drive.repeat_ctx_mut().last_position = self.save_last_position; + drive.repeat_ctx_mut().last_position = self.save_last_position; let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.state.marks_pop_discard(); From af1a53cb0530f27804442f86d3e1e05a4abcacef Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 11:34:40 +0200 Subject: [PATCH 018/960] impl Match.groups() --- interp.rs | 249 ++++++++++++++++-------------------------------------- 1 file changed, 72 insertions(+), 177 deletions(-) diff --git a/interp.rs b/interp.rs index 4a015187e9..ad0790a31b 100644 --- a/interp.rs +++ b/interp.rs @@ -1070,7 +1070,6 @@ impl OpMinRepeatOne { } } -// Everything is stored in RepeatContext struct OpMaxUntil { jump_id: usize, count: isize, @@ -1088,189 +1087,85 @@ impl Default for OpMaxUntil { impl OpcodeExecutor for OpMaxUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - 2 => self._2(drive), - 3 => self._3(drive), - 4 => self._4(drive), - _ => unreachable!(), - } - } -} -impl OpMaxUntil { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - let RepeatContext { - count, - code_position, - last_position, - } = *drive.repeat_ctx(); - drive.ctx_mut().code_position = code_position; - let mincount = drive.peek_code(2) as usize; - let maxcount = drive.peek_code(3) as usize; - drive.state.string_position = drive.ctx().string_position; - self.count = count + 1; + 0 => { + let RepeatContext { + count, + code_position, + last_position, + } = *drive.repeat_ctx(); + drive.ctx_mut().code_position = code_position; + let mincount = drive.peek_code(2) as usize; + let maxcount = drive.peek_code(3) as usize; + drive.state.string_position = drive.ctx().string_position; + self.count = count + 1; - if (self.count as usize) < mincount { - // not enough matches - drive.repeat_ctx_mut().count = self.count; - drive.push_new_context(4); - self.jump_id = 1; - return Some(()); - } + if (self.count as usize) < mincount { + // not enough matches + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } - if ((count as usize) < maxcount || maxcount == MAXREPEAT) - && drive.state.string_position != last_position - { - // we may have enough matches, if we can match another item, do so - drive.repeat_ctx_mut().count = self.count; - drive.state.marks_push(); - self.save_last_position = last_position; - drive.repeat_ctx_mut().last_position = drive.state.string_position; - drive.push_new_context(4); - self.jump_id = 2; - return Some(()); - } + if ((count as usize) < maxcount || maxcount == MAXREPEAT) + && drive.state.string_position != last_position + { + // we may have enough matches, if we can match another item, do so + drive.repeat_ctx_mut().count = self.count; + drive.state.marks_push(); + self.save_last_position = last_position; + drive.repeat_ctx_mut().last_position = drive.state.string_position; + drive.push_new_context(4); + self.jump_id = 2; + return Some(()); + } - self.jump_id = 3; - self.next(drive) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - } - None - } - fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - drive.repeat_ctx_mut().last_position = self.save_last_position; - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.marks_pop(); - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - self.jump_id = 3; - self.next(drive) - } - fn _3(&mut self, drive: &mut StackDrive) -> Option<()> { - // cannot match more repeated items here. make sure the tail matches - drive.skip_code(drive.peek_code(1) as usize + 1); - drive.push_new_context(1); - self.jump_id = 4; - Some(()) - } - fn _4(&mut self, drive: &mut StackDrive) -> Option<()> { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; + self.jump_id = 3; + self.next(drive) + } + 1 => { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + } + None + } + 2 => { + drive.repeat_ctx_mut().last_position = self.save_last_position; + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.marks_pop(); + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + self.jump_id = 3; + self.next(drive) + } + 3 => { + // cannot match more repeated items here. make sure the tail matches + drive.skip_code(drive.peek_code(1) as usize + 1); + drive.push_new_context(1); + self.jump_id = 4; + Some(()) + } + 4 => { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + } + None + } + _ => unreachable!(), } - None } } -// struct OpMaxUntil { -// jump_id: usize, -// count: isize, -// save_last_position: isize, -// } -// impl Default for OpMaxUntil { -// fn default() -> Self { -// Self { -// jump_id: 0, -// count: 0, -// save_last_position: -1, -// } -// } -// } -// impl OpcodeExecutor for OpMaxUntil { -// fn next(&mut self, drive: &mut StackDrive) -> Option<()> { -// match self.jump_id { -// 0 => { -// drive.state.string_position = drive.ctx().string_position; -// let repeat = match drive.state.repeat_stack.last_mut() { -// Some(repeat) => repeat, -// None => { -// panic!("Internal re error: MAX_UNTIL without REPEAT."); -// } -// }; -// self.count = repeat.count + 1; - -// if self.count < repeat.mincount as isize { -// // not enough matches -// repeat.count = self.count; -// drive.push_new_context(4); -// self.jump_id = 1; -// return Some(()); -// } - -// if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) -// && (drive.state.string_position as isize != repeat.last_position) -// { -// // we may have enough matches, if we can match another item, do so -// repeat.count = self.count; -// self.save_last_position = repeat.last_position; -// repeat.last_position = drive.state.string_position as isize; -// drive.state.marks_push(); -// drive.push_new_context(4); -// self.jump_id = 2; -// return Some(()); -// } - -// drive.push_new_context(1); - -// self.jump_id = 3; -// Some(()) -// } -// 1 => { -// let child_ctx = drive.state.popped_context.unwrap(); -// drive.ctx_mut().has_matched = child_ctx.has_matched; -// if drive.ctx().has_matched != Some(true) { -// drive.state.string_position = drive.ctx().string_position; -// let repeat = drive.state.repeat_stack.last_mut().unwrap(); -// repeat.count = self.count - 1; -// } -// None -// } -// 2 => { -// let repeat = drive.state.repeat_stack.last_mut().unwrap(); -// repeat.last_position = drive.state.string_position as isize; -// let child_ctx = drive.state.popped_context.unwrap(); -// if child_ctx.has_matched == Some(true) { -// drive.state.marks_pop_discard(); -// drive.ctx_mut().has_matched = Some(true); -// return None; -// } -// repeat.count = self.count - 1; -// drive.state.marks_pop(); -// drive.state.string_position = drive.ctx().string_position; - -// drive.push_new_context(1); - -// self.jump_id = 3; -// Some(()) -// } -// 3 => { -// // cannot match more repeated items here. make sure the tail matches -// let child_ctx = drive.state.popped_context.unwrap(); -// drive.ctx_mut().has_matched = child_ctx.has_matched; -// if drive.ctx().has_matched != Some(true) { -// drive.state.string_position = drive.ctx().string_position; -// } else { -// drive.state.repeat_stack.pop(); -// } -// None -// } -// _ => unreachable!(), -// } -// } -// } - struct OpMinUntil { jump_id: usize, count: isize, From db84f329816d812349b49fa843b4d0480674aba2 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 13:10:19 +0200 Subject: [PATCH 019/960] fix Opcode::CHARSET --- interp.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index ad0790a31b..289dbc2ec9 100644 --- a/interp.rs +++ b/interp.rs @@ -741,7 +741,8 @@ fn charset(set: &[u32], c: char) -> bool { } SreOpcode::CHARSET => { /* */ - if ch < 256 && (set[(ch / 32) as usize] & (1 << (32 - 1))) != 0 { + let set = &set[1..]; + if ch < 256 && ((set[(ch >> 5) as usize] & (1u32 << (ch & 31))) != 0) { return ok; } i += 8; From 36433a9f4d026df404e419d604e67295fa4db758 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 20:13:06 +0200 Subject: [PATCH 020/960] fix Opcode::BIGCHARSET --- interp.rs | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/interp.rs b/interp.rs index 289dbc2ec9..ca5efd9e4f 100644 --- a/interp.rs +++ b/interp.rs @@ -497,22 +497,16 @@ impl OpcodeDispatcher { } }), SreOpcode::IN => once(|drive| { - general_op_in(drive, |x| x); + general_op_in(drive, |set, c| charset(set, c)); }), SreOpcode::IN_IGNORE => once(|drive| { - general_op_in(drive, lower_ascii); + general_op_in(drive, |set, c| charset(set, lower_ascii(c))); }), SreOpcode::IN_UNI_IGNORE => once(|drive| { - general_op_in(drive, lower_unicode); + general_op_in(drive, |set, c| charset(set, lower_unicode(c))); }), SreOpcode::IN_LOC_IGNORE => once(|drive| { - let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset_loc_ignore(&drive.pattern()[2..], drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(skip + 1); - drive.skip_char(1); - } + general_op_in(drive, |set, c| charset_loc_ignore(set, c)); }), SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { drive.skip_code(drive.peek_code(1) as usize + 1); @@ -661,9 +655,9 @@ fn general_op_literal bool>(drive: &mut StackDrive, f: F } } -fn general_op_in char>(drive: &mut StackDrive, f: F) { +fn general_op_in bool>(drive: &mut StackDrive, f: F) { let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset(&drive.pattern()[2..], f(drive.peek_char())) { + if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(skip + 1); @@ -749,18 +743,20 @@ fn charset(set: &[u32], c: char) -> bool { } SreOpcode::BIGCHARSET => { /* <256 blockindices> */ - let count = set[i + 1]; + let count = set[i + 1] as usize; if ch < 0x10000 { - let (_, blockindices, _) = unsafe { set[i + 2..].align_to::() }; - let block = blockindices[(ch >> 8) as usize]; - if set[2 + 64 + ((block as u32 * 256 + (ch & 255)) / 32) as usize] - & (1 << (ch & (32 - 1))) + let set = &set[2..]; + let block_index = ch >> 8; + let (_, blockindices, _) = unsafe { set.align_to::() }; + let blocks = &set[64..]; + let block = blockindices[block_index as usize]; + if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize] & (1u32 << (ch & 31)) != 0 { return ok; } } - i += 2 + 64 + count as usize * 8; + i += 2 + 64 + count * 8; } SreOpcode::LITERAL => { /* */ From f05f6cb44df000ad7cffcd79b6447c97756d15dd Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 3 Jan 2021 19:44:40 +0200 Subject: [PATCH 021/960] impl Pattern.sub --- interp.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index ca5efd9e4f..98844ee9ca 100644 --- a/interp.rs +++ b/interp.rs @@ -142,6 +142,27 @@ pub(crate) fn pymatch( } } +pub(crate) fn search( + string: PyStrRef, + start: usize, + end: usize, + pattern: PyRef, +) -> Option { + // TODO: optimize by op info and skip prefix + let end = std::cmp::min(end, string.char_len()); + for i in start..end { + if let Some(m) = pymatch( + string.clone(), + i, + end, + pattern.clone(), + ) { + return Some(m); + } + } + None +} + #[derive(Debug, Copy, Clone)] struct MatchContext { string_position: usize, @@ -750,7 +771,8 @@ fn charset(set: &[u32], c: char) -> bool { let (_, blockindices, _) = unsafe { set.align_to::() }; let blocks = &set[64..]; let block = blockindices[block_index as usize]; - if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize] & (1u32 << (ch & 31)) + if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize] + & (1u32 << (ch & 31)) != 0 { return ok; From 817eb66167810a163eb889ce2626418b51bd6afb Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 4 Jan 2021 16:01:17 +0200 Subject: [PATCH 022/960] fix OpMinUntil --- interp.rs | 100 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 20 deletions(-) diff --git a/interp.rs b/interp.rs index 98844ee9ca..a3263eddce 100644 --- a/interp.rs +++ b/interp.rs @@ -151,12 +151,7 @@ pub(crate) fn search( // TODO: optimize by op info and skip prefix let end = std::cmp::min(end, string.char_len()); for i in start..end { - if let Some(m) = pymatch( - string.clone(), - i, - end, - pattern.clone(), - ) { + if let Some(m) = pymatch(string.clone(), i, end, pattern.clone()) { return Some(m); } } @@ -294,12 +289,13 @@ impl<'a> StackDrive<'a> { self.state } fn push_new_context(&mut self, pattern_offset: usize) { - let ctx = self.ctx(); - let mut child_ctx = MatchContext { ..*ctx }; + let mut child_ctx = MatchContext { ..*self.ctx() }; child_ctx.code_position += pattern_offset; - if child_ctx.code_position > self.state.pattern_codes.len() { - child_ctx.code_position = self.state.pattern_codes.len(); - } + self.state.context_stack.push(child_ctx); + } + fn push_new_context_at(&mut self, code_position: usize) { + let mut child_ctx = MatchContext { ..*self.ctx() }; + child_ctx.code_position = code_position; self.state.context_stack.push(child_ctx); } fn repeat_ctx_mut(&mut self) -> &mut RepeatContext { @@ -571,6 +567,8 @@ impl OpcodeDispatcher { count: -1, code_position: drive.ctx().code_position, last_position: std::usize::MAX, + mincount: drive.peek_code(2) as usize, + maxcount: drive.peek_code(3) as usize, }; drive.state.repeat_stack.push(repeat); drive.state.string_position = drive.ctx().string_position; @@ -584,7 +582,7 @@ impl OpcodeDispatcher { }, ), SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => todo!("min until"), + SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), @@ -996,6 +994,8 @@ struct RepeatContext { code_position: usize, // zero-width match protection last_position: usize, + mincount: usize, + maxcount: usize, } struct OpMinRepeatOne { @@ -1111,22 +1111,22 @@ impl OpcodeExecutor for OpMaxUntil { count, code_position, last_position, + mincount, + maxcount, } = *drive.repeat_ctx(); - drive.ctx_mut().code_position = code_position; - let mincount = drive.peek_code(2) as usize; - let maxcount = drive.peek_code(3) as usize; + drive.state.string_position = drive.ctx().string_position; self.count = count + 1; if (self.count as usize) < mincount { // not enough matches drive.repeat_ctx_mut().count = self.count; - drive.push_new_context(4); + drive.push_new_context_at(code_position + 4); self.jump_id = 1; return Some(()); } - if ((count as usize) < maxcount || maxcount == MAXREPEAT) + if ((self.count as usize) < maxcount || maxcount == MAXREPEAT) && drive.state.string_position != last_position { // we may have enough matches, if we can match another item, do so @@ -1134,7 +1134,7 @@ impl OpcodeExecutor for OpMaxUntil { drive.state.marks_push(); self.save_last_position = last_position; drive.repeat_ctx_mut().last_position = drive.state.string_position; - drive.push_new_context(4); + drive.push_new_context_at(code_position + 4); self.jump_id = 2; return Some(()); } @@ -1167,7 +1167,6 @@ impl OpcodeExecutor for OpMaxUntil { } 3 => { // cannot match more repeated items here. make sure the tail matches - drive.skip_code(drive.peek_code(1) as usize + 1); drive.push_new_context(1); self.jump_id = 4; Some(()) @@ -1188,18 +1187,79 @@ impl OpcodeExecutor for OpMaxUntil { struct OpMinUntil { jump_id: usize, count: isize, + save_repeat: Option, } impl Default for OpMinUntil { fn default() -> Self { Self { jump_id: 0, count: 0, + save_repeat: None, } } } impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - None + match self.jump_id { + 0 => { + let RepeatContext { + count, + code_position, + last_position: _, + mincount, + maxcount: _, + } = *drive.repeat_ctx(); + drive.state.string_position = drive.ctx().string_position; + self.count = count + 1; + + if (self.count as usize) < mincount { + // not enough matches + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context_at(code_position + 4); + self.jump_id = 1; + return Some(()); + } + + // see if the tail matches + drive.state.marks_push(); + self.save_repeat = drive.state.repeat_stack.pop(); + drive.push_new_context(1); + self.jump_id = 2; + Some(()) + } + 1 => { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + } + None + } + 2 => { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.repeat_stack.push(self.save_repeat.unwrap()); + drive.state.string_position = drive.ctx().string_position; + drive.state.marks_pop(); + + // match more unital tail matches + let maxcount = drive.repeat_ctx().maxcount; + let code_position = drive.repeat_ctx().code_position; + if self.count as usize >= maxcount && maxcount != MAXREPEAT { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context_at(code_position + 4); + self.jump_id = 1; + Some(()) + } + _ => unreachable!(), + } // match self.jump_id { // 0 => { // drive.state.string_position = drive.ctx().string_position; From 33ef82364516422776beaa498db8d96a167c7b95 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 6 Jan 2021 10:30:57 +0200 Subject: [PATCH 023/960] impl Match.groupdict --- interp.rs | 58 ------------------------------------------------------- 1 file changed, 58 deletions(-) diff --git a/interp.rs b/interp.rs index a3263eddce..c118984b38 100644 --- a/interp.rs +++ b/interp.rs @@ -1260,64 +1260,6 @@ impl OpcodeExecutor for OpMinUntil { } _ => unreachable!(), } - // match self.jump_id { - // 0 => { - // drive.state.string_position = drive.ctx().string_position; - // let repeat = match drive.state.repeat_stack.last_mut() { - // Some(repeat) => repeat, - // None => { - // todo!("Internal re error: MAX_UNTIL without REPEAT."); - // } - // }; - // self.count = repeat.count + 1; - - // if self.count < repeat.mincount as isize { - // // not enough matches - // repeat.count = self.count; - // drive.push_new_context(4); - // self.jump_id = 1; - // return Some(()); - // } - - // // see if the tail matches - // drive.state.marks_push(); - // drive.push_new_context(1); - // self.jump_id = 2; - // Some(()) - // } - // 1 => { - // let child_ctx = drive.state.popped_context.unwrap(); - // drive.ctx_mut().has_matched = child_ctx.has_matched; - // if drive.ctx().has_matched != Some(true) { - // drive.state.string_position = drive.ctx().string_position; - // let repeat = drive.state.repeat_stack.last_mut().unwrap(); - // repeat.count = self.count - 1; - // } - // None - // } - // 2 => { - // let child_ctx = drive.state.popped_context.unwrap(); - // if child_ctx.has_matched == Some(true) { - // drive.state.repeat_stack.pop(); - // drive.ctx_mut().has_matched = Some(true); - // return None; - // } - // drive.state.string_position = drive.ctx().string_position; - // drive.state.marks_pop(); - - // // match more until tail matches - // let repeat = drive.state.repeat_stack.last_mut().unwrap(); - // if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { - // drive.ctx_mut().has_matched = Some(false); - // return None; - // } - // repeat.count = self.count; - // drive.push_new_context(4); - // self.jump_id = 1; - // Some(()) - // } - // _ => unreachable!(), - // } } } From 76c95abbb5ecd91e05f216adefdd437b0f87b79b Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 7 Jan 2021 17:37:18 +0200 Subject: [PATCH 024/960] impl Match.lastgroup --- interp.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index c118984b38..febbfa2437 100644 --- a/interp.rs +++ b/interp.rs @@ -150,7 +150,7 @@ pub(crate) fn search( ) -> Option { // TODO: optimize by op info and skip prefix let end = std::cmp::min(end, string.char_len()); - for i in start..end { + for i in start..end + 1 { if let Some(m) = pymatch(string.clone(), i, end, pattern.clone()) { return Some(m); } @@ -1382,6 +1382,13 @@ impl OpcodeExecutor for OpRepeatOne { drive.ctx_mut().has_matched = Some(true); return None; } + if self.count <= self.mincount as isize { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + return None; + } + + // TODO: unnesscary double check drive.back_skip_char(1); self.count -= 1; drive.state.marks_pop_keep(); From 13a8b6cc4e38927273774ab96565f2184840f900 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 17 Jan 2021 19:55:20 +0200 Subject: [PATCH 025/960] add bytes support and refactor --- interp.rs | 457 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 266 insertions(+), 191 deletions(-) diff --git a/interp.rs b/interp.rs index febbfa2437..7c9246142a 100644 --- a/interp.rs +++ b/interp.rs @@ -1,18 +1,18 @@ // good luck to those that follow; here be dragons -use super::_sre::{Match, Pattern, MAXREPEAT}; +use super::_sre::MAXREPEAT; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; -use crate::builtins::PyStrRef; -use crate::pyobject::PyRef; -use rustpython_common::borrow::BorrowValue; +use crate::builtins::PyBytes; +use crate::bytesinner::is_py_ascii_whitespace; +use crate::pyobject::{IntoPyObject, PyObjectRef}; +use crate::VirtualMachine; use std::collections::HashMap; use std::convert::TryFrom; +use std::unreachable; #[derive(Debug)] pub(crate) struct State<'a> { - string: &'a str, - // chars count - string_len: usize, + pub string: StrDrive<'a>, pub start: usize, pub end: usize, flags: SreFlag, @@ -24,22 +24,21 @@ pub(crate) struct State<'a> { repeat_stack: Vec, pub string_position: usize, popped_context: Option, + pub has_matched: Option, } impl<'a> State<'a> { pub(crate) fn new( - string: &'a str, + string: StrDrive<'a>, start: usize, end: usize, flags: SreFlag, pattern_codes: &'a [u32], ) -> Self { - let string_len = string.chars().count(); - let end = std::cmp::min(end, string_len); + let end = std::cmp::min(end, string.count()); let start = std::cmp::min(start, end); Self { string, - string_len, start, end, flags, @@ -51,16 +50,19 @@ impl<'a> State<'a> { marks: Vec::new(), string_position: start, popped_context: None, + has_matched: None, } } - fn reset(&mut self) { - self.marks.clear(); + pub fn reset(&mut self) { self.lastindex = -1; self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); + self.marks.clear(); + self.string_position = self.start; self.popped_context = None; + self.has_matched = None; } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -96,66 +98,133 @@ impl<'a> State<'a> { fn marks_pop_discard(&mut self) { self.marks_stack.pop(); } + + pub fn pymatch(mut self) -> Self { + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + }; + self.context_stack.push(ctx); + + let mut dispatcher = OpcodeDispatcher::new(); + let mut has_matched = None; + + loop { + if self.context_stack.is_empty() { + break; + } + let ctx_id = self.context_stack.len() - 1; + let mut drive = StackDrive::drive(ctx_id, self); + + has_matched = dispatcher.pymatch(&mut drive); + self = drive.take(); + if has_matched.is_some() { + self.popped_context = self.context_stack.pop(); + } + } + + self.has_matched = has_matched; + self + } + + pub fn search(mut self) -> Self { + // TODO: optimize by op info and skip prefix + loop { + self = self.pymatch(); + + if self.has_matched == Some(true) { + return self; + } + self.start += 1; + if self.start > self.end { + return self; + } + self.reset(); + } + } } -pub(crate) fn pymatch( - string: PyStrRef, - start: usize, - end: usize, - pattern: PyRef, -) -> Option { - let mut state = State::new( - string.borrow_value(), - start, - end, - pattern.flags, - &pattern.code, - ); - let ctx = MatchContext { - string_position: state.start, - string_offset: calc_string_offset(state.string, state.start), - code_position: 0, - has_matched: None, - }; - state.context_stack.push(ctx); - let mut dispatcher = OpcodeDispatcher::new(); +#[derive(Debug, Clone, Copy)] +pub(crate) enum StrDrive<'a> { + Str(&'a str), + Bytes(&'a [u8]), +} +impl<'a> StrDrive<'a> { + fn offset(&self, offset: usize, skip: usize) -> usize { + match *self { + StrDrive::Str(s) => s + .get(offset..) + .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) + .unwrap_or_else(|| s.len()), + StrDrive::Bytes(b) => std::cmp::min(offset + skip, b.len()), + } + } - let mut has_matched = None; - loop { - if state.context_stack.is_empty() { - break; + pub fn count(&self) -> usize { + match *self { + StrDrive::Str(s) => s.chars().count(), + StrDrive::Bytes(b) => b.len(), } - let ctx_id = state.context_stack.len() - 1; - let mut drive = StackDrive::drive(ctx_id, state); + } - has_matched = dispatcher.pymatch(&mut drive); - state = drive.take(); - if has_matched.is_some() { - state.popped_context = state.context_stack.pop(); + fn peek(&self, offset: usize) -> u32 { + match *self { + StrDrive::Str(s) => unsafe { s.get_unchecked(offset..) }.chars().next().unwrap() as u32, + StrDrive::Bytes(b) => b[offset] as u32, } } - if has_matched != Some(true) { - None - } else { - Some(Match::new(&state, pattern.clone(), string.clone())) + fn back_peek(&self, offset: usize) -> u32 { + match *self { + StrDrive::Str(s) => { + let bytes = s.as_bytes(); + let back_offset = utf8_back_peek_offset(bytes, offset); + match offset - back_offset { + 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), + 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), + 3 => { + u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]) + } + 4 => u32::from_ne_bytes([ + bytes[offset], + bytes[offset + 1], + bytes[offset + 2], + bytes[offset + 3], + ]), + _ => unreachable!(), + } + } + StrDrive::Bytes(b) => b[offset - 1] as u32, + } } -} -pub(crate) fn search( - string: PyStrRef, - start: usize, - end: usize, - pattern: PyRef, -) -> Option { - // TODO: optimize by op info and skip prefix - let end = std::cmp::min(end, string.char_len()); - for i in start..end + 1 { - if let Some(m) = pymatch(string.clone(), i, end, pattern.clone()) { - return Some(m); + fn back_offset(&self, offset: usize, skip: usize) -> usize { + match *self { + StrDrive::Str(s) => { + let bytes = s.as_bytes(); + let mut back_offset = offset; + for _ in 0..skip { + back_offset = utf8_back_peek_offset(bytes, back_offset); + } + back_offset + } + StrDrive::Bytes(_) => offset - skip, + } + } + + pub fn slice_to_pyobject(&self, start: usize, end: usize, vm: &VirtualMachine) -> PyObjectRef { + match *self { + StrDrive::Str(s) => s + .chars() + .take(end) + .skip(start) + .collect::() + .into_pyobject(vm), + StrDrive::Bytes(b) => PyBytes::from(b[start..end].to_vec()).into_pyobject(vm), } } - None } #[derive(Debug, Copy, Clone)] @@ -173,33 +242,21 @@ trait MatchContextDrive { fn repeat_ctx(&self) -> &RepeatContext { self.state().repeat_stack.last().unwrap() } - fn str(&self) -> &str { - unsafe { - std::str::from_utf8_unchecked( - &self.state().string.as_bytes()[self.ctx().string_offset..], - ) - } - } fn pattern(&self) -> &[u32] { &self.state().pattern_codes[self.ctx().code_position..] } - fn peek_char(&self) -> char { - self.str().chars().next().unwrap() + fn peek_char(&self) -> u32 { + self.state().string.peek(self.ctx().string_offset) } fn peek_code(&self, peek: usize) -> u32 { self.state().pattern_codes[self.ctx().code_position + peek] } fn skip_char(&mut self, skip_count: usize) { - match self.str().char_indices().nth(skip_count).map(|x| x.0) { - Some(skipped) => { - self.ctx_mut().string_position += skip_count; - self.ctx_mut().string_offset += skipped; - } - None => { - self.ctx_mut().string_position = self.state().end; - self.ctx_mut().string_offset = self.state().string.len(); // bytes len - } - } + self.ctx_mut().string_offset = self + .state() + .string + .offset(self.ctx().string_offset, skip_count); + self.ctx_mut().string_position += skip_count; } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; @@ -222,7 +279,7 @@ trait MatchContextDrive { fn at_linebreak(&self) -> bool { !self.at_end() && is_linebreak(self.peek_char()) } - fn at_boundary bool>(&self, mut word_checker: F) -> bool { + fn at_boundary bool>(&self, mut word_checker: F) -> bool { if self.at_beginning() && self.at_end() { return false; } @@ -230,47 +287,15 @@ trait MatchContextDrive { let this = !self.at_end() && word_checker(self.peek_char()); this != that } - fn back_peek_offset(&self) -> usize { - let bytes = self.state().string.as_bytes(); - let mut offset = self.ctx().string_offset - 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - panic!("not utf-8 code point"); - } - } - } - } - offset - } - fn back_peek_char(&self) -> char { - let bytes = self.state().string.as_bytes(); - let offset = self.back_peek_offset(); - let current_offset = self.ctx().string_offset; - let code = match current_offset - offset { - 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), - 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), - 3 => u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]), - 4 => u32::from_ne_bytes([ - bytes[offset], - bytes[offset + 1], - bytes[offset + 2], - bytes[offset + 3], - ]), - _ => unreachable!(), - }; - // TODO: char::from_u32_unchecked is stable from 1.5.0 - unsafe { std::mem::transmute(code) } + fn back_peek_char(&self) -> u32 { + self.state().string.back_peek(self.ctx().string_offset) } fn back_skip_char(&mut self, skip_count: usize) { self.ctx_mut().string_position -= skip_count; - for _ in 0..skip_count { - self.ctx_mut().string_offset = self.back_peek_offset(); - } + self.ctx_mut().string_offset = self + .state() + .string + .back_offset(self.ctx().string_offset, skip_count); } } @@ -529,22 +554,22 @@ impl OpcodeDispatcher { drive.skip_code(drive.peek_code(1) as usize + 1); }), SreOpcode::LITERAL => once(|drive| { - general_op_literal(drive, |code, c| code == c as u32); + general_op_literal(drive, |code, c| code == c); }), SreOpcode::NOT_LITERAL => once(|drive| { - general_op_literal(drive, |code, c| code != c as u32); + general_op_literal(drive, |code, c| code != c); }), SreOpcode::LITERAL_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code == lower_ascii(c) as u32); + general_op_literal(drive, |code, c| code == lower_ascii(c)); }), SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code != lower_ascii(c) as u32); + general_op_literal(drive, |code, c| code != lower_ascii(c)); }), SreOpcode::LITERAL_UNI_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code == lower_unicode(c) as u32); + general_op_literal(drive, |code, c| code == lower_unicode(c)); }), SreOpcode::NOT_LITERAL_UNI_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code != lower_unicode(c) as u32); + general_op_literal(drive, |code, c| code != lower_unicode(c)); }), SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { general_op_literal(drive, char_loc_ignore); @@ -610,19 +635,11 @@ impl OpcodeDispatcher { } } -fn calc_string_offset(string: &str, position: usize) -> usize { - string - .char_indices() - .nth(position) - .map(|(i, _)| i) - .unwrap_or(0) -} - -fn char_loc_ignore(code: u32, c: char) -> bool { - code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 +fn char_loc_ignore(code: u32, c: u32) -> bool { + code == c || code == lower_locate(c) || code == upper_locate(c) } -fn charset_loc_ignore(set: &[u32], c: char) -> bool { +fn charset_loc_ignore(set: &[u32], c: u32) -> bool { let lo = lower_locate(c); if charset(set, c) { return true; @@ -631,7 +648,7 @@ fn charset_loc_ignore(set: &[u32], c: char) -> bool { up != lo && charset(set, up) } -fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) { +fn general_op_groupref u32>(drive: &mut StackDrive, mut f: F) { let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); let (group_start, group_end) = match (group_start, group_end) { (Some(start), Some(end)) if start <= end => (start, end), @@ -645,7 +662,7 @@ fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) MatchContext { string_position: group_start, // TODO: cache the offset - string_offset: calc_string_offset(drive.state.string, group_start), + string_offset: drive.state.string.offset(0, group_start), ..*drive.ctx() }, &drive, @@ -665,7 +682,7 @@ fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) drive.ctx_mut().string_offset = offset; } -fn general_op_literal bool>(drive: &mut StackDrive, f: F) { +fn general_op_literal bool>(drive: &mut StackDrive, f: F) { if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); } else { @@ -674,7 +691,7 @@ fn general_op_literal bool>(drive: &mut StackDrive, f: F } } -fn general_op_in bool>(drive: &mut StackDrive, f: F) { +fn general_op_in bool>(drive: &mut StackDrive, f: F) { let skip = drive.peek_code(1) as usize; if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); @@ -700,7 +717,7 @@ fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { } } -fn category(catcode: SreCatCode, c: char) -> bool { +fn category(catcode: SreCatCode, c: u32) -> bool { match catcode { SreCatCode::DIGIT => is_digit(c), SreCatCode::NOT_DIGIT => !is_digit(c), @@ -723,9 +740,8 @@ fn category(catcode: SreCatCode, c: char) -> bool { } } -fn charset(set: &[u32], c: char) -> bool { +fn charset(set: &[u32], ch: u32) -> bool { /* check if character is a member of the given set */ - let ch = c as u32; let mut ok = true; let mut i = 0; while i < set.len() { @@ -747,7 +763,7 @@ fn charset(set: &[u32], c: char) -> bool { break; } }; - if category(catcode, c) { + if category(catcode, ch) { return ok; } i += 2; @@ -801,7 +817,7 @@ fn charset(set: &[u32], c: char) -> bool { if set[i + 1] <= ch && ch <= set[i + 2] { return ok; } - let ch = upper_unicode(c) as u32; + let ch = upper_unicode(ch); if set[i + 1] <= ch && ch <= set[i + 2] { return ok; } @@ -898,86 +914,128 @@ fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { drive.ctx().string_position - drive.state().string_position } -fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { +fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { let ch = drive.peek_code(1); while !drive.ctx().string_position < end && f(ch, drive.peek_char()) { drive.skip_char(1); } } -fn eq_loc_ignore(code: u32, c: char) -> bool { - code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 +fn eq_loc_ignore(code: u32, ch: u32) -> bool { + code == ch || code == lower_locate(ch) || code == upper_locate(ch) } -fn is_word(c: char) -> bool { - c.is_ascii_alphanumeric() || c == '_' +fn is_word(ch: u32) -> bool { + ch == '_' as u32 + || u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) } -fn is_space(c: char) -> bool { - c.is_ascii_whitespace() +fn is_space(ch: u32) -> bool { + u8::try_from(ch) + .map(is_py_ascii_whitespace) + .unwrap_or(false) } -fn is_digit(c: char) -> bool { - c.is_ascii_digit() +fn is_digit(ch: u32) -> bool { + u8::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) } -fn is_loc_alnum(c: char) -> bool { +fn is_loc_alnum(ch: u32) -> bool { // TODO: check with cpython - c.is_alphanumeric() + u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) } -fn is_loc_word(c: char) -> bool { - is_loc_alnum(c) || c == '_' +fn is_loc_word(ch: u32) -> bool { + ch == '_' as u32 || is_loc_alnum(ch) } -fn is_linebreak(c: char) -> bool { - c == '\n' +fn is_linebreak(ch: u32) -> bool { + ch == '\n' as u32 } -pub(crate) fn lower_ascii(c: char) -> char { - c.to_ascii_lowercase() +pub(crate) fn lower_ascii(ch: u32) -> u32 { + u8::try_from(ch) + .map(|x| x.to_ascii_lowercase() as u32) + .unwrap_or(ch) } -fn lower_locate(c: char) -> char { +fn lower_locate(ch: u32) -> u32 { // TODO: check with cpython // https://doc.rust-lang.org/std/primitive.char.html#method.to_lowercase - c.to_lowercase().next().unwrap() + lower_ascii(ch) } -fn upper_locate(c: char) -> char { +fn upper_locate(ch: u32) -> u32 { // TODO: check with cpython // https://doc.rust-lang.org/std/primitive.char.html#method.to_uppercase - c.to_uppercase().next().unwrap() + u8::try_from(ch) + .map(|x| x.to_ascii_uppercase() as u32) + .unwrap_or(ch) } -fn is_uni_digit(c: char) -> bool { +fn is_uni_digit(ch: u32) -> bool { // TODO: check with cpython - c.is_digit(10) + char::try_from(ch).map(|x| x.is_digit(10)).unwrap_or(false) } -fn is_uni_space(c: char) -> bool { +fn is_uni_space(ch: u32) -> bool { // TODO: check with cpython - c.is_whitespace() -} -fn is_uni_linebreak(c: char) -> bool { + is_space(ch) + || matches!( + ch, + 0x0009 + | 0x000A + | 0x000B + | 0x000C + | 0x000D + | 0x001C + | 0x001D + | 0x001E + | 0x001F + | 0x0020 + | 0x0085 + | 0x00A0 + | 0x1680 + | 0x2000 + | 0x2001 + | 0x2002 + | 0x2003 + | 0x2004 + | 0x2005 + | 0x2006 + | 0x2007 + | 0x2008 + | 0x2009 + | 0x200A + | 0x2028 + | 0x2029 + | 0x202F + | 0x205F + | 0x3000 + ) +} +fn is_uni_linebreak(ch: u32) -> bool { matches!( - c, - '\u{000A}' - | '\u{000B}' - | '\u{000C}' - | '\u{000D}' - | '\u{001C}' - | '\u{001D}' - | '\u{001E}' - | '\u{0085}' - | '\u{2028}' - | '\u{2029}' + ch, + 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 ) } -fn is_uni_alnum(c: char) -> bool { +fn is_uni_alnum(ch: u32) -> bool { // TODO: check with cpython - c.is_alphanumeric() + char::try_from(ch) + .map(|x| x.is_alphanumeric()) + .unwrap_or(false) } -fn is_uni_word(c: char) -> bool { - is_uni_alnum(c) || c == '_' +fn is_uni_word(ch: u32) -> bool { + ch == '_' as u32 || is_uni_alnum(ch) } -pub(crate) fn lower_unicode(c: char) -> char { +pub(crate) fn lower_unicode(ch: u32) -> u32 { // TODO: check with cpython - c.to_lowercase().next().unwrap() + char::try_from(ch) + .map(|x| x.to_lowercase().next().unwrap() as u32) + .unwrap_or(ch) } -pub(crate) fn upper_unicode(c: char) -> char { +pub(crate) fn upper_unicode(ch: u32) -> u32 { // TODO: check with cpython - c.to_uppercase().next().unwrap() + char::try_from(ch) + .map(|x| x.to_uppercase().next().unwrap() as u32) + .unwrap_or(ch) } fn is_utf8_first_byte(b: u8) -> bool { @@ -988,6 +1046,23 @@ fn is_utf8_first_byte(b: u8) -> bool { (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) } +fn utf8_back_peek_offset(bytes: &[u8], offset: usize) -> usize { + let mut offset = offset - 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + panic!("not utf-8 code point"); + } + } + } + } + offset +} + #[derive(Debug, Copy, Clone)] struct RepeatContext { count: isize, From 97000fc4e046c5f9d7da42357d8b3290e33972db Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 19 Jan 2021 17:14:00 +0200 Subject: [PATCH 026/960] fix multiple bugs; skip crash tests --- interp.rs | 220 ++++++++++++++++++++++-------------------------------- 1 file changed, 88 insertions(+), 132 deletions(-) diff --git a/interp.rs b/interp.rs index 7c9246142a..dfb2e04e47 100644 --- a/interp.rs +++ b/interp.rs @@ -131,18 +131,17 @@ impl<'a> State<'a> { pub fn search(mut self) -> Self { // TODO: optimize by op info and skip prefix - loop { + while self.start <= self.end { self = self.pymatch(); if self.has_matched == Some(true) { return self; } self.start += 1; - if self.start > self.end { - return self; - } self.reset(); } + + self } } @@ -182,16 +181,19 @@ impl<'a> StrDrive<'a> { let bytes = s.as_bytes(); let back_offset = utf8_back_peek_offset(bytes, offset); match offset - back_offset { - 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), - 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), - 3 => { - u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]) - } + 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset - 1]]), + 2 => u32::from_ne_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), + 3 => u32::from_ne_bytes([ + 0, + bytes[offset - 3], + bytes[offset - 2], + bytes[offset - 1], + ]), 4 => u32::from_ne_bytes([ - bytes[offset], - bytes[offset + 1], - bytes[offset + 2], - bytes[offset + 3], + bytes[offset - 4], + bytes[offset - 3], + bytes[offset - 2], + bytes[offset - 1], ]), _ => unreachable!(), } @@ -222,7 +224,10 @@ impl<'a> StrDrive<'a> { .skip(start) .collect::() .into_pyobject(vm), - StrDrive::Bytes(b) => PyBytes::from(b[start..end].to_vec()).into_pyobject(vm), + StrDrive::Bytes(b) => { + PyBytes::from(b.iter().take(end).skip(start).cloned().collect::>()) + .into_pyobject(vm) + } } } } @@ -256,13 +261,11 @@ trait MatchContextDrive { .state() .string .offset(self.ctx().string_offset, skip_count); - self.ctx_mut().string_position += skip_count; + self.ctx_mut().string_position = + std::cmp::min(self.ctx().string_position + skip_count, self.state().end); } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; - if self.ctx().code_position > self.state().pattern_codes.len() { - self.ctx_mut().code_position = self.state().pattern_codes.len(); - } } fn remaining_chars(&self) -> usize { self.state().end - self.ctx().string_position @@ -314,9 +317,7 @@ impl<'a> StackDrive<'a> { self.state } fn push_new_context(&mut self, pattern_offset: usize) { - let mut child_ctx = MatchContext { ..*self.ctx() }; - child_ctx.code_position += pattern_offset; - self.state.context_stack.push(child_ctx); + self.push_new_context_at(self.ctx().code_position + pattern_offset); } fn push_new_context_at(&mut self, code_position: usize) { let mut child_ctx = MatchContext { ..*self.ctx() }; @@ -352,11 +353,9 @@ impl MatchContextDrive for WrapDrive<'_> { fn ctx_mut(&mut self) -> &mut MatchContext { &mut self.ctx } - fn ctx(&self) -> &MatchContext { &self.ctx } - fn state(&self) -> &State { self.stack_drive.state() } @@ -833,6 +832,7 @@ fn charset(set: &[u32], ch: u32) -> bool { false } +/* General case */ fn count(drive: &mut StackDrive, maxcount: usize) -> usize { let mut count = 0; let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); @@ -854,6 +854,8 @@ fn count(drive: &mut StackDrive, maxcount: usize) -> usize { count } +/* TODO: check literal cases should improve the perfermance + fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); @@ -924,6 +926,7 @@ fn general_count_literal bool>(drive: &mut WrapDrive, end: fn eq_loc_ignore(code: u32, ch: u32) -> bool { code == ch || code == lower_locate(ch) || code == upper_locate(ch) } +*/ fn is_word(ch: u32) -> bool { ch == '_' as u32 @@ -1073,6 +1076,7 @@ struct RepeatContext { maxcount: usize, } +#[derive(Default)] struct OpMinRepeatOne { jump_id: usize, mincount: usize, @@ -1082,102 +1086,79 @@ struct OpMinRepeatOne { impl OpcodeExecutor for OpMinRepeatOne { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - 2 => self._2(drive), - _ => unreachable!(), - } - } -} -impl Default for OpMinRepeatOne { - fn default() -> Self { - OpMinRepeatOne { - jump_id: 0, - mincount: 0, - maxcount: 0, - count: 0, - } - } -} -impl OpMinRepeatOne { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; + 0 => { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } - drive.state.string_position = drive.ctx().string_position; + drive.state.string_position = drive.ctx().string_position; - self.count = if self.mincount == 0 { - 0 - } else { - let count = count(drive, self.mincount); - if count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(count); - count - }; + self.count = if self.mincount == 0 { + 0 + } else { + let count = count(drive, self.mincount); + if count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(count); + count + }; - if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } + if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } - drive.state.marks_push(); - self.jump_id = 1; - self._1(drive) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - if self.maxcount == MAXREPEAT || self.count <= self.maxcount { - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - return Some(()); - } + drive.state.marks_push(); + self.jump_id = 1; + self.next(drive) + } + 1 => { + if self.maxcount == MAXREPEAT || self.count <= self.maxcount { + drive.state.string_position = drive.ctx().string_position; + drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + return Some(()); + } - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - None - } - fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - if count(drive, 1) == 0 { - drive.ctx_mut().has_matched = Some(false); - return None; + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + None + } + 2 => { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + if count(drive, 1) == 0 { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(1); + self.count += 1; + drive.state.marks_pop_keep(); + self.jump_id = 1; + self.next(drive) + } + _ => unreachable!(), } - drive.skip_char(1); - self.count += 1; - drive.state.marks_pop_keep(); - self.jump_id = 1; - self._1(drive) } } +#[derive(Default)] struct OpMaxUntil { jump_id: usize, count: isize, save_last_position: usize, } -impl Default for OpMaxUntil { - fn default() -> Self { - OpMaxUntil { - jump_id: 0, - count: 0, - save_last_position: 0, - } - } -} impl OpcodeExecutor for OpMaxUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1259,20 +1240,12 @@ impl OpcodeExecutor for OpMaxUntil { } } +#[derive(Default)] struct OpMinUntil { jump_id: usize, count: isize, save_repeat: Option, } -impl Default for OpMinUntil { - fn default() -> Self { - Self { - jump_id: 0, - count: 0, - save_repeat: None, - } - } -} impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1338,18 +1311,11 @@ impl OpcodeExecutor for OpMinUntil { } } +#[derive(Default)] struct OpBranch { jump_id: usize, current_branch_length: usize, } -impl Default for OpBranch { - fn default() -> Self { - Self { - jump_id: 0, - current_branch_length: 0, - } - } -} impl OpcodeExecutor for OpBranch { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1388,22 +1354,13 @@ impl OpcodeExecutor for OpBranch { } } +#[derive(Default)] struct OpRepeatOne { jump_id: usize, mincount: usize, maxcount: usize, count: isize, } -impl Default for OpRepeatOne { - fn default() -> Self { - Self { - jump_id: 0, - mincount: 0, - maxcount: 0, - count: 0, - } - } -} impl OpcodeExecutor for OpRepeatOne { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1413,7 +1370,6 @@ impl OpcodeExecutor for OpRepeatOne { if drive.remaining_chars() < self.mincount { drive.ctx_mut().has_matched = Some(false); - return None; } drive.state.string_position = drive.ctx().string_position; self.count = count(drive, self.maxcount) as isize; From 84113cba2cd66b83106f4564b03ae8cb999197c3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 20 Jan 2021 16:33:30 +0200 Subject: [PATCH 027/960] fix zero width repeat --- interp.rs | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/interp.rs b/interp.rs index dfb2e04e47..8b5a023ffd 100644 --- a/interp.rs +++ b/interp.rs @@ -1245,6 +1245,7 @@ struct OpMinUntil { jump_id: usize, count: isize, save_repeat: Option, + save_last_position: usize, } impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { @@ -1280,6 +1281,7 @@ impl OpcodeExecutor for OpMinUntil { drive.ctx_mut().has_matched = child_ctx.has_matched; if drive.ctx().has_matched != Some(true) { drive.repeat_ctx_mut().count = self.count - 1; + drive.repeat_ctx_mut().last_position = self.save_last_position; drive.state.string_position = drive.ctx().string_position; } None @@ -1295,13 +1297,26 @@ impl OpcodeExecutor for OpMinUntil { drive.state.marks_pop(); // match more unital tail matches - let maxcount = drive.repeat_ctx().maxcount; - let code_position = drive.repeat_ctx().code_position; - if self.count as usize >= maxcount && maxcount != MAXREPEAT { + let RepeatContext { + count: _, + code_position, + last_position, + mincount: _, + maxcount, + } = *drive.repeat_ctx(); + + if self.count as usize >= maxcount && maxcount != MAXREPEAT + || drive.state.string_position == last_position + { drive.ctx_mut().has_matched = Some(false); return None; } drive.repeat_ctx_mut().count = self.count; + + /* zero-width match protection */ + self.save_last_position = last_position; + drive.repeat_ctx_mut().last_position = drive.state.string_position; + drive.push_new_context_at(code_position + 4); self.jump_id = 1; Some(()) From f2311b56fcbc87431eb0e291b68f44bb3658c2c1 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 21 Jan 2021 12:48:32 +0200 Subject: [PATCH 028/960] fix op branch --- interp.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/interp.rs b/interp.rs index 8b5a023ffd..5fa7ca8c0e 100644 --- a/interp.rs +++ b/interp.rs @@ -1329,28 +1329,30 @@ impl OpcodeExecutor for OpMinUntil { #[derive(Default)] struct OpBranch { jump_id: usize, - current_branch_length: usize, + branch_offset: usize, } impl OpcodeExecutor for OpBranch { + // alternation + // <0=skip> code ... fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => { drive.state.marks_push(); // jump out the head - self.current_branch_length = 1; + self.branch_offset = 1; self.jump_id = 1; self.next(drive) } 1 => { - drive.skip_code(self.current_branch_length); - self.current_branch_length = drive.peek_code(0) as usize; - if self.current_branch_length == 0 { + let next_branch_length = drive.peek_code(self.branch_offset) as usize; + if next_branch_length == 0 { drive.state.marks_pop_discard(); drive.ctx_mut().has_matched = Some(false); return None; } drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(1); + drive.push_new_context(self.branch_offset + 1); + self.branch_offset += next_branch_length; self.jump_id = 2; Some(()) } From 6a792324b027bea56c39cb04fe072f719fd1efed Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 21 Jan 2021 16:14:23 +0200 Subject: [PATCH 029/960] fix at_beginning --- interp.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index 5fa7ca8c0e..11010c7152 100644 --- a/interp.rs +++ b/interp.rs @@ -274,7 +274,8 @@ trait MatchContextDrive { self.state().pattern_codes.len() - self.ctx().code_position } fn at_beginning(&self) -> bool { - self.ctx().string_position == self.state().start + // self.ctx().string_position == self.state().start + self.ctx().string_position == 0 } fn at_end(&self) -> bool { self.ctx().string_position == self.state().end From 7f0dad7901751abcc09ac8308742c91627401c81 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 21 Jan 2021 19:24:10 +0200 Subject: [PATCH 030/960] fix back_peek_char --- interp.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interp.rs b/interp.rs index 11010c7152..beb96f1f09 100644 --- a/interp.rs +++ b/interp.rs @@ -181,15 +181,15 @@ impl<'a> StrDrive<'a> { let bytes = s.as_bytes(); let back_offset = utf8_back_peek_offset(bytes, offset); match offset - back_offset { - 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset - 1]]), - 2 => u32::from_ne_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), - 3 => u32::from_ne_bytes([ + 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), + 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), + 3 => u32::from_be_bytes([ 0, bytes[offset - 3], bytes[offset - 2], bytes[offset - 1], ]), - 4 => u32::from_ne_bytes([ + 4 => u32::from_be_bytes([ bytes[offset - 4], bytes[offset - 3], bytes[offset - 2], From 4416158eceb9dd6931ca25cfc610f50582dd83dc Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 22 Jan 2021 16:40:33 +0200 Subject: [PATCH 031/960] fix multiple bugs; pass tests --- interp.rs | 97 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 27 deletions(-) diff --git a/interp.rs b/interp.rs index beb96f1f09..71e3ead143 100644 --- a/interp.rs +++ b/interp.rs @@ -25,6 +25,7 @@ pub(crate) struct State<'a> { pub string_position: usize, popped_context: Option, pub has_matched: Option, + pub match_all: bool, } impl<'a> State<'a> { @@ -51,6 +52,7 @@ impl<'a> State<'a> { string_position: start, popped_context: None, has_matched: None, + match_all: false, } } @@ -105,6 +107,7 @@ impl<'a> State<'a> { string_offset: self.string.offset(0, self.start), code_position: 0, has_matched: None, + toplevel: true, }; self.context_stack.push(ctx); @@ -132,6 +135,7 @@ impl<'a> State<'a> { pub fn search(mut self) -> Self { // TODO: optimize by op info and skip prefix while self.start <= self.end { + self.match_all = false; self = self.pymatch(); if self.has_matched == Some(true) { @@ -232,12 +236,13 @@ impl<'a> StrDrive<'a> { } } -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Clone, Copy)] struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, + toplevel: bool, } trait MatchContextDrive { @@ -463,8 +468,12 @@ impl OpcodeDispatcher { drive.ctx_mut().has_matched = Some(false); }), SreOpcode::SUCCESS => once(|drive| { - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); + if drive.ctx().toplevel && drive.state.match_all && !drive.at_end() { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + } }), SreOpcode::ANY => once(|drive| { if drive.at_end() || drive.at_linebreak() { @@ -482,15 +491,19 @@ impl OpcodeDispatcher { drive.skip_char(1); } }), + /* assert subpattern */ + /* */ SreOpcode::ASSERT => twice( |drive| { let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { + let passed = drive.ctx().string_position - drive.state.start; + if passed < back { drive.ctx_mut().has_matched = Some(false); return; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); + drive.state.context_stack.last_mut().unwrap().toplevel = false; }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -504,12 +517,14 @@ impl OpcodeDispatcher { SreOpcode::ASSERT_NOT => twice( |drive| { let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { + let passed = drive.ctx().string_position - drive.state.start; + if passed < back { drive.skip_code(drive.peek_code(1) as usize + 1); return; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); + drive.state.context_stack.last_mut().unwrap().toplevel = false; }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -770,17 +785,17 @@ fn charset(set: &[u32], ch: u32) -> bool { } SreOpcode::CHARSET => { /* */ - let set = &set[1..]; + let set = &set[i + 1..]; if ch < 256 && ((set[(ch >> 5) as usize] & (1u32 << (ch & 31))) != 0) { return ok; } - i += 8; + i += 1 + 8; } SreOpcode::BIGCHARSET => { /* <256 blockindices> */ let count = set[i + 1] as usize; if ch < 0x10000 { - let set = &set[2..]; + let set = &set[i + 2..]; let block_index = ch >> 8; let (_, blockindices, _) = unsafe { set.align_to::() }; let blocks = &set[64..]; @@ -1085,6 +1100,7 @@ struct OpMinRepeatOne { count: usize, } impl OpcodeExecutor for OpMinRepeatOne { + /* <1=min> <2=max> item tail */ fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => { @@ -1110,7 +1126,11 @@ impl OpcodeExecutor for OpMinRepeatOne { count }; - if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 + && !(drive.ctx().toplevel && drive.state.match_all && !drive.at_end()) + { + // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); return None; @@ -1377,9 +1397,18 @@ struct OpRepeatOne { jump_id: usize, mincount: usize, maxcount: usize, - count: isize, + count: usize, + following_literal: Option, } impl OpcodeExecutor for OpRepeatOne { + /* match repeated sequence (maximizing regexp) */ + + /* this operator only works if the repeated item is + exactly one character wide, and we're not already + collecting backtracking points. for other cases, + use the MAX_REPEAT operator */ + + /* <1=min> <2=max> item tail */ fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => { @@ -1388,17 +1417,21 @@ impl OpcodeExecutor for OpRepeatOne { if drive.remaining_chars() < self.mincount { drive.ctx_mut().has_matched = Some(false); + return None; } + drive.state.string_position = drive.ctx().string_position; - self.count = count(drive, self.maxcount) as isize; - drive.skip_char(self.count as usize); - if self.count < self.mincount as isize { + + self.count = count(drive, self.maxcount); + drive.skip_char(self.count); + if self.count < self.mincount { drive.ctx_mut().has_matched = Some(false); return None; } let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 { + if next_code == SreOpcode::SUCCESS as u32 && drive.at_end() && !drive.ctx().toplevel + { // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); @@ -1406,24 +1439,34 @@ impl OpcodeExecutor for OpRepeatOne { } drive.state.marks_push(); - // TODO: + // Special case: Tail starts with a literal. Skip positions where // the rest of the pattern cannot possibly match. + if next_code == SreOpcode::LITERAL as u32 { + self.following_literal = Some(drive.peek_code(drive.peek_code(1) as usize + 2)) + } + self.jump_id = 1; self.next(drive) } 1 => { - // General case: backtracking - if self.count >= self.mincount as isize { - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - return Some(()); + if let Some(c) = self.following_literal { + while drive.at_end() || drive.peek_char() != c { + if self.count <= self.mincount { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.back_skip_char(1); + self.count -= 1; + } } - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - None + // General case: backtracking + drive.state.string_position = drive.ctx().string_position; + drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + Some(()) } 2 => { let child_ctx = drive.state.popped_context.unwrap(); @@ -1431,19 +1474,19 @@ impl OpcodeExecutor for OpRepeatOne { drive.ctx_mut().has_matched = Some(true); return None; } - if self.count <= self.mincount as isize { + if self.count <= self.mincount { drive.state.marks_pop_discard(); drive.ctx_mut().has_matched = Some(false); return None; } - // TODO: unnesscary double check drive.back_skip_char(1); self.count -= 1; + drive.state.marks_pop_keep(); self.jump_id = 1; - Some(()) + self.next(drive) } _ => unreachable!(), } From db5bd646b928048c7183f46ede40a114f19a82e4 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 27 Jan 2021 18:54:38 -0600 Subject: [PATCH 032/960] Initial commit to switch to new repo --- .gitignore | 2 ++ Cargo.toml | 7 +++++++ constants.rs => src/constants.rs | 0 interp.rs => src/engine.rs | 0 src/lib.rs | 2 ++ 5 files changed, 11 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml rename constants.rs => src/constants.rs (100%) rename interp.rs => src/engine.rs (100%) create mode 100644 src/lib.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..96ef6c0b94 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000000..f9f504966b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "sre-engine" +version = "0.1.0" +authors = ["Kangzhi Shi ", "RustPython Team"] +edition = "2018" + +[dependencies] diff --git a/constants.rs b/src/constants.rs similarity index 100% rename from constants.rs rename to src/constants.rs diff --git a/interp.rs b/src/engine.rs similarity index 100% rename from interp.rs rename to src/engine.rs diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000000..f305aa094a --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +pub mod constants; +pub mod engine; From 9c95994dab20fea56004940ba50323ee7e327f81 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 27 Jan 2021 19:18:42 -0600 Subject: [PATCH 033/960] Modify to work outside of rustpython-vm --- Cargo.toml | 2 ++ src/engine.rs | 38 +++++++++++--------------------------- src/lib.rs | 12 ++++++++++++ 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f9f504966b..3a82ba73f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,3 +5,5 @@ authors = ["Kangzhi Shi ", "RustPython Team"] edition = "2018" [dependencies] +num_enum = "0.5" +bitflags = "1.2" diff --git a/src/engine.rs b/src/engine.rs index 71e3ead143..9aff2dcc91 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,17 +1,16 @@ // good luck to those that follow; here be dragons -use super::_sre::MAXREPEAT; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; -use crate::builtins::PyBytes; -use crate::bytesinner::is_py_ascii_whitespace; -use crate::pyobject::{IntoPyObject, PyObjectRef}; -use crate::VirtualMachine; +use super::MAXREPEAT; use std::collections::HashMap; use std::convert::TryFrom; -use std::unreachable; + +const fn is_py_ascii_whitespace(b: u8) -> bool { + matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') +} #[derive(Debug)] -pub(crate) struct State<'a> { +pub struct State<'a> { pub string: StrDrive<'a>, pub start: usize, pub end: usize, @@ -29,7 +28,7 @@ pub(crate) struct State<'a> { } impl<'a> State<'a> { - pub(crate) fn new( + pub fn new( string: StrDrive<'a>, start: usize, end: usize, @@ -150,7 +149,7 @@ impl<'a> State<'a> { } #[derive(Debug, Clone, Copy)] -pub(crate) enum StrDrive<'a> { +pub enum StrDrive<'a> { Str(&'a str), Bytes(&'a [u8]), } @@ -219,21 +218,6 @@ impl<'a> StrDrive<'a> { StrDrive::Bytes(_) => offset - skip, } } - - pub fn slice_to_pyobject(&self, start: usize, end: usize, vm: &VirtualMachine) -> PyObjectRef { - match *self { - StrDrive::Str(s) => s - .chars() - .take(end) - .skip(start) - .collect::() - .into_pyobject(vm), - StrDrive::Bytes(b) => { - PyBytes::from(b.iter().take(end).skip(start).cloned().collect::>()) - .into_pyobject(vm) - } - } - } } #[derive(Debug, Clone, Copy)] @@ -972,7 +956,7 @@ fn is_loc_word(ch: u32) -> bool { fn is_linebreak(ch: u32) -> bool { ch == '\n' as u32 } -pub(crate) fn lower_ascii(ch: u32) -> u32 { +pub fn lower_ascii(ch: u32) -> u32 { u8::try_from(ch) .map(|x| x.to_ascii_lowercase() as u32) .unwrap_or(ch) @@ -1044,13 +1028,13 @@ fn is_uni_alnum(ch: u32) -> bool { fn is_uni_word(ch: u32) -> bool { ch == '_' as u32 || is_uni_alnum(ch) } -pub(crate) fn lower_unicode(ch: u32) -> u32 { +pub fn lower_unicode(ch: u32) -> u32 { // TODO: check with cpython char::try_from(ch) .map(|x| x.to_lowercase().next().unwrap() as u32) .unwrap_or(ch) } -pub(crate) fn upper_unicode(ch: u32) -> u32 { +pub fn upper_unicode(ch: u32) -> u32 { // TODO: check with cpython char::try_from(ch) .map(|x| x.to_uppercase().next().unwrap() as u32) diff --git a/src/lib.rs b/src/lib.rs index f305aa094a..4a3ed1b754 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,14 @@ pub mod constants; pub mod engine; + +pub const CODESIZE: usize = 4; + +#[cfg(target_pointer_width = "32")] +pub const MAXREPEAT: usize = usize::MAX; +#[cfg(target_pointer_width = "64")] +pub const MAXREPEAT: usize = u32::MAX as usize; + +#[cfg(target_pointer_width = "32")] +pub const MAXGROUPS: usize = MAXREPEAT / 4 / 2; +#[cfg(target_pointer_width = "64")] +pub const MAXGROUPS: usize = MAXREPEAT / 2; From 2592067f95f530850db8451fd45982dc3bf161e0 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 27 Jan 2021 19:19:31 -0600 Subject: [PATCH 034/960] Add LICENSE --- Cargo.toml | 1 + LICENSE | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 LICENSE diff --git a/Cargo.toml b/Cargo.toml index 3a82ba73f1..03db7aba4f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ name = "sre-engine" version = "0.1.0" authors = ["Kangzhi Shi ", "RustPython Team"] +license = "MIT" edition = "2018" [dependencies] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..7213274e0f --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 RustPython Team + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 2473c3e49f6ba539549a8bf4f87db9146044fc52 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 1 Feb 2021 20:25:45 +0200 Subject: [PATCH 035/960] add tests; fix OpAssert panic --- src/engine.rs | 15 ++++++++------- src/lib.rs | 2 ++ src/tests.rs | 19 +++++++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) create mode 100644 src/tests.rs diff --git a/src/engine.rs b/src/engine.rs index 9aff2dcc91..f6a6092f9b 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -369,20 +369,18 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } -// F1 F2 are same identical, but workaround for closure struct OpTwice { f1: Option, f2: Option, } impl OpcodeExecutor for OpTwice where - F1: FnOnce(&mut StackDrive), + F1: FnOnce(&mut StackDrive) -> Option<()>, F2: FnOnce(&mut StackDrive), { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { if let Some(f1) = self.f1.take() { - f1(drive); - Some(()) + f1(drive) } else if let Some(f2) = self.f2.take() { f2(drive); None @@ -393,7 +391,7 @@ where } fn twice(f1: F1, f2: F2) -> Box> where - F1: FnOnce(&mut StackDrive), + F1: FnOnce(&mut StackDrive) -> Option<()>, F2: FnOnce(&mut StackDrive), { Box::new(OpTwice { @@ -483,11 +481,12 @@ impl OpcodeDispatcher { let passed = drive.ctx().string_position - drive.state.start; if passed < back { drive.ctx_mut().has_matched = Some(false); - return; + return None; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); drive.state.context_stack.last_mut().unwrap().toplevel = false; + Some(()) }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -504,11 +503,12 @@ impl OpcodeDispatcher { let passed = drive.ctx().string_position - drive.state.start; if passed < back { drive.skip_code(drive.peek_code(1) as usize + 1); - return; + return None; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); drive.state.context_stack.last_mut().unwrap().toplevel = false; + Some(()) }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -598,6 +598,7 @@ impl OpcodeDispatcher { drive.state.string_position = drive.ctx().string_position; // execute UNTIL operator drive.push_new_context(drive.peek_code(1) as usize + 1); + Some(()) }, |drive| { drive.state.repeat_stack.pop(); diff --git a/src/lib.rs b/src/lib.rs index 4a3ed1b754..eae3be617d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,7 @@ pub mod constants; pub mod engine; +#[cfg(test)] +mod tests; pub const CODESIZE: usize = 4; diff --git a/src/tests.rs b/src/tests.rs new file mode 100644 index 0000000000..95922e88a7 --- /dev/null +++ b/src/tests.rs @@ -0,0 +1,19 @@ +use engine::{State, StrDrive}; + +use super::*; + +#[test] +fn test_2427() { + let str_drive = StrDrive::Str("x"); + // r'(? = vec![15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1]; + let mut state = State::new( + str_drive, + 0, + std::usize::MAX, + constants::SreFlag::UNICODE, + &code, + ); + state = state.pymatch(); + assert!(state.has_matched == Some(true)); +} From cc4441b50ffa361c8f52eeecad54a9459c471e9b Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Mon, 1 Feb 2021 14:27:01 -0600 Subject: [PATCH 036/960] Compile regex patterns for tests with a script --- generate_tests.py | 37 +++++++++++++++++++++++++++++++++++++ src/engine.rs | 12 ++++++++++++ src/lib.rs | 2 -- src/tests.rs | 19 ------------------- tests/lookbehind.py | 1 + tests/lookbehind.re | 2 ++ tests/tests.rs | 26 ++++++++++++++++++++++++++ 7 files changed, 78 insertions(+), 21 deletions(-) create mode 100644 generate_tests.py delete mode 100644 src/tests.rs create mode 100644 tests/lookbehind.py create mode 100644 tests/lookbehind.re create mode 100644 tests/tests.rs diff --git a/generate_tests.py b/generate_tests.py new file mode 100644 index 0000000000..49a24792be --- /dev/null +++ b/generate_tests.py @@ -0,0 +1,37 @@ +import os +from pathlib import Path +import re +import sre_constants +import sre_compile +import sre_parse +import json + +m = re.search(r"const SRE_MAGIC: usize = (\d+);", open("src/constants.rs").read()) +sre_engine_magic = int(m.group(1)) +del m + +assert sre_constants.MAGIC == sre_engine_magic + +class CompiledPattern: + @classmethod + def compile(cls, pattern, flags=0): + p = sre_parse.parse(pattern) + code = sre_compile._code(p, flags) + self = cls() + self.pattern = pattern + self.code = code + self.flags = re.RegexFlag(flags | p.state.flags) + return self + +for k, v in re.RegexFlag.__members__.items(): + setattr(CompiledPattern, k, v) + +with os.scandir("tests") as d: + for f in d: + path = Path(f.path) + if path.suffix == ".py": + pattern = eval(path.read_text(), {"re": CompiledPattern}) + path.with_suffix(".re").write_text( + f"// {pattern.pattern!r}, flags={pattern.flags!r}\n" + f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" + ) diff --git a/src/engine.rs b/src/engine.rs index f6a6092f9b..a48b799e1b 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -153,6 +153,18 @@ pub enum StrDrive<'a> { Str(&'a str), Bytes(&'a [u8]), } + +impl<'a> From<&'a str> for StrDrive<'a> { + fn from(s: &'a str) -> Self { + Self::Str(s) + } +} +impl<'a> From<&'a [u8]> for StrDrive<'a> { + fn from(b: &'a [u8]) -> Self { + Self::Bytes(b) + } +} + impl<'a> StrDrive<'a> { fn offset(&self, offset: usize, skip: usize) -> usize { match *self { diff --git a/src/lib.rs b/src/lib.rs index eae3be617d..4a3ed1b754 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,5 @@ pub mod constants; pub mod engine; -#[cfg(test)] -mod tests; pub const CODESIZE: usize = 4; diff --git a/src/tests.rs b/src/tests.rs deleted file mode 100644 index 95922e88a7..0000000000 --- a/src/tests.rs +++ /dev/null @@ -1,19 +0,0 @@ -use engine::{State, StrDrive}; - -use super::*; - -#[test] -fn test_2427() { - let str_drive = StrDrive::Str("x"); - // r'(? = vec![15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1]; - let mut state = State::new( - str_drive, - 0, - std::usize::MAX, - constants::SreFlag::UNICODE, - &code, - ); - state = state.pymatch(); - assert!(state.has_matched == Some(true)); -} diff --git a/tests/lookbehind.py b/tests/lookbehind.py new file mode 100644 index 0000000000..3da6425959 --- /dev/null +++ b/tests/lookbehind.py @@ -0,0 +1 @@ +re.compile(r'(?( + &self, + string: impl Into>, + range: std::ops::Range, + ) -> engine::State<'a> { + engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + } +} + +#[test] +fn test_2427() { + // r'(? Date: Wed, 3 Feb 2021 13:32:48 +0200 Subject: [PATCH 037/960] fix OpAssert positive lookbehind --- src/engine.rs | 28 ++++++++++++++++++++++++---- tests/positive_lookbehind.py | 1 + tests/positive_lookbehind.re | 2 ++ tests/tests.rs | 9 +++++++++ 4 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 tests/positive_lookbehind.py create mode 100644 tests/positive_lookbehind.re diff --git a/src/engine.rs b/src/engine.rs index a48b799e1b..5e0e0f4208 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -490,14 +490,24 @@ impl OpcodeDispatcher { SreOpcode::ASSERT => twice( |drive| { let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position - drive.state.start; + let passed = drive.ctx().string_position; if passed < back { drive.ctx_mut().has_matched = Some(false); return None; } + let back_offset = drive + .state + .string + .back_offset(drive.ctx().string_offset, back); + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); - drive.state.context_stack.last_mut().unwrap().toplevel = false; + let child_ctx = drive.state.context_stack.last_mut().unwrap(); + child_ctx.toplevel = false; + child_ctx.string_position -= back; + child_ctx.string_offset = back_offset; + Some(()) }, |drive| { @@ -512,14 +522,24 @@ impl OpcodeDispatcher { SreOpcode::ASSERT_NOT => twice( |drive| { let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position - drive.state.start; + let passed = drive.ctx().string_position; if passed < back { drive.skip_code(drive.peek_code(1) as usize + 1); return None; } + let back_offset = drive + .state + .string + .back_offset(drive.ctx().string_offset, back); + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); - drive.state.context_stack.last_mut().unwrap().toplevel = false; + let child_ctx = drive.state.context_stack.last_mut().unwrap(); + child_ctx.toplevel = false; + child_ctx.string_position -= back; + child_ctx.string_offset = back_offset; + Some(()) }, |drive| { diff --git a/tests/positive_lookbehind.py b/tests/positive_lookbehind.py new file mode 100644 index 0000000000..2a0ab29253 --- /dev/null +++ b/tests/positive_lookbehind.py @@ -0,0 +1 @@ +re.compile(r'(?<=abc)def') \ No newline at end of file diff --git a/tests/positive_lookbehind.re b/tests/positive_lookbehind.re new file mode 100644 index 0000000000..68923b58ee --- /dev/null +++ b/tests/positive_lookbehind.re @@ -0,0 +1,2 @@ +// '(?<=abc)def', flags=re.UNICODE +Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1], flags: SreFlag::from_bits_truncate(32) } \ No newline at end of file diff --git a/tests/tests.rs b/tests/tests.rs index 4db110177d..8a18b5f333 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -24,3 +24,12 @@ fn test_2427() { state = state.pymatch(); assert!(state.has_matched == Some(true)); } + +#[test] +fn test_assert() { + // '(?<=abc)def', flags=re.UNICODE + let pattern = include!("positive_lookbehind.re"); + let mut state = pattern.state("abcdef", 0..usize::MAX); + state = state.search(); + assert!(state.has_matched == Some(true)); +} From 2a43d66e11a4a2245c12b24484b14eb01d7033a1 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Thu, 1 Apr 2021 21:38:14 -0500 Subject: [PATCH 038/960] Fix clippy lint --- src/constants.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/constants.rs b/src/constants.rs index f5ab92c531..f3962b339a 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -16,7 +16,7 @@ use bitflags::bitflags; pub const SRE_MAGIC: usize = 20171005; #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] -#[allow(non_camel_case_types)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreOpcode { FAILURE = 0, SUCCESS = 1, @@ -62,7 +62,7 @@ pub enum SreOpcode { } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] -#[allow(non_camel_case_types)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreAtCode { BEGINNING = 0, BEGINNING_LINE = 1, @@ -79,7 +79,7 @@ pub enum SreAtCode { } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] -#[allow(non_camel_case_types)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreCatCode { DIGIT = 0, NOT_DIGIT = 1, From ca1346ee031624b37e6e5d531a1c5dcaa5b284fc Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Thu, 1 Apr 2021 22:05:45 -0500 Subject: [PATCH 039/960] Have generate_tests.py generate Patterns inline in tests.rs --- generate_tests.py | 21 +++++++++++++++------ tests/lookbehind.py | 1 - tests/lookbehind.re | 2 -- tests/positive_lookbehind.py | 1 - tests/positive_lookbehind.re | 2 -- tests/tests.rs | 16 ++++++++++------ 6 files changed, 25 insertions(+), 18 deletions(-) delete mode 100644 tests/lookbehind.py delete mode 100644 tests/lookbehind.re delete mode 100644 tests/positive_lookbehind.py delete mode 100644 tests/positive_lookbehind.re diff --git a/generate_tests.py b/generate_tests.py index 49a24792be..7af1d2f0c2 100644 --- a/generate_tests.py +++ b/generate_tests.py @@ -26,12 +26,21 @@ def compile(cls, pattern, flags=0): for k, v in re.RegexFlag.__members__.items(): setattr(CompiledPattern, k, v) + +# matches `// pattern {varname} = re.compile(...)` +pattern_pattern = re.compile(r"^((\s*)\/\/\s*pattern\s+(\w+)\s+=\s+(.+?))$(?:.+?END GENERATED)?", re.M | re.S) +def replace_compiled(m): + line, indent, varname, pattern = m.groups() + pattern = eval(pattern, {"re": CompiledPattern}) + pattern = f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" + return f'''{line} +{indent}// START GENERATED by generate_tests.py +{indent}#[rustfmt::skip] let {varname} = {pattern}; +{indent}// END GENERATED''' + with os.scandir("tests") as d: for f in d: path = Path(f.path) - if path.suffix == ".py": - pattern = eval(path.read_text(), {"re": CompiledPattern}) - path.with_suffix(".re").write_text( - f"// {pattern.pattern!r}, flags={pattern.flags!r}\n" - f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" - ) + if path.suffix == ".rs": + replaced = pattern_pattern.sub(replace_compiled, path.read_text()) + path.write_text(replaced) diff --git a/tests/lookbehind.py b/tests/lookbehind.py deleted file mode 100644 index 3da6425959..0000000000 --- a/tests/lookbehind.py +++ /dev/null @@ -1 +0,0 @@ -re.compile(r'(? Date: Mon, 5 Apr 2021 11:10:32 -0500 Subject: [PATCH 040/960] Add more info to Cargo.toml --- Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 03db7aba4f..8e69ab5235 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,8 +2,11 @@ name = "sre-engine" version = "0.1.0" authors = ["Kangzhi Shi ", "RustPython Team"] +description = "A low-level implementation of Python's SRE regex engine" +repository = "https://github.com/RustPython/sre-engine" license = "MIT" edition = "2018" +keywords = ["regex"] [dependencies] num_enum = "0.5" From 9728dd8699a255686deaedbdd0f23f549e62009f Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 16 Apr 2021 09:35:11 +0200 Subject: [PATCH 041/960] fix test_string_boundaries --- src/engine.rs | 14 +++++++++++--- tests/tests.rs | 11 +++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 5e0e0f4208..0de9f43844 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -292,6 +292,14 @@ trait MatchContextDrive { let this = !self.at_end() && word_checker(self.peek_char()); this != that } + fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { + if self.at_beginning() && self.at_end() { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char()); + let this = !self.at_end() && word_checker(self.peek_char()); + this == that + } fn back_peek_char(&self) -> u32 { self.state().string.back_peek(self.ctx().string_offset) } @@ -738,14 +746,14 @@ fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), SreAtCode::BOUNDARY => drive.at_boundary(is_word), - SreAtCode::NON_BOUNDARY => !drive.at_boundary(is_word), + SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), SreAtCode::END_STRING => drive.at_end(), SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), - SreAtCode::LOC_NON_BOUNDARY => !drive.at_boundary(is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), - SreAtCode::UNI_NON_BOUNDARY => !drive.at_boundary(is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), } } diff --git a/tests/tests.rs b/tests/tests.rs index f4cd091f0d..690c72861b 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -37,3 +37,14 @@ fn test_assert() { state = state.search(); assert!(state.has_matched == Some(true)); } + +#[test] +fn test_string_boundaries() { + // pattern big_b = re.compile(r'\B') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = big_b.state("", 0..usize::MAX); + state = state.search(); + assert!(state.has_matched == None) +} From d2b48fdea2986e8513d63d19ea30859c5a48a66e Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Fri, 16 Apr 2021 10:40:42 -0500 Subject: [PATCH 042/960] Release 0.1.1 sre-engine@0.1.1 Generated by cargo-workspaces --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8e69ab5235..cf830a6053 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.1.0" +version = "0.1.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From 73abbace85aebf063eb8c7ce4815cacd2449f522 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Fri, 16 Apr 2021 10:53:37 -0500 Subject: [PATCH 043/960] Add explicit include for Cargo files --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index cf830a6053..f0cd628f0e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ repository = "https://github.com/RustPython/sre-engine" license = "MIT" edition = "2018" keywords = ["regex"] +include = ["LICENSE", "src/**/*.rs"] [dependencies] num_enum = "0.5" From df8453d387c0a4bc6b84131f40703b46000ba7ee Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 20 Apr 2021 10:19:27 +0200 Subject: [PATCH 044/960] fix zerowidth search --- Cargo.toml | 2 +- src/engine.rs | 99 +++++++++++++++++++++++++++++++++++--------------- tests/tests.rs | 18 +++++++-- 3 files changed, 85 insertions(+), 34 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f0cd628f0e..614243eeb1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.1.1" +version = "0.1.2" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index 0de9f43844..bded52448d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -23,8 +23,9 @@ pub struct State<'a> { repeat_stack: Vec, pub string_position: usize, popped_context: Option, - pub has_matched: Option, + pub has_matched: bool, pub match_all: bool, + pub must_advance: bool, } impl<'a> State<'a> { @@ -50,8 +51,9 @@ impl<'a> State<'a> { marks: Vec::new(), string_position: start, popped_context: None, - has_matched: None, + has_matched: false, match_all: false, + must_advance: false, } } @@ -63,7 +65,7 @@ impl<'a> State<'a> { self.marks.clear(); self.string_position = self.start; self.popped_context = None; - self.has_matched = None; + self.has_matched = false; } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -100,17 +102,7 @@ impl<'a> State<'a> { self.marks_stack.pop(); } - pub fn pymatch(mut self) -> Self { - let ctx = MatchContext { - string_position: self.start, - string_offset: self.string.offset(0, self.start), - code_position: 0, - has_matched: None, - toplevel: true, - }; - self.context_stack.push(ctx); - - let mut dispatcher = OpcodeDispatcher::new(); + fn _match(mut self, dispatcher: &mut OpcodeDispatcher) -> Self { let mut has_matched = None; loop { @@ -127,21 +119,58 @@ impl<'a> State<'a> { } } - self.has_matched = has_matched; + self.has_matched = has_matched == Some(true); self } + pub fn pymatch(mut self) -> Self { + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + toplevel: true, + }; + self.context_stack.push(ctx); + + let mut dispatcher = OpcodeDispatcher::new(); + + self._match(&mut dispatcher) + } + pub fn search(mut self) -> Self { // TODO: optimize by op info and skip prefix - while self.start <= self.end { - self.match_all = false; - self = self.pymatch(); - if self.has_matched == Some(true) { - return self; - } + if self.start > self.end { + return self; + } + + let mut dispatcher = OpcodeDispatcher::new(); + + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + toplevel: true, + }; + self.context_stack.push(ctx); + self = self._match(&mut dispatcher); + + self.must_advance = false; + while !self.has_matched && self.start < self.end { self.start += 1; self.reset(); + dispatcher.clear(); + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + toplevel: false, + }; + self.context_stack.push(ctx); + self = self._match(&mut dispatcher); } self @@ -310,6 +339,18 @@ trait MatchContextDrive { .string .back_offset(self.ctx().string_offset, skip_count); } + fn can_success(&self) -> bool { + if !self.ctx().toplevel { + return true; + } + if self.state().match_all && !self.at_end() { + return false; + } + if self.state().must_advance && self.ctx().string_position == self.state().start { + return false; + } + true + } } struct StackDrive<'a> { @@ -429,6 +470,9 @@ impl OpcodeDispatcher { executing_contexts: HashMap::new(), } } + fn clear(&mut self) { + self.executing_contexts.clear(); + } // Returns True if the current context matches, False if it doesn't and // None if matching is not finished, ie must be resumed after child // contexts have been matched. @@ -470,11 +514,9 @@ impl OpcodeDispatcher { drive.ctx_mut().has_matched = Some(false); }), SreOpcode::SUCCESS => once(|drive| { - if drive.ctx().toplevel && drive.state.match_all && !drive.at_end() { - drive.ctx_mut().has_matched = Some(false); - } else { + drive.ctx_mut().has_matched = Some(drive.can_success()); + if drive.ctx().has_matched == Some(true) { drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); } }), SreOpcode::ANY => once(|drive| { @@ -1152,9 +1194,7 @@ impl OpcodeExecutor for OpMinRepeatOne { }; let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 - && !(drive.ctx().toplevel && drive.state.match_all && !drive.at_end()) - { + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); @@ -1455,8 +1495,7 @@ impl OpcodeExecutor for OpRepeatOne { } let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.at_end() && !drive.ctx().toplevel - { + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); diff --git a/tests/tests.rs b/tests/tests.rs index 690c72861b..d76ff3cfb5 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -24,7 +24,7 @@ fn test_2427() { // END GENERATED let mut state = lookbehind.state("x", 0..usize::MAX); state = state.pymatch(); - assert!(state.has_matched == Some(true)); + assert!(state.has_matched); } #[test] @@ -35,7 +35,7 @@ fn test_assert() { // END GENERATED let mut state = positive_lookbehind.state("abcdef", 0..usize::MAX); state = state.search(); - assert!(state.has_matched == Some(true)); + assert!(state.has_matched); } #[test] @@ -46,5 +46,17 @@ fn test_string_boundaries() { // END GENERATED let mut state = big_b.state("", 0..usize::MAX); state = state.search(); - assert!(state.has_matched == None) + assert!(!state.has_matched); +} + +#[test] +fn test_zerowidth() { + // pattern p = re.compile(r'\b|:+') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 7, 5, 6, 10, 16, 12, 10, 25, 6, 1, 4294967295, 17, 58, 1, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = p.state("a:", 0..usize::MAX); + state.must_advance = true; + state = state.search(); + assert!(state.string_position == 1); } From a3c3573d67f94d6119a8bb7126f385c38ba438e8 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 20 Apr 2021 17:36:32 +0200 Subject: [PATCH 045/960] optimize count --- src/engine.rs | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index bded52448d..5409baf66d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -916,7 +916,7 @@ fn charset(set: &[u32], ch: u32) -> bool { } /* General case */ -fn count(drive: &mut StackDrive, maxcount: usize) -> usize { +fn general_count(drive: &mut StackDrive, maxcount: usize) -> usize { let mut count = 0; let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); @@ -937,18 +937,11 @@ fn count(drive: &mut StackDrive, maxcount: usize) -> usize { count } -/* TODO: check literal cases should improve the perfermance - -fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { +fn count(stack_drive: &mut StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let end = drive.ctx().string_position + maxcount; - let opcode = match SreOpcode::try_from(drive.peek_code(1)) { - Ok(code) => code, - Err(_) => { - panic!("FIXME:COUNT1"); - } - }; + let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); match opcode { SreOpcode::ANY => { @@ -960,7 +953,6 @@ fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { drive.skip_char(maxcount); } SreOpcode::IN => { - // TODO: pattern[2 or 1..]? while !drive.ctx().string_position < end && charset(&drive.pattern()[2..], drive.peek_char()) { @@ -992,7 +984,7 @@ fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { general_count_literal(&mut drive, end, |code, c| code != lower_unicode(c) as u32); } _ => { - todo!("repeated single character pattern?"); + return general_count(stack_drive, maxcount); } } @@ -1006,11 +998,6 @@ fn general_count_literal bool>(drive: &mut WrapDrive, end: } } -fn eq_loc_ignore(code: u32, ch: u32) -> bool { - code == ch || code == lower_locate(ch) || code == upper_locate(ch) -} -*/ - fn is_word(ch: u32) -> bool { ch == '_' as u32 || u8::try_from(ch) @@ -1028,7 +1015,7 @@ fn is_digit(ch: u32) -> bool { .unwrap_or(false) } fn is_loc_alnum(ch: u32) -> bool { - // TODO: check with cpython + // FIXME: Ignore the locales u8::try_from(ch) .map(|x| x.is_ascii_alphanumeric()) .unwrap_or(false) @@ -1045,13 +1032,11 @@ pub fn lower_ascii(ch: u32) -> u32 { .unwrap_or(ch) } fn lower_locate(ch: u32) -> u32 { - // TODO: check with cpython - // https://doc.rust-lang.org/std/primitive.char.html#method.to_lowercase + // FIXME: Ignore the locales lower_ascii(ch) } fn upper_locate(ch: u32) -> u32 { - // TODO: check with cpython - // https://doc.rust-lang.org/std/primitive.char.html#method.to_uppercase + // FIXME: Ignore the locales u8::try_from(ch) .map(|x| x.to_ascii_uppercase() as u32) .unwrap_or(ch) From 5bd6b672d089fa4dc8db1d5d3d8564f6a98dbacd Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 21 Apr 2021 11:09:10 +0200 Subject: [PATCH 046/960] optimize opcode that execute only once --- src/engine.rs | 183 ++++++++++++++++++++++++++++---------------------- 1 file changed, 102 insertions(+), 81 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 5409baf66d..2888b6930c 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -416,20 +416,6 @@ trait OpcodeExecutor { fn next(&mut self, drive: &mut StackDrive) -> Option<()>; } -struct OpOnce { - f: Option, -} -impl OpcodeExecutor for OpOnce { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - let f = self.f.take()?; - f(drive); - None - } -} -fn once(f: F) -> Box> { - Box::new(OpOnce { f: Some(f) }) -} - struct OpTwice { f1: Option, f2: Option, @@ -496,48 +482,58 @@ impl OpcodeDispatcher { // Dispatches a context on a given opcode. Returns True if the context // is done matching, False if it must be resumed when next encountered. fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { - let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { - Some((_, executor)) => executor, - None => self.dispatch_table(opcode), - }; - if let Some(()) = executor.next(drive) { - self.executing_contexts.insert(drive.id(), executor); - false - } else { - true + let executor = self + .executing_contexts + .remove_entry(&drive.id()) + .map(|(_, x)| x) + .or_else(|| self.dispatch_table(opcode, drive)); + if let Some(mut executor) = executor { + if let Some(()) = executor.next(drive) { + self.executing_contexts.insert(drive.id(), executor); + return false; + } } + true } - fn dispatch_table(&mut self, opcode: SreOpcode) -> Box { + fn dispatch_table( + &mut self, + opcode: SreOpcode, + drive: &mut StackDrive, + ) -> Option> { match opcode { - SreOpcode::FAILURE => once(|drive| { + SreOpcode::FAILURE => { drive.ctx_mut().has_matched = Some(false); - }), - SreOpcode::SUCCESS => once(|drive| { + None + } + SreOpcode::SUCCESS => { drive.ctx_mut().has_matched = Some(drive.can_success()); if drive.ctx().has_matched == Some(true) { drive.state.string_position = drive.ctx().string_position; } - }), - SreOpcode::ANY => once(|drive| { + None + } + SreOpcode::ANY => { if drive.at_end() || drive.at_linebreak() { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(1); drive.skip_char(1); } - }), - SreOpcode::ANY_ALL => once(|drive| { + None + } + SreOpcode::ANY_ALL => { if drive.at_end() { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(1); drive.skip_char(1); } - }), + None + } /* assert subpattern */ /* */ - SreOpcode::ASSERT => twice( + SreOpcode::ASSERT => Some(twice( |drive| { let back = drive.peek_code(2) as usize; let passed = drive.ctx().string_position; @@ -568,8 +564,8 @@ impl OpcodeDispatcher { drive.ctx_mut().has_matched = Some(false); } }, - ), - SreOpcode::ASSERT_NOT => twice( + )), + SreOpcode::ASSERT_NOT => Some(twice( |drive| { let back = drive.peek_code(2) as usize; let passed = drive.ctx().string_position; @@ -600,17 +596,18 @@ impl OpcodeDispatcher { drive.skip_code(drive.peek_code(1) as usize + 1); } }, - ), - SreOpcode::AT => once(|drive| { + )), + SreOpcode::AT => { let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); if !at(drive, atcode) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(2); } - }), - SreOpcode::BRANCH => Box::new(OpBranch::default()), - SreOpcode::CATEGORY => once(|drive| { + None + } + SreOpcode::BRANCH => Some(Box::new(OpBranch::default())), + SreOpcode::CATEGORY => { let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); if drive.at_end() || !category(catcode, drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); @@ -618,53 +615,68 @@ impl OpcodeDispatcher { drive.skip_code(2); drive.skip_char(1); } - }), - SreOpcode::IN => once(|drive| { + None + } + SreOpcode::IN => { general_op_in(drive, |set, c| charset(set, c)); - }), - SreOpcode::IN_IGNORE => once(|drive| { + None + } + SreOpcode::IN_IGNORE => { general_op_in(drive, |set, c| charset(set, lower_ascii(c))); - }), - SreOpcode::IN_UNI_IGNORE => once(|drive| { + None + } + SreOpcode::IN_UNI_IGNORE => { general_op_in(drive, |set, c| charset(set, lower_unicode(c))); - }), - SreOpcode::IN_LOC_IGNORE => once(|drive| { + None + } + SreOpcode::IN_LOC_IGNORE => { general_op_in(drive, |set, c| charset_loc_ignore(set, c)); - }), - SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { + None + } + SreOpcode::INFO | SreOpcode::JUMP => { drive.skip_code(drive.peek_code(1) as usize + 1); - }), - SreOpcode::LITERAL => once(|drive| { + None + } + SreOpcode::LITERAL => { general_op_literal(drive, |code, c| code == c); - }), - SreOpcode::NOT_LITERAL => once(|drive| { + None + } + SreOpcode::NOT_LITERAL => { general_op_literal(drive, |code, c| code != c); - }), - SreOpcode::LITERAL_IGNORE => once(|drive| { + None + } + SreOpcode::LITERAL_IGNORE => { general_op_literal(drive, |code, c| code == lower_ascii(c)); - }), - SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { + None + } + SreOpcode::NOT_LITERAL_IGNORE => { general_op_literal(drive, |code, c| code != lower_ascii(c)); - }), - SreOpcode::LITERAL_UNI_IGNORE => once(|drive| { + None + } + SreOpcode::LITERAL_UNI_IGNORE => { general_op_literal(drive, |code, c| code == lower_unicode(c)); - }), - SreOpcode::NOT_LITERAL_UNI_IGNORE => once(|drive| { + None + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { general_op_literal(drive, |code, c| code != lower_unicode(c)); - }), - SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { + None + } + SreOpcode::LITERAL_LOC_IGNORE => { general_op_literal(drive, char_loc_ignore); - }), - SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { + None + } + SreOpcode::NOT_LITERAL_LOC_IGNORE => { general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); - }), - SreOpcode::MARK => once(|drive| { + None + } + SreOpcode::MARK => { drive .state .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); drive.skip_code(2); - }), - SreOpcode::REPEAT => twice( + None + } + SreOpcode::REPEAT => Some(twice( // create repeat context. all the hard work is done by the UNTIL // operator (MAX_UNTIL, MIN_UNTIL) // <1=min> <2=max> item tail @@ -687,20 +699,28 @@ impl OpcodeDispatcher { let child_ctx = drive.state.popped_context.unwrap(); drive.ctx_mut().has_matched = child_ctx.has_matched; }, - ), - SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), - SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), - SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), - SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), - SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), + )), + SreOpcode::MAX_UNTIL => Some(Box::new(OpMaxUntil::default())), + SreOpcode::MIN_UNTIL => Some(Box::new(OpMinUntil::default())), + SreOpcode::REPEAT_ONE => Some(Box::new(OpRepeatOne::default())), + SreOpcode::MIN_REPEAT_ONE => Some(Box::new(OpMinRepeatOne::default())), + SreOpcode::GROUPREF => { + general_op_groupref(drive, |x| x); + None + } + SreOpcode::GROUPREF_IGNORE => { + general_op_groupref(drive, lower_ascii); + None + } SreOpcode::GROUPREF_LOC_IGNORE => { - once(|drive| general_op_groupref(drive, lower_locate)) + general_op_groupref(drive, lower_locate); + None } SreOpcode::GROUPREF_UNI_IGNORE => { - once(|drive| general_op_groupref(drive, lower_unicode)) + general_op_groupref(drive, lower_unicode); + None } - SreOpcode::GROUPREF_EXISTS => once(|drive| { + SreOpcode::GROUPREF_EXISTS => { let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); match (group_start, group_end) { (Some(start), Some(end)) if start <= end => { @@ -708,7 +728,8 @@ impl OpcodeDispatcher { } _ => drive.skip_code(drive.peek_code(2) as usize + 1), } - }), + None + } _ => { // TODO python expcetion? unreachable!("unexpected opcode") From 7324feef89dd692d909c98849e54969617728ecf Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 21 Apr 2021 11:13:58 +0200 Subject: [PATCH 047/960] optimize search cache the string offset --- src/engine.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 2888b6930c..2b85ea3514 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -147,9 +147,11 @@ impl<'a> State<'a> { let mut dispatcher = OpcodeDispatcher::new(); + let mut start_offset = self.string.offset(0, self.start); + let ctx = MatchContext { string_position: self.start, - string_offset: self.string.offset(0, self.start), + string_offset: start_offset, code_position: 0, has_matched: None, toplevel: true, @@ -160,11 +162,12 @@ impl<'a> State<'a> { self.must_advance = false; while !self.has_matched && self.start < self.end { self.start += 1; + start_offset = self.string.offset(start_offset, 1); self.reset(); dispatcher.clear(); let ctx = MatchContext { string_position: self.start, - string_offset: self.string.offset(0, self.start), + string_offset: start_offset, code_position: 0, has_matched: None, toplevel: false, From 86435b8a4b44d0b79109ace6acd66cbfcebaac66 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 22 Apr 2021 17:15:03 +0200 Subject: [PATCH 048/960] add benchmark --- benches/benches.rs | 112 +++++++++++++++++++++++++++++++++++++++++++++ generate_tests.py | 5 +- 2 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 benches/benches.rs diff --git a/benches/benches.rs b/benches/benches.rs new file mode 100644 index 0000000000..b86a592967 --- /dev/null +++ b/benches/benches.rs @@ -0,0 +1,112 @@ +#![feature(test)] + +extern crate test; +use test::Bencher; + +use sre_engine::constants::SreFlag; +use sre_engine::engine; +pub struct Pattern { + pub code: &'static [u32], + pub flags: SreFlag, +} + +impl Pattern { + pub fn state<'a>( + &self, + string: impl Into>, + range: std::ops::Range, + ) -> engine::State<'a> { + engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + } +} +#[bench] +fn benchmarks(b: &mut Bencher) { + // # test common prefix + // pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('Python') #, 'Python'), # Simple text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('.*Python') #, 'Python'), # Bad text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('.*Python.*') #, 'Python'), # Worse text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + + let tests = [ + (p1, "Perl"), + (p2, "Perl"), + (p3, "Perl"), + (p4, "Perl"), + (p5, "PythonPython"), + (p6, "a5,b7,c9,"), + (p7, "a5,b7,c9,"), + (p8, "Python"), + (p9, "Python"), + (p10, "Python"), + (p11, "Python"), + ]; + + b.iter(move || { + for (p, s) in &tests { + let mut state = p.state(s.clone(), 0..usize::MAX); + state = state.search(); + assert!(state.has_matched); + state = p.state(s.clone(), 0..usize::MAX); + state = state.pymatch(); + assert!(state.has_matched); + state = p.state(s.clone(), 0..usize::MAX); + state.match_all = true; + state = state.pymatch(); + assert!(state.has_matched); + let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); + state = p.state(s2.as_str(), 0..usize::MAX); + state = state.search(); + assert!(state.has_matched); + state = p.state(s2.as_str(), 10000..usize::MAX); + state = state.pymatch(); + assert!(state.has_matched); + state = p.state(s2.as_str(), 10000..10000 + s.len()); + state = state.pymatch(); + assert!(state.has_matched); + state = p.state(s2.as_str(), 10000..10000 + s.len()); + state.match_all = true; + state = state.pymatch(); + assert!(state.has_matched); + } + }) +} diff --git a/generate_tests.py b/generate_tests.py index 7af1d2f0c2..b432720cd1 100644 --- a/generate_tests.py +++ b/generate_tests.py @@ -5,6 +5,7 @@ import sre_compile import sre_parse import json +from itertools import chain m = re.search(r"const SRE_MAGIC: usize = (\d+);", open("src/constants.rs").read()) sre_engine_magic = int(m.group(1)) @@ -38,8 +39,8 @@ def replace_compiled(m): {indent}#[rustfmt::skip] let {varname} = {pattern}; {indent}// END GENERATED''' -with os.scandir("tests") as d: - for f in d: +with os.scandir("tests") as t, os.scandir("benches") as b: + for f in chain(t, b): path = Path(f.path) if path.suffix == ".rs": replaced = pattern_pattern.sub(replace_compiled, path.read_text()) From 58981a41e99cbcb53002b559ea13c7131b597938 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 22 Apr 2021 19:27:24 +0200 Subject: [PATCH 049/960] optimize; replace hashmap with btreemap --- src/engine.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 2b85ea3514..3837974838 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2,7 +2,7 @@ use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use super::MAXREPEAT; -use std::collections::HashMap; +use std::collections::BTreeMap; use std::convert::TryFrom; const fn is_py_ascii_whitespace(b: u8) -> bool { @@ -204,7 +204,7 @@ impl<'a> StrDrive<'a> { .get(offset..) .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) .unwrap_or_else(|| s.len()), - StrDrive::Bytes(b) => std::cmp::min(offset + skip, b.len()), + StrDrive::Bytes(_) => offset + skip, } } @@ -294,8 +294,7 @@ trait MatchContextDrive { .state() .string .offset(self.ctx().string_offset, skip_count); - self.ctx_mut().string_position = - std::cmp::min(self.ctx().string_position + skip_count, self.state().end); + self.ctx_mut().string_position += skip_count; } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; @@ -451,12 +450,12 @@ where } struct OpcodeDispatcher { - executing_contexts: HashMap>, + executing_contexts: BTreeMap>, } impl OpcodeDispatcher { fn new() -> Self { Self { - executing_contexts: HashMap::new(), + executing_contexts: BTreeMap::new(), } } fn clear(&mut self) { @@ -487,8 +486,7 @@ impl OpcodeDispatcher { fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { let executor = self .executing_contexts - .remove_entry(&drive.id()) - .map(|(_, x)| x) + .remove(&drive.id()) .or_else(|| self.dispatch_table(opcode, drive)); if let Some(mut executor) = executor { if let Some(()) = executor.next(drive) { From 74ebdaf4e8a50330ea6195333bcb47e086ff3b5e Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 11 Jul 2022 21:30:48 +0200 Subject: [PATCH 050/960] fix panic OpMinUntil return before restore repeat --- .vscode/launch.json | 21 +++++++++++++++++++++ src/engine.rs | 10 ++++++---- tests/tests.rs | 11 +++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000000..5ebfe34f05 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,21 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "type": "lldb", + "request": "launch", + "name": "Debug Unit Test", + "cargo": { + "args": [ + "test", + "--no-run" + ], + "filter": { + "kind": "test" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + } + ] +} \ No newline at end of file diff --git a/src/engine.rs b/src/engine.rs index 3837974838..d4e036ff32 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -14,7 +14,7 @@ pub struct State<'a> { pub string: StrDrive<'a>, pub start: usize, pub end: usize, - flags: SreFlag, + _flags: SreFlag, pattern_codes: &'a [u32], pub marks: Vec>, pub lastindex: isize, @@ -42,7 +42,7 @@ impl<'a> State<'a> { string, start, end, - flags, + _flags: flags, pattern_codes, lastindex: -1, marks_stack: Vec::new(), @@ -1380,16 +1380,18 @@ impl OpcodeExecutor for OpMinUntil { None } 2 => { + // restore repeat before return + drive.state.repeat_stack.push(self.save_repeat.unwrap()); + let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; } - drive.state.repeat_stack.push(self.save_repeat.unwrap()); drive.state.string_position = drive.ctx().string_position; drive.state.marks_pop(); - // match more unital tail matches + // match more until tail matches let RepeatContext { count: _, code_position, diff --git a/tests/tests.rs b/tests/tests.rs index d76ff3cfb5..b430947a9b 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -60,3 +60,14 @@ fn test_zerowidth() { state = state.search(); assert!(state.string_position == 1); } + +#[test] +fn test_repeat_context_panic() { + // pattern p = re.compile(r'(?:a*?(xx)??z)*') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = p.state("axxzaz", 0..usize::MAX); + state = state.pymatch(); + assert!(state.marks == vec![Some(1), Some(3)]); +} \ No newline at end of file From 919e1d7933b62bba0830b5b8dce63c1dfc758056 Mon Sep 17 00:00:00 2001 From: Steve Shi Date: Tue, 26 Jul 2022 20:38:03 +0200 Subject: [PATCH 051/960] Refactor and fix multiple max_until recusion (#10) * wip refactor engine * wip 2 refactor engine * wip 3 refactor engine * wip 3 refactor engine * wip 4 refactor engine * wip 5 refactor engine * refactor seperate Stacks * fix clippy * fix pymatch and search restore _stacks * fix toplevel * fix marks panic * fix double max_until repeat context * clearup * update version to 0.2.0 --- Cargo.toml | 2 +- src/engine.rs | 1700 ++++++++++++++++++++++++------------------------ src/lib.rs | 2 +- tests/tests.rs | 13 +- 4 files changed, 847 insertions(+), 870 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 614243eeb1..6ba3996947 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.1.2" +version = "0.2.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index d4e036ff32..81903ccfdd 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2,7 +2,6 @@ use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use super::MAXREPEAT; -use std::collections::BTreeMap; use std::convert::TryFrom; const fn is_py_ascii_whitespace(b: u8) -> bool { @@ -20,7 +19,7 @@ pub struct State<'a> { pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, - repeat_stack: Vec, + _stacks: Option>, pub string_position: usize, popped_context: Option, pub has_matched: bool, @@ -44,11 +43,11 @@ impl<'a> State<'a> { end, _flags: flags, pattern_codes, + marks: Vec::new(), lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - repeat_stack: Vec::new(), - marks: Vec::new(), + _stacks: Default::default(), string_position: start, popped_context: None, has_matched: false, @@ -59,10 +58,12 @@ impl<'a> State<'a> { pub fn reset(&mut self) { self.lastindex = -1; + self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); - self.repeat_stack.clear(); - self.marks.clear(); + if let Some(x) = self._stacks.as_mut() { + x.clear() + }; self.string_position = self.start; self.popped_context = None; self.has_matched = false; @@ -102,51 +103,71 @@ impl<'a> State<'a> { self.marks_stack.pop(); } - fn _match(mut self, dispatcher: &mut OpcodeDispatcher) -> Self { - let mut has_matched = None; + fn _match(mut self, stacks: &mut Stacks) -> Self { + while let Some(ctx) = self.context_stack.pop() { + let mut drive = StateContext { + state: self, + ctx, + next_ctx: None, + }; - loop { - if self.context_stack.is_empty() { - break; + if let Some(handler) = drive.ctx.handler { + handler(&mut drive, stacks); + } else if drive.remaining_codes() > 0 { + let code = drive.peek_code(0); + let code = SreOpcode::try_from(code).unwrap(); + dispatch(code, &mut drive, stacks); + } else { + drive.failure(); } - let ctx_id = self.context_stack.len() - 1; - let mut drive = StackDrive::drive(ctx_id, self); - has_matched = dispatcher.pymatch(&mut drive); - self = drive.take(); - if has_matched.is_some() { - self.popped_context = self.context_stack.pop(); + let StateContext { + mut state, + ctx, + next_ctx, + } = drive; + + if ctx.has_matched.is_some() { + state.popped_context = Some(ctx); + } else { + state.context_stack.push(ctx); + if let Some(next_ctx) = next_ctx { + state.context_stack.push(next_ctx); + } } + self = state } - - self.has_matched = has_matched == Some(true); + self.has_matched = self.popped_context.unwrap().has_matched == Some(true); self } pub fn pymatch(mut self) -> Self { + let mut stacks = self._stacks.take().unwrap_or_default(); + let ctx = MatchContext { string_position: self.start, string_offset: self.string.offset(0, self.start), code_position: 0, has_matched: None, toplevel: true, + handler: None, + repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - let mut dispatcher = OpcodeDispatcher::new(); - - self._match(&mut dispatcher) + self = self._match(&mut stacks); + self._stacks = Some(stacks); + self } pub fn search(mut self) -> Self { + let mut stacks = self._stacks.take().unwrap_or_default(); // TODO: optimize by op info and skip prefix if self.start > self.end { return self; } - let mut dispatcher = OpcodeDispatcher::new(); - let mut start_offset = self.string.offset(0, self.start); let ctx = MatchContext { @@ -155,31 +176,664 @@ impl<'a> State<'a> { code_position: 0, has_matched: None, toplevel: true, + handler: None, + repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut dispatcher); + self = self._match(&mut stacks); self.must_advance = false; while !self.has_matched && self.start < self.end { self.start += 1; start_offset = self.string.offset(start_offset, 1); self.reset(); - dispatcher.clear(); + stacks.clear(); + let ctx = MatchContext { string_position: self.start, string_offset: start_offset, code_position: 0, has_matched: None, toplevel: false, + handler: None, + repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut dispatcher); + self = self._match(&mut stacks); } + self._stacks = Some(stacks); self } } +fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { + match opcode { + SreOpcode::FAILURE => { + drive.failure(); + } + SreOpcode::SUCCESS => { + drive.ctx.has_matched = Some(drive.can_success()); + if drive.ctx.has_matched == Some(true) { + drive.state.string_position = drive.ctx.string_position; + } + } + SreOpcode::ANY => { + if drive.at_end() || drive.at_linebreak() { + drive.failure(); + } else { + drive.skip_code(1); + drive.skip_char(1); + } + } + SreOpcode::ANY_ALL => { + if drive.at_end() { + drive.failure(); + } else { + drive.skip_code(1); + drive.skip_char(1); + } + } + SreOpcode::ASSERT => op_assert(drive), + SreOpcode::ASSERT_NOT => op_assert_not(drive), + SreOpcode::AT => { + let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); + if at(drive, atcode) { + drive.skip_code(2); + } else { + drive.failure(); + } + } + SreOpcode::BRANCH => op_branch(drive, stacks), + SreOpcode::CATEGORY => { + let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); + if drive.at_end() || !category(catcode, drive.peek_char()) { + drive.failure(); + } else { + drive.skip_code(2); + drive.skip_char(1); + } + } + SreOpcode::IN => general_op_in(drive, charset), + SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), + SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), + SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), + SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), + SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), + SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), + SreOpcode::NOT_LITERAL_IGNORE => { + general_op_literal(drive, |code, c| code != lower_ascii(c)) + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_op_literal(drive, |code, c| code == lower_unicode(c)) + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_op_literal(drive, |code, c| code != lower_unicode(c)) + } + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) + } + SreOpcode::MARK => { + drive + .state + .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); + drive.skip_code(2); + } + SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), + SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), + SreOpcode::REPEAT => op_repeat(drive, stacks), + SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), + SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), + SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), + SreOpcode::GROUPREF_EXISTS => { + let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); + match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => { + drive.skip_code(3); + } + _ => drive.skip_code_from(2), + } + } + _ => unreachable!("unexpected opcode"), + } +} + +/* assert subpattern */ +/* */ +fn op_assert(drive: &mut StateContext) { + let back = drive.peek_code(2) as usize; + + if drive.ctx.string_position < back { + return drive.failure(); + } + + let offset = drive + .state + .string + .back_offset(drive.ctx.string_offset, back); + let position = drive.ctx.string_position - back; + + drive.state.string_position = position; + + let next_ctx = drive.next_ctx(3, |drive, _| { + if drive.popped_ctx().has_matched == Some(true) { + drive.ctx.handler = None; + drive.skip_code_from(1); + } else { + drive.failure(); + } + }); + next_ctx.string_position = position; + next_ctx.string_offset = offset; + next_ctx.toplevel = false; +} + +/* assert not subpattern */ +/* */ +fn op_assert_not(drive: &mut StateContext) { + let back = drive.peek_code(2) as usize; + + if drive.ctx.string_position < back { + return drive.skip_code_from(1); + } + + let offset = drive + .state + .string + .back_offset(drive.ctx.string_offset, back); + let position = drive.ctx.string_position - back; + + drive.state.string_position = position; + + let next_ctx = drive.next_ctx(3, |drive, _| { + if drive.popped_ctx().has_matched == Some(true) { + drive.failure(); + } else { + drive.ctx.handler = None; + drive.skip_code_from(1); + } + }); + next_ctx.string_position = position; + next_ctx.string_offset = offset; + next_ctx.toplevel = false; +} + +#[derive(Debug)] +struct BranchContext { + branch_offset: usize, +} + +// alternation +// <0=skip> code ... +fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { + drive.state.marks_push(); + stacks.branch.push(BranchContext { branch_offset: 1 }); + create_context(drive, stacks); + + fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { + let branch_offset = stacks.branch_last().branch_offset; + let next_length = drive.peek_code(branch_offset) as usize; + if next_length == 0 { + drive.state.marks_pop_discard(); + stacks.branch.pop(); + return drive.failure(); + } + + drive.sync_string_position(); + + stacks.branch_last().branch_offset += next_length; + drive.next_ctx(branch_offset + 1, callback); + } + + fn callback(drive: &mut StateContext, stacks: &mut Stacks) { + if drive.popped_ctx().has_matched == Some(true) { + stacks.branch.pop(); + return drive.success(); + } + drive.state.marks_pop_keep(); + drive.ctx.handler = Some(create_context) + } +} + +#[derive(Debug, Copy, Clone)] +struct MinRepeatOneContext { + count: usize, + max_count: usize, +} + +/* <1=min> <2=max> item tail */ +fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { + let min_count = drive.peek_code(2) as usize; + let max_count = drive.peek_code(3) as usize; + + if drive.remaining_chars() < min_count { + return drive.failure(); + } + + drive.sync_string_position(); + + let count = if min_count == 0 { + 0 + } else { + let count = count(drive, stacks, min_count); + if count < min_count { + return drive.failure(); + } + drive.skip_char(count); + count + }; + + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { + // tail is empty. we're finished + drive.sync_string_position(); + return drive.success(); + } + + drive.state.marks_push(); + stacks + .min_repeat_one + .push(MinRepeatOneContext { count, max_count }); + create_context(drive, stacks); + + fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { + let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); + + if max_count == MAXREPEAT || count <= max_count { + drive.sync_string_position(); + drive.next_ctx_from(1, callback); + } else { + drive.state.marks_pop_discard(); + stacks.min_repeat_one.pop(); + drive.failure(); + } + } + + fn callback(drive: &mut StateContext, stacks: &mut Stacks) { + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_repeat_one.pop(); + return drive.success(); + } + + drive.sync_string_position(); + + if crate::engine::count(drive, stacks, 1) == 0 { + drive.state.marks_pop_discard(); + stacks.min_repeat_one.pop(); + return drive.failure(); + } + + drive.skip_char(1); + stacks.min_repeat_one_last().count += 1; + drive.state.marks_pop_keep(); + create_context(drive, stacks); + } +} + +#[derive(Debug, Copy, Clone)] +struct RepeatOneContext { + count: usize, + min_count: usize, + following_literal: Option, +} + +/* match repeated sequence (maximizing regexp) */ + +/* this operator only works if the repeated item is +exactly one character wide, and we're not already +collecting backtracking points. for other cases, +use the MAX_REPEAT operator */ + +/* <1=min> <2=max> item tail */ +fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { + let min_count = drive.peek_code(2) as usize; + let max_count = drive.peek_code(3) as usize; + + if drive.remaining_chars() < min_count { + return drive.failure(); + } + + drive.sync_string_position(); + + let count = count(drive, stacks, max_count); + drive.skip_char(count); + if count < min_count { + return drive.failure(); + } + + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { + // tail is empty. we're finished + drive.sync_string_position(); + return drive.success(); + } + + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + let following_literal = (next_code == SreOpcode::LITERAL as u32) + .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); + + drive.state.marks_push(); + stacks.repeat_one.push(RepeatOneContext { + count, + min_count, + following_literal, + }); + create_context(drive, stacks); + + fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { + let RepeatOneContext { + mut count, + min_count, + following_literal, + } = *stacks.repeat_one_last(); + + if let Some(c) = following_literal { + while drive.at_end() || drive.peek_char() != c { + if count <= min_count { + drive.state.marks_pop_discard(); + stacks.repeat_one.pop(); + return drive.failure(); + } + drive.back_skip_char(1); + count -= 1; + } + } + stacks.repeat_one_last().count = count; + + drive.sync_string_position(); + + // General case: backtracking + drive.next_ctx_from(1, callback); + } + + fn callback(drive: &mut StateContext, stacks: &mut Stacks) { + if drive.popped_ctx().has_matched == Some(true) { + stacks.repeat_one.pop(); + return drive.success(); + } + + let RepeatOneContext { + count, + min_count, + following_literal: _, + } = stacks.repeat_one_last(); + + if count <= min_count { + drive.state.marks_pop_discard(); + stacks.repeat_one.pop(); + return drive.failure(); + } + + drive.back_skip_char(1); + *count -= 1; + + drive.state.marks_pop_keep(); + create_context(drive, stacks); + } +} + +#[derive(Debug, Clone, Copy)] +struct RepeatContext { + count: isize, + min_count: usize, + max_count: usize, + code_position: usize, + last_position: usize, + prev_id: usize, +} + +/* create repeat context. all the hard work is done +by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ +/* <1=min> <2=max> item tail */ +fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { + let repeat_ctx = RepeatContext { + count: -1, + min_count: drive.peek_code(2) as usize, + max_count: drive.peek_code(3) as usize, + code_position: drive.ctx.code_position, + last_position: std::usize::MAX, + prev_id: drive.ctx.repeat_ctx_id, + }; + + stacks.repeat.push(repeat_ctx); + + drive.sync_string_position(); + + let next_ctx = drive.next_ctx_from(1, |drive, stacks| { + drive.ctx.has_matched = drive.popped_ctx().has_matched; + stacks.repeat.pop(); + }); + next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; +} + +#[derive(Debug, Clone, Copy)] +struct MinUntilContext { + count: isize, + save_repeat_ctx: Option, + save_last_position: usize, +} + +/* minimizing repeat */ +fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { + let repeat_ctx = stacks.repeat.last_mut().unwrap(); + + drive.sync_string_position(); + + let count = repeat_ctx.count + 1; + + stacks.min_until.push(MinUntilContext { + count, + save_repeat_ctx: None, + save_last_position: repeat_ctx.last_position, + }); + + if (count as usize) < repeat_ctx.min_count { + // not enough matches + repeat_ctx.count = count; + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_until.pop(); + return drive.success(); + } + + stacks.repeat_last().count = stacks.min_until_last().count - 1; + drive.sync_string_position(); + stacks.min_until.pop(); + drive.failure(); + }); + return; + } + + drive.state.marks_push(); + + // see if the tail matches + stacks.min_until_last().save_repeat_ctx = stacks.repeat.pop(); + + drive.next_ctx(1, |drive, stacks| { + let MinUntilContext { + count, + save_repeat_ctx, + save_last_position, + } = stacks.min_until_last(); + let count = *count; + + let mut repeat_ctx = save_repeat_ctx.take().unwrap(); + + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_until.pop(); + // restore repeat before return + stacks.repeat.push(repeat_ctx); + return drive.success(); + } + + drive.sync_string_position(); + + drive.state.marks_pop(); + + // match more until tail matches + + if count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT + || drive.state.string_position == repeat_ctx.last_position + { + stacks.min_until.pop(); + // restore repeat before return + stacks.repeat.push(repeat_ctx); + return drive.failure(); + } + + repeat_ctx.count = count; + /* zero-width match protection */ + *save_last_position = repeat_ctx.last_position; + repeat_ctx.last_position = drive.state.string_position; + + stacks.repeat.push(repeat_ctx); + + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_until.pop(); + drive.success(); + } else { + stacks.repeat_last().count = stacks.min_until_last().count - 1; + drive.sync_string_position(); + stacks.min_until.pop(); + drive.failure(); + } + }); + }); +} + +#[derive(Debug, Clone, Copy)] +struct MaxUntilContext { + save_last_position: usize, +} + +/* maximizing repeat */ +fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { + // let repeat_ctx = stacks.repeat.last_mut().unwrap(); + let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + + drive.sync_string_position(); + + repeat_ctx.count += 1; + + // let count = repeat_ctx.count + 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + // repeat_ctx.count = count; + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + if drive.popped_ctx().has_matched == Some(true) { + // stacks.max_until.pop(); + drive.success(); + } else { + // let count = stacks.max_until_last().count; + // stacks.repeat_last().count -= 1; + stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; + drive.sync_string_position(); + // stacks.max_until.pop(); + drive.failure(); + } + }); + return; + } + + stacks.max_until.push(MaxUntilContext { + save_last_position: repeat_ctx.last_position, + }); + + if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) + && drive.state.string_position != repeat_ctx.last_position + { + /* we may have enough matches, but if we can + match another item, do so */ + repeat_ctx.last_position = drive.state.string_position; + + drive.state.marks_push(); + + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + let save_last_position = stacks.max_until_last().save_last_position; + let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + repeat_ctx.last_position = save_last_position; + if drive.popped_ctx().has_matched == Some(true) { + drive.state.marks_pop_discard(); + stacks.max_until.pop(); + return drive.success(); + } + drive.state.marks_pop(); + repeat_ctx.count -= 1; + drive.sync_string_position(); + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = drive.next_ctx(1, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + }); + return; + } + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = drive.next_ctx(1, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + + fn tail_callback(drive: &mut StateContext, stacks: &mut Stacks) { + stacks.max_until.pop(); + + if drive.popped_ctx().has_matched == Some(true) { + drive.success(); + } else { + drive.sync_string_position(); + drive.failure(); + } + } +} + +#[derive(Debug, Default)] +struct Stacks { + branch: Vec, + min_repeat_one: Vec, + repeat_one: Vec, + repeat: Vec, + min_until: Vec, + max_until: Vec, +} + +impl Stacks { + fn clear(&mut self) { + self.branch.clear(); + self.min_repeat_one.clear(); + self.repeat_one.clear(); + self.repeat.clear(); + self.min_until.clear(); + self.max_until.clear(); + } + + fn branch_last(&mut self) -> &mut BranchContext { + self.branch.last_mut().unwrap() + } + fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { + self.min_repeat_one.last_mut().unwrap() + } + fn repeat_one_last(&mut self) -> &mut RepeatOneContext { + self.repeat_one.last_mut().unwrap() + } + fn repeat_last(&mut self) -> &mut RepeatContext { + self.repeat.last_mut().unwrap() + } + fn min_until_last(&mut self) -> &mut MinUntilContext { + self.min_until.last_mut().unwrap() + } + fn max_until_last(&mut self) -> &mut MaxUntilContext { + self.max_until.last_mut().unwrap() + } +} + #[derive(Debug, Clone, Copy)] pub enum StrDrive<'a> { Str(&'a str), @@ -203,7 +857,7 @@ impl<'a> StrDrive<'a> { StrDrive::Str(s) => s .get(offset..) .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) - .unwrap_or_else(|| s.len()), + .unwrap_or(s.len()), StrDrive::Bytes(_) => offset + skip, } } @@ -264,31 +918,63 @@ impl<'a> StrDrive<'a> { } } -#[derive(Debug, Clone, Copy)] +type OpcodeHandler = fn(&mut StateContext, &mut Stacks); + +#[derive(Clone, Copy)] struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, toplevel: bool, + handler: Option, + repeat_ctx_id: usize, } -trait MatchContextDrive { - fn ctx_mut(&mut self) -> &mut MatchContext; +impl std::fmt::Debug for MatchContext { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MatchContext") + .field("string_position", &self.string_position) + .field("string_offset", &self.string_offset) + .field("code_position", &self.code_position) + .field("has_matched", &self.has_matched) + .field("toplevel", &self.toplevel) + .field("handler", &self.handler.map(|x| x as usize)) + .finish() + } +} + +trait ContextDrive { fn ctx(&self) -> &MatchContext; + fn ctx_mut(&mut self) -> &mut MatchContext; fn state(&self) -> &State; - fn repeat_ctx(&self) -> &RepeatContext { - self.state().repeat_stack.last().unwrap() + + fn popped_ctx(&self) -> &MatchContext { + self.state().popped_context.as_ref().unwrap() } + fn pattern(&self) -> &[u32] { &self.state().pattern_codes[self.ctx().code_position..] } + fn peek_char(&self) -> u32 { self.state().string.peek(self.ctx().string_offset) } fn peek_code(&self, peek: usize) -> u32 { self.state().pattern_codes[self.ctx().code_position + peek] } + + fn back_peek_char(&self) -> u32 { + self.state().string.back_peek(self.ctx().string_offset) + } + fn back_skip_char(&mut self, skip_count: usize) { + self.ctx_mut().string_position -= skip_count; + self.ctx_mut().string_offset = self + .state() + .string + .back_offset(self.ctx().string_offset, skip_count); + } + fn skip_char(&mut self, skip_count: usize) { self.ctx_mut().string_offset = self .state() @@ -299,12 +985,17 @@ trait MatchContextDrive { fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; } + fn skip_code_from(&mut self, peek: usize) { + self.skip_code(self.peek_code(peek) as usize + 1); + } + fn remaining_chars(&self) -> usize { self.state().end - self.ctx().string_position } fn remaining_codes(&self) -> usize { self.state().pattern_codes.len() - self.ctx().code_position } + fn at_beginning(&self) -> bool { // self.ctx().string_position == self.state().start self.ctx().string_position == 0 @@ -331,16 +1022,7 @@ trait MatchContextDrive { let this = !self.at_end() && word_checker(self.peek_char()); this == that } - fn back_peek_char(&self) -> u32 { - self.state().string.back_peek(self.ctx().string_offset) - } - fn back_skip_char(&mut self, skip_count: usize) { - self.ctx_mut().string_position -= skip_count; - self.ctx_mut().string_offset = self - .state() - .string - .back_offset(self.ctx().string_offset, skip_count); - } + fn can_success(&self) -> bool { if !self.ctx().toplevel { return true; @@ -353,389 +1035,71 @@ trait MatchContextDrive { } true } -} -struct StackDrive<'a> { - state: State<'a>, - ctx_id: usize, -} -impl<'a> StackDrive<'a> { - fn id(&self) -> usize { - self.ctx_id - } - fn drive(ctx_id: usize, state: State<'a>) -> Self { - Self { state, ctx_id } - } - fn take(self) -> State<'a> { - self.state - } - fn push_new_context(&mut self, pattern_offset: usize) { - self.push_new_context_at(self.ctx().code_position + pattern_offset); - } - fn push_new_context_at(&mut self, code_position: usize) { - let mut child_ctx = MatchContext { ..*self.ctx() }; - child_ctx.code_position = code_position; - self.state.context_stack.push(child_ctx); - } - fn repeat_ctx_mut(&mut self) -> &mut RepeatContext { - self.state.repeat_stack.last_mut().unwrap() - } -} -impl MatchContextDrive for StackDrive<'_> { - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.state.context_stack[self.ctx_id] + fn success(&mut self) { + self.ctx_mut().has_matched = Some(true); } - fn ctx(&self) -> &MatchContext { - &self.state.context_stack[self.ctx_id] - } - fn state(&self) -> &State { - &self.state + + fn failure(&mut self) { + self.ctx_mut().has_matched = Some(false); } } -struct WrapDrive<'a> { - stack_drive: &'a StackDrive<'a>, +struct StateContext<'a> { + state: State<'a>, ctx: MatchContext, + next_ctx: Option, } -impl<'a> WrapDrive<'a> { - fn drive(ctx: MatchContext, stack_drive: &'a StackDrive<'a>) -> Self { - Self { stack_drive, ctx } + +impl ContextDrive for StateContext<'_> { + fn ctx(&self) -> &MatchContext { + &self.ctx } -} -impl MatchContextDrive for WrapDrive<'_> { fn ctx_mut(&mut self) -> &mut MatchContext { &mut self.ctx } - fn ctx(&self) -> &MatchContext { - &self.ctx - } fn state(&self) -> &State { - self.stack_drive.state() + &self.state } } -trait OpcodeExecutor { - fn next(&mut self, drive: &mut StackDrive) -> Option<()>; -} +impl StateContext<'_> { + fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { + self.next_ctx(self.peek_code(peek) as usize + 1, handler) + } + fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { + self.next_ctx_at(self.ctx.code_position + offset, handler) + } + fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { + self.next_ctx = Some(MatchContext { + code_position, + has_matched: None, + handler: None, + ..self.ctx + }); + self.ctx.handler = Some(handler); + self.next_ctx.as_mut().unwrap() + } -struct OpTwice { - f1: Option, - f2: Option, -} -impl OpcodeExecutor for OpTwice -where - F1: FnOnce(&mut StackDrive) -> Option<()>, - F2: FnOnce(&mut StackDrive), -{ - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - if let Some(f1) = self.f1.take() { - f1(drive) - } else if let Some(f2) = self.f2.take() { - f2(drive); - None - } else { - unreachable!() - } + fn sync_string_position(&mut self) { + self.state.string_position = self.ctx.string_position; } } -fn twice(f1: F1, f2: F2) -> Box> -where - F1: FnOnce(&mut StackDrive) -> Option<()>, - F2: FnOnce(&mut StackDrive), -{ - Box::new(OpTwice { - f1: Some(f1), - f2: Some(f2), - }) -} -struct OpcodeDispatcher { - executing_contexts: BTreeMap>, +struct StateRefContext<'a> { + entity: &'a StateContext<'a>, + ctx: MatchContext, } -impl OpcodeDispatcher { - fn new() -> Self { - Self { - executing_contexts: BTreeMap::new(), - } - } - fn clear(&mut self) { - self.executing_contexts.clear(); - } - // Returns True if the current context matches, False if it doesn't and - // None if matching is not finished, ie must be resumed after child - // contexts have been matched. - fn pymatch(&mut self, drive: &mut StackDrive) -> Option { - while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { - let code = drive.peek_code(0); - let opcode = SreOpcode::try_from(code).unwrap(); - if !self.dispatch(opcode, drive) { - return None; - } - } - match drive.ctx().has_matched { - Some(matched) => Some(matched), - None => { - drive.ctx_mut().has_matched = Some(false); - Some(false) - } - } - } - // Dispatches a context on a given opcode. Returns True if the context - // is done matching, False if it must be resumed when next encountered. - fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { - let executor = self - .executing_contexts - .remove(&drive.id()) - .or_else(|| self.dispatch_table(opcode, drive)); - if let Some(mut executor) = executor { - if let Some(()) = executor.next(drive) { - self.executing_contexts.insert(drive.id(), executor); - return false; - } - } - true +impl ContextDrive for StateRefContext<'_> { + fn ctx(&self) -> &MatchContext { + &self.ctx } - - fn dispatch_table( - &mut self, - opcode: SreOpcode, - drive: &mut StackDrive, - ) -> Option> { - match opcode { - SreOpcode::FAILURE => { - drive.ctx_mut().has_matched = Some(false); - None - } - SreOpcode::SUCCESS => { - drive.ctx_mut().has_matched = Some(drive.can_success()); - if drive.ctx().has_matched == Some(true) { - drive.state.string_position = drive.ctx().string_position; - } - None - } - SreOpcode::ANY => { - if drive.at_end() || drive.at_linebreak() { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - None - } - SreOpcode::ANY_ALL => { - if drive.at_end() { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - None - } - /* assert subpattern */ - /* */ - SreOpcode::ASSERT => Some(twice( - |drive| { - let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position; - if passed < back { - drive.ctx_mut().has_matched = Some(false); - return None; - } - let back_offset = drive - .state - .string - .back_offset(drive.ctx().string_offset, back); - - drive.state.string_position = drive.ctx().string_position - back; - - drive.push_new_context(3); - let child_ctx = drive.state.context_stack.last_mut().unwrap(); - child_ctx.toplevel = false; - child_ctx.string_position -= back; - child_ctx.string_offset = back_offset; - - Some(()) - }, - |drive| { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.skip_code(drive.peek_code(1) as usize + 1); - } else { - drive.ctx_mut().has_matched = Some(false); - } - }, - )), - SreOpcode::ASSERT_NOT => Some(twice( - |drive| { - let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position; - if passed < back { - drive.skip_code(drive.peek_code(1) as usize + 1); - return None; - } - let back_offset = drive - .state - .string - .back_offset(drive.ctx().string_offset, back); - - drive.state.string_position = drive.ctx().string_position - back; - - drive.push_new_context(3); - let child_ctx = drive.state.context_stack.last_mut().unwrap(); - child_ctx.toplevel = false; - child_ctx.string_position -= back; - child_ctx.string_offset = back_offset; - - Some(()) - }, - |drive| { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(drive.peek_code(1) as usize + 1); - } - }, - )), - SreOpcode::AT => { - let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); - if !at(drive, atcode) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(2); - } - None - } - SreOpcode::BRANCH => Some(Box::new(OpBranch::default())), - SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); - if drive.at_end() || !category(catcode, drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(2); - drive.skip_char(1); - } - None - } - SreOpcode::IN => { - general_op_in(drive, |set, c| charset(set, c)); - None - } - SreOpcode::IN_IGNORE => { - general_op_in(drive, |set, c| charset(set, lower_ascii(c))); - None - } - SreOpcode::IN_UNI_IGNORE => { - general_op_in(drive, |set, c| charset(set, lower_unicode(c))); - None - } - SreOpcode::IN_LOC_IGNORE => { - general_op_in(drive, |set, c| charset_loc_ignore(set, c)); - None - } - SreOpcode::INFO | SreOpcode::JUMP => { - drive.skip_code(drive.peek_code(1) as usize + 1); - None - } - SreOpcode::LITERAL => { - general_op_literal(drive, |code, c| code == c); - None - } - SreOpcode::NOT_LITERAL => { - general_op_literal(drive, |code, c| code != c); - None - } - SreOpcode::LITERAL_IGNORE => { - general_op_literal(drive, |code, c| code == lower_ascii(c)); - None - } - SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(drive, |code, c| code != lower_ascii(c)); - None - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code == lower_unicode(c)); - None - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code != lower_unicode(c)); - None - } - SreOpcode::LITERAL_LOC_IGNORE => { - general_op_literal(drive, char_loc_ignore); - None - } - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); - None - } - SreOpcode::MARK => { - drive - .state - .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); - drive.skip_code(2); - None - } - SreOpcode::REPEAT => Some(twice( - // create repeat context. all the hard work is done by the UNTIL - // operator (MAX_UNTIL, MIN_UNTIL) - // <1=min> <2=max> item tail - |drive| { - let repeat = RepeatContext { - count: -1, - code_position: drive.ctx().code_position, - last_position: std::usize::MAX, - mincount: drive.peek_code(2) as usize, - maxcount: drive.peek_code(3) as usize, - }; - drive.state.repeat_stack.push(repeat); - drive.state.string_position = drive.ctx().string_position; - // execute UNTIL operator - drive.push_new_context(drive.peek_code(1) as usize + 1); - Some(()) - }, - |drive| { - drive.state.repeat_stack.pop(); - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - }, - )), - SreOpcode::MAX_UNTIL => Some(Box::new(OpMaxUntil::default())), - SreOpcode::MIN_UNTIL => Some(Box::new(OpMinUntil::default())), - SreOpcode::REPEAT_ONE => Some(Box::new(OpRepeatOne::default())), - SreOpcode::MIN_REPEAT_ONE => Some(Box::new(OpMinRepeatOne::default())), - SreOpcode::GROUPREF => { - general_op_groupref(drive, |x| x); - None - } - SreOpcode::GROUPREF_IGNORE => { - general_op_groupref(drive, lower_ascii); - None - } - SreOpcode::GROUPREF_LOC_IGNORE => { - general_op_groupref(drive, lower_locate); - None - } - SreOpcode::GROUPREF_UNI_IGNORE => { - general_op_groupref(drive, lower_unicode); - None - } - SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); - match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => { - drive.skip_code(3); - } - _ => drive.skip_code(drive.peek_code(2) as usize + 1), - } - None - } - _ => { - // TODO python expcetion? - unreachable!("unexpected opcode") - } - } + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.ctx + } + fn state(&self) -> &State { + &self.entity.state } } @@ -752,60 +1116,63 @@ fn charset_loc_ignore(set: &[u32], c: u32) -> bool { up != lo && charset(set, up) } -fn general_op_groupref u32>(drive: &mut StackDrive, mut f: F) { +fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); let (group_start, group_end) = match (group_start, group_end) { (Some(start), Some(end)) if start <= end => (start, end), _ => { - drive.ctx_mut().has_matched = Some(false); - return; + return drive.failure(); } }; - let mut wdrive = WrapDrive::drive(*drive.ctx(), &drive); - let mut gdrive = WrapDrive::drive( - MatchContext { + + let mut wdrive = StateRefContext { + entity: drive, + ctx: drive.ctx, + }; + let mut gdrive = StateRefContext { + entity: drive, + ctx: MatchContext { string_position: group_start, // TODO: cache the offset string_offset: drive.state.string.offset(0, group_start), - ..*drive.ctx() + ..drive.ctx }, - &drive, - ); + }; + for _ in group_start..group_end { if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); - return; + return drive.failure(); } wdrive.skip_char(1); gdrive.skip_char(1); } - let position = wdrive.ctx().string_position; - let offset = wdrive.ctx().string_offset; + + let position = wdrive.ctx.string_position; + let offset = wdrive.ctx.string_offset; drive.skip_code(2); - drive.ctx_mut().string_position = position; - drive.ctx_mut().string_offset = offset; + drive.ctx.string_position = position; + drive.ctx.string_offset = offset; } -fn general_op_literal bool>(drive: &mut StackDrive, f: F) { +fn general_op_literal bool>(drive: &mut StateContext, f: F) { if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); + drive.failure(); } else { drive.skip_code(2); drive.skip_char(1); } } -fn general_op_in bool>(drive: &mut StackDrive, f: F) { - let skip = drive.peek_code(1) as usize; +fn general_op_in bool>(drive: &mut StateContext, f: F) { if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); + drive.failure(); } else { - drive.skip_code(skip + 1); + drive.skip_code_from(1); drive.skip_char(1); } } -fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { +fn at(drive: &StateContext, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), @@ -938,84 +1305,91 @@ fn charset(set: &[u32], ch: u32) -> bool { } /* General case */ -fn general_count(drive: &mut StackDrive, maxcount: usize) -> usize { +fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { let mut count = 0; - let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + let max_count = std::cmp::min(max_count, drive.remaining_chars()); - let save_ctx = *drive.ctx(); + let save_ctx = drive.ctx; drive.skip_code(4); - let reset_position = drive.ctx().code_position; - - let mut dispatcher = OpcodeDispatcher::new(); - while count < maxcount { - drive.ctx_mut().code_position = reset_position; - dispatcher.dispatch(SreOpcode::try_from(drive.peek_code(0)).unwrap(), drive); - if drive.ctx().has_matched == Some(false) { + let reset_position = drive.ctx.code_position; + + while count < max_count { + drive.ctx.code_position = reset_position; + let code = drive.peek_code(0); + let code = SreOpcode::try_from(code).unwrap(); + dispatch(code, drive, stacks); + if drive.ctx.has_matched == Some(false) { break; } count += 1; } - *drive.ctx_mut() = save_ctx; + drive.ctx = save_ctx; count } -fn count(stack_drive: &mut StackDrive, maxcount: usize) -> usize { - let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); - let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); - let end = drive.ctx().string_position + maxcount; +fn count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { + let save_ctx = drive.ctx; + let max_count = std::cmp::min(max_count, drive.remaining_chars()); + let end = drive.ctx.string_position + max_count; let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); match opcode { SreOpcode::ANY => { - while !drive.ctx().string_position < end && !drive.at_linebreak() { + while !drive.ctx.string_position < end && !drive.at_linebreak() { drive.skip_char(1); } } SreOpcode::ANY_ALL => { - drive.skip_char(maxcount); + drive.skip_char(max_count); } SreOpcode::IN => { - while !drive.ctx().string_position < end + while !drive.ctx.string_position < end && charset(&drive.pattern()[2..], drive.peek_char()) { drive.skip_char(1); } } SreOpcode::LITERAL => { - general_count_literal(&mut drive, end, |code, c| code == c as u32); + general_count_literal(drive, end, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(&mut drive, end, |code, c| code != c as u32); + general_count_literal(drive, end, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code == lower_ascii(c) as u32); + general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code != lower_ascii(c) as u32); + general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, end, char_loc_ignore); + general_count_literal(drive, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, end, |code, c| !char_loc_ignore(code, c)); + general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code == lower_unicode(c) as u32); + general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code != lower_unicode(c) as u32); + general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); } _ => { - return general_count(stack_drive, maxcount); + return general_count(drive, stacks, max_count); } } - drive.ctx().string_position - drive.state().string_position + let count = drive.ctx.string_position - drive.state.string_position; + drive.ctx = save_ctx; + count } -fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { +fn general_count_literal bool>( + drive: &mut StateContext, + end: usize, + mut f: F, +) { let ch = drive.peek_code(1); - while !drive.ctx().string_position < end && f(ch, drive.peek_char()) { + while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { drive.skip_char(1); } } @@ -1065,7 +1439,9 @@ fn upper_locate(ch: u32) -> u32 { } fn is_uni_digit(ch: u32) -> bool { // TODO: check with cpython - char::try_from(ch).map(|x| x.is_digit(10)).unwrap_or(false) + char::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) } fn is_uni_space(ch: u32) -> bool { // TODO: check with cpython @@ -1155,413 +1531,3 @@ fn utf8_back_peek_offset(bytes: &[u8], offset: usize) -> usize { } offset } - -#[derive(Debug, Copy, Clone)] -struct RepeatContext { - count: isize, - code_position: usize, - // zero-width match protection - last_position: usize, - mincount: usize, - maxcount: usize, -} - -#[derive(Default)] -struct OpMinRepeatOne { - jump_id: usize, - mincount: usize, - maxcount: usize, - count: usize, -} -impl OpcodeExecutor for OpMinRepeatOne { - /* <1=min> <2=max> item tail */ - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; - - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - - drive.state.string_position = drive.ctx().string_position; - - self.count = if self.mincount == 0 { - 0 - } else { - let count = count(drive, self.mincount); - if count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(count); - count - }; - - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } - - drive.state.marks_push(); - self.jump_id = 1; - self.next(drive) - } - 1 => { - if self.maxcount == MAXREPEAT || self.count <= self.maxcount { - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - return Some(()); - } - - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - None - } - 2 => { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - if count(drive, 1) == 0 { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(1); - self.count += 1; - drive.state.marks_pop_keep(); - self.jump_id = 1; - self.next(drive) - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpMaxUntil { - jump_id: usize, - count: isize, - save_last_position: usize, -} -impl OpcodeExecutor for OpMaxUntil { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - let RepeatContext { - count, - code_position, - last_position, - mincount, - maxcount, - } = *drive.repeat_ctx(); - - drive.state.string_position = drive.ctx().string_position; - self.count = count + 1; - - if (self.count as usize) < mincount { - // not enough matches - drive.repeat_ctx_mut().count = self.count; - drive.push_new_context_at(code_position + 4); - self.jump_id = 1; - return Some(()); - } - - if ((self.count as usize) < maxcount || maxcount == MAXREPEAT) - && drive.state.string_position != last_position - { - // we may have enough matches, if we can match another item, do so - drive.repeat_ctx_mut().count = self.count; - drive.state.marks_push(); - self.save_last_position = last_position; - drive.repeat_ctx_mut().last_position = drive.state.string_position; - drive.push_new_context_at(code_position + 4); - self.jump_id = 2; - return Some(()); - } - - self.jump_id = 3; - self.next(drive) - } - 1 => { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - } - None - } - 2 => { - drive.repeat_ctx_mut().last_position = self.save_last_position; - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.marks_pop(); - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - self.jump_id = 3; - self.next(drive) - } - 3 => { - // cannot match more repeated items here. make sure the tail matches - drive.push_new_context(1); - self.jump_id = 4; - Some(()) - } - 4 => { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - } - None - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpMinUntil { - jump_id: usize, - count: isize, - save_repeat: Option, - save_last_position: usize, -} -impl OpcodeExecutor for OpMinUntil { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - let RepeatContext { - count, - code_position, - last_position: _, - mincount, - maxcount: _, - } = *drive.repeat_ctx(); - drive.state.string_position = drive.ctx().string_position; - self.count = count + 1; - - if (self.count as usize) < mincount { - // not enough matches - drive.repeat_ctx_mut().count = self.count; - drive.push_new_context_at(code_position + 4); - self.jump_id = 1; - return Some(()); - } - - // see if the tail matches - drive.state.marks_push(); - self.save_repeat = drive.state.repeat_stack.pop(); - drive.push_new_context(1); - self.jump_id = 2; - Some(()) - } - 1 => { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.repeat_ctx_mut().count = self.count - 1; - drive.repeat_ctx_mut().last_position = self.save_last_position; - drive.state.string_position = drive.ctx().string_position; - } - None - } - 2 => { - // restore repeat before return - drive.state.repeat_stack.push(self.save_repeat.unwrap()); - - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - drive.state.marks_pop(); - - // match more until tail matches - let RepeatContext { - count: _, - code_position, - last_position, - mincount: _, - maxcount, - } = *drive.repeat_ctx(); - - if self.count as usize >= maxcount && maxcount != MAXREPEAT - || drive.state.string_position == last_position - { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.repeat_ctx_mut().count = self.count; - - /* zero-width match protection */ - self.save_last_position = last_position; - drive.repeat_ctx_mut().last_position = drive.state.string_position; - - drive.push_new_context_at(code_position + 4); - self.jump_id = 1; - Some(()) - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpBranch { - jump_id: usize, - branch_offset: usize, -} -impl OpcodeExecutor for OpBranch { - // alternation - // <0=skip> code ... - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - drive.state.marks_push(); - // jump out the head - self.branch_offset = 1; - self.jump_id = 1; - self.next(drive) - } - 1 => { - let next_branch_length = drive.peek_code(self.branch_offset) as usize; - if next_branch_length == 0 { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(self.branch_offset + 1); - self.branch_offset += next_branch_length; - self.jump_id = 2; - Some(()) - } - 2 => { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.marks_pop_keep(); - self.jump_id = 1; - Some(()) - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpRepeatOne { - jump_id: usize, - mincount: usize, - maxcount: usize, - count: usize, - following_literal: Option, -} -impl OpcodeExecutor for OpRepeatOne { - /* match repeated sequence (maximizing regexp) */ - - /* this operator only works if the repeated item is - exactly one character wide, and we're not already - collecting backtracking points. for other cases, - use the MAX_REPEAT operator */ - - /* <1=min> <2=max> item tail */ - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; - - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - - drive.state.string_position = drive.ctx().string_position; - - self.count = count(drive, self.maxcount); - drive.skip_char(self.count); - if self.count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } - - drive.state.marks_push(); - - // Special case: Tail starts with a literal. Skip positions where - // the rest of the pattern cannot possibly match. - if next_code == SreOpcode::LITERAL as u32 { - self.following_literal = Some(drive.peek_code(drive.peek_code(1) as usize + 2)) - } - - self.jump_id = 1; - self.next(drive) - } - 1 => { - if let Some(c) = self.following_literal { - while drive.at_end() || drive.peek_char() != c { - if self.count <= self.mincount { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.back_skip_char(1); - self.count -= 1; - } - } - - // General case: backtracking - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - Some(()) - } - 2 => { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - if self.count <= self.mincount { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - return None; - } - - drive.back_skip_char(1); - self.count -= 1; - - drive.state.marks_pop_keep(); - - self.jump_id = 1; - self.next(drive) - } - _ => unreachable!(), - } - } -} diff --git a/src/lib.rs b/src/lib.rs index 4a3ed1b754..c23e807501 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ pub mod engine; pub const CODESIZE: usize = 4; #[cfg(target_pointer_width = "32")] -pub const MAXREPEAT: usize = usize::MAX; +pub const MAXREPEAT: usize = usize::MAX - 1; #[cfg(target_pointer_width = "64")] pub const MAXREPEAT: usize = u32::MAX as usize; diff --git a/tests/tests.rs b/tests/tests.rs index b430947a9b..e8ae487029 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -70,4 +70,15 @@ fn test_repeat_context_panic() { let mut state = p.state("axxzaz", 0..usize::MAX); state = state.pymatch(); assert!(state.marks == vec![Some(1), Some(3)]); -} \ No newline at end of file +} + +#[test] +fn test_double_max_until() { + // pattern p = re.compile(r'((1)?)*') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = p.state("1111", 0..usize::MAX); + state = state.pymatch(); + assert!(state.string_position == 4); +} From 4007f8276550efb6aa1ada429a3e89c042eae86c Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 27 Jul 2022 21:33:20 +0200 Subject: [PATCH 052/960] optimize max_until and min_until --- src/engine.rs | 96 +++++++++++++++------------------------------------ 1 file changed, 27 insertions(+), 69 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 81903ccfdd..223aa3425c 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -420,7 +420,7 @@ fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { let count = if min_count == 0 { 0 } else { - let count = count(drive, stacks, min_count); + let count = _count(drive, stacks, min_count); if count < min_count { return drive.failure(); } @@ -462,7 +462,7 @@ fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { drive.sync_string_position(); - if crate::engine::count(drive, stacks, 1) == 0 { + if _count(drive, stacks, 1) == 0 { drive.state.marks_pop_discard(); stacks.min_repeat_one.pop(); return drive.failure(); @@ -500,7 +500,7 @@ fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { drive.sync_string_position(); - let count = count(drive, stacks, max_count); + let count = _count(drive, stacks, max_count); drive.skip_char(count); if count < min_count { return drive.failure(); @@ -614,9 +614,7 @@ fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { #[derive(Debug, Clone, Copy)] struct MinUntilContext { - count: isize, - save_repeat_ctx: Option, - save_last_position: usize, + save_repeat_ctx_id: usize, } /* minimizing repeat */ @@ -625,50 +623,35 @@ fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { drive.sync_string_position(); - let count = repeat_ctx.count + 1; - - stacks.min_until.push(MinUntilContext { - count, - save_repeat_ctx: None, - save_last_position: repeat_ctx.last_position, - }); + repeat_ctx.count += 1; - if (count as usize) < repeat_ctx.min_count { + if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - repeat_ctx.count = count; drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { if drive.popped_ctx().has_matched == Some(true) { - stacks.min_until.pop(); - return drive.success(); + drive.success(); + } else { + stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; + drive.sync_string_position(); + drive.failure(); } - - stacks.repeat_last().count = stacks.min_until_last().count - 1; - drive.sync_string_position(); - stacks.min_until.pop(); - drive.failure(); }); return; } drive.state.marks_push(); - // see if the tail matches - stacks.min_until_last().save_repeat_ctx = stacks.repeat.pop(); + stacks.min_until.push(MinUntilContext { + save_repeat_ctx_id: drive.ctx.repeat_ctx_id, + }); - drive.next_ctx(1, |drive, stacks| { - let MinUntilContext { - count, - save_repeat_ctx, - save_last_position, - } = stacks.min_until_last(); - let count = *count; + // see if the tail matches + let next_ctx = drive.next_ctx(1, |drive, stacks| { + drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; - let mut repeat_ctx = save_repeat_ctx.take().unwrap(); + let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; if drive.popped_ctx().has_matched == Some(true) { - stacks.min_until.pop(); - // restore repeat before return - stacks.repeat.push(repeat_ctx); return drive.success(); } @@ -678,34 +661,27 @@ fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { // match more until tail matches - if count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT + if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT || drive.state.string_position == repeat_ctx.last_position { - stacks.min_until.pop(); - // restore repeat before return - stacks.repeat.push(repeat_ctx); + repeat_ctx.count -= 1; return drive.failure(); } - repeat_ctx.count = count; /* zero-width match protection */ - *save_last_position = repeat_ctx.last_position; repeat_ctx.last_position = drive.state.string_position; - stacks.repeat.push(repeat_ctx); - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { if drive.popped_ctx().has_matched == Some(true) { - stacks.min_until.pop(); drive.success(); } else { - stacks.repeat_last().count = stacks.min_until_last().count - 1; + stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; drive.sync_string_position(); - stacks.min_until.pop(); drive.failure(); } }); }); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; } #[derive(Debug, Clone, Copy)] @@ -715,28 +691,20 @@ struct MaxUntilContext { /* maximizing repeat */ fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { - // let repeat_ctx = stacks.repeat.last_mut().unwrap(); let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; drive.sync_string_position(); repeat_ctx.count += 1; - // let count = repeat_ctx.count + 1; - if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - // repeat_ctx.count = count; drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { if drive.popped_ctx().has_matched == Some(true) { - // stacks.max_until.pop(); drive.success(); } else { - // let count = stacks.max_until_last().count; - // stacks.repeat_last().count -= 1; stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; drive.sync_string_position(); - // stacks.max_until.pop(); drive.failure(); } }); @@ -757,14 +725,15 @@ fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { drive.state.marks_push(); drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - let save_last_position = stacks.max_until_last().save_last_position; + let save_last_position = stacks.max_until.pop().unwrap().save_last_position; let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; repeat_ctx.last_position = save_last_position; + if drive.popped_ctx().has_matched == Some(true) { drive.state.marks_pop_discard(); - stacks.max_until.pop(); return drive.success(); } + drive.state.marks_pop(); repeat_ctx.count -= 1; drive.sync_string_position(); @@ -782,9 +751,7 @@ fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { let next_ctx = drive.next_ctx(1, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - fn tail_callback(drive: &mut StateContext, stacks: &mut Stacks) { - stacks.max_until.pop(); - + fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { if drive.popped_ctx().has_matched == Some(true) { drive.success(); } else { @@ -823,15 +790,6 @@ impl Stacks { fn repeat_one_last(&mut self) -> &mut RepeatOneContext { self.repeat_one.last_mut().unwrap() } - fn repeat_last(&mut self) -> &mut RepeatContext { - self.repeat.last_mut().unwrap() - } - fn min_until_last(&mut self) -> &mut MinUntilContext { - self.min_until.last_mut().unwrap() - } - fn max_until_last(&mut self) -> &mut MaxUntilContext { - self.max_until.last_mut().unwrap() - } } #[derive(Debug, Clone, Copy)] @@ -1327,7 +1285,7 @@ fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize count } -fn count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { +fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { let save_ctx = drive.ctx; let max_count = std::cmp::min(max_count, drive.remaining_chars()); let end = drive.ctx.string_position + max_count; From bf57f289bff1ec5633316a924e82fbb5f5ed6eb0 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 27 Jul 2022 21:33:43 +0200 Subject: [PATCH 053/960] update version to 0.2.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6ba3996947..00123d92c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.2.0" +version = "0.2.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From 9058f287881af7fc0f004759184a0ff5d0811967 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 28 Jul 2022 22:46:28 +0200 Subject: [PATCH 054/960] refactor trait StrDrive instead enum --- src/engine.rs | 2049 +++++++++++++++++++++++++------------------------ 1 file changed, 1055 insertions(+), 994 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 223aa3425c..8865eb6a39 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -9,8 +9,8 @@ const fn is_py_ascii_whitespace(b: u8) -> bool { } #[derive(Debug)] -pub struct State<'a> { - pub string: StrDrive<'a>, +pub struct State<'a, S: StrDrive> { + pub string: S, pub start: usize, pub end: usize, _flags: SreFlag, @@ -18,18 +18,25 @@ pub struct State<'a> { pub marks: Vec>, pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, - context_stack: Vec, - _stacks: Option>, + context_stack: Vec>, + // branch_stack: Vec, + // min_repeat_one_stack: Vec, + // repeat_one_stack: Vec, + // repeat_stack: Vec, + // min_until_stack: Vec, + // max_until_stack: Vec, + // _stacks: Option>, pub string_position: usize, - popped_context: Option, + popped_context: Option>, + next_context: Option>, pub has_matched: bool, pub match_all: bool, pub must_advance: bool, } -impl<'a> State<'a> { +impl<'a, S: StrDrive> State<'a, S> { pub fn new( - string: StrDrive<'a>, + string: S, start: usize, end: usize, flags: SreFlag, @@ -47,9 +54,16 @@ impl<'a> State<'a> { lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - _stacks: Default::default(), + // branch_stack: Vec::new(), + // min_repeat_one_stack: Vec::new(), + // repeat_one_stack: Vec::new(), + // repeat_stack: Vec::new(), + // min_until_stack: Vec::new(), + // max_until_stack: Vec::new(), + // _stacks: Default::default(), string_position: start, popped_context: None, + next_context: None, has_matched: false, match_all: false, must_advance: false, @@ -61,11 +75,15 @@ impl<'a> State<'a> { self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); - if let Some(x) = self._stacks.as_mut() { - x.clear() - }; + // self.branch_stack.clear(); + // self.min_repeat_one_stack.clear(); + // self.repeat_one_stack.clear(); + // self.repeat_stack.clear(); + // self.min_until_stack.clear(); + // self.max_until_stack.clear(); self.string_position = self.start; self.popped_context = None; + self.next_context = None; self.has_matched = false; } @@ -103,47 +121,46 @@ impl<'a> State<'a> { self.marks_stack.pop(); } - fn _match(mut self, stacks: &mut Stacks) -> Self { - while let Some(ctx) = self.context_stack.pop() { - let mut drive = StateContext { - state: self, - ctx, - next_ctx: None, - }; + fn _match(&mut self) { + while let Some(mut ctx) = self.context_stack.pop() { + // let mut drive = StateContext { + // state: self, + // ctx, + // next_ctx: None, + // }; + // let mut state = self; - if let Some(handler) = drive.ctx.handler { - handler(&mut drive, stacks); - } else if drive.remaining_codes() > 0 { - let code = drive.peek_code(0); + if let Some(handler) = ctx.handler { + handler(self, &mut ctx); + } else if ctx.remaining_codes(self) > 0 { + let code = ctx.peek_code(self, 0); let code = SreOpcode::try_from(code).unwrap(); - dispatch(code, &mut drive, stacks); + self.dispatch(code, &mut ctx); } else { - drive.failure(); + ctx.failure(); } - let StateContext { - mut state, - ctx, - next_ctx, - } = drive; + // let StateContext { + // mut state, + // ctx, + // next_ctx, + // } = drive; if ctx.has_matched.is_some() { - state.popped_context = Some(ctx); + self.popped_context = Some(ctx); } else { - state.context_stack.push(ctx); - if let Some(next_ctx) = next_ctx { - state.context_stack.push(next_ctx); + self.context_stack.push(ctx); + if let Some(next_ctx) = self.next_context.take() { + self.context_stack.push(next_ctx); } } - self = state + // self = state } - self.has_matched = self.popped_context.unwrap().has_matched == Some(true); - self + self.has_matched = self.popped_context.take().unwrap().has_matched == Some(true); + // self } pub fn pymatch(mut self) -> Self { - let mut stacks = self._stacks.take().unwrap_or_default(); - let ctx = MatchContext { string_position: self.start, string_offset: self.string.offset(0, self.start), @@ -155,13 +172,11 @@ impl<'a> State<'a> { }; self.context_stack.push(ctx); - self = self._match(&mut stacks); - self._stacks = Some(stacks); + self._match(); self } pub fn search(mut self) -> Self { - let mut stacks = self._stacks.take().unwrap_or_default(); // TODO: optimize by op info and skip prefix if self.start > self.end { @@ -180,14 +195,13 @@ impl<'a> State<'a> { repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut stacks); + self._match(); self.must_advance = false; while !self.has_matched && self.start < self.end { self.start += 1; start_offset = self.string.offset(start_offset, 1); self.reset(); - stacks.clear(); let ctx = MatchContext { string_position: self.start, @@ -199,697 +213,730 @@ impl<'a> State<'a> { repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut stacks); + self._match(); } - self._stacks = Some(stacks); self } -} -fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { - match opcode { - SreOpcode::FAILURE => { - drive.failure(); - } - SreOpcode::SUCCESS => { - drive.ctx.has_matched = Some(drive.can_success()); - if drive.ctx.has_matched == Some(true) { - drive.state.string_position = drive.ctx.string_position; - } - } - SreOpcode::ANY => { - if drive.at_end() || drive.at_linebreak() { - drive.failure(); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - } - SreOpcode::ANY_ALL => { - if drive.at_end() { - drive.failure(); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - } - SreOpcode::ASSERT => op_assert(drive), - SreOpcode::ASSERT_NOT => op_assert_not(drive), - SreOpcode::AT => { - let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); - if at(drive, atcode) { - drive.skip_code(2); - } else { - drive.failure(); - } - } - SreOpcode::BRANCH => op_branch(drive, stacks), - SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); - if drive.at_end() || !category(catcode, drive.peek_char()) { - drive.failure(); - } else { - drive.skip_code(2); - drive.skip_char(1); - } - } - SreOpcode::IN => general_op_in(drive, charset), - SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), - SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), - SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), - SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), - SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), - SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), - SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), - SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(drive, |code, c| code != lower_ascii(c)) - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code == lower_unicode(c)) - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code != lower_unicode(c)) - } - SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) - } - SreOpcode::MARK => { - drive - .state - .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); - drive.skip_code(2); - } - SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), - SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), - SreOpcode::REPEAT => op_repeat(drive, stacks), - SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), - SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), - SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), - SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), - SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), - SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), - SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); - match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => { - drive.skip_code(3); - } - _ => drive.skip_code_from(2), + fn dispatch(&mut self, opcode: SreOpcode, ctx: &mut MatchContext<'a, S>) { + match opcode { + SreOpcode::FAILURE => { + ctx.has_matched = Some(false); } + SreOpcode::SUCCESS => todo!(), + SreOpcode::ANY => todo!(), + SreOpcode::ANY_ALL => todo!(), + SreOpcode::ASSERT => todo!(), + SreOpcode::ASSERT_NOT => todo!(), + SreOpcode::AT => todo!(), + SreOpcode::BRANCH => todo!(), + SreOpcode::CALL => todo!(), + SreOpcode::CATEGORY => todo!(), + SreOpcode::CHARSET => todo!(), + SreOpcode::BIGCHARSET => todo!(), + SreOpcode::GROUPREF => todo!(), + SreOpcode::GROUPREF_EXISTS => todo!(), + SreOpcode::IN => todo!(), + SreOpcode::INFO => todo!(), + SreOpcode::JUMP => todo!(), + SreOpcode::LITERAL => todo!(), + SreOpcode::MARK => todo!(), + SreOpcode::MAX_UNTIL => todo!(), + SreOpcode::MIN_UNTIL => todo!(), + SreOpcode::NOT_LITERAL => todo!(), + SreOpcode::NEGATE => todo!(), + SreOpcode::RANGE => todo!(), + SreOpcode::REPEAT => todo!(), + SreOpcode::REPEAT_ONE => todo!(), + SreOpcode::SUBPATTERN => todo!(), + SreOpcode::MIN_REPEAT_ONE => todo!(), + SreOpcode::GROUPREF_IGNORE => todo!(), + SreOpcode::IN_IGNORE => todo!(), + SreOpcode::LITERAL_IGNORE => todo!(), + SreOpcode::NOT_LITERAL_IGNORE => todo!(), + SreOpcode::GROUPREF_LOC_IGNORE => todo!(), + SreOpcode::IN_LOC_IGNORE => todo!(), + SreOpcode::LITERAL_LOC_IGNORE => todo!(), + SreOpcode::NOT_LITERAL_LOC_IGNORE => todo!(), + SreOpcode::GROUPREF_UNI_IGNORE => todo!(), + SreOpcode::IN_UNI_IGNORE => todo!(), + SreOpcode::LITERAL_UNI_IGNORE => todo!(), + SreOpcode::NOT_LITERAL_UNI_IGNORE => todo!(), + SreOpcode::RANGE_UNI_IGNORE => todo!(), } - _ => unreachable!("unexpected opcode"), - } -} - -/* assert subpattern */ -/* */ -fn op_assert(drive: &mut StateContext) { - let back = drive.peek_code(2) as usize; - - if drive.ctx.string_position < back { - return drive.failure(); - } - - let offset = drive - .state - .string - .back_offset(drive.ctx.string_offset, back); - let position = drive.ctx.string_position - back; - - drive.state.string_position = position; - - let next_ctx = drive.next_ctx(3, |drive, _| { - if drive.popped_ctx().has_matched == Some(true) { - drive.ctx.handler = None; - drive.skip_code_from(1); - } else { - drive.failure(); - } - }); - next_ctx.string_position = position; - next_ctx.string_offset = offset; - next_ctx.toplevel = false; -} - -/* assert not subpattern */ -/* */ -fn op_assert_not(drive: &mut StateContext) { - let back = drive.peek_code(2) as usize; - - if drive.ctx.string_position < back { - return drive.skip_code_from(1); - } - - let offset = drive - .state - .string - .back_offset(drive.ctx.string_offset, back); - let position = drive.ctx.string_position - back; - - drive.state.string_position = position; - - let next_ctx = drive.next_ctx(3, |drive, _| { - if drive.popped_ctx().has_matched == Some(true) { - drive.failure(); - } else { - drive.ctx.handler = None; - drive.skip_code_from(1); - } - }); - next_ctx.string_position = position; - next_ctx.string_offset = offset; - next_ctx.toplevel = false; -} - -#[derive(Debug)] -struct BranchContext { - branch_offset: usize, -} - -// alternation -// <0=skip> code ... -fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { - drive.state.marks_push(); - stacks.branch.push(BranchContext { branch_offset: 1 }); - create_context(drive, stacks); - - fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { - let branch_offset = stacks.branch_last().branch_offset; - let next_length = drive.peek_code(branch_offset) as usize; - if next_length == 0 { - drive.state.marks_pop_discard(); - stacks.branch.pop(); - return drive.failure(); - } - - drive.sync_string_position(); - - stacks.branch_last().branch_offset += next_length; - drive.next_ctx(branch_offset + 1, callback); - } - - fn callback(drive: &mut StateContext, stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - stacks.branch.pop(); - return drive.success(); - } - drive.state.marks_pop_keep(); - drive.ctx.handler = Some(create_context) - } -} - -#[derive(Debug, Copy, Clone)] -struct MinRepeatOneContext { - count: usize, - max_count: usize, -} - -/* <1=min> <2=max> item tail */ -fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { - let min_count = drive.peek_code(2) as usize; - let max_count = drive.peek_code(3) as usize; - - if drive.remaining_chars() < min_count { - return drive.failure(); - } - - drive.sync_string_position(); - - let count = if min_count == 0 { - 0 - } else { - let count = _count(drive, stacks, min_count); - if count < min_count { - return drive.failure(); - } - drive.skip_char(count); - count - }; - - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.sync_string_position(); - return drive.success(); - } - - drive.state.marks_push(); - stacks - .min_repeat_one - .push(MinRepeatOneContext { count, max_count }); - create_context(drive, stacks); - - fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { - let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); - - if max_count == MAXREPEAT || count <= max_count { - drive.sync_string_position(); - drive.next_ctx_from(1, callback); - } else { - drive.state.marks_pop_discard(); - stacks.min_repeat_one.pop(); - drive.failure(); - } - } - - fn callback(drive: &mut StateContext, stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - stacks.min_repeat_one.pop(); - return drive.success(); - } - - drive.sync_string_position(); - - if _count(drive, stacks, 1) == 0 { - drive.state.marks_pop_discard(); - stacks.min_repeat_one.pop(); - return drive.failure(); - } - - drive.skip_char(1); - stacks.min_repeat_one_last().count += 1; - drive.state.marks_pop_keep(); - create_context(drive, stacks); } } -#[derive(Debug, Copy, Clone)] -struct RepeatOneContext { - count: usize, - min_count: usize, - following_literal: Option, +// fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { +// match opcode { +// SreOpcode::FAILURE => { +// drive.failure(); +// } +// SreOpcode::SUCCESS => { +// drive.ctx.has_matched = Some(drive.can_success()); +// if drive.ctx.has_matched == Some(true) { +// drive.state.string_position = drive.ctx.string_position; +// } +// } +// SreOpcode::ANY => { +// if drive.at_end() || drive.at_linebreak() { +// drive.failure(); +// } else { +// drive.skip_code(1); +// drive.skip_char(1); +// } +// } +// SreOpcode::ANY_ALL => { +// if drive.at_end() { +// drive.failure(); +// } else { +// drive.skip_code(1); +// drive.skip_char(1); +// } +// } +// SreOpcode::ASSERT => op_assert(drive), +// SreOpcode::ASSERT_NOT => op_assert_not(drive), +// SreOpcode::AT => { +// let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); +// if at(drive, atcode) { +// drive.skip_code(2); +// } else { +// drive.failure(); +// } +// } +// SreOpcode::BRANCH => op_branch(drive, stacks), +// SreOpcode::CATEGORY => { +// let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); +// if drive.at_end() || !category(catcode, drive.peek_char()) { +// drive.failure(); +// } else { +// drive.skip_code(2); +// drive.skip_char(1); +// } +// } +// SreOpcode::IN => general_op_in(drive, charset), +// SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), +// SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), +// SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), +// SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), +// SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), +// SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), +// SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), +// SreOpcode::NOT_LITERAL_IGNORE => { +// general_op_literal(drive, |code, c| code != lower_ascii(c)) +// } +// SreOpcode::LITERAL_UNI_IGNORE => { +// general_op_literal(drive, |code, c| code == lower_unicode(c)) +// } +// SreOpcode::NOT_LITERAL_UNI_IGNORE => { +// general_op_literal(drive, |code, c| code != lower_unicode(c)) +// } +// SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), +// SreOpcode::NOT_LITERAL_LOC_IGNORE => { +// general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) +// } +// SreOpcode::MARK => { +// drive +// .state +// .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); +// drive.skip_code(2); +// } +// SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), +// SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), +// SreOpcode::REPEAT => op_repeat(drive, stacks), +// SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), +// SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), +// SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), +// SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), +// SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), +// SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), +// SreOpcode::GROUPREF_EXISTS => { +// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); +// match (group_start, group_end) { +// (Some(start), Some(end)) if start <= end => { +// drive.skip_code(3); +// } +// _ => drive.skip_code_from(2), +// } +// } +// _ => unreachable!("unexpected opcode"), +// } +// } + +// /* assert subpattern */ +// /* */ +// fn op_assert(drive: &mut StateContext) { +// let back = drive.peek_code(2) as usize; + +// if drive.ctx.string_position < back { +// return drive.failure(); +// } + +// let offset = drive +// .state +// .string +// .back_offset(drive.ctx.string_offset, back); +// let position = drive.ctx.string_position - back; + +// drive.state.string_position = position; + +// let next_ctx = drive.next_ctx(3, |drive, _| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.ctx.handler = None; +// drive.skip_code_from(1); +// } else { +// drive.failure(); +// } +// }); +// next_ctx.string_position = position; +// next_ctx.string_offset = offset; +// next_ctx.toplevel = false; +// } + +// /* assert not subpattern */ +// /* */ +// fn op_assert_not(drive: &mut StateContext) { +// let back = drive.peek_code(2) as usize; + +// if drive.ctx.string_position < back { +// return drive.skip_code_from(1); +// } + +// let offset = drive +// .state +// .string +// .back_offset(drive.ctx.string_offset, back); +// let position = drive.ctx.string_position - back; + +// drive.state.string_position = position; + +// let next_ctx = drive.next_ctx(3, |drive, _| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.failure(); +// } else { +// drive.ctx.handler = None; +// drive.skip_code_from(1); +// } +// }); +// next_ctx.string_position = position; +// next_ctx.string_offset = offset; +// next_ctx.toplevel = false; +// } + +// #[derive(Debug)] +// struct BranchContext { +// branch_offset: usize, +// } + +// // alternation +// // <0=skip> code ... +// fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { +// drive.state.marks_push(); +// stacks.branch.push(BranchContext { branch_offset: 1 }); +// create_context(drive, stacks); + +// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { +// let branch_offset = stacks.branch_last().branch_offset; +// let next_length = drive.peek_code(branch_offset) as usize; +// if next_length == 0 { +// drive.state.marks_pop_discard(); +// stacks.branch.pop(); +// return drive.failure(); +// } + +// drive.sync_string_position(); + +// stacks.branch_last().branch_offset += next_length; +// drive.next_ctx(branch_offset + 1, callback); +// } + +// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// stacks.branch.pop(); +// return drive.success(); +// } +// drive.state.marks_pop_keep(); +// drive.ctx.handler = Some(create_context) +// } +// } + +// #[derive(Debug, Copy, Clone)] +// struct MinRepeatOneContext { +// count: usize, +// max_count: usize, +// } + +// /* <1=min> <2=max> item tail */ +// fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { +// let min_count = drive.peek_code(2) as usize; +// let max_count = drive.peek_code(3) as usize; + +// if drive.remaining_chars() < min_count { +// return drive.failure(); +// } + +// drive.sync_string_position(); + +// let count = if min_count == 0 { +// 0 +// } else { +// let count = _count(drive, stacks, min_count); +// if count < min_count { +// return drive.failure(); +// } +// drive.skip_char(count); +// count +// }; + +// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); +// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { +// // tail is empty. we're finished +// drive.sync_string_position(); +// return drive.success(); +// } + +// drive.state.marks_push(); +// stacks +// .min_repeat_one +// .push(MinRepeatOneContext { count, max_count }); +// create_context(drive, stacks); + +// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { +// let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); + +// if max_count == MAXREPEAT || count <= max_count { +// drive.sync_string_position(); +// drive.next_ctx_from(1, callback); +// } else { +// drive.state.marks_pop_discard(); +// stacks.min_repeat_one.pop(); +// drive.failure(); +// } +// } + +// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// stacks.min_repeat_one.pop(); +// return drive.success(); +// } + +// drive.sync_string_position(); + +// if _count(drive, stacks, 1) == 0 { +// drive.state.marks_pop_discard(); +// stacks.min_repeat_one.pop(); +// return drive.failure(); +// } + +// drive.skip_char(1); +// stacks.min_repeat_one_last().count += 1; +// drive.state.marks_pop_keep(); +// create_context(drive, stacks); +// } +// } + +// #[derive(Debug, Copy, Clone)] +// struct RepeatOneContext { +// count: usize, +// min_count: usize, +// following_literal: Option, +// } + +// /* match repeated sequence (maximizing regexp) */ + +// /* this operator only works if the repeated item is +// exactly one character wide, and we're not already +// collecting backtracking points. for other cases, +// use the MAX_REPEAT operator */ + +// /* <1=min> <2=max> item tail */ +// fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { +// let min_count = drive.peek_code(2) as usize; +// let max_count = drive.peek_code(3) as usize; + +// if drive.remaining_chars() < min_count { +// return drive.failure(); +// } + +// drive.sync_string_position(); + +// let count = _count(drive, stacks, max_count); +// drive.skip_char(count); +// if count < min_count { +// return drive.failure(); +// } + +// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); +// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { +// // tail is empty. we're finished +// drive.sync_string_position(); +// return drive.success(); +// } + +// // Special case: Tail starts with a literal. Skip positions where +// // the rest of the pattern cannot possibly match. +// let following_literal = (next_code == SreOpcode::LITERAL as u32) +// .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); + +// drive.state.marks_push(); +// stacks.repeat_one.push(RepeatOneContext { +// count, +// min_count, +// following_literal, +// }); +// create_context(drive, stacks); + +// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { +// let RepeatOneContext { +// mut count, +// min_count, +// following_literal, +// } = *stacks.repeat_one_last(); + +// if let Some(c) = following_literal { +// while drive.at_end() || drive.peek_char() != c { +// if count <= min_count { +// drive.state.marks_pop_discard(); +// stacks.repeat_one.pop(); +// return drive.failure(); +// } +// drive.back_skip_char(1); +// count -= 1; +// } +// } +// stacks.repeat_one_last().count = count; + +// drive.sync_string_position(); + +// // General case: backtracking +// drive.next_ctx_from(1, callback); +// } + +// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// stacks.repeat_one.pop(); +// return drive.success(); +// } + +// let RepeatOneContext { +// count, +// min_count, +// following_literal: _, +// } = stacks.repeat_one_last(); + +// if count <= min_count { +// drive.state.marks_pop_discard(); +// stacks.repeat_one.pop(); +// return drive.failure(); +// } + +// drive.back_skip_char(1); +// *count -= 1; + +// drive.state.marks_pop_keep(); +// create_context(drive, stacks); +// } +// } + +// #[derive(Debug, Clone, Copy)] +// struct RepeatContext { +// count: isize, +// min_count: usize, +// max_count: usize, +// code_position: usize, +// last_position: usize, +// prev_id: usize, +// } + +// /* create repeat context. all the hard work is done +// by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ +// /* <1=min> <2=max> item tail */ +// fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { +// let repeat_ctx = RepeatContext { +// count: -1, +// min_count: drive.peek_code(2) as usize, +// max_count: drive.peek_code(3) as usize, +// code_position: drive.ctx.code_position, +// last_position: std::usize::MAX, +// prev_id: drive.ctx.repeat_ctx_id, +// }; + +// stacks.repeat.push(repeat_ctx); + +// drive.sync_string_position(); + +// let next_ctx = drive.next_ctx_from(1, |drive, stacks| { +// drive.ctx.has_matched = drive.popped_ctx().has_matched; +// stacks.repeat.pop(); +// }); +// next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; +// } + +// #[derive(Debug, Clone, Copy)] +// struct MinUntilContext { +// save_repeat_ctx_id: usize, +// } + +// /* minimizing repeat */ +// fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { +// let repeat_ctx = stacks.repeat.last_mut().unwrap(); + +// drive.sync_string_position(); + +// repeat_ctx.count += 1; + +// if (repeat_ctx.count as usize) < repeat_ctx.min_count { +// // not enough matches +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; +// drive.sync_string_position(); +// drive.failure(); +// } +// }); +// return; +// } + +// drive.state.marks_push(); + +// stacks.min_until.push(MinUntilContext { +// save_repeat_ctx_id: drive.ctx.repeat_ctx_id, +// }); + +// // see if the tail matches +// let next_ctx = drive.next_ctx(1, |drive, stacks| { +// drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; + +// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + +// if drive.popped_ctx().has_matched == Some(true) { +// return drive.success(); +// } + +// drive.sync_string_position(); + +// drive.state.marks_pop(); + +// // match more until tail matches + +// if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT +// || drive.state.string_position == repeat_ctx.last_position +// { +// repeat_ctx.count -= 1; +// return drive.failure(); +// } + +// /* zero-width match protection */ +// repeat_ctx.last_position = drive.state.string_position; + +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; +// drive.sync_string_position(); +// drive.failure(); +// } +// }); +// }); +// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; +// } + +// #[derive(Debug, Clone, Copy)] +// struct MaxUntilContext { +// save_last_position: usize, +// } + +// /* maximizing repeat */ +// fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { +// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + +// drive.sync_string_position(); + +// repeat_ctx.count += 1; + +// if (repeat_ctx.count as usize) < repeat_ctx.min_count { +// // not enough matches +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; +// drive.sync_string_position(); +// drive.failure(); +// } +// }); +// return; +// } + +// stacks.max_until.push(MaxUntilContext { +// save_last_position: repeat_ctx.last_position, +// }); + +// if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) +// && drive.state.string_position != repeat_ctx.last_position +// { +// /* we may have enough matches, but if we can +// match another item, do so */ +// repeat_ctx.last_position = drive.state.string_position; + +// drive.state.marks_push(); + +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// let save_last_position = stacks.max_until.pop().unwrap().save_last_position; +// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; +// repeat_ctx.last_position = save_last_position; + +// if drive.popped_ctx().has_matched == Some(true) { +// drive.state.marks_pop_discard(); +// return drive.success(); +// } + +// drive.state.marks_pop(); +// repeat_ctx.count -= 1; +// drive.sync_string_position(); + +// /* cannot match more repeated items here. make sure the +// tail matches */ +// let next_ctx = drive.next_ctx(1, tail_callback); +// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; +// }); +// return; +// } + +// /* cannot match more repeated items here. make sure the +// tail matches */ +// let next_ctx = drive.next_ctx(1, tail_callback); +// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + +// fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// drive.sync_string_position(); +// drive.failure(); +// } +// } +// } + +// #[derive(Debug, Default)] +// struct Stacks { +// } + +// impl Stacks { +// fn clear(&mut self) { +// self.branch.clear(); +// self.min_repeat_one.clear(); +// self.repeat_one.clear(); +// self.repeat.clear(); +// self.min_until.clear(); +// self.max_until.clear(); +// } + +// fn branch_last(&mut self) -> &mut BranchContext { +// self.branch.last_mut().unwrap() +// } +// fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { +// self.min_repeat_one.last_mut().unwrap() +// } +// fn repeat_one_last(&mut self) -> &mut RepeatOneContext { +// self.repeat_one.last_mut().unwrap() +// } +// } + +pub trait StrDrive { + fn offset(&self, offset: usize, skip: usize) -> usize; + fn count(&self) -> usize; + fn peek(&self, offset: usize) -> u32; + fn back_peek(&self, offset: usize) -> u32; + fn back_offset(&self, offset: usize, skip: usize) -> usize; } -/* match repeated sequence (maximizing regexp) */ - -/* this operator only works if the repeated item is -exactly one character wide, and we're not already -collecting backtracking points. for other cases, -use the MAX_REPEAT operator */ - -/* <1=min> <2=max> item tail */ -fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { - let min_count = drive.peek_code(2) as usize; - let max_count = drive.peek_code(3) as usize; - - if drive.remaining_chars() < min_count { - return drive.failure(); - } - - drive.sync_string_position(); - let count = _count(drive, stacks, max_count); - drive.skip_char(count); - if count < min_count { - return drive.failure(); +impl<'a> StrDrive for &'a str { + fn offset(&self, offset: usize, skip: usize) -> usize { + self.get(offset..) + .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) + .unwrap_or(self.len()) } - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.sync_string_position(); - return drive.success(); + fn count(&self) -> usize { + self.chars().count() } - // Special case: Tail starts with a literal. Skip positions where - // the rest of the pattern cannot possibly match. - let following_literal = (next_code == SreOpcode::LITERAL as u32) - .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); - - drive.state.marks_push(); - stacks.repeat_one.push(RepeatOneContext { - count, - min_count, - following_literal, - }); - create_context(drive, stacks); - - fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { - let RepeatOneContext { - mut count, - min_count, - following_literal, - } = *stacks.repeat_one_last(); - - if let Some(c) = following_literal { - while drive.at_end() || drive.peek_char() != c { - if count <= min_count { - drive.state.marks_pop_discard(); - stacks.repeat_one.pop(); - return drive.failure(); - } - drive.back_skip_char(1); - count -= 1; - } - } - stacks.repeat_one_last().count = count; - - drive.sync_string_position(); - - // General case: backtracking - drive.next_ctx_from(1, callback); + fn peek(&self, offset: usize) -> u32 { + unsafe { self.get_unchecked(offset..) } + .chars() + .next() + .unwrap() as u32 } - fn callback(drive: &mut StateContext, stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - stacks.repeat_one.pop(); - return drive.success(); - } - - let RepeatOneContext { - count, - min_count, - following_literal: _, - } = stacks.repeat_one_last(); - - if count <= min_count { - drive.state.marks_pop_discard(); - stacks.repeat_one.pop(); - return drive.failure(); + fn back_peek(&self, offset: usize) -> u32 { + let bytes = self.as_bytes(); + let back_offset = utf8_back_peek_offset(bytes, offset); + match offset - back_offset { + 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), + 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), + 3 => u32::from_be_bytes([0, bytes[offset - 3], bytes[offset - 2], bytes[offset - 1]]), + 4 => u32::from_be_bytes([ + bytes[offset - 4], + bytes[offset - 3], + bytes[offset - 2], + bytes[offset - 1], + ]), + _ => unreachable!(), } - - drive.back_skip_char(1); - *count -= 1; - - drive.state.marks_pop_keep(); - create_context(drive, stacks); } -} - -#[derive(Debug, Clone, Copy)] -struct RepeatContext { - count: isize, - min_count: usize, - max_count: usize, - code_position: usize, - last_position: usize, - prev_id: usize, -} - -/* create repeat context. all the hard work is done -by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ -/* <1=min> <2=max> item tail */ -fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { - let repeat_ctx = RepeatContext { - count: -1, - min_count: drive.peek_code(2) as usize, - max_count: drive.peek_code(3) as usize, - code_position: drive.ctx.code_position, - last_position: std::usize::MAX, - prev_id: drive.ctx.repeat_ctx_id, - }; - - stacks.repeat.push(repeat_ctx); - - drive.sync_string_position(); - - let next_ctx = drive.next_ctx_from(1, |drive, stacks| { - drive.ctx.has_matched = drive.popped_ctx().has_matched; - stacks.repeat.pop(); - }); - next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; -} - -#[derive(Debug, Clone, Copy)] -struct MinUntilContext { - save_repeat_ctx_id: usize, -} - -/* minimizing repeat */ -fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { - let repeat_ctx = stacks.repeat.last_mut().unwrap(); - - drive.sync_string_position(); - - repeat_ctx.count += 1; - - if (repeat_ctx.count as usize) < repeat_ctx.min_count { - // not enough matches - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; - drive.sync_string_position(); - drive.failure(); - } - }); - return; - } - - drive.state.marks_push(); - - stacks.min_until.push(MinUntilContext { - save_repeat_ctx_id: drive.ctx.repeat_ctx_id, - }); - - // see if the tail matches - let next_ctx = drive.next_ctx(1, |drive, stacks| { - drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; - - let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - - if drive.popped_ctx().has_matched == Some(true) { - return drive.success(); - } - - drive.sync_string_position(); - - drive.state.marks_pop(); - - // match more until tail matches - - if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT - || drive.state.string_position == repeat_ctx.last_position - { - repeat_ctx.count -= 1; - return drive.failure(); - } - - /* zero-width match protection */ - repeat_ctx.last_position = drive.state.string_position; - - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; - drive.sync_string_position(); - drive.failure(); - } - }); - }); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; -} - -#[derive(Debug, Clone, Copy)] -struct MaxUntilContext { - save_last_position: usize, -} - -/* maximizing repeat */ -fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { - let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - - drive.sync_string_position(); - repeat_ctx.count += 1; - - if (repeat_ctx.count as usize) < repeat_ctx.min_count { - // not enough matches - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; - drive.sync_string_position(); - drive.failure(); - } - }); - return; - } - - stacks.max_until.push(MaxUntilContext { - save_last_position: repeat_ctx.last_position, - }); - - if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) - && drive.state.string_position != repeat_ctx.last_position - { - /* we may have enough matches, but if we can - match another item, do so */ - repeat_ctx.last_position = drive.state.string_position; - - drive.state.marks_push(); - - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - let save_last_position = stacks.max_until.pop().unwrap().save_last_position; - let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - repeat_ctx.last_position = save_last_position; - - if drive.popped_ctx().has_matched == Some(true) { - drive.state.marks_pop_discard(); - return drive.success(); - } - - drive.state.marks_pop(); - repeat_ctx.count -= 1; - drive.sync_string_position(); - - /* cannot match more repeated items here. make sure the - tail matches */ - let next_ctx = drive.next_ctx(1, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - }); - return; - } - - /* cannot match more repeated items here. make sure the - tail matches */ - let next_ctx = drive.next_ctx(1, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - - fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - drive.sync_string_position(); - drive.failure(); + fn back_offset(&self, offset: usize, skip: usize) -> usize { + let bytes = self.as_bytes(); + let mut back_offset = offset; + for _ in 0..skip { + back_offset = utf8_back_peek_offset(bytes, back_offset); } + back_offset } } -#[derive(Debug, Default)] -struct Stacks { - branch: Vec, - min_repeat_one: Vec, - repeat_one: Vec, - repeat: Vec, - min_until: Vec, - max_until: Vec, -} - -impl Stacks { - fn clear(&mut self) { - self.branch.clear(); - self.min_repeat_one.clear(); - self.repeat_one.clear(); - self.repeat.clear(); - self.min_until.clear(); - self.max_until.clear(); - } - - fn branch_last(&mut self) -> &mut BranchContext { - self.branch.last_mut().unwrap() - } - fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { - self.min_repeat_one.last_mut().unwrap() - } - fn repeat_one_last(&mut self) -> &mut RepeatOneContext { - self.repeat_one.last_mut().unwrap() - } -} - -#[derive(Debug, Clone, Copy)] -pub enum StrDrive<'a> { - Str(&'a str), - Bytes(&'a [u8]), -} - -impl<'a> From<&'a str> for StrDrive<'a> { - fn from(s: &'a str) -> Self { - Self::Str(s) - } -} -impl<'a> From<&'a [u8]> for StrDrive<'a> { - fn from(b: &'a [u8]) -> Self { - Self::Bytes(b) - } -} - -impl<'a> StrDrive<'a> { +impl<'a> StrDrive for &'a [u8] { fn offset(&self, offset: usize, skip: usize) -> usize { - match *self { - StrDrive::Str(s) => s - .get(offset..) - .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) - .unwrap_or(s.len()), - StrDrive::Bytes(_) => offset + skip, - } + offset + skip } - pub fn count(&self) -> usize { - match *self { - StrDrive::Str(s) => s.chars().count(), - StrDrive::Bytes(b) => b.len(), - } + fn count(&self) -> usize { + self.len() } fn peek(&self, offset: usize) -> u32 { - match *self { - StrDrive::Str(s) => unsafe { s.get_unchecked(offset..) }.chars().next().unwrap() as u32, - StrDrive::Bytes(b) => b[offset] as u32, - } + self[offset] as u32 } fn back_peek(&self, offset: usize) -> u32 { - match *self { - StrDrive::Str(s) => { - let bytes = s.as_bytes(); - let back_offset = utf8_back_peek_offset(bytes, offset); - match offset - back_offset { - 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), - 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), - 3 => u32::from_be_bytes([ - 0, - bytes[offset - 3], - bytes[offset - 2], - bytes[offset - 1], - ]), - 4 => u32::from_be_bytes([ - bytes[offset - 4], - bytes[offset - 3], - bytes[offset - 2], - bytes[offset - 1], - ]), - _ => unreachable!(), - } - } - StrDrive::Bytes(b) => b[offset - 1] as u32, - } + self[offset - 1] as u32 } fn back_offset(&self, offset: usize, skip: usize) -> usize { - match *self { - StrDrive::Str(s) => { - let bytes = s.as_bytes(); - let mut back_offset = offset; - for _ in 0..skip { - back_offset = utf8_back_peek_offset(bytes, back_offset); - } - back_offset - } - StrDrive::Bytes(_) => offset - skip, - } + offset - skip } } -type OpcodeHandler = fn(&mut StateContext, &mut Stacks); +// type OpcodeHandler = for<'a>fn(&mut StateContext<'a, S>, &mut Stacks); #[derive(Clone, Copy)] -struct MatchContext { +struct MatchContext<'a, S: StrDrive> { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, toplevel: bool, - handler: Option, + handler: Option, &mut Self)>, repeat_ctx_id: usize, } -impl std::fmt::Debug for MatchContext { +impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MatchContext") .field("string_position", &self.string_position) @@ -902,164 +949,178 @@ impl std::fmt::Debug for MatchContext { } } -trait ContextDrive { - fn ctx(&self) -> &MatchContext; - fn ctx_mut(&mut self) -> &mut MatchContext; - fn state(&self) -> &State; - - fn popped_ctx(&self) -> &MatchContext { - self.state().popped_context.as_ref().unwrap() +impl<'a, S: StrDrive> MatchContext<'a, S> { + fn remaining_codes(&self, state: &State<'a, S>) -> usize { + state.pattern_codes.len() - self.code_position } - - fn pattern(&self) -> &[u32] { - &self.state().pattern_codes[self.ctx().code_position..] - } - - fn peek_char(&self) -> u32 { - self.state().string.peek(self.ctx().string_offset) - } - fn peek_code(&self, peek: usize) -> u32 { - self.state().pattern_codes[self.ctx().code_position + peek] - } - - fn back_peek_char(&self) -> u32 { - self.state().string.back_peek(self.ctx().string_offset) - } - fn back_skip_char(&mut self, skip_count: usize) { - self.ctx_mut().string_position -= skip_count; - self.ctx_mut().string_offset = self - .state() - .string - .back_offset(self.ctx().string_offset, skip_count); - } - - fn skip_char(&mut self, skip_count: usize) { - self.ctx_mut().string_offset = self - .state() - .string - .offset(self.ctx().string_offset, skip_count); - self.ctx_mut().string_position += skip_count; - } - fn skip_code(&mut self, skip_count: usize) { - self.ctx_mut().code_position += skip_count; - } - fn skip_code_from(&mut self, peek: usize) { - self.skip_code(self.peek_code(peek) as usize + 1); - } - - fn remaining_chars(&self) -> usize { - self.state().end - self.ctx().string_position - } - fn remaining_codes(&self) -> usize { - self.state().pattern_codes.len() - self.ctx().code_position - } - - fn at_beginning(&self) -> bool { - // self.ctx().string_position == self.state().start - self.ctx().string_position == 0 - } - fn at_end(&self) -> bool { - self.ctx().string_position == self.state().end - } - fn at_linebreak(&self) -> bool { - !self.at_end() && is_linebreak(self.peek_char()) - } - fn at_boundary bool>(&self, mut word_checker: F) -> bool { - if self.at_beginning() && self.at_end() { - return false; - } - let that = !self.at_beginning() && word_checker(self.back_peek_char()); - let this = !self.at_end() && word_checker(self.peek_char()); - this != that - } - fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { - if self.at_beginning() && self.at_end() { - return false; - } - let that = !self.at_beginning() && word_checker(self.back_peek_char()); - let this = !self.at_end() && word_checker(self.peek_char()); - this == that - } - - fn can_success(&self) -> bool { - if !self.ctx().toplevel { - return true; - } - if self.state().match_all && !self.at_end() { - return false; - } - if self.state().must_advance && self.ctx().string_position == self.state().start { - return false; - } - true - } - - fn success(&mut self) { - self.ctx_mut().has_matched = Some(true); + + fn peek_code(&self, state: &State<'a, S>, peek: usize) -> u32 { + state.pattern_codes[self.code_position + peek] } fn failure(&mut self) { - self.ctx_mut().has_matched = Some(false); + self.has_matched = Some(false); } } -struct StateContext<'a> { - state: State<'a>, - ctx: MatchContext, - next_ctx: Option, -} - -impl ContextDrive for StateContext<'_> { - fn ctx(&self) -> &MatchContext { - &self.ctx - } - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.ctx - } - fn state(&self) -> &State { - &self.state - } -} - -impl StateContext<'_> { - fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { - self.next_ctx(self.peek_code(peek) as usize + 1, handler) - } - fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { - self.next_ctx_at(self.ctx.code_position + offset, handler) - } - fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { - self.next_ctx = Some(MatchContext { - code_position, - has_matched: None, - handler: None, - ..self.ctx - }); - self.ctx.handler = Some(handler); - self.next_ctx.as_mut().unwrap() - } - - fn sync_string_position(&mut self) { - self.state.string_position = self.ctx.string_position; - } -} - -struct StateRefContext<'a> { - entity: &'a StateContext<'a>, - ctx: MatchContext, -} - -impl ContextDrive for StateRefContext<'_> { - fn ctx(&self) -> &MatchContext { - &self.ctx - } - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.ctx - } - fn state(&self) -> &State { - &self.entity.state - } -} +// trait ContextDrive<'a, T: StrDrive<'a>> { +// fn ctx(&self) -> &MatchContext; +// fn ctx_mut(&mut self) -> &mut MatchContext; +// fn state(&self) -> &State<'a, T>; + +// fn popped_ctx(&self) -> &MatchContext { +// self.state().popped_context.as_ref().unwrap() +// } + +// fn pattern(&self) -> &[u32] { +// &self.state().pattern_codes[self.ctx().code_position..] +// } + +// fn peek_char(&self) -> u32 { +// self.state().string.peek(self.ctx().string_offset) +// } +// fn peek_code(&self, peek: usize) -> u32 { +// self.state().pattern_codes[self.ctx().code_position + peek] +// } + +// fn back_peek_char(&self) -> u32 { +// self.state().string.back_peek(self.ctx().string_offset) +// } +// fn back_skip_char(&mut self, skip_count: usize) { +// self.ctx_mut().string_position -= skip_count; +// self.ctx_mut().string_offset = self +// .state() +// .string +// .back_offset(self.ctx().string_offset, skip_count); +// } + +// fn skip_char(&mut self, skip_count: usize) { +// self.ctx_mut().string_offset = self +// .state() +// .string +// .offset(self.ctx().string_offset, skip_count); +// self.ctx_mut().string_position += skip_count; +// } +// fn skip_code(&mut self, skip_count: usize) { +// self.ctx_mut().code_position += skip_count; +// } +// fn skip_code_from(&mut self, peek: usize) { +// self.skip_code(self.peek_code(peek) as usize + 1); +// } + +// fn remaining_chars(&self) -> usize { +// self.state().end - self.ctx().string_position +// } +// fn remaining_codes(&self) -> usize { +// self.state().pattern_codes.len() - self.ctx().code_position +// } + +// fn at_beginning(&self) -> bool { +// // self.ctx().string_position == self.state().start +// self.ctx().string_position == 0 +// } +// fn at_end(&self) -> bool { +// self.ctx().string_position == self.state().end +// } +// fn at_linebreak(&self) -> bool { +// !self.at_end() && is_linebreak(self.peek_char()) +// } +// fn at_boundary bool>(&self, mut word_checker: F) -> bool { +// if self.at_beginning() && self.at_end() { +// return false; +// } +// let that = !self.at_beginning() && word_checker(self.back_peek_char()); +// let this = !self.at_end() && word_checker(self.peek_char()); +// this != that +// } +// fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { +// if self.at_beginning() && self.at_end() { +// return false; +// } +// let that = !self.at_beginning() && word_checker(self.back_peek_char()); +// let this = !self.at_end() && word_checker(self.peek_char()); +// this == that +// } + +// fn can_success(&self) -> bool { +// if !self.ctx().toplevel { +// return true; +// } +// if self.state().match_all && !self.at_end() { +// return false; +// } +// if self.state().must_advance && self.ctx().string_position == self.state().start { +// return false; +// } +// true +// } + +// fn success(&mut self) { +// self.ctx_mut().has_matched = Some(true); +// } + +// fn failure(&mut self) { +// self.ctx_mut().has_matched = Some(false); +// } +// } + +// struct StateContext<'a, S: StrDrive<'a>> { +// state: State<'a, S>, +// ctx: MatchContext, +// next_ctx: Option, +// } + +// impl<'a, S: StrDrive<'a>> ContextDrive<'a, S> for StateContext<'a, S> { +// fn ctx(&self) -> &MatchContext { +// &self.ctx +// } +// fn ctx_mut(&mut self) -> &mut MatchContext { +// &mut self.ctx +// } +// fn state(&self) -> &State<'a, S> { +// &self.state +// } +// } + +// impl StateContext<'_> { +// fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { +// self.next_ctx(self.peek_code(peek) as usize + 1, handler) +// } +// fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { +// self.next_ctx_at(self.ctx.code_position + offset, handler) +// } +// fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { +// self.next_ctx = Some(MatchContext { +// code_position, +// has_matched: None, +// handler: None, +// ..self.ctx +// }); +// self.ctx.handler = Some(handler); +// self.next_ctx.as_mut().unwrap() +// } + +// fn sync_string_position(&mut self) { +// self.state.string_position = self.ctx.string_position; +// } +// } + +// struct StateRefContext<'a> { +// entity: &'a StateContext<'a>, +// ctx: MatchContext, +// } + +// impl ContextDrive for StateRefContext<'_> { +// fn ctx(&self) -> &MatchContext { +// &self.ctx +// } +// fn ctx_mut(&mut self) -> &mut MatchContext { +// &mut self.ctx +// } +// fn state(&self) -> &State { +// &self.entity.state +// } +// } fn char_loc_ignore(code: u32, c: u32) -> bool { code == c || code == lower_locate(c) || code == upper_locate(c) @@ -1074,77 +1135,77 @@ fn charset_loc_ignore(set: &[u32], c: u32) -> bool { up != lo && charset(set, up) } -fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { - let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); - let (group_start, group_end) = match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => (start, end), - _ => { - return drive.failure(); - } - }; - - let mut wdrive = StateRefContext { - entity: drive, - ctx: drive.ctx, - }; - let mut gdrive = StateRefContext { - entity: drive, - ctx: MatchContext { - string_position: group_start, - // TODO: cache the offset - string_offset: drive.state.string.offset(0, group_start), - ..drive.ctx - }, - }; - - for _ in group_start..group_end { - if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { - return drive.failure(); - } - wdrive.skip_char(1); - gdrive.skip_char(1); - } - - let position = wdrive.ctx.string_position; - let offset = wdrive.ctx.string_offset; - drive.skip_code(2); - drive.ctx.string_position = position; - drive.ctx.string_offset = offset; -} - -fn general_op_literal bool>(drive: &mut StateContext, f: F) { - if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { - drive.failure(); - } else { - drive.skip_code(2); - drive.skip_char(1); - } -} - -fn general_op_in bool>(drive: &mut StateContext, f: F) { - if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { - drive.failure(); - } else { - drive.skip_code_from(1); - drive.skip_char(1); - } -} - -fn at(drive: &StateContext, atcode: SreAtCode) -> bool { - match atcode { - SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), - SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), - SreAtCode::BOUNDARY => drive.at_boundary(is_word), - SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), - SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), - SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), - SreAtCode::END_STRING => drive.at_end(), - SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), - SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), - SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), - SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), - } -} +// fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { +// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); +// let (group_start, group_end) = match (group_start, group_end) { +// (Some(start), Some(end)) if start <= end => (start, end), +// _ => { +// return drive.failure(); +// } +// }; + +// let mut wdrive = StateRefContext { +// entity: drive, +// ctx: drive.ctx, +// }; +// let mut gdrive = StateRefContext { +// entity: drive, +// ctx: MatchContext { +// string_position: group_start, +// // TODO: cache the offset +// string_offset: drive.state.string.offset(0, group_start), +// ..drive.ctx +// }, +// }; + +// for _ in group_start..group_end { +// if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { +// return drive.failure(); +// } +// wdrive.skip_char(1); +// gdrive.skip_char(1); +// } + +// let position = wdrive.ctx.string_position; +// let offset = wdrive.ctx.string_offset; +// drive.skip_code(2); +// drive.ctx.string_position = position; +// drive.ctx.string_offset = offset; +// } + +// fn general_op_literal bool>(drive: &mut StateContext, f: F) { +// if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { +// drive.failure(); +// } else { +// drive.skip_code(2); +// drive.skip_char(1); +// } +// } + +// fn general_op_in bool>(drive: &mut StateContext, f: F) { +// if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { +// drive.failure(); +// } else { +// drive.skip_code_from(1); +// drive.skip_char(1); +// } +// } + +// fn at(drive: &StateContext, atcode: SreAtCode) -> bool { +// match atcode { +// SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), +// SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), +// SreAtCode::BOUNDARY => drive.at_boundary(is_word), +// SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), +// SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), +// SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), +// SreAtCode::END_STRING => drive.at_end(), +// SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), +// SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), +// SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), +// SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), +// } +// } fn category(catcode: SreCatCode, c: u32) -> bool { match catcode { @@ -1262,95 +1323,95 @@ fn charset(set: &[u32], ch: u32) -> bool { false } -/* General case */ -fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { - let mut count = 0; - let max_count = std::cmp::min(max_count, drive.remaining_chars()); - - let save_ctx = drive.ctx; - drive.skip_code(4); - let reset_position = drive.ctx.code_position; - - while count < max_count { - drive.ctx.code_position = reset_position; - let code = drive.peek_code(0); - let code = SreOpcode::try_from(code).unwrap(); - dispatch(code, drive, stacks); - if drive.ctx.has_matched == Some(false) { - break; - } - count += 1; - } - drive.ctx = save_ctx; - count -} - -fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { - let save_ctx = drive.ctx; - let max_count = std::cmp::min(max_count, drive.remaining_chars()); - let end = drive.ctx.string_position + max_count; - let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); - - match opcode { - SreOpcode::ANY => { - while !drive.ctx.string_position < end && !drive.at_linebreak() { - drive.skip_char(1); - } - } - SreOpcode::ANY_ALL => { - drive.skip_char(max_count); - } - SreOpcode::IN => { - while !drive.ctx.string_position < end - && charset(&drive.pattern()[2..], drive.peek_char()) - { - drive.skip_char(1); - } - } - SreOpcode::LITERAL => { - general_count_literal(drive, end, |code, c| code == c as u32); - } - SreOpcode::NOT_LITERAL => { - general_count_literal(drive, end, |code, c| code != c as u32); - } - SreOpcode::LITERAL_IGNORE => { - general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); - } - SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); - } - SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(drive, end, char_loc_ignore); - } - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); - } - _ => { - return general_count(drive, stacks, max_count); - } - } - - let count = drive.ctx.string_position - drive.state.string_position; - drive.ctx = save_ctx; - count -} - -fn general_count_literal bool>( - drive: &mut StateContext, - end: usize, - mut f: F, -) { - let ch = drive.peek_code(1); - while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { - drive.skip_char(1); - } -} +// /* General case */ +// fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { +// let mut count = 0; +// let max_count = std::cmp::min(max_count, drive.remaining_chars()); + +// let save_ctx = drive.ctx; +// drive.skip_code(4); +// let reset_position = drive.ctx.code_position; + +// while count < max_count { +// drive.ctx.code_position = reset_position; +// let code = drive.peek_code(0); +// let code = SreOpcode::try_from(code).unwrap(); +// dispatch(code, drive, stacks); +// if drive.ctx.has_matched == Some(false) { +// break; +// } +// count += 1; +// } +// drive.ctx = save_ctx; +// count +// } + +// fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { +// let save_ctx = drive.ctx; +// let max_count = std::cmp::min(max_count, drive.remaining_chars()); +// let end = drive.ctx.string_position + max_count; +// let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); + +// match opcode { +// SreOpcode::ANY => { +// while !drive.ctx.string_position < end && !drive.at_linebreak() { +// drive.skip_char(1); +// } +// } +// SreOpcode::ANY_ALL => { +// drive.skip_char(max_count); +// } +// SreOpcode::IN => { +// while !drive.ctx.string_position < end +// && charset(&drive.pattern()[2..], drive.peek_char()) +// { +// drive.skip_char(1); +// } +// } +// SreOpcode::LITERAL => { +// general_count_literal(drive, end, |code, c| code == c as u32); +// } +// SreOpcode::NOT_LITERAL => { +// general_count_literal(drive, end, |code, c| code != c as u32); +// } +// SreOpcode::LITERAL_IGNORE => { +// general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); +// } +// SreOpcode::NOT_LITERAL_IGNORE => { +// general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); +// } +// SreOpcode::LITERAL_LOC_IGNORE => { +// general_count_literal(drive, end, char_loc_ignore); +// } +// SreOpcode::NOT_LITERAL_LOC_IGNORE => { +// general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); +// } +// SreOpcode::LITERAL_UNI_IGNORE => { +// general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); +// } +// SreOpcode::NOT_LITERAL_UNI_IGNORE => { +// general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); +// } +// _ => { +// return general_count(drive, stacks, max_count); +// } +// } + +// let count = drive.ctx.string_position - drive.state.string_position; +// drive.ctx = save_ctx; +// count +// } + +// fn general_count_literal bool>( +// drive: &mut StateContext, +// end: usize, +// mut f: F, +// ) { +// let ch = drive.peek_code(1); +// while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { +// drive.skip_char(1); +// } +// } fn is_word(ch: u32) -> bool { ch == '_' as u32 From 34bde45a2c0906a3a3e03dca79a4426b4cf57655 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 1 Aug 2022 22:19:49 +0200 Subject: [PATCH 055/960] pass compile --- benches/benches.rs | 22 +- src/engine.rs | 1802 ++++++++++++++++++++------------------------ tests/tests.rs | 20 +- 3 files changed, 821 insertions(+), 1023 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index b86a592967..d000ceb62e 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -11,12 +11,12 @@ pub struct Pattern { } impl Pattern { - pub fn state<'a>( + pub fn state<'a, S: engine::StrDrive>( &self, - string: impl Into>, + string: S, range: std::ops::Range, - ) -> engine::State<'a> { - engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + ) -> engine::State<'a, S> { + engine::State::new(string, range.start, range.end, self.flags, self.code) } } #[bench] @@ -84,28 +84,28 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { let mut state = p.state(s.clone(), 0..usize::MAX); - state = state.search(); + state.search(); assert!(state.has_matched); state = p.state(s.clone(), 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); state = p.state(s.clone(), 0..usize::MAX); state.match_all = true; - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); state = p.state(s2.as_str(), 0..usize::MAX); - state = state.search(); + state.search(); assert!(state.has_matched); state = p.state(s2.as_str(), 10000..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); state = p.state(s2.as_str(), 10000..10000 + s.len()); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); state = p.state(s2.as_str(), 10000..10000 + s.len()); state.match_all = true; - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); } }) diff --git a/src/engine.rs b/src/engine.rs index 8865eb6a39..b0717e1671 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -19,21 +19,45 @@ pub struct State<'a, S: StrDrive> { pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec>, - // branch_stack: Vec, - // min_repeat_one_stack: Vec, - // repeat_one_stack: Vec, - // repeat_stack: Vec, - // min_until_stack: Vec, - // max_until_stack: Vec, - // _stacks: Option>, + repeat_stack: Vec, pub string_position: usize, - popped_context: Option>, next_context: Option>, + popped_has_matched: bool, pub has_matched: bool, pub match_all: bool, pub must_advance: bool, } +macro_rules! next_ctx { + (offset $offset:expr, $state:expr, $ctx:expr, $handler:expr) => { + next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) + }; + (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { + next_ctx!(position $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) + }; + (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => { + {$state.next_context.insert(MatchContext { + code_position: $position, + has_matched: None, + handler: Some($handler), + ..*$ctx + })} + }; +} + +macro_rules! mark { + (push, $state:expr) => { + $state + .marks_stack + .push(($state.marks.clone(), $state.lastindex)) + }; + (pop, $state:expr) => { + let (marks, lastindex) = $state.marks_stack.pop().unwrap(); + $state.marks = marks; + $state.lastindex = lastindex; + }; +} + impl<'a, S: StrDrive> State<'a, S> { pub fn new( string: S, @@ -54,16 +78,10 @@ impl<'a, S: StrDrive> State<'a, S> { lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - // branch_stack: Vec::new(), - // min_repeat_one_stack: Vec::new(), - // repeat_one_stack: Vec::new(), - // repeat_stack: Vec::new(), - // min_until_stack: Vec::new(), - // max_until_stack: Vec::new(), - // _stacks: Default::default(), + repeat_stack: Vec::new(), string_position: start, - popped_context: None, next_context: None, + popped_has_matched: false, has_matched: false, match_all: false, must_advance: false, @@ -75,15 +93,10 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); - // self.branch_stack.clear(); - // self.min_repeat_one_stack.clear(); - // self.repeat_one_stack.clear(); - // self.repeat_stack.clear(); - // self.min_until_stack.clear(); - // self.max_until_stack.clear(); + self.repeat_stack.clear(); self.string_position = self.start; - self.popped_context = None; self.next_context = None; + self.popped_has_matched = false; self.has_matched = false; } @@ -104,14 +117,14 @@ impl<'a, S: StrDrive> State<'a, S> { (None, None) } } - fn marks_push(&mut self) { - self.marks_stack.push((self.marks.clone(), self.lastindex)); - } - fn marks_pop(&mut self) { - let (marks, lastindex) = self.marks_stack.pop().unwrap(); - self.marks = marks; - self.lastindex = lastindex; - } + // fn marks_push(&mut self) { + // self.marks_stack.push((self.marks.clone(), self.lastindex)); + // } + // fn marks_pop(&mut self) { + // let (marks, lastindex) = self.marks_stack.pop().unwrap(); + // self.marks = marks; + // self.lastindex = lastindex; + // } fn marks_pop_keep(&mut self) { let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); self.marks = marks; @@ -121,46 +134,31 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks_stack.pop(); } - fn _match(&mut self) { + fn _match(&mut self) { while let Some(mut ctx) = self.context_stack.pop() { - // let mut drive = StateContext { - // state: self, - // ctx, - // next_ctx: None, - // }; - // let mut state = self; - - if let Some(handler) = ctx.handler { + if let Some(handler) = ctx.handler.take() { handler(self, &mut ctx); } else if ctx.remaining_codes(self) > 0 { let code = ctx.peek_code(self, 0); let code = SreOpcode::try_from(code).unwrap(); - self.dispatch(code, &mut ctx); + dispatch(self, &mut ctx, code); } else { ctx.failure(); } - // let StateContext { - // mut state, - // ctx, - // next_ctx, - // } = drive; - - if ctx.has_matched.is_some() { - self.popped_context = Some(ctx); + if let Some(has_matched) = ctx.has_matched { + self.popped_has_matched = has_matched; } else { self.context_stack.push(ctx); if let Some(next_ctx) = self.next_context.take() { self.context_stack.push(next_ctx); } } - // self = state } - self.has_matched = self.popped_context.take().unwrap().has_matched == Some(true); - // self + self.has_matched = self.popped_has_matched; } - pub fn pymatch(mut self) -> Self { + pub fn pymatch(&mut self) { let ctx = MatchContext { string_position: self.start, string_offset: self.string.offset(0, self.start), @@ -169,18 +167,18 @@ impl<'a, S: StrDrive> State<'a, S> { toplevel: true, handler: None, repeat_ctx_id: usize::MAX, + count: -1, }; self.context_stack.push(ctx); self._match(); - self } - pub fn search(mut self) -> Self { + pub fn search(&mut self) { // TODO: optimize by op info and skip prefix if self.start > self.end { - return self; + return; } let mut start_offset = self.string.offset(0, self.start); @@ -193,6 +191,7 @@ impl<'a, S: StrDrive> State<'a, S> { toplevel: true, handler: None, repeat_ctx_id: usize::MAX, + count: -1, }; self.context_stack.push(ctx); self._match(); @@ -211,643 +210,503 @@ impl<'a, S: StrDrive> State<'a, S> { toplevel: false, handler: None, repeat_ctx_id: usize::MAX, + count: -1, }; self.context_stack.push(ctx); self._match(); } + } +} + +fn dispatch<'a, S: StrDrive>( + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + opcode: SreOpcode, +) { + match opcode { + SreOpcode::FAILURE => { + ctx.failure(); + } + SreOpcode::SUCCESS => { + if ctx.can_success(state) { + state.string_position = ctx.string_position; + ctx.success(); + } else { + ctx.failure(); + } + } + SreOpcode::ANY => { + if ctx.at_end(state) || ctx.at_linebreak(state) { + ctx.failure(); + } else { + ctx.skip_code(1); + ctx.skip_char(state, 1); + } + } + SreOpcode::ANY_ALL => { + if ctx.at_end(state) { + ctx.failure(); + } else { + ctx.skip_code(1); + ctx.skip_char(state, 1); + } + } + /* assert subpattern */ + /* */ + SreOpcode::ASSERT => op_assert(state, ctx), + SreOpcode::ASSERT_NOT => op_assert_not(state, ctx), + SreOpcode::AT => { + let atcode = SreAtCode::try_from(ctx.peek_code(state, 1)).unwrap(); + if at(state, ctx, atcode) { + ctx.skip_code(2); + } else { + ctx.failure(); + } + } + SreOpcode::BRANCH => op_branch(state, ctx), + SreOpcode::CATEGORY => { + let catcode = SreCatCode::try_from(ctx.peek_code(state, 1)).unwrap(); + if ctx.at_end(state) || !category(catcode, ctx.peek_char(state)) { + ctx.failure(); + } else { + ctx.skip_code(2); + ctx.skip_char(state, 1); + } + } + SreOpcode::IN => general_op_in(state, ctx, charset), + SreOpcode::IN_IGNORE => general_op_in(state, ctx, |set, c| charset(set, lower_ascii(c))), + SreOpcode::IN_UNI_IGNORE => { + general_op_in(state, ctx, |set, c| charset(set, lower_unicode(c))) + } + SreOpcode::IN_LOC_IGNORE => general_op_in(state, ctx, charset_loc_ignore), + SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(state, 1), + SreOpcode::LITERAL => general_op_literal(state, ctx, |code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal(state, ctx, |code, c| code != c), + SreOpcode::LITERAL_IGNORE => { + general_op_literal(state, ctx, |code, c| code == lower_ascii(c)) + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_op_literal(state, ctx, |code, c| code != lower_ascii(c)) + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_op_literal(state, ctx, |code, c| code == lower_unicode(c)) + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_op_literal(state, ctx, |code, c| code != lower_unicode(c)) + } + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(state, ctx, char_loc_ignore), + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_op_literal(state, ctx, |code, c| !char_loc_ignore(code, c)) + } + SreOpcode::MARK => { + state.set_mark(ctx.peek_code(state, 1) as usize, ctx.string_position); + ctx.skip_code(2); + } + SreOpcode::MAX_UNTIL => op_max_until(state, ctx), + SreOpcode::MIN_UNTIL => op_min_until(state, ctx), + SreOpcode::REPEAT => op_repeat(state, ctx), + SreOpcode::REPEAT_ONE => op_repeat_one(state, ctx), + SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(state, ctx), + SreOpcode::GROUPREF => general_op_groupref(state, ctx, |x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref(state, ctx, lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(state, ctx, lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(state, ctx, lower_unicode), + SreOpcode::GROUPREF_EXISTS => { + let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => { + ctx.skip_code(3); + } + _ => ctx.skip_code_from(state, 2), + } + } + _ => unreachable!("unexpected opcode"), + } +} - self +/* assert subpattern */ +/* */ +fn op_assert<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let back = ctx.peek_code(state, 2) as usize; + if ctx.string_position < back { + return ctx.failure(); } - fn dispatch(&mut self, opcode: SreOpcode, ctx: &mut MatchContext<'a, S>) { - match opcode { - SreOpcode::FAILURE => { - ctx.has_matched = Some(false); + // let next_ctx = state.next_ctx(ctx, 3, |state, ctx| { + let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.skip_code_from(state, 1); + } else { + ctx.failure(); + } + }); + next_ctx.back_skip_char(&state.string, back); + state.string_position = next_ctx.string_position; + next_ctx.toplevel = false; +} + +/* assert not subpattern */ +/* */ +fn op_assert_not<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let back = ctx.peek_code(state, 2) as usize; + + if ctx.string_position < back { + return ctx.skip_code_from(state, 1); + } + + let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.failure(); + } else { + ctx.skip_code_from(state, 1); + } + }); + next_ctx.back_skip_char(&state.string, back); + state.string_position = next_ctx.string_position; + next_ctx.toplevel = false; +} + +// alternation +// <0=skip> code ... +fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + // state.marks_push(); + mark!(push, state); + + ctx.count = 1; + create_context(state, ctx); + + fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let branch_offset = ctx.count as usize; + let next_length = ctx.peek_code(state, branch_offset) as isize; + if next_length == 0 { + state.marks_pop_discard(); + return ctx.failure(); + } + + state.string_position = ctx.string_position; + + ctx.count += next_length; + next_ctx!(offset branch_offset + 1, state, ctx, callback); + } + + fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + return ctx.success(); + } + state.marks_pop_keep(); + create_context(state, ctx); + } +} + +/* <1=min> <2=max> item tail */ +fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let min_count = ctx.peek_code(state, 2) as usize; + // let max_count = ctx.peek_code(state, 3) as usize; + + if ctx.remaining_chars(state) < min_count { + return ctx.failure(); + } + + state.string_position = ctx.string_position; + + ctx.count = if min_count == 0 { + 0 + } else { + let count = _count(state, ctx, min_count); + if count < min_count { + return ctx.failure(); + } + ctx.skip_char(state, count); + count as isize + }; + + let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + // tail is empty. we're finished + state.string_position = ctx.string_position; + return ctx.success(); + } + + mark!(push, state); + create_context(state, ctx); + + fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let max_count = ctx.peek_code(state, 3) as usize; + + if max_count == MAXREPEAT || ctx.count as usize <= max_count { + state.string_position = ctx.string_position; + next_ctx!(from 1, state, ctx, callback); + } else { + state.marks_pop_discard(); + ctx.failure(); + } + } + + fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + return ctx.success(); + } + + state.string_position = ctx.string_position; + + if _count(state, ctx, 1) == 0 { + state.marks_pop_discard(); + return ctx.failure(); + } + + ctx.skip_char(state, 1); + ctx.count += 1; + state.marks_pop_keep(); + create_context(state, ctx); + } +} + +/* match repeated sequence (maximizing regexp) */ +/* this operator only works if the repeated item is +exactly one character wide, and we're not already +collecting backtracking points. for other cases, +use the MAX_REPEAT operator */ +/* <1=min> <2=max> item tail */ +fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let min_count = ctx.peek_code(state, 2) as usize; + let max_count = ctx.peek_code(state, 3) as usize; + + if ctx.remaining_chars(state) < min_count { + return ctx.failure(); + } + + state.string_position = ctx.string_position; + + let count = _count(state, ctx, max_count); + ctx.skip_char(state, count); + if count < min_count { + return ctx.failure(); + } + + let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + // tail is empty. we're finished + state.string_position = ctx.string_position; + return ctx.success(); + } + + mark!(push, state); + ctx.count = count as isize; + create_context(state, ctx); + + fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let min_count = ctx.peek_code(state, 2) as isize; + let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + if next_code == SreOpcode::LITERAL as u32 { + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + let c = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 2); + while ctx.at_end(state) || ctx.peek_char(state) != c { + if ctx.count <= min_count { + state.marks_pop_discard(); + return ctx.failure(); + } + ctx.back_skip_char(&state.string, 1); + ctx.count -= 1; + } + } + + state.string_position = ctx.string_position; + + // General case: backtracking + next_ctx!(from 1, state, ctx, callback); + } + + fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + return ctx.success(); + } + + let min_count = ctx.peek_code(state, 2) as isize; + + if ctx.count <= min_count { + state.marks_pop_discard(); + return ctx.failure(); + } + + ctx.back_skip_char(&state.string, 1); + ctx.count -= 1; + + state.marks_pop_keep(); + create_context(state, ctx); + } +} + +#[derive(Debug, Clone, Copy)] +struct RepeatContext { + count: isize, + min_count: usize, + max_count: usize, + code_position: usize, + last_position: usize, + prev_id: usize, +} + +/* create repeat context. all the hard work is done +by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ +/* <1=min> <2=max> item tail */ +fn op_repeat<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let repeat_ctx = RepeatContext { + count: -1, + min_count: ctx.peek_code(state, 2) as usize, + max_count: ctx.peek_code(state, 3) as usize, + code_position: ctx.code_position, + last_position: std::usize::MAX, + prev_id: ctx.repeat_ctx_id, + }; + + state.repeat_stack.push(repeat_ctx); + + state.string_position = ctx.string_position; + + let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { + ctx.has_matched = Some(state.popped_has_matched); + state.repeat_stack.pop(); + }); + next_ctx.repeat_ctx_id = state.repeat_stack.len() - 1; +} + +/* minimizing repeat */ +fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let repeat_ctx = state.repeat_stack.last_mut().unwrap(); + + state.string_position = ctx.string_position; + + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.success(); + } else { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + ctx.failure(); + } + }); + return; + } + + mark!(push, state); + + ctx.count = ctx.repeat_ctx_id as isize; + + // see if the tail matches + let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { + if state.popped_has_matched { + return ctx.success(); + } + + ctx.repeat_ctx_id = ctx.count as usize; + + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + + state.string_position = ctx.string_position; + + mark!(pop, state); + + // match more until tail matches + + if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT + || state.string_position == repeat_ctx.last_position + { + repeat_ctx.count -= 1; + return ctx.failure(); + } + + /* zero-width match protection */ + repeat_ctx.last_position = state.string_position; + + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.success(); + } else { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + ctx.failure(); + } + }); + }); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; +} + +/* maximizing repeat */ +fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + + state.string_position = ctx.string_position; + + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.success(); + } else { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + ctx.failure(); + } + }); + return; + } + + if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) + && state.string_position != repeat_ctx.last_position + { + /* we may have enough matches, but if we can + match another item, do so */ + mark!(push, state); + + ctx.count = repeat_ctx.last_position as isize; + repeat_ctx.last_position = state.string_position; + + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + let save_last_position = ctx.count as usize; + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + repeat_ctx.last_position = save_last_position; + + if state.popped_has_matched { + state.marks_pop_discard(); + return ctx.success(); } - SreOpcode::SUCCESS => todo!(), - SreOpcode::ANY => todo!(), - SreOpcode::ANY_ALL => todo!(), - SreOpcode::ASSERT => todo!(), - SreOpcode::ASSERT_NOT => todo!(), - SreOpcode::AT => todo!(), - SreOpcode::BRANCH => todo!(), - SreOpcode::CALL => todo!(), - SreOpcode::CATEGORY => todo!(), - SreOpcode::CHARSET => todo!(), - SreOpcode::BIGCHARSET => todo!(), - SreOpcode::GROUPREF => todo!(), - SreOpcode::GROUPREF_EXISTS => todo!(), - SreOpcode::IN => todo!(), - SreOpcode::INFO => todo!(), - SreOpcode::JUMP => todo!(), - SreOpcode::LITERAL => todo!(), - SreOpcode::MARK => todo!(), - SreOpcode::MAX_UNTIL => todo!(), - SreOpcode::MIN_UNTIL => todo!(), - SreOpcode::NOT_LITERAL => todo!(), - SreOpcode::NEGATE => todo!(), - SreOpcode::RANGE => todo!(), - SreOpcode::REPEAT => todo!(), - SreOpcode::REPEAT_ONE => todo!(), - SreOpcode::SUBPATTERN => todo!(), - SreOpcode::MIN_REPEAT_ONE => todo!(), - SreOpcode::GROUPREF_IGNORE => todo!(), - SreOpcode::IN_IGNORE => todo!(), - SreOpcode::LITERAL_IGNORE => todo!(), - SreOpcode::NOT_LITERAL_IGNORE => todo!(), - SreOpcode::GROUPREF_LOC_IGNORE => todo!(), - SreOpcode::IN_LOC_IGNORE => todo!(), - SreOpcode::LITERAL_LOC_IGNORE => todo!(), - SreOpcode::NOT_LITERAL_LOC_IGNORE => todo!(), - SreOpcode::GROUPREF_UNI_IGNORE => todo!(), - SreOpcode::IN_UNI_IGNORE => todo!(), - SreOpcode::LITERAL_UNI_IGNORE => todo!(), - SreOpcode::NOT_LITERAL_UNI_IGNORE => todo!(), - SreOpcode::RANGE_UNI_IGNORE => todo!(), + + mark!(pop, state); + repeat_ctx.count -= 1; + + state.string_position = ctx.string_position; + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + }); + return; + } + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + + fn tail_callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + ctx.success(); + } else { + state.string_position = ctx.string_position; + ctx.failure(); } } } -// fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { -// match opcode { -// SreOpcode::FAILURE => { -// drive.failure(); -// } -// SreOpcode::SUCCESS => { -// drive.ctx.has_matched = Some(drive.can_success()); -// if drive.ctx.has_matched == Some(true) { -// drive.state.string_position = drive.ctx.string_position; -// } -// } -// SreOpcode::ANY => { -// if drive.at_end() || drive.at_linebreak() { -// drive.failure(); -// } else { -// drive.skip_code(1); -// drive.skip_char(1); -// } -// } -// SreOpcode::ANY_ALL => { -// if drive.at_end() { -// drive.failure(); -// } else { -// drive.skip_code(1); -// drive.skip_char(1); -// } -// } -// SreOpcode::ASSERT => op_assert(drive), -// SreOpcode::ASSERT_NOT => op_assert_not(drive), -// SreOpcode::AT => { -// let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); -// if at(drive, atcode) { -// drive.skip_code(2); -// } else { -// drive.failure(); -// } -// } -// SreOpcode::BRANCH => op_branch(drive, stacks), -// SreOpcode::CATEGORY => { -// let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); -// if drive.at_end() || !category(catcode, drive.peek_char()) { -// drive.failure(); -// } else { -// drive.skip_code(2); -// drive.skip_char(1); -// } -// } -// SreOpcode::IN => general_op_in(drive, charset), -// SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), -// SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), -// SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), -// SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), -// SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), -// SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), -// SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), -// SreOpcode::NOT_LITERAL_IGNORE => { -// general_op_literal(drive, |code, c| code != lower_ascii(c)) -// } -// SreOpcode::LITERAL_UNI_IGNORE => { -// general_op_literal(drive, |code, c| code == lower_unicode(c)) -// } -// SreOpcode::NOT_LITERAL_UNI_IGNORE => { -// general_op_literal(drive, |code, c| code != lower_unicode(c)) -// } -// SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), -// SreOpcode::NOT_LITERAL_LOC_IGNORE => { -// general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) -// } -// SreOpcode::MARK => { -// drive -// .state -// .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); -// drive.skip_code(2); -// } -// SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), -// SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), -// SreOpcode::REPEAT => op_repeat(drive, stacks), -// SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), -// SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), -// SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), -// SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), -// SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), -// SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), -// SreOpcode::GROUPREF_EXISTS => { -// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); -// match (group_start, group_end) { -// (Some(start), Some(end)) if start <= end => { -// drive.skip_code(3); -// } -// _ => drive.skip_code_from(2), -// } -// } -// _ => unreachable!("unexpected opcode"), -// } -// } - -// /* assert subpattern */ -// /* */ -// fn op_assert(drive: &mut StateContext) { -// let back = drive.peek_code(2) as usize; - -// if drive.ctx.string_position < back { -// return drive.failure(); -// } - -// let offset = drive -// .state -// .string -// .back_offset(drive.ctx.string_offset, back); -// let position = drive.ctx.string_position - back; - -// drive.state.string_position = position; - -// let next_ctx = drive.next_ctx(3, |drive, _| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.ctx.handler = None; -// drive.skip_code_from(1); -// } else { -// drive.failure(); -// } -// }); -// next_ctx.string_position = position; -// next_ctx.string_offset = offset; -// next_ctx.toplevel = false; -// } - -// /* assert not subpattern */ -// /* */ -// fn op_assert_not(drive: &mut StateContext) { -// let back = drive.peek_code(2) as usize; - -// if drive.ctx.string_position < back { -// return drive.skip_code_from(1); -// } - -// let offset = drive -// .state -// .string -// .back_offset(drive.ctx.string_offset, back); -// let position = drive.ctx.string_position - back; - -// drive.state.string_position = position; - -// let next_ctx = drive.next_ctx(3, |drive, _| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.failure(); -// } else { -// drive.ctx.handler = None; -// drive.skip_code_from(1); -// } -// }); -// next_ctx.string_position = position; -// next_ctx.string_offset = offset; -// next_ctx.toplevel = false; -// } - -// #[derive(Debug)] -// struct BranchContext { -// branch_offset: usize, -// } - -// // alternation -// // <0=skip> code ... -// fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { -// drive.state.marks_push(); -// stacks.branch.push(BranchContext { branch_offset: 1 }); -// create_context(drive, stacks); - -// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { -// let branch_offset = stacks.branch_last().branch_offset; -// let next_length = drive.peek_code(branch_offset) as usize; -// if next_length == 0 { -// drive.state.marks_pop_discard(); -// stacks.branch.pop(); -// return drive.failure(); -// } - -// drive.sync_string_position(); - -// stacks.branch_last().branch_offset += next_length; -// drive.next_ctx(branch_offset + 1, callback); -// } - -// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// stacks.branch.pop(); -// return drive.success(); -// } -// drive.state.marks_pop_keep(); -// drive.ctx.handler = Some(create_context) -// } -// } - -// #[derive(Debug, Copy, Clone)] -// struct MinRepeatOneContext { -// count: usize, -// max_count: usize, -// } - -// /* <1=min> <2=max> item tail */ -// fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { -// let min_count = drive.peek_code(2) as usize; -// let max_count = drive.peek_code(3) as usize; - -// if drive.remaining_chars() < min_count { -// return drive.failure(); -// } - -// drive.sync_string_position(); - -// let count = if min_count == 0 { -// 0 -// } else { -// let count = _count(drive, stacks, min_count); -// if count < min_count { -// return drive.failure(); -// } -// drive.skip_char(count); -// count -// }; - -// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); -// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { -// // tail is empty. we're finished -// drive.sync_string_position(); -// return drive.success(); -// } - -// drive.state.marks_push(); -// stacks -// .min_repeat_one -// .push(MinRepeatOneContext { count, max_count }); -// create_context(drive, stacks); - -// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { -// let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); - -// if max_count == MAXREPEAT || count <= max_count { -// drive.sync_string_position(); -// drive.next_ctx_from(1, callback); -// } else { -// drive.state.marks_pop_discard(); -// stacks.min_repeat_one.pop(); -// drive.failure(); -// } -// } - -// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// stacks.min_repeat_one.pop(); -// return drive.success(); -// } - -// drive.sync_string_position(); - -// if _count(drive, stacks, 1) == 0 { -// drive.state.marks_pop_discard(); -// stacks.min_repeat_one.pop(); -// return drive.failure(); -// } - -// drive.skip_char(1); -// stacks.min_repeat_one_last().count += 1; -// drive.state.marks_pop_keep(); -// create_context(drive, stacks); -// } -// } - -// #[derive(Debug, Copy, Clone)] -// struct RepeatOneContext { -// count: usize, -// min_count: usize, -// following_literal: Option, -// } - -// /* match repeated sequence (maximizing regexp) */ - -// /* this operator only works if the repeated item is -// exactly one character wide, and we're not already -// collecting backtracking points. for other cases, -// use the MAX_REPEAT operator */ - -// /* <1=min> <2=max> item tail */ -// fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { -// let min_count = drive.peek_code(2) as usize; -// let max_count = drive.peek_code(3) as usize; - -// if drive.remaining_chars() < min_count { -// return drive.failure(); -// } - -// drive.sync_string_position(); - -// let count = _count(drive, stacks, max_count); -// drive.skip_char(count); -// if count < min_count { -// return drive.failure(); -// } - -// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); -// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { -// // tail is empty. we're finished -// drive.sync_string_position(); -// return drive.success(); -// } - -// // Special case: Tail starts with a literal. Skip positions where -// // the rest of the pattern cannot possibly match. -// let following_literal = (next_code == SreOpcode::LITERAL as u32) -// .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); - -// drive.state.marks_push(); -// stacks.repeat_one.push(RepeatOneContext { -// count, -// min_count, -// following_literal, -// }); -// create_context(drive, stacks); - -// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { -// let RepeatOneContext { -// mut count, -// min_count, -// following_literal, -// } = *stacks.repeat_one_last(); - -// if let Some(c) = following_literal { -// while drive.at_end() || drive.peek_char() != c { -// if count <= min_count { -// drive.state.marks_pop_discard(); -// stacks.repeat_one.pop(); -// return drive.failure(); -// } -// drive.back_skip_char(1); -// count -= 1; -// } -// } -// stacks.repeat_one_last().count = count; - -// drive.sync_string_position(); - -// // General case: backtracking -// drive.next_ctx_from(1, callback); -// } - -// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// stacks.repeat_one.pop(); -// return drive.success(); -// } - -// let RepeatOneContext { -// count, -// min_count, -// following_literal: _, -// } = stacks.repeat_one_last(); - -// if count <= min_count { -// drive.state.marks_pop_discard(); -// stacks.repeat_one.pop(); -// return drive.failure(); -// } - -// drive.back_skip_char(1); -// *count -= 1; - -// drive.state.marks_pop_keep(); -// create_context(drive, stacks); -// } -// } - -// #[derive(Debug, Clone, Copy)] -// struct RepeatContext { -// count: isize, -// min_count: usize, -// max_count: usize, -// code_position: usize, -// last_position: usize, -// prev_id: usize, -// } - -// /* create repeat context. all the hard work is done -// by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ -// /* <1=min> <2=max> item tail */ -// fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { -// let repeat_ctx = RepeatContext { -// count: -1, -// min_count: drive.peek_code(2) as usize, -// max_count: drive.peek_code(3) as usize, -// code_position: drive.ctx.code_position, -// last_position: std::usize::MAX, -// prev_id: drive.ctx.repeat_ctx_id, -// }; - -// stacks.repeat.push(repeat_ctx); - -// drive.sync_string_position(); - -// let next_ctx = drive.next_ctx_from(1, |drive, stacks| { -// drive.ctx.has_matched = drive.popped_ctx().has_matched; -// stacks.repeat.pop(); -// }); -// next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; -// } - -// #[derive(Debug, Clone, Copy)] -// struct MinUntilContext { -// save_repeat_ctx_id: usize, -// } - -// /* minimizing repeat */ -// fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { -// let repeat_ctx = stacks.repeat.last_mut().unwrap(); - -// drive.sync_string_position(); - -// repeat_ctx.count += 1; - -// if (repeat_ctx.count as usize) < repeat_ctx.min_count { -// // not enough matches -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; -// drive.sync_string_position(); -// drive.failure(); -// } -// }); -// return; -// } - -// drive.state.marks_push(); - -// stacks.min_until.push(MinUntilContext { -// save_repeat_ctx_id: drive.ctx.repeat_ctx_id, -// }); - -// // see if the tail matches -// let next_ctx = drive.next_ctx(1, |drive, stacks| { -// drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; - -// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - -// if drive.popped_ctx().has_matched == Some(true) { -// return drive.success(); -// } - -// drive.sync_string_position(); - -// drive.state.marks_pop(); - -// // match more until tail matches - -// if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT -// || drive.state.string_position == repeat_ctx.last_position -// { -// repeat_ctx.count -= 1; -// return drive.failure(); -// } - -// /* zero-width match protection */ -// repeat_ctx.last_position = drive.state.string_position; - -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; -// drive.sync_string_position(); -// drive.failure(); -// } -// }); -// }); -// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; -// } - -// #[derive(Debug, Clone, Copy)] -// struct MaxUntilContext { -// save_last_position: usize, -// } - -// /* maximizing repeat */ -// fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { -// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - -// drive.sync_string_position(); - -// repeat_ctx.count += 1; - -// if (repeat_ctx.count as usize) < repeat_ctx.min_count { -// // not enough matches -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; -// drive.sync_string_position(); -// drive.failure(); -// } -// }); -// return; -// } - -// stacks.max_until.push(MaxUntilContext { -// save_last_position: repeat_ctx.last_position, -// }); - -// if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) -// && drive.state.string_position != repeat_ctx.last_position -// { -// /* we may have enough matches, but if we can -// match another item, do so */ -// repeat_ctx.last_position = drive.state.string_position; - -// drive.state.marks_push(); - -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// let save_last_position = stacks.max_until.pop().unwrap().save_last_position; -// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; -// repeat_ctx.last_position = save_last_position; - -// if drive.popped_ctx().has_matched == Some(true) { -// drive.state.marks_pop_discard(); -// return drive.success(); -// } - -// drive.state.marks_pop(); -// repeat_ctx.count -= 1; -// drive.sync_string_position(); - -// /* cannot match more repeated items here. make sure the -// tail matches */ -// let next_ctx = drive.next_ctx(1, tail_callback); -// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; -// }); -// return; -// } - -// /* cannot match more repeated items here. make sure the -// tail matches */ -// let next_ctx = drive.next_ctx(1, tail_callback); -// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - -// fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// drive.sync_string_position(); -// drive.failure(); -// } -// } -// } - -// #[derive(Debug, Default)] -// struct Stacks { -// } - -// impl Stacks { -// fn clear(&mut self) { -// self.branch.clear(); -// self.min_repeat_one.clear(); -// self.repeat_one.clear(); -// self.repeat.clear(); -// self.min_until.clear(); -// self.max_until.clear(); -// } - -// fn branch_last(&mut self) -> &mut BranchContext { -// self.branch.last_mut().unwrap() -// } -// fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { -// self.min_repeat_one.last_mut().unwrap() -// } -// fn repeat_one_last(&mut self) -> &mut RepeatOneContext { -// self.repeat_one.last_mut().unwrap() -// } -// } - -pub trait StrDrive { +pub trait StrDrive: Copy { fn offset(&self, offset: usize, skip: usize) -> usize; fn count(&self) -> usize; fn peek(&self, offset: usize) -> u32; @@ -855,7 +714,6 @@ pub trait StrDrive { fn back_offset(&self, offset: usize, skip: usize) -> usize; } - impl<'a> StrDrive for &'a str { fn offset(&self, offset: usize, skip: usize) -> usize { self.get(offset..) @@ -934,6 +792,7 @@ struct MatchContext<'a, S: StrDrive> { toplevel: bool, handler: Option, &mut Self)>, repeat_ctx_id: usize, + count: isize, } impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { @@ -945,182 +804,188 @@ impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { .field("has_matched", &self.has_matched) .field("toplevel", &self.toplevel) .field("handler", &self.handler.map(|x| x as usize)) + .field("count", &self.count) .finish() } } impl<'a, S: StrDrive> MatchContext<'a, S> { + fn pattern(&self, state: &State<'a, S>) -> &[u32] { + &state.pattern_codes[self.code_position..] + } + fn remaining_codes(&self, state: &State<'a, S>) -> usize { state.pattern_codes.len() - self.code_position } - + + fn remaining_chars(&self, state: &State<'a, S>) -> usize { + state.end - self.string_position + } + + fn peek_char(&self, state: &State<'a, S>) -> u32 { + state.string.peek(self.string_offset) + } + + fn skip_char(&mut self, state: &State<'a, S>, skip: usize) { + self.string_position += skip; + self.string_offset = state.string.offset(self.string_offset, skip); + } + + fn back_peek_char(&self, state: &State<'a, S>) -> u32 { + state.string.back_peek(self.string_offset) + } + + fn back_skip_char(&mut self, string: &S, skip: usize) { + self.string_position -= skip; + self.string_offset = string.back_offset(self.string_offset, skip); + } + fn peek_code(&self, state: &State<'a, S>, peek: usize) -> u32 { state.pattern_codes[self.code_position + peek] } + fn skip_code(&mut self, skip: usize) { + self.code_position += skip; + } + + fn skip_code_from(&mut self, state: &State<'a, S>, peek: usize) { + self.skip_code(self.peek_code(state, peek) as usize + 1); + } + + fn at_beginning(&self) -> bool { + // self.ctx().string_position == self.state().start + self.string_position == 0 + } + + fn at_end(&self, state: &State<'a, S>) -> bool { + self.string_position == state.end + } + + fn at_linebreak(&self, state: &State<'a, S>) -> bool { + !self.at_end(state) && is_linebreak(self.peek_char(state)) + } + + fn at_boundary bool>( + &self, + state: &State<'a, S>, + mut word_checker: F, + ) -> bool { + if self.at_beginning() && self.at_end(state) { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); + let this = !self.at_end(state) && word_checker(self.peek_char(state)); + this != that + } + + fn at_non_boundary bool>( + &self, + state: &State<'a, S>, + mut word_checker: F, + ) -> bool { + if self.at_beginning() && self.at_end(state) { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); + let this = !self.at_end(state) && word_checker(self.peek_char(state)); + this == that + } + + fn can_success(&self, state: &State<'a, S>) -> bool { + if !self.toplevel { + return true; + } + if state.match_all && !self.at_end(state) { + return false; + } + if state.must_advance && self.string_position == state.start { + return false; + } + true + } + + fn success(&mut self) { + self.has_matched = Some(true); + } + fn failure(&mut self) { self.has_matched = Some(false); } } -// trait ContextDrive<'a, T: StrDrive<'a>> { -// fn ctx(&self) -> &MatchContext; -// fn ctx_mut(&mut self) -> &mut MatchContext; -// fn state(&self) -> &State<'a, T>; - -// fn popped_ctx(&self) -> &MatchContext { -// self.state().popped_context.as_ref().unwrap() -// } - -// fn pattern(&self) -> &[u32] { -// &self.state().pattern_codes[self.ctx().code_position..] -// } - -// fn peek_char(&self) -> u32 { -// self.state().string.peek(self.ctx().string_offset) -// } -// fn peek_code(&self, peek: usize) -> u32 { -// self.state().pattern_codes[self.ctx().code_position + peek] -// } - -// fn back_peek_char(&self) -> u32 { -// self.state().string.back_peek(self.ctx().string_offset) -// } -// fn back_skip_char(&mut self, skip_count: usize) { -// self.ctx_mut().string_position -= skip_count; -// self.ctx_mut().string_offset = self -// .state() -// .string -// .back_offset(self.ctx().string_offset, skip_count); -// } - -// fn skip_char(&mut self, skip_count: usize) { -// self.ctx_mut().string_offset = self -// .state() -// .string -// .offset(self.ctx().string_offset, skip_count); -// self.ctx_mut().string_position += skip_count; -// } -// fn skip_code(&mut self, skip_count: usize) { -// self.ctx_mut().code_position += skip_count; -// } -// fn skip_code_from(&mut self, peek: usize) { -// self.skip_code(self.peek_code(peek) as usize + 1); -// } - -// fn remaining_chars(&self) -> usize { -// self.state().end - self.ctx().string_position -// } -// fn remaining_codes(&self) -> usize { -// self.state().pattern_codes.len() - self.ctx().code_position -// } - -// fn at_beginning(&self) -> bool { -// // self.ctx().string_position == self.state().start -// self.ctx().string_position == 0 -// } -// fn at_end(&self) -> bool { -// self.ctx().string_position == self.state().end -// } -// fn at_linebreak(&self) -> bool { -// !self.at_end() && is_linebreak(self.peek_char()) -// } -// fn at_boundary bool>(&self, mut word_checker: F) -> bool { -// if self.at_beginning() && self.at_end() { -// return false; -// } -// let that = !self.at_beginning() && word_checker(self.back_peek_char()); -// let this = !self.at_end() && word_checker(self.peek_char()); -// this != that -// } -// fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { -// if self.at_beginning() && self.at_end() { -// return false; -// } -// let that = !self.at_beginning() && word_checker(self.back_peek_char()); -// let this = !self.at_end() && word_checker(self.peek_char()); -// this == that -// } - -// fn can_success(&self) -> bool { -// if !self.ctx().toplevel { -// return true; -// } -// if self.state().match_all && !self.at_end() { -// return false; -// } -// if self.state().must_advance && self.ctx().string_position == self.state().start { -// return false; -// } -// true -// } - -// fn success(&mut self) { -// self.ctx_mut().has_matched = Some(true); -// } - -// fn failure(&mut self) { -// self.ctx_mut().has_matched = Some(false); -// } -// } - -// struct StateContext<'a, S: StrDrive<'a>> { -// state: State<'a, S>, -// ctx: MatchContext, -// next_ctx: Option, -// } - -// impl<'a, S: StrDrive<'a>> ContextDrive<'a, S> for StateContext<'a, S> { -// fn ctx(&self) -> &MatchContext { -// &self.ctx -// } -// fn ctx_mut(&mut self) -> &mut MatchContext { -// &mut self.ctx -// } -// fn state(&self) -> &State<'a, S> { -// &self.state -// } -// } - -// impl StateContext<'_> { -// fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { -// self.next_ctx(self.peek_code(peek) as usize + 1, handler) -// } -// fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { -// self.next_ctx_at(self.ctx.code_position + offset, handler) -// } -// fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { -// self.next_ctx = Some(MatchContext { -// code_position, -// has_matched: None, -// handler: None, -// ..self.ctx -// }); -// self.ctx.handler = Some(handler); -// self.next_ctx.as_mut().unwrap() -// } - -// fn sync_string_position(&mut self) { -// self.state.string_position = self.ctx.string_position; -// } -// } - -// struct StateRefContext<'a> { -// entity: &'a StateContext<'a>, -// ctx: MatchContext, -// } - -// impl ContextDrive for StateRefContext<'_> { -// fn ctx(&self) -> &MatchContext { -// &self.ctx -// } -// fn ctx_mut(&mut self) -> &mut MatchContext { -// &mut self.ctx -// } -// fn state(&self) -> &State { -// &self.entity.state -// } -// } +fn at<'a, S: StrDrive>(state: &State<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { + match atcode { + SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), + SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(state)), + SreAtCode::BOUNDARY => ctx.at_boundary(state, is_word), + SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(state, is_word), + SreAtCode::END => { + (ctx.remaining_chars(state) == 1 && ctx.at_linebreak(state)) || ctx.at_end(state) + } + SreAtCode::END_LINE => ctx.at_linebreak(state) || ctx.at_end(state), + SreAtCode::END_STRING => ctx.at_end(state), + SreAtCode::LOC_BOUNDARY => ctx.at_boundary(state, is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(state, is_loc_word), + SreAtCode::UNI_BOUNDARY => ctx.at_boundary(state, is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(state, is_uni_word), + } +} + +fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + f: F, +) { + if ctx.at_end(state) || !f(ctx.peek_code(state, 1), ctx.peek_char(state)) { + ctx.failure(); + } else { + ctx.skip_code(2); + ctx.skip_char(state, 1); + } +} + +fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + f: F, +) { + if ctx.at_end(state) || !f(&ctx.pattern(state)[2..], ctx.peek_char(state)) { + ctx.failure(); + } else { + ctx.skip_code_from(state, 1); + ctx.skip_char(state, 1); + } +} + +fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + mut f: F, +) { + let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + let (group_start, group_end) = match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => (start, end), + _ => { + return ctx.failure(); + } + }; + + let mut gctx = MatchContext { + string_position: group_start, + string_offset: state.string.offset(0, group_start), + ..*ctx + }; + + for _ in group_start..group_end { + if ctx.at_end(state) || f(ctx.peek_char(state)) != f(gctx.peek_char(state)) { + return ctx.failure(); + } + ctx.skip_char(state, 1); + gctx.skip_char(state, 1); + } + + ctx.skip_code(2); +} fn char_loc_ignore(code: u32, c: u32) -> bool { code == c || code == lower_locate(c) || code == upper_locate(c) @@ -1135,78 +1000,6 @@ fn charset_loc_ignore(set: &[u32], c: u32) -> bool { up != lo && charset(set, up) } -// fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { -// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); -// let (group_start, group_end) = match (group_start, group_end) { -// (Some(start), Some(end)) if start <= end => (start, end), -// _ => { -// return drive.failure(); -// } -// }; - -// let mut wdrive = StateRefContext { -// entity: drive, -// ctx: drive.ctx, -// }; -// let mut gdrive = StateRefContext { -// entity: drive, -// ctx: MatchContext { -// string_position: group_start, -// // TODO: cache the offset -// string_offset: drive.state.string.offset(0, group_start), -// ..drive.ctx -// }, -// }; - -// for _ in group_start..group_end { -// if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { -// return drive.failure(); -// } -// wdrive.skip_char(1); -// gdrive.skip_char(1); -// } - -// let position = wdrive.ctx.string_position; -// let offset = wdrive.ctx.string_offset; -// drive.skip_code(2); -// drive.ctx.string_position = position; -// drive.ctx.string_offset = offset; -// } - -// fn general_op_literal bool>(drive: &mut StateContext, f: F) { -// if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { -// drive.failure(); -// } else { -// drive.skip_code(2); -// drive.skip_char(1); -// } -// } - -// fn general_op_in bool>(drive: &mut StateContext, f: F) { -// if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { -// drive.failure(); -// } else { -// drive.skip_code_from(1); -// drive.skip_char(1); -// } -// } - -// fn at(drive: &StateContext, atcode: SreAtCode) -> bool { -// match atcode { -// SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), -// SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), -// SreAtCode::BOUNDARY => drive.at_boundary(is_word), -// SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), -// SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), -// SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), -// SreAtCode::END_STRING => drive.at_end(), -// SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), -// SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), -// SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), -// SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), -// } -// } - fn category(catcode: SreCatCode, c: u32) -> bool { match catcode { SreCatCode::DIGIT => is_digit(c), @@ -1323,95 +1116,100 @@ fn charset(set: &[u32], ch: u32) -> bool { false } -// /* General case */ -// fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { -// let mut count = 0; -// let max_count = std::cmp::min(max_count, drive.remaining_chars()); - -// let save_ctx = drive.ctx; -// drive.skip_code(4); -// let reset_position = drive.ctx.code_position; - -// while count < max_count { -// drive.ctx.code_position = reset_position; -// let code = drive.peek_code(0); -// let code = SreOpcode::try_from(code).unwrap(); -// dispatch(code, drive, stacks); -// if drive.ctx.has_matched == Some(false) { -// break; -// } -// count += 1; -// } -// drive.ctx = save_ctx; -// count -// } - -// fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { -// let save_ctx = drive.ctx; -// let max_count = std::cmp::min(max_count, drive.remaining_chars()); -// let end = drive.ctx.string_position + max_count; -// let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); - -// match opcode { -// SreOpcode::ANY => { -// while !drive.ctx.string_position < end && !drive.at_linebreak() { -// drive.skip_char(1); -// } -// } -// SreOpcode::ANY_ALL => { -// drive.skip_char(max_count); -// } -// SreOpcode::IN => { -// while !drive.ctx.string_position < end -// && charset(&drive.pattern()[2..], drive.peek_char()) -// { -// drive.skip_char(1); -// } -// } -// SreOpcode::LITERAL => { -// general_count_literal(drive, end, |code, c| code == c as u32); -// } -// SreOpcode::NOT_LITERAL => { -// general_count_literal(drive, end, |code, c| code != c as u32); -// } -// SreOpcode::LITERAL_IGNORE => { -// general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); -// } -// SreOpcode::NOT_LITERAL_IGNORE => { -// general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); -// } -// SreOpcode::LITERAL_LOC_IGNORE => { -// general_count_literal(drive, end, char_loc_ignore); -// } -// SreOpcode::NOT_LITERAL_LOC_IGNORE => { -// general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); -// } -// SreOpcode::LITERAL_UNI_IGNORE => { -// general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); -// } -// SreOpcode::NOT_LITERAL_UNI_IGNORE => { -// general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); -// } -// _ => { -// return general_count(drive, stacks, max_count); -// } -// } - -// let count = drive.ctx.string_position - drive.state.string_position; -// drive.ctx = save_ctx; -// count -// } - -// fn general_count_literal bool>( -// drive: &mut StateContext, -// end: usize, -// mut f: F, -// ) { -// let ch = drive.peek_code(1); -// while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { -// drive.skip_char(1); -// } -// } +fn _count<'a, S: StrDrive>( + state: &mut State<'a, S>, + ctx: &MatchContext<'a, S>, + max_count: usize, +) -> usize { + let mut ctx = *ctx; + let max_count = std::cmp::min(max_count, ctx.remaining_chars(state)); + let end = ctx.string_position + max_count; + let opcode = SreOpcode::try_from(ctx.peek_code(state, 0)).unwrap(); + + match opcode { + SreOpcode::ANY => { + while !ctx.string_position < end && !ctx.at_linebreak(state) { + ctx.skip_char(state, 1); + } + } + SreOpcode::ANY_ALL => { + ctx.skip_char(state, max_count); + } + SreOpcode::IN => { + while !ctx.string_position < end + && charset(&ctx.pattern(state)[2..], ctx.peek_char(state)) + { + ctx.skip_char(state, 1); + } + } + SreOpcode::LITERAL => { + general_count_literal(state, &mut ctx, end, |code, c| code == c as u32); + } + SreOpcode::NOT_LITERAL => { + general_count_literal(state, &mut ctx, end, |code, c| code != c as u32); + } + SreOpcode::LITERAL_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code == lower_ascii(c) as u32 + }); + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code != lower_ascii(c) as u32 + }); + } + SreOpcode::LITERAL_LOC_IGNORE => { + general_count_literal(state, &mut ctx, end, char_loc_ignore); + } + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code == lower_unicode(c) as u32 + }); + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code != lower_unicode(c) as u32 + }); + } + _ => { + /* General case */ + let mut count = 0; + + ctx.skip_code(4); + let reset_position = ctx.code_position; + + while count < max_count { + ctx.code_position = reset_position; + let code = ctx.peek_code(state, 0); + let code = SreOpcode::try_from(code).unwrap(); + dispatch(state, &mut ctx, code); + if ctx.has_matched == Some(false) { + break; + } + count += 1; + } + return count; + } + } + + // TODO: return offset + ctx.string_position - state.string_position +} + +fn general_count_literal<'a, S: StrDrive, F: FnMut(u32, u32) -> bool>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + end: usize, + mut f: F, +) { + let ch = ctx.peek_code(state, 1); + while !ctx.string_position < end && f(ch, ctx.peek_char(state)) { + ctx.skip_char(state, 1); + } +} fn is_word(ch: u32) -> bool { ch == '_' as u32 diff --git a/tests/tests.rs b/tests/tests.rs index e8ae487029..cc5c4d1f38 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -7,12 +7,12 @@ struct Pattern { } impl Pattern { - fn state<'a>( + fn state<'a, S: engine::StrDrive>( &self, - string: impl Into>, + string: S, range: std::ops::Range, - ) -> engine::State<'a> { - engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + ) -> engine::State<'a, S> { + engine::State::new(string, range.start, range.end, self.flags, self.code) } } @@ -23,7 +23,7 @@ fn test_2427() { #[rustfmt::skip] let lookbehind = Pattern { code: &[15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = lookbehind.state("x", 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); } @@ -34,7 +34,7 @@ fn test_assert() { #[rustfmt::skip] let positive_lookbehind = Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = positive_lookbehind.state("abcdef", 0..usize::MAX); - state = state.search(); + state.search(); assert!(state.has_matched); } @@ -45,7 +45,7 @@ fn test_string_boundaries() { #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = big_b.state("", 0..usize::MAX); - state = state.search(); + state.search(); assert!(!state.has_matched); } @@ -57,7 +57,7 @@ fn test_zerowidth() { // END GENERATED let mut state = p.state("a:", 0..usize::MAX); state.must_advance = true; - state = state.search(); + state.search(); assert!(state.string_position == 1); } @@ -68,7 +68,7 @@ fn test_repeat_context_panic() { #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = p.state("axxzaz", 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.marks == vec![Some(1), Some(3)]); } @@ -79,6 +79,6 @@ fn test_double_max_until() { #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = p.state("1111", 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.string_position == 4); } From 982d8f53f2eea5eb80d37a0d22c807aebd694445 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 2 Aug 2022 21:10:31 +0200 Subject: [PATCH 056/960] fix next_ctx --- src/engine.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index b0717e1671..64043cee53 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -35,14 +35,16 @@ macro_rules! next_ctx { (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { next_ctx!(position $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) }; - (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => { - {$state.next_context.insert(MatchContext { + (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => {{ + $ctx.handler = Some($handler); + $state.next_context.insert(MatchContext { code_position: $position, has_matched: None, - handler: Some($handler), + handler: None, + count: -1, ..*$ctx - })} - }; + }) + }}; } macro_rules! mark { @@ -678,7 +680,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex return ctx.success(); } - mark!(pop, state); + mark!(pop, state); repeat_ctx.count -= 1; state.string_position = ctx.string_position; From ccae898885496a1390a10fa6fca4cf394445dd35 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 2 Aug 2022 21:24:11 +0200 Subject: [PATCH 057/960] fix next_ctx bug --- src/engine.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 64043cee53..4acc172bc6 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -33,7 +33,7 @@ macro_rules! next_ctx { next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) }; (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { - next_ctx!(position $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) + next_ctx!(offset $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) }; (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => {{ $ctx.handler = Some($handler); From f3b30443aab82095ed2ce786482309e659f9c107 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Aug 2022 21:05:10 +0200 Subject: [PATCH 058/960] fix lifetime --- src/engine.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 4acc172bc6..810e011c95 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -716,7 +716,7 @@ pub trait StrDrive: Copy { fn back_offset(&self, offset: usize, skip: usize) -> usize; } -impl<'a> StrDrive for &'a str { +impl StrDrive for &str { fn offset(&self, offset: usize, skip: usize) -> usize { self.get(offset..) .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) From ca20b5951d7092cf0ec25198f01d3620a69e61e2 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Aug 2022 21:08:14 +0200 Subject: [PATCH 059/960] update version to 0.3.0 --- Cargo.toml | 2 +- src/engine.rs | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 00123d92c5..98b632a4dc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.2.1" +version = "0.3.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index 810e011c95..1302667d1d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -253,8 +253,6 @@ fn dispatch<'a, S: StrDrive>( ctx.skip_char(state, 1); } } - /* assert subpattern */ - /* */ SreOpcode::ASSERT => op_assert(state, ctx), SreOpcode::ASSERT_NOT => op_assert_not(state, ctx), SreOpcode::AT => { @@ -334,7 +332,6 @@ fn op_assert<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' return ctx.failure(); } - // let next_ctx = state.next_ctx(ctx, 3, |state, ctx| { let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { if state.popped_has_matched { ctx.skip_code_from(state, 1); @@ -371,7 +368,6 @@ fn op_assert_not<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchConte // alternation // <0=skip> code ... fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - // state.marks_push(); mark!(push, state); ctx.count = 1; @@ -403,7 +399,6 @@ fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' /* <1=min> <2=max> item tail */ fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { let min_count = ctx.peek_code(state, 2) as usize; - // let max_count = ctx.peek_code(state, 3) as usize; if ctx.remaining_chars(state) < min_count { return ctx.failure(); From a48f5b07c5671b690e262b935b81f0a7a832b566 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Aug 2022 21:49:46 +0200 Subject: [PATCH 060/960] impl op_info --- src/engine.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 1302667d1d..fcade829f2 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -279,7 +279,15 @@ fn dispatch<'a, S: StrDrive>( general_op_in(state, ctx, |set, c| charset(set, lower_unicode(c))) } SreOpcode::IN_LOC_IGNORE => general_op_in(state, ctx, charset_loc_ignore), - SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(state, 1), + SreOpcode::INFO => { + let min = ctx.peek_code(state, 3) as usize; + if ctx.remaining_chars(state) < min { + ctx.failure(); + } else { + ctx.skip_code_from(state, 1); + } + } + SreOpcode::JUMP => ctx.skip_code_from(state, 1), SreOpcode::LITERAL => general_op_literal(state, ctx, |code, c| code == c), SreOpcode::NOT_LITERAL => general_op_literal(state, ctx, |code, c| code != c), SreOpcode::LITERAL_IGNORE => { From 8b1fcea7ec27aa22f698d485ee203dbe2a552334 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 7 Aug 2022 08:10:52 +0200 Subject: [PATCH 061/960] update version to 0.3.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 98b632a4dc..4b403f2861 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.3.0" +version = "0.3.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From c31462d51b7d3adbf8f121403c4e8b305a9dab6f Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 16:29:51 +0200 Subject: [PATCH 062/960] refactor split State with Request --- src/engine.rs | 669 +++++++++++++++++++++++++++++++------------------- 1 file changed, 420 insertions(+), 249 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index fcade829f2..5e1f1457ec 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,6 +1,6 @@ // good luck to those that follow; here be dragons -use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; +use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; use super::MAXREPEAT; use std::convert::TryFrom; @@ -8,26 +8,37 @@ const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } -#[derive(Debug)] -pub struct State<'a, S: StrDrive> { +pub struct Request<'a, S: StrDrive> { pub string: S, pub start: usize, pub end: usize, - _flags: SreFlag, - pattern_codes: &'a [u32], - pub marks: Vec>, - pub lastindex: isize, - marks_stack: Vec<(Vec>, isize)>, - context_stack: Vec>, - repeat_stack: Vec, - pub string_position: usize, - next_context: Option>, - popped_has_matched: bool, - pub has_matched: bool, + pub pattern_codes: &'a [u32], pub match_all: bool, pub must_advance: bool, } +impl<'a, S: StrDrive> Request<'a, S> { + pub fn new( + string: S, + start: usize, + end: usize, + pattern_codes: &'a [u32], + match_all: bool, + ) -> Self { + let end = std::cmp::min(end, string.count()); + let start = std::cmp::min(start, end); + + Self { + string, + start, + end, + pattern_codes, + match_all, + must_advance: false, + } + } +} + macro_rules! next_ctx { (offset $offset:expr, $state:expr, $ctx:expr, $handler:expr) => { next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) @@ -60,43 +71,41 @@ macro_rules! mark { }; } +#[derive(Debug)] +pub struct State<'a, S: StrDrive> { + pub marks: Vec>, + pub lastindex: isize, + marks_stack: Vec<(Vec>, isize)>, + context_stack: Vec>, + repeat_stack: Vec, + pub string_position: usize, + next_context: Option>, + popped_has_matched: bool, + has_matched: bool, +} + impl<'a, S: StrDrive> State<'a, S> { - pub fn new( - string: S, - start: usize, - end: usize, - flags: SreFlag, - pattern_codes: &'a [u32], - ) -> Self { - let end = std::cmp::min(end, string.count()); - let start = std::cmp::min(start, end); + pub fn new(string_position: usize) -> Self { Self { - string, - start, - end, - _flags: flags, - pattern_codes, marks: Vec::new(), lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), repeat_stack: Vec::new(), - string_position: start, + string_position, next_context: None, popped_has_matched: false, has_matched: false, - match_all: false, - must_advance: false, } } - pub fn reset(&mut self) { + pub fn reset(&mut self, string_position: usize) { self.lastindex = -1; self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); - self.string_position = self.start; + self.string_position = string_position; self.next_context = None; self.popped_has_matched = false; self.has_matched = false; @@ -136,14 +145,14 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks_stack.pop(); } - fn _match(&mut self) { + fn _match(&mut self, req: &mut Request<'a, S>) { while let Some(mut ctx) = self.context_stack.pop() { if let Some(handler) = ctx.handler.take() { - handler(self, &mut ctx); - } else if ctx.remaining_codes(self) > 0 { - let code = ctx.peek_code(self, 0); + handler(req, self, &mut ctx); + } else if ctx.remaining_codes(req) > 0 { + let code = ctx.peek_code(req, 0); let code = SreOpcode::try_from(code).unwrap(); - dispatch(self, &mut ctx, code); + dispatch(req, self, &mut ctx, code); } else { ctx.failure(); } @@ -160,10 +169,10 @@ impl<'a, S: StrDrive> State<'a, S> { self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self) { + pub fn pymatch(&mut self, req: &mut Request<'a, S>) { let ctx = MatchContext { - string_position: self.start, - string_offset: self.string.offset(0, self.start), + string_position: req.start, + string_offset: req.string.offset(0, req.start), code_position: 0, has_matched: None, toplevel: true, @@ -173,20 +182,22 @@ impl<'a, S: StrDrive> State<'a, S> { }; self.context_stack.push(ctx); - self._match(); + self._match(req); } - pub fn search(&mut self) { + pub fn search(&mut self, req: &mut Request<'a, S>) { // TODO: optimize by op info and skip prefix - - if self.start > self.end { + if req.start > req.end { return; } - let mut start_offset = self.string.offset(0, self.start); + // let start = self.start; + // let end = self.end; - let ctx = MatchContext { - string_position: self.start, + let mut start_offset = req.string.offset(0, req.start); + + let mut ctx = MatchContext { + string_position: req.start, string_offset: start_offset, code_position: 0, has_matched: None, @@ -195,17 +206,26 @@ impl<'a, S: StrDrive> State<'a, S> { repeat_ctx_id: usize::MAX, count: -1, }; + + // if ctx.peek_code(self, 0) == SreOpcode::INFO as u32 { + // search_op_info(self, &mut ctx); + // if let Some(has_matched) = ctx.has_matched { + // self.has_matched = has_matched; + // return; + // } + // } + self.context_stack.push(ctx); - self._match(); + self._match(req); - self.must_advance = false; - while !self.has_matched && self.start < self.end { - self.start += 1; - start_offset = self.string.offset(start_offset, 1); - self.reset(); + req.must_advance = false; + while !self.has_matched && req.start < req.end { + req.start += 1; + start_offset = req.string.offset(start_offset, 1); + self.reset(req.start); let ctx = MatchContext { - string_position: self.start, + string_position: req.start, string_offset: start_offset, code_position: 0, has_matched: None, @@ -215,12 +235,13 @@ impl<'a, S: StrDrive> State<'a, S> { count: -1, }; self.context_stack.push(ctx); - self._match(); + self._match(req); } } } fn dispatch<'a, S: StrDrive>( + req: &Request<'a, S>, state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>, opcode: SreOpcode, @@ -230,7 +251,7 @@ fn dispatch<'a, S: StrDrive>( ctx.failure(); } SreOpcode::SUCCESS => { - if ctx.can_success(state) { + if ctx.can_success(req) { state.string_position = ctx.string_position; ctx.success(); } else { @@ -238,152 +259,224 @@ fn dispatch<'a, S: StrDrive>( } } SreOpcode::ANY => { - if ctx.at_end(state) || ctx.at_linebreak(state) { + if ctx.at_end(req) || ctx.at_linebreak(req) { ctx.failure(); } else { ctx.skip_code(1); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } SreOpcode::ANY_ALL => { - if ctx.at_end(state) { + if ctx.at_end(req) { ctx.failure(); } else { ctx.skip_code(1); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } - SreOpcode::ASSERT => op_assert(state, ctx), - SreOpcode::ASSERT_NOT => op_assert_not(state, ctx), + SreOpcode::ASSERT => op_assert(req, state, ctx), + SreOpcode::ASSERT_NOT => op_assert_not(req, state, ctx), SreOpcode::AT => { - let atcode = SreAtCode::try_from(ctx.peek_code(state, 1)).unwrap(); - if at(state, ctx, atcode) { + let atcode = SreAtCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if at(req, ctx, atcode) { ctx.skip_code(2); } else { ctx.failure(); } } - SreOpcode::BRANCH => op_branch(state, ctx), + SreOpcode::BRANCH => op_branch(req, state, ctx), SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(ctx.peek_code(state, 1)).unwrap(); - if ctx.at_end(state) || !category(catcode, ctx.peek_char(state)) { + let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if ctx.at_end(req) || !category(catcode, ctx.peek_char(req)) { ctx.failure(); } else { ctx.skip_code(2); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } - SreOpcode::IN => general_op_in(state, ctx, charset), - SreOpcode::IN_IGNORE => general_op_in(state, ctx, |set, c| charset(set, lower_ascii(c))), + SreOpcode::IN => general_op_in(req, ctx, charset), + SreOpcode::IN_IGNORE => general_op_in(req, ctx, |set, c| charset(set, lower_ascii(c))), SreOpcode::IN_UNI_IGNORE => { - general_op_in(state, ctx, |set, c| charset(set, lower_unicode(c))) + general_op_in(req, ctx, |set, c| charset(set, lower_unicode(c))) } - SreOpcode::IN_LOC_IGNORE => general_op_in(state, ctx, charset_loc_ignore), + SreOpcode::IN_LOC_IGNORE => general_op_in(req, ctx, charset_loc_ignore), SreOpcode::INFO => { - let min = ctx.peek_code(state, 3) as usize; - if ctx.remaining_chars(state) < min { + let min = ctx.peek_code(req, 3) as usize; + if ctx.remaining_chars(req) < min { ctx.failure(); } else { - ctx.skip_code_from(state, 1); + ctx.skip_code_from(req, 1); } } - SreOpcode::JUMP => ctx.skip_code_from(state, 1), - SreOpcode::LITERAL => general_op_literal(state, ctx, |code, c| code == c), - SreOpcode::NOT_LITERAL => general_op_literal(state, ctx, |code, c| code != c), - SreOpcode::LITERAL_IGNORE => { - general_op_literal(state, ctx, |code, c| code == lower_ascii(c)) - } + SreOpcode::JUMP => ctx.skip_code_from(req, 1), + SreOpcode::LITERAL => general_op_literal(req, ctx, |code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal(req, ctx, |code, c| code != c), + SreOpcode::LITERAL_IGNORE => general_op_literal(req, ctx, |code, c| code == lower_ascii(c)), SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(state, ctx, |code, c| code != lower_ascii(c)) + general_op_literal(req, ctx, |code, c| code != lower_ascii(c)) } SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(state, ctx, |code, c| code == lower_unicode(c)) + general_op_literal(req, ctx, |code, c| code == lower_unicode(c)) } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(state, ctx, |code, c| code != lower_unicode(c)) + general_op_literal(req, ctx, |code, c| code != lower_unicode(c)) } - SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(state, ctx, char_loc_ignore), + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(req, ctx, char_loc_ignore), SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(state, ctx, |code, c| !char_loc_ignore(code, c)) + general_op_literal(req, ctx, |code, c| !char_loc_ignore(code, c)) } SreOpcode::MARK => { - state.set_mark(ctx.peek_code(state, 1) as usize, ctx.string_position); + state.set_mark(ctx.peek_code(req, 1) as usize, ctx.string_position); ctx.skip_code(2); } SreOpcode::MAX_UNTIL => op_max_until(state, ctx), SreOpcode::MIN_UNTIL => op_min_until(state, ctx), - SreOpcode::REPEAT => op_repeat(state, ctx), - SreOpcode::REPEAT_ONE => op_repeat_one(state, ctx), - SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(state, ctx), - SreOpcode::GROUPREF => general_op_groupref(state, ctx, |x| x), - SreOpcode::GROUPREF_IGNORE => general_op_groupref(state, ctx, lower_ascii), - SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(state, ctx, lower_locate), - SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(state, ctx, lower_unicode), + SreOpcode::REPEAT => op_repeat(req, state, ctx), + SreOpcode::REPEAT_ONE => op_repeat_one(req, state, ctx), + SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(req, state, ctx), + SreOpcode::GROUPREF => general_op_groupref(req, state, ctx, |x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref(req, state, ctx, lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(req, state, ctx, lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(req, state, ctx, lower_unicode), SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); match (group_start, group_end) { (Some(start), Some(end)) if start <= end => { ctx.skip_code(3); } - _ => ctx.skip_code_from(state, 2), + _ => ctx.skip_code_from(req, 2), } } _ => unreachable!("unexpected opcode"), } } +/* optimization info block */ +/* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ +// fn search_op_info<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +// let min = ctx.peek_code(state, 3) as usize; + +// if ctx.remaining_chars(state) < min { +// return ctx.failure(); +// } + +// if min > 1 { +// /* adjust end point (but make sure we leave at least one +// character in there, so literal search will work) */ +// // no overflow can happen as remaining chars >= min +// state.end -= min - 1; + +// // adjust ctx position +// if state.end < ctx.string_position { +// ctx.string_position = state.end; +// ctx.string_offset = state.string.offset(0, ctx.string_position); +// } +// } + +// let flags = SreInfo::from_bits_truncate(ctx.peek_code(state, 2)); + +// if flags.contains(SreInfo::PREFIX) { +// /* pattern starts with a known prefix */ +// /* */ +// let len = ctx.peek_code(state, 5) as usize; +// let skip = ctx.peek_code(state, 6) as usize; +// let prefix = &ctx.pattern(state)[7..]; +// let overlap = &prefix[len - 1..]; + +// ctx.skip_code_from(state, 1); + +// if len == 1 { +// // pattern starts with a literal character +// let c = prefix[0]; +// let end = state.end; + +// while (!ctx.at_end(state)) { +// // find the next matched literal +// while (ctx.peek_char(state) != c) { +// ctx.skip_char(state, 1); +// if (ctx.at_end(state)) { +// return ctx.failure(); +// } +// } + +// // literal only +// if flags.contains(SreInfo::LITERAL) { +// return ctx.success(); +// } +// } +// } +// } +// } + /* assert subpattern */ /* */ -fn op_assert<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let back = ctx.peek_code(state, 2) as usize; +fn op_assert<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.failure(); } - let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + // let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { + let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { if state.popped_has_matched { - ctx.skip_code_from(state, 1); + ctx.skip_code_from(req, 1); } else { ctx.failure(); } }); - next_ctx.back_skip_char(&state.string, back); - state.string_position = next_ctx.string_position; next_ctx.toplevel = false; + next_ctx.back_skip_char(req, back); + state.string_position = next_ctx.string_position; } /* assert not subpattern */ /* */ -fn op_assert_not<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let back = ctx.peek_code(state, 2) as usize; +fn op_assert_not<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { - return ctx.skip_code_from(state, 1); + return ctx.skip_code_from(req, 1); } - let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { if state.popped_has_matched { ctx.failure(); } else { - ctx.skip_code_from(state, 1); + ctx.skip_code_from(req, 1); } }); - next_ctx.back_skip_char(&state.string, back); - state.string_position = next_ctx.string_position; next_ctx.toplevel = false; + next_ctx.back_skip_char(req, back); + state.string_position = next_ctx.string_position; } // alternation // <0=skip> code ... -fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_branch<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { mark!(push, state); ctx.count = 1; - create_context(state, ctx); + create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn create_context<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { let branch_offset = ctx.count as usize; - let next_length = ctx.peek_code(state, branch_offset) as isize; + let next_length = ctx.peek_code(req, branch_offset) as isize; if next_length == 0 { state.marks_pop_discard(); return ctx.failure(); @@ -395,20 +488,28 @@ fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' next_ctx!(offset branch_offset + 1, state, ctx, callback); } - fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { return ctx.success(); } state.marks_pop_keep(); - create_context(state, ctx); + create_context(req, state, ctx); } } /* <1=min> <2=max> item tail */ -fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let min_count = ctx.peek_code(state, 2) as usize; +fn op_min_repeat_one<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let min_count = ctx.peek_code(req, 2) as usize; - if ctx.remaining_chars(state) < min_count { + if ctx.remaining_chars(req) < min_count { return ctx.failure(); } @@ -417,52 +518,61 @@ fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchC ctx.count = if min_count == 0 { 0 } else { - let count = _count(state, ctx, min_count); + let count = _count(req, state, ctx, min_count); if count < min_count { return ctx.failure(); } - ctx.skip_char(state, count); + ctx.skip_char(req, count); count as isize }; - let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { // tail is empty. we're finished state.string_position = ctx.string_position; return ctx.success(); } mark!(push, state); - create_context(state, ctx); + create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let max_count = ctx.peek_code(state, 3) as usize; + fn create_context<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { + let max_count = ctx.peek_code(req, 3) as usize; if max_count == MAXREPEAT || ctx.count as usize <= max_count { state.string_position = ctx.string_position; - next_ctx!(from 1, state, ctx, callback); + // next_ctx!(from 1, state, ctx, callback); + ctx.next_from(1, req, state, callback); } else { state.marks_pop_discard(); ctx.failure(); } } - fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { return ctx.success(); } state.string_position = ctx.string_position; - if _count(state, ctx, 1) == 0 { + if _count(req, state, ctx, 1) == 0 { state.marks_pop_discard(); return ctx.failure(); } - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); ctx.count += 1; state.marks_pop_keep(); - create_context(state, ctx); + create_context(req, state, ctx); } } @@ -472,24 +582,28 @@ exactly one character wide, and we're not already collecting backtracking points. for other cases, use the MAX_REPEAT operator */ /* <1=min> <2=max> item tail */ -fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let min_count = ctx.peek_code(state, 2) as usize; - let max_count = ctx.peek_code(state, 3) as usize; +fn op_repeat_one<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let min_count = ctx.peek_code(req, 2) as usize; + let max_count = ctx.peek_code(req, 3) as usize; - if ctx.remaining_chars(state) < min_count { + if ctx.remaining_chars(req) < min_count { return ctx.failure(); } state.string_position = ctx.string_position; - let count = _count(state, ctx, max_count); - ctx.skip_char(state, count); + let count = _count(req, state, ctx, max_count); + ctx.skip_char(req, count); if count < min_count { return ctx.failure(); } - let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { // tail is empty. we're finished state.string_position = ctx.string_position; return ctx.success(); @@ -497,21 +611,25 @@ fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchConte mark!(push, state); ctx.count = count as isize; - create_context(state, ctx); - - fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let min_count = ctx.peek_code(state, 2) as isize; - let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + create_context(req, state, ctx); + + fn create_context<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { + let min_count = ctx.peek_code(req, 2) as isize; + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::LITERAL as u32 { // Special case: Tail starts with a literal. Skip positions where // the rest of the pattern cannot possibly match. - let c = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 2); - while ctx.at_end(state) || ctx.peek_char(state) != c { + let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); + while ctx.at_end(req) || ctx.peek_char(req) != c { if ctx.count <= min_count { state.marks_pop_discard(); return ctx.failure(); } - ctx.back_skip_char(&state.string, 1); + ctx.back_skip_char(req, 1); ctx.count -= 1; } } @@ -519,26 +637,31 @@ fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchConte state.string_position = ctx.string_position; // General case: backtracking - next_ctx!(from 1, state, ctx, callback); + // next_ctx!(from 1, state, ctx, callback); + ctx.next_from(1, req, state, callback); } - fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { return ctx.success(); } - let min_count = ctx.peek_code(state, 2) as isize; + let min_count = ctx.peek_code(req, 2) as isize; if ctx.count <= min_count { state.marks_pop_discard(); return ctx.failure(); } - ctx.back_skip_char(&state.string, 1); + ctx.back_skip_char(req, 1); ctx.count -= 1; state.marks_pop_keep(); - create_context(state, ctx); + create_context(req, state, ctx); } } @@ -555,11 +678,15 @@ struct RepeatContext { /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> item tail */ -fn op_repeat<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_repeat<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { let repeat_ctx = RepeatContext { count: -1, - min_count: ctx.peek_code(state, 2) as usize, - max_count: ctx.peek_code(state, 3) as usize, + min_count: ctx.peek_code(req, 2) as usize, + max_count: ctx.peek_code(req, 3) as usize, code_position: ctx.code_position, last_position: std::usize::MAX, prev_id: ctx.repeat_ctx_id, @@ -569,11 +696,14 @@ fn op_repeat<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' state.string_position = ctx.string_position; - let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { + let repeat_ctx_id = state.repeat_stack.len(); + + // let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { + let next_ctx = ctx.next_from(1, req, state, |req, state, ctx| { ctx.has_matched = Some(state.popped_has_matched); state.repeat_stack.pop(); }); - next_ctx.repeat_ctx_id = state.repeat_stack.len() - 1; + next_ctx.repeat_ctx_id = repeat_ctx_id; } /* minimizing repeat */ @@ -586,7 +716,8 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -602,8 +733,11 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex ctx.count = ctx.repeat_ctx_id as isize; + let repeat_ctx_prev_id = repeat_ctx.prev_id; + // see if the tail matches - let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { + // let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { + let next_ctx = ctx.next_offset(1, state, |req, state, ctx| { if state.popped_has_matched { return ctx.success(); } @@ -628,7 +762,8 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex /* zero-width match protection */ repeat_ctx.last_position = state.string_position; - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -638,7 +773,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex } }); }); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + next_ctx.repeat_ctx_id = repeat_ctx_prev_id; } /* maximizing repeat */ @@ -651,7 +786,8 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -673,7 +809,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex ctx.count = repeat_ctx.last_position as isize; repeat_ctx.last_position = state.string_position; - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { let save_last_position = ctx.count as usize; let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; repeat_ctx.last_position = save_last_position; @@ -701,7 +837,11 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - fn tail_callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn tail_callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { ctx.success(); } else { @@ -786,8 +926,6 @@ impl<'a> StrDrive for &'a [u8] { } } -// type OpcodeHandler = for<'a>fn(&mut StateContext<'a, S>, &mut Stacks); - #[derive(Clone, Copy)] struct MatchContext<'a, S: StrDrive> { string_position: usize, @@ -795,7 +933,7 @@ struct MatchContext<'a, S: StrDrive> { code_position: usize, has_matched: Option, toplevel: bool, - handler: Option, &mut Self)>, + handler: Option, &mut State<'a, S>, &mut Self)>, repeat_ctx_id: usize, count: isize, } @@ -809,52 +947,53 @@ impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { .field("has_matched", &self.has_matched) .field("toplevel", &self.toplevel) .field("handler", &self.handler.map(|x| x as usize)) + .field("repeat_ctx_id", &self.repeat_ctx_id) .field("count", &self.count) .finish() } } impl<'a, S: StrDrive> MatchContext<'a, S> { - fn pattern(&self, state: &State<'a, S>) -> &[u32] { - &state.pattern_codes[self.code_position..] + fn pattern(&self, req: &Request<'a, S>) -> &'a [u32] { + &req.pattern_codes[self.code_position..] } - fn remaining_codes(&self, state: &State<'a, S>) -> usize { - state.pattern_codes.len() - self.code_position + fn remaining_codes(&self, req: &Request<'a, S>) -> usize { + req.pattern_codes.len() - self.code_position } - fn remaining_chars(&self, state: &State<'a, S>) -> usize { - state.end - self.string_position + fn remaining_chars(&self, req: &Request<'a, S>) -> usize { + req.end - self.string_position } - fn peek_char(&self, state: &State<'a, S>) -> u32 { - state.string.peek(self.string_offset) + fn peek_char(&self, req: &Request<'a, S>) -> u32 { + req.string.peek(self.string_offset) } - fn skip_char(&mut self, state: &State<'a, S>, skip: usize) { + fn skip_char(&mut self, req: &Request<'a, S>, skip: usize) { self.string_position += skip; - self.string_offset = state.string.offset(self.string_offset, skip); + self.string_offset = req.string.offset(self.string_offset, skip); } - fn back_peek_char(&self, state: &State<'a, S>) -> u32 { - state.string.back_peek(self.string_offset) + fn back_peek_char(&self, req: &Request<'a, S>) -> u32 { + req.string.back_peek(self.string_offset) } - fn back_skip_char(&mut self, string: &S, skip: usize) { + fn back_skip_char(&mut self, req: &Request<'a, S>, skip: usize) { self.string_position -= skip; - self.string_offset = string.back_offset(self.string_offset, skip); + self.string_offset = req.string.back_offset(self.string_offset, skip); } - fn peek_code(&self, state: &State<'a, S>, peek: usize) -> u32 { - state.pattern_codes[self.code_position + peek] + fn peek_code(&self, req: &Request<'a, S>, peek: usize) -> u32 { + req.pattern_codes[self.code_position + peek] } fn skip_code(&mut self, skip: usize) { self.code_position += skip; } - fn skip_code_from(&mut self, state: &State<'a, S>, peek: usize) { - self.skip_code(self.peek_code(state, peek) as usize + 1); + fn skip_code_from(&mut self, req: &Request<'a, S>, peek: usize) { + self.skip_code(self.peek_code(req, peek) as usize + 1); } fn at_beginning(&self) -> bool { @@ -862,48 +1001,48 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { self.string_position == 0 } - fn at_end(&self, state: &State<'a, S>) -> bool { - self.string_position == state.end + fn at_end(&self, req: &Request<'a, S>) -> bool { + self.string_position == req.end } - fn at_linebreak(&self, state: &State<'a, S>) -> bool { - !self.at_end(state) && is_linebreak(self.peek_char(state)) + fn at_linebreak(&self, req: &Request<'a, S>) -> bool { + !self.at_end(req) && is_linebreak(self.peek_char(req)) } fn at_boundary bool>( &self, - state: &State<'a, S>, + req: &Request<'a, S>, mut word_checker: F, ) -> bool { - if self.at_beginning() && self.at_end(state) { + if self.at_beginning() && self.at_end(req) { return false; } - let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); - let this = !self.at_end(state) && word_checker(self.peek_char(state)); + let that = !self.at_beginning() && word_checker(self.back_peek_char(req)); + let this = !self.at_end(req) && word_checker(self.peek_char(req)); this != that } fn at_non_boundary bool>( &self, - state: &State<'a, S>, + req: &Request<'a, S>, mut word_checker: F, ) -> bool { - if self.at_beginning() && self.at_end(state) { + if self.at_beginning() && self.at_end(req) { return false; } - let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); - let this = !self.at_end(state) && word_checker(self.peek_char(state)); + let that = !self.at_beginning() && word_checker(self.back_peek_char(req)); + let this = !self.at_end(req) && word_checker(self.peek_char(req)); this == that } - fn can_success(&self, state: &State<'a, S>) -> bool { + fn can_success(&self, req: &Request<'a, S>) -> bool { if !self.toplevel { return true; } - if state.match_all && !self.at_end(state) { + if req.match_all && !self.at_end(req) { return false; } - if state.must_advance && self.string_position == state.start { + if req.must_advance && self.string_position == req.start { return false; } true @@ -916,58 +1055,94 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn failure(&mut self) { self.has_matched = Some(false); } + + fn next_from<'b>( + &mut self, + peek: usize, + req: &Request<'a, S>, + state: &'b mut State<'a, S>, + f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + ) -> &'b mut Self { + self.next_offset(self.peek_code(req, peek) as usize + 1, state, f) + } + + fn next_offset<'b>( + &mut self, + offset: usize, + state: &'b mut State<'a, S>, + f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + ) -> &'b mut Self { + self.next_at(self.code_position + offset, state, f) + } + + fn next_at<'b>( + &mut self, + code_position: usize, + state: &'b mut State<'a, S>, + f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + ) -> &'b mut Self { + self.handler = Some(f); + state.next_context.insert(MatchContext { + code_position, + has_matched: None, + handler: None, + count: -1, + ..*self + }) + } } -fn at<'a, S: StrDrive>(state: &State<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { +fn at<'a, S: StrDrive>(req: &Request<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), - SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(state)), - SreAtCode::BOUNDARY => ctx.at_boundary(state, is_word), - SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(state, is_word), + SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)), + SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word), + SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word), SreAtCode::END => { - (ctx.remaining_chars(state) == 1 && ctx.at_linebreak(state)) || ctx.at_end(state) + (ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req) } - SreAtCode::END_LINE => ctx.at_linebreak(state) || ctx.at_end(state), - SreAtCode::END_STRING => ctx.at_end(state), - SreAtCode::LOC_BOUNDARY => ctx.at_boundary(state, is_loc_word), - SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(state, is_loc_word), - SreAtCode::UNI_BOUNDARY => ctx.at_boundary(state, is_uni_word), - SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(state, is_uni_word), + SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req), + SreAtCode::END_STRING => ctx.at_end(req), + SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, is_loc_word), + SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, is_uni_word), } } fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( - state: &State<'a, S>, + req: &Request<'a, S>, ctx: &mut MatchContext<'a, S>, f: F, ) { - if ctx.at_end(state) || !f(ctx.peek_code(state, 1), ctx.peek_char(state)) { + if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) { ctx.failure(); } else { ctx.skip_code(2); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( - state: &State<'a, S>, + req: &Request<'a, S>, ctx: &mut MatchContext<'a, S>, f: F, ) { - if ctx.at_end(state) || !f(&ctx.pattern(state)[2..], ctx.peek_char(state)) { + if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { ctx.failure(); } else { - ctx.skip_code_from(state, 1); - ctx.skip_char(state, 1); + ctx.skip_code_from(req, 1); + ctx.skip_char(req, 1); } } fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( + req: &Request<'a, S>, state: &State<'a, S>, ctx: &mut MatchContext<'a, S>, mut f: F, ) { - let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); let (group_start, group_end) = match (group_start, group_end) { (Some(start), Some(end)) if start <= end => (start, end), _ => { @@ -977,16 +1152,16 @@ fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( let mut gctx = MatchContext { string_position: group_start, - string_offset: state.string.offset(0, group_start), + string_offset: req.string.offset(0, group_start), ..*ctx }; for _ in group_start..group_end { - if ctx.at_end(state) || f(ctx.peek_char(state)) != f(gctx.peek_char(state)) { + if ctx.at_end(req) || f(ctx.peek_char(req)) != f(gctx.peek_char(req)) { return ctx.failure(); } - ctx.skip_char(state, 1); - gctx.skip_char(state, 1); + ctx.skip_char(req, 1); + gctx.skip_char(req, 1); } ctx.skip_code(2); @@ -1122,60 +1297,56 @@ fn charset(set: &[u32], ch: u32) -> bool { } fn _count<'a, S: StrDrive>( + req: &Request<'a, S>, state: &mut State<'a, S>, ctx: &MatchContext<'a, S>, max_count: usize, ) -> usize { let mut ctx = *ctx; - let max_count = std::cmp::min(max_count, ctx.remaining_chars(state)); + let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); let end = ctx.string_position + max_count; - let opcode = SreOpcode::try_from(ctx.peek_code(state, 0)).unwrap(); + let opcode = SreOpcode::try_from(ctx.peek_code(req, 0)).unwrap(); match opcode { SreOpcode::ANY => { - while !ctx.string_position < end && !ctx.at_linebreak(state) { - ctx.skip_char(state, 1); + while !ctx.string_position < end && !ctx.at_linebreak(req) { + ctx.skip_char(req, 1); } } SreOpcode::ANY_ALL => { - ctx.skip_char(state, max_count); + ctx.skip_char(req, max_count); } SreOpcode::IN => { - while !ctx.string_position < end - && charset(&ctx.pattern(state)[2..], ctx.peek_char(state)) + while !ctx.string_position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char(req)) { - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } SreOpcode::LITERAL => { - general_count_literal(state, &mut ctx, end, |code, c| code == c as u32); + general_count_literal(req, &mut ctx, end, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(state, &mut ctx, end, |code, c| code != c as u32); + general_count_literal(req, &mut ctx, end, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { - code == lower_ascii(c) as u32 - }); + general_count_literal(req, &mut ctx, end, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { - code != lower_ascii(c) as u32 - }); + general_count_literal(req, &mut ctx, end, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(state, &mut ctx, end, char_loc_ignore); + general_count_literal(req, &mut ctx, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); + general_count_literal(req, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { + general_count_literal(req, &mut ctx, end, |code, c| { code == lower_unicode(c) as u32 }); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { + general_count_literal(req, &mut ctx, end, |code, c| { code != lower_unicode(c) as u32 }); } @@ -1188,9 +1359,9 @@ fn _count<'a, S: StrDrive>( while count < max_count { ctx.code_position = reset_position; - let code = ctx.peek_code(state, 0); + let code = ctx.peek_code(req, 0); let code = SreOpcode::try_from(code).unwrap(); - dispatch(state, &mut ctx, code); + dispatch(req, state, &mut ctx, code); if ctx.has_matched == Some(false) { break; } @@ -1205,14 +1376,14 @@ fn _count<'a, S: StrDrive>( } fn general_count_literal<'a, S: StrDrive, F: FnMut(u32, u32) -> bool>( - state: &State<'a, S>, + req: &Request<'a, S>, ctx: &mut MatchContext<'a, S>, end: usize, mut f: F, ) { - let ch = ctx.peek_code(state, 1); - while !ctx.string_position < end && f(ch, ctx.peek_char(state)) { - ctx.skip_char(state, 1); + let ch = ctx.peek_code(req, 1); + while !ctx.string_position < end && f(ch, ctx.peek_char(req)) { + ctx.skip_char(req, 1); } } From c15387e97289386bbf0891a7ed18367220b59a15 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 16:44:47 +0200 Subject: [PATCH 063/960] refactor tests --- generate_tests.py | 2 +- src/engine.rs | 4 ++-- tests/tests.rs | 47 +++++++++++++++++++++++------------------------ 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/generate_tests.py b/generate_tests.py index b432720cd1..8adf043f29 100644 --- a/generate_tests.py +++ b/generate_tests.py @@ -33,7 +33,7 @@ def compile(cls, pattern, flags=0): def replace_compiled(m): line, indent, varname, pattern = m.groups() pattern = eval(pattern, {"re": CompiledPattern}) - pattern = f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" + pattern = f"Pattern {{ code: &{json.dumps(pattern.code)} }}" return f'''{line} {indent}// START GENERATED by generate_tests.py {indent}#[rustfmt::skip] let {varname} = {pattern}; diff --git a/src/engine.rs b/src/engine.rs index 5e1f1457ec..b9487a2fd2 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -81,7 +81,7 @@ pub struct State<'a, S: StrDrive> { pub string_position: usize, next_context: Option>, popped_has_matched: bool, - has_matched: bool, + pub has_matched: bool, } impl<'a, S: StrDrive> State<'a, S> { @@ -696,7 +696,7 @@ fn op_repeat<'a, S: StrDrive>( state.string_position = ctx.string_position; - let repeat_ctx_id = state.repeat_stack.len(); + let repeat_ctx_id = state.repeat_stack.len() - 1; // let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { let next_ctx = ctx.next_from(1, req, state, |req, state, ctx| { diff --git a/tests/tests.rs b/tests/tests.rs index cc5c4d1f38..b4ad09f7be 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1,18 +1,17 @@ -use sre_engine::constants::SreFlag; use sre_engine::engine; struct Pattern { code: &'static [u32], - flags: SreFlag, } impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - range: std::ops::Range, - ) -> engine::State<'a, S> { - engine::State::new(string, range.start, range.end, self.flags, self.code) + ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + let req = engine::Request::new(string, 0, usize::MAX, self.code, false); + let state = engine::State::new(0); + (req, state) } } @@ -20,10 +19,10 @@ impl Pattern { fn test_2427() { // pattern lookbehind = re.compile(r'(? Date: Tue, 9 Aug 2022 16:54:41 +0200 Subject: [PATCH 064/960] refactor benches --- benches/benches.rs | 95 +++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index d000ceb62e..e24ea2f972 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -3,68 +3,77 @@ extern crate test; use test::Bencher; -use sre_engine::constants::SreFlag; use sre_engine::engine; -pub struct Pattern { - pub code: &'static [u32], - pub flags: SreFlag, + +struct Pattern { + code: &'static [u32], } impl Pattern { - pub fn state<'a, S: engine::StrDrive>( + fn state<'a, S: engine::StrDrive>( + &self, + string: S, + ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + self.state_range(string, 0..usize::MAX) + } + + fn state_range<'a, S: engine::StrDrive>( &self, string: S, range: std::ops::Range, - ) -> engine::State<'a, S> { - engine::State::new(string, range.start, range.end, self.flags, self.code) + ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + let req = engine::Request::new(string, range.start, range.end, self.code, false); + let state = engine::State::new(0); + (req, state) } } + #[bench] fn benchmarks(b: &mut Bencher) { // # test common prefix // pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1] }; // END GENERATED // pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1] }; // END GENERATED - // pattern pn = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation + // pattern p3 = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1] }; // END GENERATED - // pattern pn = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation + // pattern p4 = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1] }; // END GENERATED - // pattern pn = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference + // pattern p5 = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference // START GENERATED by generate_tests.py - #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1] }; // END GENERATED - // pattern pn = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization + // pattern p6 = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization // START GENERATED by generate_tests.py - #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1] }; // END GENERATED - // pattern pn = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets + // pattern p7 = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets // START GENERATED by generate_tests.py - #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1] }; // END GENERATED - // pattern pn = re.compile('Python') #, 'Python'), # Simple text literal + // pattern p8 = re.compile('Python') #, 'Python'), # Simple text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1] }; // END GENERATED - // pattern pn = re.compile('.*Python') #, 'Python'), # Bad text literal + // pattern p9 = re.compile('.*Python') #, 'Python'), # Bad text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1] }; // END GENERATED - // pattern pn = re.compile('.*Python.*') #, 'Python'), # Worse text literal + // pattern p10 = re.compile('.*Python.*') #, 'Python'), # Worse text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1] }; // END GENERATED - // pattern pn = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping + // pattern p11 = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping // START GENERATED by generate_tests.py - #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1] }; // END GENERATED let tests = [ @@ -83,29 +92,29 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { - let mut state = p.state(s.clone(), 0..usize::MAX); - state.search(); + let (mut req, mut state) = p.state(s.clone()); + state.search(&mut req); assert!(state.has_matched); - state = p.state(s.clone(), 0..usize::MAX); - state.pymatch(); + let (mut req, mut state) = p.state(s.clone()); + state.pymatch(&mut req); assert!(state.has_matched); - state = p.state(s.clone(), 0..usize::MAX); - state.match_all = true; - state.pymatch(); + let (mut req, mut state) = p.state(s.clone()); + req.match_all = true; + state.pymatch(&mut req); assert!(state.has_matched); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); - state = p.state(s2.as_str(), 0..usize::MAX); - state.search(); + let (mut req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); + state.search(&mut req); assert!(state.has_matched); - state = p.state(s2.as_str(), 10000..usize::MAX); - state.pymatch(); + let (mut req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); + state.pymatch(&mut req); assert!(state.has_matched); - state = p.state(s2.as_str(), 10000..10000 + s.len()); - state.pymatch(); + let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + state.pymatch(&mut req); assert!(state.has_matched); - state = p.state(s2.as_str(), 10000..10000 + s.len()); - state.match_all = true; - state.pymatch(); + let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + req.match_all = true; + state.pymatch(&mut req); assert!(state.has_matched); } }) From de8973d77a40303693e8e15da70fe63f6f974546 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 17:34:03 +0200 Subject: [PATCH 065/960] simplify lifetime --- benches/benches.rs | 6 +- src/engine.rs | 265 ++++++++++++++++++--------------------------- tests/tests.rs | 4 +- 3 files changed, 110 insertions(+), 165 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index e24ea2f972..8e0e87935a 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -13,7 +13,7 @@ impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + ) -> (engine::Request<'a, S>, engine::State) { self.state_range(string, 0..usize::MAX) } @@ -21,9 +21,9 @@ impl Pattern { &self, string: S, range: std::ops::Range, - ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, range.start, range.end, self.code, false); - let state = engine::State::new(0); + let state = engine::State::new(); (req, state) } } diff --git a/src/engine.rs b/src/engine.rs index b9487a2fd2..ace75d1a36 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -39,25 +39,6 @@ impl<'a, S: StrDrive> Request<'a, S> { } } -macro_rules! next_ctx { - (offset $offset:expr, $state:expr, $ctx:expr, $handler:expr) => { - next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) - }; - (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { - next_ctx!(offset $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) - }; - (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => {{ - $ctx.handler = Some($handler); - $state.next_context.insert(MatchContext { - code_position: $position, - has_matched: None, - handler: None, - count: -1, - ..*$ctx - }) - }}; -} - macro_rules! mark { (push, $state:expr) => { $state @@ -72,27 +53,27 @@ macro_rules! mark { } #[derive(Debug)] -pub struct State<'a, S: StrDrive> { +pub struct State { pub marks: Vec>, pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, - context_stack: Vec>, + context_stack: Vec>, repeat_stack: Vec, pub string_position: usize, - next_context: Option>, + next_context: Option>, popped_has_matched: bool, pub has_matched: bool, } -impl<'a, S: StrDrive> State<'a, S> { - pub fn new(string_position: usize) -> Self { +impl State { + pub fn new() -> Self { Self { marks: Vec::new(), lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), repeat_stack: Vec::new(), - string_position, + string_position: 0, next_context: None, popped_has_matched: false, has_matched: false, @@ -145,7 +126,7 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks_stack.pop(); } - fn _match(&mut self, req: &mut Request<'a, S>) { + fn _match(&mut self, req: &mut Request) { while let Some(mut ctx) = self.context_stack.pop() { if let Some(handler) = ctx.handler.take() { handler(req, self, &mut ctx); @@ -169,7 +150,9 @@ impl<'a, S: StrDrive> State<'a, S> { self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self, req: &mut Request<'a, S>) { + pub fn pymatch(&mut self, req: &mut Request) { + self.string_position = req.start; + let ctx = MatchContext { string_position: req.start, string_offset: req.string.offset(0, req.start), @@ -185,7 +168,9 @@ impl<'a, S: StrDrive> State<'a, S> { self._match(req); } - pub fn search(&mut self, req: &mut Request<'a, S>) { + pub fn search(&mut self, req: &mut Request) { + self.string_position = req.start; + // TODO: optimize by op info and skip prefix if req.start > req.end { return; @@ -196,7 +181,7 @@ impl<'a, S: StrDrive> State<'a, S> { let mut start_offset = req.string.offset(0, req.start); - let mut ctx = MatchContext { + let ctx = MatchContext { string_position: req.start, string_offset: start_offset, code_position: 0, @@ -240,10 +225,10 @@ impl<'a, S: StrDrive> State<'a, S> { } } -fn dispatch<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn dispatch( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, opcode: SreOpcode, ) { match opcode { @@ -410,11 +395,7 @@ fn dispatch<'a, S: StrDrive>( /* assert subpattern */ /* */ -fn op_assert<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.failure(); @@ -435,18 +416,14 @@ fn op_assert<'a, S: StrDrive>( /* assert not subpattern */ /* */ -fn op_assert_not<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_assert_not(req: &Request, state: &mut State, ctx: &mut MatchContext) { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.skip_code_from(req, 1); } - let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { + let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { if state.popped_has_matched { ctx.failure(); } else { @@ -460,20 +437,16 @@ fn op_assert_not<'a, S: StrDrive>( // alternation // <0=skip> code ... -fn op_branch<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { mark!(push, state); ctx.count = 1; create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, + fn create_context( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let branch_offset = ctx.count as usize; let next_length = ctx.peek_code(req, branch_offset) as isize; @@ -485,14 +458,10 @@ fn op_branch<'a, S: StrDrive>( state.string_position = ctx.string_position; ctx.count += next_length; - next_ctx!(offset branch_offset + 1, state, ctx, callback); + ctx.next_offset(branch_offset + 1, state, callback); } - fn callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -502,10 +471,10 @@ fn op_branch<'a, S: StrDrive>( } /* <1=min> <2=max> item tail */ -fn op_min_repeat_one<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn op_min_repeat_one( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let min_count = ctx.peek_code(req, 2) as usize; @@ -536,10 +505,10 @@ fn op_min_repeat_one<'a, S: StrDrive>( mark!(push, state); create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, + fn create_context( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let max_count = ctx.peek_code(req, 3) as usize; @@ -553,11 +522,7 @@ fn op_min_repeat_one<'a, S: StrDrive>( } } - fn callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -582,11 +547,7 @@ exactly one character wide, and we're not already collecting backtracking points. for other cases, use the MAX_REPEAT operator */ /* <1=min> <2=max> item tail */ -fn op_repeat_one<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { let min_count = ctx.peek_code(req, 2) as usize; let max_count = ctx.peek_code(req, 3) as usize; @@ -613,10 +574,10 @@ fn op_repeat_one<'a, S: StrDrive>( ctx.count = count as isize; create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, + fn create_context( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let min_count = ctx.peek_code(req, 2) as isize; let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); @@ -641,11 +602,7 @@ fn op_repeat_one<'a, S: StrDrive>( ctx.next_from(1, req, state, callback); } - fn callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -678,11 +635,7 @@ struct RepeatContext { /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> item tail */ -fn op_repeat<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_repeat(req: &Request, state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = RepeatContext { count: -1, min_count: ctx.peek_code(req, 2) as usize, @@ -698,8 +651,7 @@ fn op_repeat<'a, S: StrDrive>( let repeat_ctx_id = state.repeat_stack.len() - 1; - // let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { - let next_ctx = ctx.next_from(1, req, state, |req, state, ctx| { + let next_ctx = ctx.next_from(1, req, state, |_, state, ctx| { ctx.has_matched = Some(state.popped_has_matched); state.repeat_stack.pop(); }); @@ -707,7 +659,7 @@ fn op_repeat<'a, S: StrDrive>( } /* minimizing repeat */ -fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_min_until(state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = state.repeat_stack.last_mut().unwrap(); state.string_position = ctx.string_position; @@ -716,8 +668,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -736,8 +687,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex let repeat_ctx_prev_id = repeat_ctx.prev_id; // see if the tail matches - // let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { - let next_ctx = ctx.next_offset(1, state, |req, state, ctx| { + let next_ctx = ctx.next_offset(1, state, |_, state, ctx| { if state.popped_has_matched { return ctx.success(); } @@ -762,8 +712,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex /* zero-width match protection */ repeat_ctx.last_position = state.string_position; - // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -777,7 +726,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex } /* maximizing repeat */ -fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_max_until(state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; state.string_position = ctx.string_position; @@ -786,8 +735,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -809,7 +757,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex ctx.count = repeat_ctx.last_position as isize; repeat_ctx.last_position = state.string_position; - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { let save_last_position = ctx.count as usize; let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; repeat_ctx.last_position = save_last_position; @@ -826,22 +774,21 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex /* cannot match more repeated items here. make sure the tail matches */ - let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + let repeat_ctx_prev_id = repeat_ctx.prev_id; + let next_ctx = ctx.next_offset(1, state, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx_prev_id; }); return; } /* cannot match more repeated items here. make sure the tail matches */ - let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + // let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); + let repeat_ctx_prev_id = repeat_ctx.prev_id; + let next_ctx = ctx.next_offset(1, state, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx_prev_id; - fn tail_callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn tail_callback(_: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { ctx.success(); } else { @@ -926,19 +873,21 @@ impl<'a> StrDrive for &'a [u8] { } } +type OpFunc = for<'a> fn(&Request<'a, S>, &mut State, &mut MatchContext); + #[derive(Clone, Copy)] -struct MatchContext<'a, S: StrDrive> { +struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, toplevel: bool, - handler: Option, &mut State<'a, S>, &mut Self)>, + handler: Option>, repeat_ctx_id: usize, count: isize, } -impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { +impl<'a, S: StrDrive> std::fmt::Debug for MatchContext { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MatchContext") .field("string_position", &self.string_position) @@ -953,38 +902,38 @@ impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { } } -impl<'a, S: StrDrive> MatchContext<'a, S> { - fn pattern(&self, req: &Request<'a, S>) -> &'a [u32] { +impl MatchContext { + fn pattern<'a>(&self, req: &Request<'a, S>) -> &'a [u32] { &req.pattern_codes[self.code_position..] } - fn remaining_codes(&self, req: &Request<'a, S>) -> usize { + fn remaining_codes(&self, req: &Request) -> usize { req.pattern_codes.len() - self.code_position } - fn remaining_chars(&self, req: &Request<'a, S>) -> usize { + fn remaining_chars(&self, req: &Request) -> usize { req.end - self.string_position } - fn peek_char(&self, req: &Request<'a, S>) -> u32 { + fn peek_char(&self, req: &Request) -> u32 { req.string.peek(self.string_offset) } - fn skip_char(&mut self, req: &Request<'a, S>, skip: usize) { + fn skip_char(&mut self, req: &Request, skip: usize) { self.string_position += skip; self.string_offset = req.string.offset(self.string_offset, skip); } - fn back_peek_char(&self, req: &Request<'a, S>) -> u32 { + fn back_peek_char(&self, req: &Request) -> u32 { req.string.back_peek(self.string_offset) } - fn back_skip_char(&mut self, req: &Request<'a, S>, skip: usize) { + fn back_skip_char(&mut self, req: &Request, skip: usize) { self.string_position -= skip; self.string_offset = req.string.back_offset(self.string_offset, skip); } - fn peek_code(&self, req: &Request<'a, S>, peek: usize) -> u32 { + fn peek_code(&self, req: &Request, peek: usize) -> u32 { req.pattern_codes[self.code_position + peek] } @@ -992,7 +941,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { self.code_position += skip; } - fn skip_code_from(&mut self, req: &Request<'a, S>, peek: usize) { + fn skip_code_from(&mut self, req: &Request, peek: usize) { self.skip_code(self.peek_code(req, peek) as usize + 1); } @@ -1001,19 +950,15 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { self.string_position == 0 } - fn at_end(&self, req: &Request<'a, S>) -> bool { + fn at_end(&self, req: &Request) -> bool { self.string_position == req.end } - fn at_linebreak(&self, req: &Request<'a, S>) -> bool { + fn at_linebreak(&self, req: &Request) -> bool { !self.at_end(req) && is_linebreak(self.peek_char(req)) } - fn at_boundary bool>( - &self, - req: &Request<'a, S>, - mut word_checker: F, - ) -> bool { + fn at_boundary bool>(&self, req: &Request, mut word_checker: F) -> bool { if self.at_beginning() && self.at_end(req) { return false; } @@ -1024,7 +969,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn at_non_boundary bool>( &self, - req: &Request<'a, S>, + req: &Request, mut word_checker: F, ) -> bool { if self.at_beginning() && self.at_end(req) { @@ -1035,7 +980,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { this == that } - fn can_success(&self, req: &Request<'a, S>) -> bool { + fn can_success(&self, req: &Request) -> bool { if !self.toplevel { return true; } @@ -1059,9 +1004,9 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn next_from<'b>( &mut self, peek: usize, - req: &Request<'a, S>, - state: &'b mut State<'a, S>, - f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + req: &Request, + state: &'b mut State, + f: OpFunc, ) -> &'b mut Self { self.next_offset(self.peek_code(req, peek) as usize + 1, state, f) } @@ -1069,8 +1014,8 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn next_offset<'b>( &mut self, offset: usize, - state: &'b mut State<'a, S>, - f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + state: &'b mut State, + f: OpFunc, ) -> &'b mut Self { self.next_at(self.code_position + offset, state, f) } @@ -1078,8 +1023,8 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn next_at<'b>( &mut self, code_position: usize, - state: &'b mut State<'a, S>, - f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + state: &'b mut State, + f: OpFunc, ) -> &'b mut Self { self.handler = Some(f); state.next_context.insert(MatchContext { @@ -1092,7 +1037,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { } } -fn at<'a, S: StrDrive>(req: &Request<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { +fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)), @@ -1110,9 +1055,9 @@ fn at<'a, S: StrDrive>(req: &Request<'a, S>, ctx: &MatchContext<'a, S>, atcode: } } -fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( - req: &Request<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_op_literal bool>( + req: &Request, + ctx: &mut MatchContext, f: F, ) { if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) { @@ -1123,9 +1068,9 @@ fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( } } -fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( - req: &Request<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_op_in bool>( + req: &Request, + ctx: &mut MatchContext, f: F, ) { if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { @@ -1136,10 +1081,10 @@ fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( } } -fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( - req: &Request<'a, S>, - state: &State<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_op_groupref u32>( + req: &Request, + state: &State, + ctx: &mut MatchContext, mut f: F, ) { let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); @@ -1296,10 +1241,10 @@ fn charset(set: &[u32], ch: u32) -> bool { false } -fn _count<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &MatchContext<'a, S>, +fn _count( + req: &Request, + state: &mut State, + ctx: &MatchContext, max_count: usize, ) -> usize { let mut ctx = *ctx; @@ -1375,9 +1320,9 @@ fn _count<'a, S: StrDrive>( ctx.string_position - state.string_position } -fn general_count_literal<'a, S: StrDrive, F: FnMut(u32, u32) -> bool>( - req: &Request<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_count_literal bool>( + req: &Request, + ctx: &mut MatchContext, end: usize, mut f: F, ) { diff --git a/tests/tests.rs b/tests/tests.rs index b4ad09f7be..ead111c74a 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -8,9 +8,9 @@ impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, 0, usize::MAX, self.code, false); - let state = engine::State::new(0); + let state = engine::State::new(); (req, state) } } From c494feb7f776e8e15185710f72d41b603db995d8 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 21:34:06 +0200 Subject: [PATCH 066/960] refactor split Marks --- Cargo.toml | 1 + benches/benches.rs | 2 +- src/engine.rs | 242 ++++++++++++++++++++++++++++++--------------- tests/tests.rs | 5 +- 4 files changed, 166 insertions(+), 84 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4b403f2861..8993c1e71d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,3 +12,4 @@ include = ["LICENSE", "src/**/*.rs"] [dependencies] num_enum = "0.5" bitflags = "1.2" +optional = "0.5" diff --git a/benches/benches.rs b/benches/benches.rs index 8e0e87935a..f19b92d64b 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -23,7 +23,7 @@ impl Pattern { range: std::ops::Range, ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, range.start, range.end, self.code, false); - let state = engine::State::new(); + let state = engine::State::default(); (req, state) } } diff --git a/src/engine.rs b/src/engine.rs index ace75d1a36..087d64e8cd 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2,7 +2,9 @@ use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; use super::MAXREPEAT; +use optional::Optioned; use std::convert::TryFrom; +use std::ops::Deref; const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') @@ -39,24 +41,98 @@ impl<'a, S: StrDrive> Request<'a, S> { } } -macro_rules! mark { - (push, $state:expr) => { - $state - .marks_stack - .push(($state.marks.clone(), $state.lastindex)) - }; - (pop, $state:expr) => { - let (marks, lastindex) = $state.marks_stack.pop().unwrap(); - $state.marks = marks; - $state.lastindex = lastindex; - }; +// macro_rules! mark { +// (push, $state:expr) => { +// $state +// .marks_stack +// .push(($state.marks.clone(), $state.lastindex)) +// }; +// (pop, $state:expr) => { +// let (marks, lastindex) = $state.marks_stack.pop().unwrap(); +// $state.marks = marks; +// $state.lastindex = lastindex; +// }; +// } + +#[derive(Debug)] +pub struct Marks { + last_index: isize, + marks: Vec>, + marks_stack: Vec<(Vec>, isize)>, +} + +impl Default for Marks { + fn default() -> Self { + Self { + last_index: -1, + marks: Vec::new(), + marks_stack: Vec::new(), + } + } +} + +impl Deref for Marks { + type Target = Vec>; + + fn deref(&self) -> &Self::Target { + &self.marks + } +} + +impl Marks { + pub fn get(&self, group_index: usize) -> (Optioned, Optioned) { + let marks_index = 2 * group_index; + if marks_index + 1 < self.marks.len() { + (self.marks[marks_index], self.marks[marks_index + 1]) + } else { + (Optioned::none(), Optioned::none()) + } + } + + pub fn last_index(&self) -> isize { + self.last_index + } + + fn set(&mut self, mark_nr: usize, position: usize) { + if mark_nr & 1 != 0 { + self.last_index = mark_nr as isize / 2 + 1; + } + if mark_nr >= self.marks.len() { + self.marks.resize(mark_nr + 1, Optioned::none()); + } + self.marks[mark_nr] = Optioned::some(position); + } + + fn push(&mut self) { + self.marks_stack.push((self.marks.clone(), self.last_index)); + } + + fn pop(&mut self) { + let (marks, last_index) = self.marks_stack.pop().unwrap(); + self.marks = marks; + self.last_index = last_index; + } + + fn pop_keep(&mut self) { + let (marks, last_index) = self.marks_stack.last().unwrap().clone(); + self.marks = marks; + self.last_index = last_index; + } + + fn pop_discard(&mut self) { + self.marks_stack.pop(); + } + + fn clear(&mut self) { + self.last_index = -1; + self.marks.clear(); + self.marks_stack.clear(); + } } #[derive(Debug)] pub struct State { - pub marks: Vec>, - pub lastindex: isize, - marks_stack: Vec<(Vec>, isize)>, + pub marks: Marks, context_stack: Vec>, repeat_stack: Vec, pub string_position: usize, @@ -65,25 +141,23 @@ pub struct State { pub has_matched: bool, } -impl State { - pub fn new() -> Self { +impl Default for State { + fn default() -> Self { Self { - marks: Vec::new(), - lastindex: -1, - marks_stack: Vec::new(), - context_stack: Vec::new(), - repeat_stack: Vec::new(), - string_position: 0, - next_context: None, - popped_has_matched: false, - has_matched: false, + marks: Default::default(), + context_stack: Default::default(), + repeat_stack: Default::default(), + string_position: Default::default(), + next_context: Default::default(), + popped_has_matched: Default::default(), + has_matched: Default::default(), } } +} +impl State { pub fn reset(&mut self, string_position: usize) { - self.lastindex = -1; self.marks.clear(); - self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); self.string_position = string_position; @@ -92,23 +166,23 @@ impl State { self.has_matched = false; } - fn set_mark(&mut self, mark_nr: usize, position: usize) { - if mark_nr & 1 != 0 { - self.lastindex = mark_nr as isize / 2 + 1; - } - if mark_nr >= self.marks.len() { - self.marks.resize(mark_nr + 1, None); - } - self.marks[mark_nr] = Some(position); - } - fn get_marks(&self, group_index: usize) -> (Option, Option) { - let marks_index = 2 * group_index; - if marks_index + 1 < self.marks.len() { - (self.marks[marks_index], self.marks[marks_index + 1]) - } else { - (None, None) - } - } + // fn set_mark(&mut self, mark_nr: usize, position: usize) { + // if mark_nr & 1 != 0 { + // self.lastindex = mark_nr as isize / 2 + 1; + // } + // if mark_nr >= self.marks.len() { + // self.marks.resize(mark_nr + 1, None); + // } + // self.marks[mark_nr] = Some(position); + // } + // fn get_marks(&self, group_index: usize) -> (Option, Option) { + // let marks_index = 2 * group_index; + // if marks_index + 1 < self.marks.len() { + // (self.marks[marks_index], self.marks[marks_index + 1]) + // } else { + // (None, None) + // } + // } // fn marks_push(&mut self) { // self.marks_stack.push((self.marks.clone(), self.lastindex)); // } @@ -117,14 +191,14 @@ impl State { // self.marks = marks; // self.lastindex = lastindex; // } - fn marks_pop_keep(&mut self) { - let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); - self.marks = marks; - self.lastindex = lastindex; - } - fn marks_pop_discard(&mut self) { - self.marks_stack.pop(); - } + // fn marks_pop_keep(&mut self) { + // let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); + // self.marks = marks; + // self.lastindex = lastindex; + // } + // fn marks_pop_discard(&mut self) { + // self.marks_stack.pop(); + // } fn _match(&mut self, req: &mut Request) { while let Some(mut ctx) = self.context_stack.pop() { @@ -311,7 +385,9 @@ fn dispatch( general_op_literal(req, ctx, |code, c| !char_loc_ignore(code, c)) } SreOpcode::MARK => { - state.set_mark(ctx.peek_code(req, 1) as usize, ctx.string_position); + state + .marks + .set(ctx.peek_code(req, 1) as usize, ctx.string_position); ctx.skip_code(2); } SreOpcode::MAX_UNTIL => op_max_until(state, ctx), @@ -324,12 +400,14 @@ fn dispatch( SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(req, state, ctx, lower_locate), SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(req, state, ctx, lower_unicode), SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); - match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => { - ctx.skip_code(3); - } - _ => ctx.skip_code_from(req, 2), + let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); + if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + ctx.skip_code(3); + } else { + ctx.skip_code_from(req, 2) } } _ => unreachable!("unexpected opcode"), @@ -438,7 +516,7 @@ fn op_assert_not(req: &Request, state: &mut State, ctx: &mut // alternation // <0=skip> code ... fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { - mark!(push, state); + state.marks.push(); ctx.count = 1; create_context(req, state, ctx); @@ -451,7 +529,7 @@ fn op_branch(req: &Request, state: &mut State, ctx: &mut Matc let branch_offset = ctx.count as usize; let next_length = ctx.peek_code(req, branch_offset) as isize; if next_length == 0 { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } @@ -465,7 +543,7 @@ fn op_branch(req: &Request, state: &mut State, ctx: &mut Matc if state.popped_has_matched { return ctx.success(); } - state.marks_pop_keep(); + state.marks.pop_keep(); create_context(req, state, ctx); } } @@ -502,7 +580,7 @@ fn op_min_repeat_one( return ctx.success(); } - mark!(push, state); + state.marks.push(); create_context(req, state, ctx); fn create_context( @@ -517,7 +595,7 @@ fn op_min_repeat_one( // next_ctx!(from 1, state, ctx, callback); ctx.next_from(1, req, state, callback); } else { - state.marks_pop_discard(); + state.marks.pop_discard(); ctx.failure(); } } @@ -530,13 +608,13 @@ fn op_min_repeat_one( state.string_position = ctx.string_position; if _count(req, state, ctx, 1) == 0 { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } ctx.skip_char(req, 1); ctx.count += 1; - state.marks_pop_keep(); + state.marks.pop_keep(); create_context(req, state, ctx); } } @@ -570,7 +648,7 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut return ctx.success(); } - mark!(push, state); + state.marks.push(); ctx.count = count as isize; create_context(req, state, ctx); @@ -587,7 +665,7 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); while ctx.at_end(req) || ctx.peek_char(req) != c { if ctx.count <= min_count { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } ctx.back_skip_char(req, 1); @@ -610,14 +688,14 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut let min_count = ctx.peek_code(req, 2) as isize; if ctx.count <= min_count { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } ctx.back_skip_char(req, 1); ctx.count -= 1; - state.marks_pop_keep(); + state.marks.pop_keep(); create_context(req, state, ctx); } } @@ -680,7 +758,7 @@ fn op_min_until(state: &mut State, ctx: &mut MatchContext) { return; } - mark!(push, state); + state.marks.push(); ctx.count = ctx.repeat_ctx_id as isize; @@ -698,7 +776,7 @@ fn op_min_until(state: &mut State, ctx: &mut MatchContext) { state.string_position = ctx.string_position; - mark!(pop, state); + state.marks.pop(); // match more until tail matches @@ -752,7 +830,7 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { { /* we may have enough matches, but if we can match another item, do so */ - mark!(push, state); + state.marks.push(); ctx.count = repeat_ctx.last_position as isize; repeat_ctx.last_position = state.string_position; @@ -763,11 +841,11 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { repeat_ctx.last_position = save_last_position; if state.popped_has_matched { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.success(); } - mark!(pop, state); + state.marks.pop(); repeat_ctx.count -= 1; state.string_position = ctx.string_position; @@ -1087,12 +1165,14 @@ fn general_op_groupref u32>( ctx: &mut MatchContext, mut f: F, ) { - let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); - let (group_start, group_end) = match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => (start, end), - _ => { - return ctx.failure(); - } + let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); + let (group_start, group_end) = if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + (group_start.unpack(), group_end.unpack()) + } else { + return ctx.failure(); }; let mut gctx = MatchContext { diff --git a/tests/tests.rs b/tests/tests.rs index ead111c74a..cb11db3483 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -10,7 +10,7 @@ impl Pattern { string: S, ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, 0, usize::MAX, self.code, false); - let state = engine::State::new(); + let state = engine::State::default(); (req, state) } } @@ -62,13 +62,14 @@ fn test_zerowidth() { #[test] fn test_repeat_context_panic() { + use optional::Optioned; // pattern p = re.compile(r'(?:a*?(xx)??z)*') // START GENERATED by generate_tests.py #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1] }; // END GENERATED let (mut req, mut state) = p.state("axxzaz"); state.pymatch(&mut req); - assert!(state.marks == vec![Some(1), Some(3)]); + assert!(*state.marks == vec![Optioned::some(1), Optioned::some(3)]); } #[test] From 18258000cde2c848198b745e92f74a89d48c7fe8 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 22:17:01 +0200 Subject: [PATCH 067/960] clearup --- src/engine.rs | 78 +++++++++++---------------------------------------- 1 file changed, 16 insertions(+), 62 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 087d64e8cd..652ca04c27 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,6 +1,6 @@ // good luck to those that follow; here be dragons -use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; +use super::constants::{SreAtCode, SreCatCode, SreOpcode}; use super::MAXREPEAT; use optional::Optioned; use std::convert::TryFrom; @@ -41,19 +41,6 @@ impl<'a, S: StrDrive> Request<'a, S> { } } -// macro_rules! mark { -// (push, $state:expr) => { -// $state -// .marks_stack -// .push(($state.marks.clone(), $state.lastindex)) -// }; -// (pop, $state:expr) => { -// let (marks, lastindex) = $state.marks_stack.pop().unwrap(); -// $state.marks = marks; -// $state.lastindex = lastindex; -// }; -// } - #[derive(Debug)] pub struct Marks { last_index: isize, @@ -135,6 +122,7 @@ pub struct State { pub marks: Marks, context_stack: Vec>, repeat_stack: Vec, + pub start: usize, pub string_position: usize, next_context: Option>, popped_has_matched: bool, @@ -144,62 +132,30 @@ pub struct State { impl Default for State { fn default() -> Self { Self { - marks: Default::default(), - context_stack: Default::default(), - repeat_stack: Default::default(), - string_position: Default::default(), - next_context: Default::default(), - popped_has_matched: Default::default(), - has_matched: Default::default(), + marks: Marks::default(), + context_stack: Vec::new(), + repeat_stack: Vec::new(), + start: 0, + string_position: 0, + next_context: None, + popped_has_matched: false, + has_matched: false, } } } impl State { - pub fn reset(&mut self, string_position: usize) { + pub fn reset(&mut self, start: usize) { self.marks.clear(); self.context_stack.clear(); self.repeat_stack.clear(); - self.string_position = string_position; + self.start = start; + self.string_position = start; self.next_context = None; self.popped_has_matched = false; self.has_matched = false; } - // fn set_mark(&mut self, mark_nr: usize, position: usize) { - // if mark_nr & 1 != 0 { - // self.lastindex = mark_nr as isize / 2 + 1; - // } - // if mark_nr >= self.marks.len() { - // self.marks.resize(mark_nr + 1, None); - // } - // self.marks[mark_nr] = Some(position); - // } - // fn get_marks(&self, group_index: usize) -> (Option, Option) { - // let marks_index = 2 * group_index; - // if marks_index + 1 < self.marks.len() { - // (self.marks[marks_index], self.marks[marks_index + 1]) - // } else { - // (None, None) - // } - // } - // fn marks_push(&mut self) { - // self.marks_stack.push((self.marks.clone(), self.lastindex)); - // } - // fn marks_pop(&mut self) { - // let (marks, lastindex) = self.marks_stack.pop().unwrap(); - // self.marks = marks; - // self.lastindex = lastindex; - // } - // fn marks_pop_keep(&mut self) { - // let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); - // self.marks = marks; - // self.lastindex = lastindex; - // } - // fn marks_pop_discard(&mut self) { - // self.marks_stack.pop(); - // } - fn _match(&mut self, req: &mut Request) { while let Some(mut ctx) = self.context_stack.pop() { if let Some(handler) = ctx.handler.take() { @@ -225,6 +181,7 @@ impl State { } pub fn pymatch(&mut self, req: &mut Request) { + self.start = req.start; self.string_position = req.start; let ctx = MatchContext { @@ -243,6 +200,7 @@ impl State { } pub fn search(&mut self, req: &mut Request) { + self.start = req.start; self.string_position = req.start; // TODO: optimize by op info and skip prefix @@ -479,7 +437,6 @@ fn op_assert(req: &Request, state: &mut State, ctx: &mut Matc return ctx.failure(); } - // let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { if state.popped_has_matched { ctx.skip_code_from(req, 1); @@ -592,7 +549,6 @@ fn op_min_repeat_one( if max_count == MAXREPEAT || ctx.count as usize <= max_count { state.string_position = ctx.string_position; - // next_ctx!(from 1, state, ctx, callback); ctx.next_from(1, req, state, callback); } else { state.marks.pop_discard(); @@ -676,7 +632,6 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut state.string_position = ctx.string_position; // General case: backtracking - // next_ctx!(from 1, state, ctx, callback); ctx.next_from(1, req, state, callback); } @@ -861,7 +816,6 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { /* cannot match more repeated items here. make sure the tail matches */ - // let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); let repeat_ctx_prev_id = repeat_ctx.prev_id; let next_ctx = ctx.next_offset(1, state, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx_prev_id; @@ -965,7 +919,7 @@ struct MatchContext { count: isize, } -impl<'a, S: StrDrive> std::fmt::Debug for MatchContext { +impl std::fmt::Debug for MatchContext { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MatchContext") .field("string_position", &self.string_position) From e42df1d8597bf964fb7f70b606df9e0be696c624 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 22:18:01 +0200 Subject: [PATCH 068/960] update version to 0.4.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8993c1e71d..373166d6db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.3.1" +version = "0.4.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From c4f10edc95ab7dc1f20bed2c9b4bddfc520fa660 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 14 Aug 2022 20:46:20 +0200 Subject: [PATCH 069/960] impl opinfo single literal --- benches/benches.rs | 24 +++---- src/engine.rs | 173 +++++++++++++++++++++++---------------------- tests/tests.rs | 51 +++++++++---- 3 files changed, 139 insertions(+), 109 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index f19b92d64b..604cf91f42 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -92,29 +92,29 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { - let (mut req, mut state) = p.state(s.clone()); - state.search(&mut req); + let (req, mut state) = p.state(s.clone()); + state.search(req); assert!(state.has_matched); - let (mut req, mut state) = p.state(s.clone()); - state.pymatch(&mut req); + let (req, mut state) = p.state(s.clone()); + state.pymatch(req); assert!(state.has_matched); let (mut req, mut state) = p.state(s.clone()); req.match_all = true; - state.pymatch(&mut req); + state.pymatch(req); assert!(state.has_matched); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); - let (mut req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); - state.search(&mut req); + let (req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); + state.search(req); assert!(state.has_matched); - let (mut req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); - state.pymatch(&mut req); + let (req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); + state.pymatch(req); assert!(state.has_matched); - let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); - state.pymatch(&mut req); + let (req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + state.pymatch(req); assert!(state.has_matched); let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); req.match_all = true; - state.pymatch(&mut req); + state.pymatch(req); assert!(state.has_matched); } }) diff --git a/src/engine.rs b/src/engine.rs index 652ca04c27..0d645e046d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,5 +1,7 @@ // good luck to those that follow; here be dragons +use crate::constants::SreInfo; + use super::constants::{SreAtCode, SreCatCode, SreOpcode}; use super::MAXREPEAT; use optional::Optioned; @@ -10,6 +12,7 @@ const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } +#[derive(Debug, Clone, Copy)] pub struct Request<'a, S: StrDrive> { pub string: S, pub start: usize, @@ -180,7 +183,7 @@ impl State { self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self, req: &mut Request) { + pub fn pymatch(&mut self, mut req: Request) { self.start = req.start; self.string_position = req.start; @@ -196,10 +199,10 @@ impl State { }; self.context_stack.push(ctx); - self._match(req); + self._match(&mut req); } - pub fn search(&mut self, req: &mut Request) { + pub fn search(&mut self, mut req: Request) { self.start = req.start; self.string_position = req.start; @@ -208,12 +211,11 @@ impl State { return; } - // let start = self.start; - // let end = self.end; + let mut end = req.end; let mut start_offset = req.string.offset(0, req.start); - let ctx = MatchContext { + let mut ctx = MatchContext { string_position: req.start, string_offset: start_offset, code_position: 0, @@ -224,35 +226,97 @@ impl State { count: -1, }; - // if ctx.peek_code(self, 0) == SreOpcode::INFO as u32 { - // search_op_info(self, &mut ctx); - // if let Some(has_matched) = ctx.has_matched { - // self.has_matched = has_matched; - // return; - // } - // } + if ctx.peek_code(&req, 0) == SreOpcode::INFO as u32 { + /* optimization info block */ + /* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ + let req = &mut req; + let min = ctx.peek_code(req, 3) as usize; + + if ctx.remaining_chars(req) < min { + return; + } + + if min > 1 { + /* adjust end point (but make sure we leave at least one + character in there, so literal search will work) */ + // no overflow can happen as remaining chars >= min + end -= min - 1; + + // adjust ctx position + if end < ctx.string_position { + ctx.string_position = end; + ctx.string_offset = req.string.offset(0, ctx.string_position); + } + } + + let flags = SreInfo::from_bits_truncate(ctx.peek_code(req, 2)); + + if flags.contains(SreInfo::PREFIX) { + /* pattern starts with a known prefix */ + /* */ + let len = ctx.peek_code(req, 5) as usize; + let skip = ctx.peek_code(req, 6) as usize; + let prefix = &ctx.pattern(req)[7..]; + let overlap = &prefix[len - 1..]; + + if len == 1 { + // pattern starts with a literal character + ctx.skip_code_from(req, 1); + let c = prefix[0]; + req.must_advance = false; + + while !ctx.at_end(req) { + // find the next matched literal + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + + req.start = ctx.string_position; + self.reset(req.start); + // self.start = ctx.string_position; + self.string_position += skip; + + // literal only + if flags.contains(SreInfo::LITERAL) { + self.has_matched = true; + return; + } + + let mut next_ctx = ctx; + next_ctx.skip_char(req, skip); + next_ctx.skip_code(2 * skip); + + self.context_stack.push(next_ctx); + self._match(req); + + if self.has_matched { + return; + } + + ctx.skip_char(req, 1); + } + return; + } + } + } self.context_stack.push(ctx); - self._match(req); + self._match(&mut req); req.must_advance = false; - while !self.has_matched && req.start < req.end { + ctx.toplevel = false; + while !self.has_matched && req.start < end { req.start += 1; start_offset = req.string.offset(start_offset, 1); self.reset(req.start); + ctx.string_position = req.start; + ctx.string_offset = start_offset; - let ctx = MatchContext { - string_position: req.start, - string_offset: start_offset, - code_position: 0, - has_matched: None, - toplevel: false, - handler: None, - repeat_ctx_id: usize::MAX, - count: -1, - }; self.context_stack.push(ctx); - self._match(req); + self._match(&mut req); } } } @@ -372,63 +436,6 @@ fn dispatch( } } -/* optimization info block */ -/* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ -// fn search_op_info<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { -// let min = ctx.peek_code(state, 3) as usize; - -// if ctx.remaining_chars(state) < min { -// return ctx.failure(); -// } - -// if min > 1 { -// /* adjust end point (but make sure we leave at least one -// character in there, so literal search will work) */ -// // no overflow can happen as remaining chars >= min -// state.end -= min - 1; - -// // adjust ctx position -// if state.end < ctx.string_position { -// ctx.string_position = state.end; -// ctx.string_offset = state.string.offset(0, ctx.string_position); -// } -// } - -// let flags = SreInfo::from_bits_truncate(ctx.peek_code(state, 2)); - -// if flags.contains(SreInfo::PREFIX) { -// /* pattern starts with a known prefix */ -// /* */ -// let len = ctx.peek_code(state, 5) as usize; -// let skip = ctx.peek_code(state, 6) as usize; -// let prefix = &ctx.pattern(state)[7..]; -// let overlap = &prefix[len - 1..]; - -// ctx.skip_code_from(state, 1); - -// if len == 1 { -// // pattern starts with a literal character -// let c = prefix[0]; -// let end = state.end; - -// while (!ctx.at_end(state)) { -// // find the next matched literal -// while (ctx.peek_char(state) != c) { -// ctx.skip_char(state, 1); -// if (ctx.at_end(state)) { -// return ctx.failure(); -// } -// } - -// // literal only -// if flags.contains(SreInfo::LITERAL) { -// return ctx.success(); -// } -// } -// } -// } -// } - /* assert subpattern */ /* */ fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { diff --git a/tests/tests.rs b/tests/tests.rs index cb11db3483..31f032f0a4 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -21,8 +21,8 @@ fn test_2427() { // START GENERATED by generate_tests.py #[rustfmt::skip] let lookbehind = Pattern { code: &[15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1] }; // END GENERATED - let (mut req, mut state) = lookbehind.state("x"); - state.pymatch(&mut req); + let (req, mut state) = lookbehind.state("x"); + state.pymatch(req); assert!(state.has_matched); } @@ -32,8 +32,8 @@ fn test_assert() { // START GENERATED by generate_tests.py #[rustfmt::skip] let positive_lookbehind = Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1] }; // END GENERATED - let (mut req, mut state) = positive_lookbehind.state("abcdef"); - state.search(&mut req); + let (req, mut state) = positive_lookbehind.state("abcdef"); + state.search(req); assert!(state.has_matched); } @@ -43,8 +43,8 @@ fn test_string_boundaries() { // START GENERATED by generate_tests.py #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1] }; // END GENERATED - let (mut req, mut state) = big_b.state(""); - state.search(&mut req); + let (req, mut state) = big_b.state(""); + state.search(req); assert!(!state.has_matched); } @@ -56,8 +56,8 @@ fn test_zerowidth() { // END GENERATED let (mut req, mut state) = p.state("a:"); req.must_advance = true; - state.search(&mut req); - assert!(state.string_position == 1); + state.search(req); + assert_eq!(state.string_position, 1); } #[test] @@ -67,9 +67,9 @@ fn test_repeat_context_panic() { // START GENERATED by generate_tests.py #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1] }; // END GENERATED - let (mut req, mut state) = p.state("axxzaz"); - state.pymatch(&mut req); - assert!(*state.marks == vec![Optioned::some(1), Optioned::some(3)]); + let (req, mut state) = p.state("axxzaz"); + state.pymatch(req); + assert_eq!(*state.marks, vec![Optioned::some(1), Optioned::some(3)]); } #[test] @@ -78,7 +78,30 @@ fn test_double_max_until() { // START GENERATED by generate_tests.py #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1] }; // END GENERATED - let (mut req, mut state) = p.state("1111"); - state.pymatch(&mut req); - assert!(state.string_position == 4); + let (req, mut state) = p.state("1111"); + state.pymatch(req); + assert_eq!(state.string_position, 4); +} + +#[test] +fn test_info_single() { + // pattern p = re.compile(r'aa*') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 8, 1, 1, 4294967295, 1, 1, 97, 0, 17, 97, 25, 6, 0, 4294967295, 17, 97, 1, 1] }; + // END GENERATED + let (req, mut state) = p.state("baaaa"); + state.search(req); + assert_eq!(state.start, 1); + assert_eq!(state.string_position, 5); +} + +#[test] +fn test_info_single2() { + // pattern p = re.compile(r'Python|Perl') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1] }; + // END GENERATED + let (req, mut state) = p.state("Perl"); + state.search(req); + assert!(state.has_matched); } From 236631141fa3d2681e9a53eb3550c96219cb0cf7 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 19:55:48 +0200 Subject: [PATCH 070/960] impl opinfo literal --- src/engine.rs | 81 +++++++++++++++++++++++++++++++++++++++++++++----- tests/tests.rs | 22 ++++++++++++++ 2 files changed, 96 insertions(+), 7 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 0d645e046d..53181c5043 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -256,13 +256,17 @@ impl State { /* */ let len = ctx.peek_code(req, 5) as usize; let skip = ctx.peek_code(req, 6) as usize; - let prefix = &ctx.pattern(req)[7..]; - let overlap = &prefix[len - 1..]; + let prefix = &ctx.pattern(req)[7..7 + len]; + let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2]; if len == 1 { // pattern starts with a literal character - ctx.skip_code_from(req, 1); let c = prefix[0]; + + // code_position ready for tail match + ctx.skip_code_from(req, 1); + ctx.skip_code(2 * skip); + req.must_advance = false; while !ctx.at_end(req) { @@ -275,9 +279,8 @@ impl State { } req.start = ctx.string_position; - self.reset(req.start); - // self.start = ctx.string_position; - self.string_position += skip; + self.start = ctx.string_position; + self.string_position = ctx.string_position + skip; // literal only if flags.contains(SreInfo::LITERAL) { @@ -287,7 +290,6 @@ impl State { let mut next_ctx = ctx; next_ctx.skip_char(req, skip); - next_ctx.skip_code(2 * skip); self.context_stack.push(next_ctx); self._match(req); @@ -297,6 +299,71 @@ impl State { } ctx.skip_char(req, 1); + self.marks.clear(); + } + return; + } else if len > 1 { + // code_position ready for tail match + ctx.skip_code_from(req, 1); + ctx.skip_code(2 * skip); + + req.must_advance = false; + + while !ctx.at_end(req) { + let c = prefix[0]; + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + + let mut i = 1; + loop { + if ctx.peek_char(req) == prefix[i] { + i += 1; + if i != len { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + continue; + } + + req.start = ctx.string_position - (len - 1); + self.start = req.start; + self.string_position = self.start + skip; + + if flags.contains(SreInfo::LITERAL) { + self.has_matched = true; + return; + } + + let mut next_ctx = ctx; + // next_ctx.skip_char(req, 1); + next_ctx.string_position = self.string_position; + next_ctx.string_offset = req.string.offset(0, self.string_position); + self.context_stack.push(next_ctx); + self._match(req); + if self.has_matched { + return; + } + + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + self.marks.clear(); + } + i = overlap[i] as usize; + if i == 0 { + break; + } + } } return; } diff --git a/tests/tests.rs b/tests/tests.rs index 31f032f0a4..21bc89d40c 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -105,3 +105,25 @@ fn test_info_single2() { state.search(req); assert!(state.has_matched); } + +#[test] +fn test_info_literal() { + // pattern p = re.compile(r'ababc+') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 14, 1, 5, 4294967295, 4, 4, 97, 98, 97, 98, 0, 0, 1, 2, 17, 97, 17, 98, 17, 97, 17, 98, 25, 6, 1, 4294967295, 17, 99, 1, 1] }; + // END GENERATED + let (req, mut state) = p.state("!ababc"); + state.search(req); + assert!(state.has_matched); +} + +#[test] +fn test_info_literal2() { + // pattern p = re.compile(r'(python)\1') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 112, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 112, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1] }; + // END GENERATED + let (req, mut state) = p.state("pythonpython"); + state.search(req); + assert!(state.has_matched); +} \ No newline at end of file From 646c8ac6578977b847e8f871e1f83fb422b3e39a Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 20:09:51 +0200 Subject: [PATCH 071/960] impl opinfo charset --- src/engine.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/engine.rs b/src/engine.rs index 53181c5043..fe7ee438e9 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -367,6 +367,31 @@ impl State { } return; } + } else if flags.contains(SreInfo::CHARSET) { + let set = &ctx.pattern(req)[5..]; + ctx.skip_code_from(req, 1); + req.must_advance = false; + loop { + while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { + ctx.skip_char(req, 1); + } + if ctx.at_end(req) { + return; + } + req.start = ctx.string_position; + self.start = ctx.string_position; + self.string_position = ctx.string_position; + + self.context_stack.push(ctx); + self._match(req); + + if self.has_matched { + return; + } + + ctx.skip_char(req, 1); + self.marks.clear(); + } } } From 7e7b9734810947155876cc52028d873ba953b5f4 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 20:36:32 +0200 Subject: [PATCH 072/960] clearup --- src/engine.rs | 301 +++++++++++++++++++++++++++----------------------- 1 file changed, 163 insertions(+), 138 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index fe7ee438e9..7a644f0671 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -252,147 +252,16 @@ impl State { let flags = SreInfo::from_bits_truncate(ctx.peek_code(req, 2)); if flags.contains(SreInfo::PREFIX) { - /* pattern starts with a known prefix */ - /* */ - let len = ctx.peek_code(req, 5) as usize; - let skip = ctx.peek_code(req, 6) as usize; - let prefix = &ctx.pattern(req)[7..7 + len]; - let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2]; - - if len == 1 { - // pattern starts with a literal character - let c = prefix[0]; - - // code_position ready for tail match - ctx.skip_code_from(req, 1); - ctx.skip_code(2 * skip); - - req.must_advance = false; - - while !ctx.at_end(req) { - // find the next matched literal - while ctx.peek_char(req) != c { - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - } - - req.start = ctx.string_position; - self.start = ctx.string_position; - self.string_position = ctx.string_position + skip; - - // literal only - if flags.contains(SreInfo::LITERAL) { - self.has_matched = true; - return; - } - - let mut next_ctx = ctx; - next_ctx.skip_char(req, skip); - - self.context_stack.push(next_ctx); - self._match(req); - - if self.has_matched { - return; - } - - ctx.skip_char(req, 1); - self.marks.clear(); - } - return; - } else if len > 1 { - // code_position ready for tail match - ctx.skip_code_from(req, 1); - ctx.skip_code(2 * skip); - - req.must_advance = false; - - while !ctx.at_end(req) { - let c = prefix[0]; - while ctx.peek_char(req) != c { - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - } - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - - let mut i = 1; - loop { - if ctx.peek_char(req) == prefix[i] { - i += 1; - if i != len { - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - continue; - } - - req.start = ctx.string_position - (len - 1); - self.start = req.start; - self.string_position = self.start + skip; - - if flags.contains(SreInfo::LITERAL) { - self.has_matched = true; - return; - } - - let mut next_ctx = ctx; - // next_ctx.skip_char(req, 1); - next_ctx.string_position = self.string_position; - next_ctx.string_offset = req.string.offset(0, self.string_position); - self.context_stack.push(next_ctx); - self._match(req); - if self.has_matched { - return; - } - - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - self.marks.clear(); - } - i = overlap[i] as usize; - if i == 0 { - break; - } - } - } - return; + if flags.contains(SreInfo::LITERAL) { + search_info_literal::(req, self, ctx); + } else { + search_info_literal::(req, self, ctx); } + return; } else if flags.contains(SreInfo::CHARSET) { - let set = &ctx.pattern(req)[5..]; - ctx.skip_code_from(req, 1); - req.must_advance = false; - loop { - while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { - ctx.skip_char(req, 1); - } - if ctx.at_end(req) { - return; - } - req.start = ctx.string_position; - self.start = ctx.string_position; - self.string_position = ctx.string_position; - - self.context_stack.push(ctx); - self._match(req); - - if self.has_matched { - return; - } - - ctx.skip_char(req, 1); - self.marks.clear(); - } + return search_info_charset(req, self, ctx); } + // fallback to general search } self.context_stack.push(ctx); @@ -528,6 +397,162 @@ fn dispatch( } } +fn search_info_literal( + req: &mut Request, + state: &mut State, + mut ctx: MatchContext, +) { + /* pattern starts with a known prefix */ + /* */ + let len = ctx.peek_code(req, 5) as usize; + let skip = ctx.peek_code(req, 6) as usize; + let prefix = &ctx.pattern(req)[7..7 + len]; + let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2]; + + // code_position ready for tail match + ctx.skip_code_from(req, 1); + ctx.skip_code(2 * skip); + + req.must_advance = false; + + if len == 1 { + // pattern starts with a literal character + let c = prefix[0]; + + while !ctx.at_end(req) { + // find the next matched literal + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + + req.start = ctx.string_position; + state.start = ctx.string_position; + state.string_position = ctx.string_position + skip; + + // literal only + if LITERAL { + state.has_matched = true; + return; + } + + let mut next_ctx = ctx; + next_ctx.skip_char(req, skip); + + state.context_stack.push(next_ctx); + state._match(req); + + if state.has_matched { + return; + } + + ctx.skip_char(req, 1); + state.marks.clear(); + } + } else { + while !ctx.at_end(req) { + let c = prefix[0]; + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + + let mut i = 1; + loop { + if ctx.peek_char(req) == prefix[i] { + i += 1; + if i != len { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + continue; + } + + req.start = ctx.string_position - (len - 1); + state.start = req.start; + state.string_position = state.start + skip; + + // literal only + if LITERAL { + state.has_matched = true; + return; + } + + let mut next_ctx = ctx; + if skip != 0 { + next_ctx.skip_char(req, 1); + } else { + next_ctx.string_position = state.string_position; + next_ctx.string_offset = req.string.offset(0, state.string_position); + } + + state.context_stack.push(next_ctx); + state._match(req); + + if state.has_matched { + return; + } + + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + state.marks.clear(); + } + + i = overlap[i] as usize; + if i == 0 { + break; + } + } + } + } +} + +fn search_info_charset( + req: &mut Request, + state: &mut State, + mut ctx: MatchContext, +) { + let set = &ctx.pattern(req)[5..]; + + ctx.skip_code_from(req, 1); + + req.must_advance = false; + + loop { + while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { + ctx.skip_char(req, 1); + } + if ctx.at_end(req) { + return; + } + + req.start = ctx.string_position; + state.start = ctx.string_position; + state.string_position = ctx.string_position; + + state.context_stack.push(ctx); + state._match(req); + + if state.has_matched { + return; + } + + ctx.skip_char(req, 1); + state.marks.clear(); + } +} + /* assert subpattern */ /* */ fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { From 26a78dbaa4e4e78f1e5d8dcea9c32054ae07fdd3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 20:36:46 +0200 Subject: [PATCH 073/960] update to 0.4.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 373166d6db..53524f1446 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.4.0" +version = "0.4.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From 4e6b27144a407bb4daf341048310cad26570a7b3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 21:30:40 +0200 Subject: [PATCH 074/960] introduce SearchIter --- src/engine.rs | 26 ++++++++++++++++++++++++++ tests/tests.rs | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 7a644f0671..7334516c2f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -282,6 +282,32 @@ impl State { } } +pub struct SearchIter<'a, S: StrDrive> { + pub req: Request<'a, S>, + pub state: State, +} + +impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { + type Item = (); + + fn next(&mut self) -> Option { + if self.req.start > self.req.end { + return None; + } + + self.state.reset(self.req.start); + self.state.search(self.req); + if !self.state.has_matched { + return None; + } + + self.req.must_advance = self.state.string_position == self.state.start; + self.req.start = self.state.string_position; + + Some(()) + } +} + fn dispatch( req: &Request, state: &mut State, diff --git a/tests/tests.rs b/tests/tests.rs index 21bc89d40c..5212226f4e 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -126,4 +126,4 @@ fn test_info_literal2() { let (req, mut state) = p.state("pythonpython"); state.search(req); assert!(state.has_matched); -} \ No newline at end of file +} From df6831694b4f4b2c0672280b66b0d7e8500c9f9c Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Fri, 21 Apr 2023 18:18:11 +0900 Subject: [PATCH 075/960] Adding more issue templates --- .github/ISSUE_TEMPLATE/empty.md | 16 +++++++++++++ .github/ISSUE_TEMPLATE/feature-request.md | 16 +++++++++++++ .github/ISSUE_TEMPLATE/report-bug.md | 24 +++++++++++++++++++ .../ISSUE_TEMPLATE/report-incompatibility.md | 4 ++-- 4 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/empty.md create mode 100644 .github/ISSUE_TEMPLATE/feature-request.md create mode 100644 .github/ISSUE_TEMPLATE/report-bug.md diff --git a/.github/ISSUE_TEMPLATE/empty.md b/.github/ISSUE_TEMPLATE/empty.md new file mode 100644 index 0000000000..6cdafc6653 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/empty.md @@ -0,0 +1,16 @@ +--- +name: Generic issue template +about: which is not covered by other templates +title: '' +labels: +assignees: '' + +--- + +## Summary + + + +## Details + + diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md new file mode 100644 index 0000000000..cb47cb1744 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -0,0 +1,16 @@ +--- +name: Feature request +about: Request a feature to use RustPython (as a Rust library) +title: '' +labels: C-enhancement +assignees: 'youknowone' + +--- + +## Summary + + + +## Expected use case + + diff --git a/.github/ISSUE_TEMPLATE/report-bug.md b/.github/ISSUE_TEMPLATE/report-bug.md new file mode 100644 index 0000000000..f25b035232 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/report-bug.md @@ -0,0 +1,24 @@ +--- +name: Report bugs +about: Report a bug not related to CPython compatibility +title: '' +labels: C-bug +assignees: '' + +--- + +## Summary + + + +## Expected + + + +## Actual + + + +## Python Documentation + + diff --git a/.github/ISSUE_TEMPLATE/report-incompatibility.md b/.github/ISSUE_TEMPLATE/report-incompatibility.md index e917e94326..d8e50a75ce 100644 --- a/.github/ISSUE_TEMPLATE/report-incompatibility.md +++ b/.github/ISSUE_TEMPLATE/report-incompatibility.md @@ -2,7 +2,7 @@ name: Report incompatibility about: Report an incompatibility between RustPython and CPython title: '' -labels: feat +labels: C-compat assignees: '' --- @@ -11,6 +11,6 @@ assignees: '' -## Python Documentation +## Python Documentation or reference to CPython source code From f2cfa5f0a7766f756e8d6d9d85fa5c441137149b Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Sat, 6 May 2023 03:09:01 +0900 Subject: [PATCH 076/960] Split parser to new repository https://github.com/RustPython/Parser/commit/6b60f85cc4ca248af9b787697c41be2adb7a3ec8 --- .github/workflows/ci.yaml | 87 +- Cargo.lock | 36 +- Cargo.toml | 15 +- common/Cargo.toml | 2 +- compiler/Cargo.toml | 4 +- compiler/ast/Cargo.toml | 20 - compiler/ast/Python.asdl | 145 - compiler/ast/asdl.py | 385 - compiler/ast/asdl_rs.py | 749 -- compiler/ast/src/ast_gen.rs | 1320 --- compiler/ast/src/constant.rs | 239 - compiler/ast/src/fold_helpers.rs | 65 - compiler/ast/src/impls.rs | 60 - compiler/ast/src/lib.rs | 12 - compiler/ast/src/unparse.rs | 536 -- compiler/codegen/Cargo.toml | 6 +- compiler/core/Cargo.toml | 18 - compiler/core/src/bytecode.rs | 1610 ---- compiler/core/src/error.rs | 116 - compiler/core/src/lib.rs | 13 - compiler/core/src/location.rs | 128 - compiler/core/src/marshal.rs | 628 -- compiler/core/src/mode.rs | 30 - compiler/literal/Cargo.toml | 18 - compiler/literal/src/char.rs | 15 - compiler/literal/src/escape.rs | 414 - compiler/literal/src/float.rs | 310 - compiler/literal/src/format.rs | 5 - compiler/literal/src/lib.rs | 4 - compiler/parser/Cargo.toml | 40 - compiler/parser/README.md | 66 - compiler/parser/build.rs | 156 - compiler/parser/python.lalrpop | 1884 ---- compiler/parser/src/context.rs | 177 - compiler/parser/src/function.rs | 229 - compiler/parser/src/lexer.rs | 1794 ---- compiler/parser/src/lib.rs | 135 - compiler/parser/src/mode.rs | 55 - compiler/parser/src/parser.rs | 828 -- compiler/parser/src/python.rs | 3 - ...rser__context__tests__ann_assign_name.snap | 77 - ...ser__context__tests__assign_attribute.snap | 133 - ...on_parser__context__tests__assign_for.snap | 131 - ...n_parser__context__tests__assign_list.snap | 151 - ...ser__context__tests__assign_list_comp.snap | 171 - ...n_parser__context__tests__assign_name.snap | 116 - ...er__context__tests__assign_named_expr.snap | 89 - ...rser__context__tests__assign_set_comp.snap | 171 - ...arser__context__tests__assign_starred.snap | 167 - ...ser__context__tests__assign_subscript.snap | 149 - ..._parser__context__tests__assign_tuple.snap | 151 - ...n_parser__context__tests__assign_with.snap | 80 - ..._context__tests__aug_assign_attribute.snap | 131 - ...rser__context__tests__aug_assign_name.snap | 58 - ..._context__tests__aug_assign_subscript.snap | 147 - ...parser__context__tests__del_attribute.snap | 57 - ...thon_parser__context__tests__del_name.snap | 40 - ...parser__context__tests__del_subscript.snap | 73 - ...unction__tests__function_kw_only_args.snap | 107 - ...__function_kw_only_args_with_defaults.snap | 146 - ...er__function__tests__function_no_args.snap | 52 - ..._tests__function_pos_and_kw_only_args.snap | 162 - ...on_pos_and_kw_only_args_with_defaults.snap | 201 - ...w_only_args_with_defaults_and_varargs.snap | 220 - ..._with_defaults_and_varargs_and_kwargs.snap | 239 - ...r__function__tests__function_pos_args.snap | 107 - ...ests__function_pos_args_with_defaults.snap | 146 - ..._function__tests__lambda_kw_only_args.snap | 121 - ...ts__lambda_kw_only_args_with_defaults.snap | 160 - ...rser__function__tests__lambda_no_args.snap | 66 - ...n__tests__lambda_pos_and_kw_only_args.snap | 158 - ...ser__function__tests__lambda_pos_args.snap | 121 - ..._tests__lambda_pos_args_with_defaults.snap | 160 - ...parser__parser__tests__dict_unpacking.snap | 121 - ..._tests__generator_expression_argument.snap | 333 - ...stpython_parser__parser__tests__match.snap | 949 -- ...r__parser__tests__match_as_identifier.snap | 1910 ---- ...ser__parser__tests__parse_bool_op_and.snap | 56 - ...rser__parser__tests__parse_bool_op_or.snap | 56 - ...on_parser__parser__tests__parse_class.snap | 226 - ...rser__tests__parse_dict_comprehension.snap | 93 - ...ests__parse_double_list_comprehension.snap | 262 - ...on_parser__parser__tests__parse_empty.snap | 5 - ...parser__parser__tests__parse_f_string.snap | 57 - ..._tests__parse_generator_comprehension.snap | 76 - ...er__parser__tests__parse_if_elif_else.snap | 184 - ...parse_if_else_generator_comprehension.snap | 125 - ...n_parser__parser__tests__parse_kwargs.snap | 113 - ...n_parser__parser__tests__parse_lambda.snap | 132 - ...rser__tests__parse_list_comprehension.snap | 76 - ...ed_expression_generator_comprehension.snap | 143 - ..._parser__parser__tests__parse_print_2.snap | 94 - ...ser__parser__tests__parse_print_hello.snap | 75 - ...n_parser__parser__tests__parse_string.snap | 40 - ...n_parser__parser__tests__parse_tuples.snap | 132 - ...stpython_parser__parser__tests__patma.snap | 8254 ----------------- ...stpython_parser__parser__tests__slice.snap | 115 - ...hon_parser__parser__tests__star_index.snap | 641 -- ...rustpython_parser__parser__tests__try.snap | 487 - ...ython_parser__parser__tests__try_star.snap | 861 -- ...ser__parser__tests__variadic_generics.snap | 188 - ...parser__parser__tests__with_statement.snap | 2485 ----- ...arser__string__tests__backspace_alias.snap | 40 - ...hon_parser__string__tests__bell_alias.snap | 40 - ..._string__tests__carriage_return_alias.snap | 40 - ...r_tabulation_with_justification_alias.snap | 40 - ...n_parser__string__tests__delete_alias.snap | 40 - ...er__string__tests__double_quoted_byte.snap | 297 - ...n_parser__string__tests__escape_alias.snap | 40 - ...g__tests__escape_char_in_byte_literal.snap | 51 - ...n_parser__string__tests__escape_octet.snap | 46 - ...arser__string__tests__form_feed_alias.snap | 40 - ...ing__tests__fstring_escaped_character.snap | 91 - ...tring__tests__fstring_escaped_newline.snap | 91 - ...ing__tests__fstring_line_continuation.snap | 91 - ...__fstring_parse_self_documenting_base.snap | 78 - ...ring_parse_self_documenting_base_more.snap | 188 - ...fstring_parse_self_documenting_format.snap | 115 - ...ing__tests__fstring_unescaped_newline.snap | 91 - ...thon_parser__string__tests__hts_alias.snap | 40 - ...r__string__tests__parse_empty_fstring.snap | 5 - ...tring__tests__parse_f_string_concat_1.snap | 57 - ...tring__tests__parse_f_string_concat_2.snap | 57 - ...tring__tests__parse_f_string_concat_3.snap | 93 - ..._parser__string__tests__parse_fstring.snap | 93 - ...__string__tests__parse_fstring_equals.snap | 81 - ...ing__tests__parse_fstring_nested_spec.snap | 92 - ...ring__tests__parse_fstring_not_equals.snap | 81 - ..._tests__parse_fstring_not_nested_spec.snap | 77 - ...ts__parse_fstring_self_doc_prec_space.snap | 78 - ...parse_fstring_self_doc_trailing_space.snap | 78 - ...ring__tests__parse_fstring_yield_expr.snap | 39 - ...r__string__tests__parse_string_concat.snap | 40 - ..._parse_string_triple_quotes_with_kind.snap | 42 - ...ing__tests__parse_u_f_string_concat_1.snap | 59 - ...ing__tests__parse_u_f_string_concat_2.snap | 59 - ...tring__tests__parse_u_string_concat_1.snap | 40 - ...tring__tests__parse_u_string_concat_2.snap | 42 - ...er__string__tests__raw_byte_literal_1.snap | 45 - ...er__string__tests__raw_byte_literal_2.snap | 43 - ...on_parser__string__tests__raw_fstring.snap | 72 - ...er__string__tests__single_quoted_byte.snap | 297 - ...ing__tests__triple_quoted_raw_fstring.snap | 72 - compiler/parser/src/soft_keywords.rs | 118 - compiler/parser/src/string.rs | 1075 --- compiler/parser/src/token.rs | 412 - compiler/src/lib.rs | 10 +- derive-impl/Cargo.toml | 2 +- jit/Cargo.toml | 2 +- pylib/Cargo.toml | 2 +- vm/Cargo.toml | 9 +- wasm/lib/Cargo.toml | 3 +- 152 files changed, 54 insertions(+), 39584 deletions(-) delete mode 100644 compiler/ast/Cargo.toml delete mode 100644 compiler/ast/Python.asdl delete mode 100644 compiler/ast/asdl.py delete mode 100755 compiler/ast/asdl_rs.py delete mode 100644 compiler/ast/src/ast_gen.rs delete mode 100644 compiler/ast/src/constant.rs delete mode 100644 compiler/ast/src/fold_helpers.rs delete mode 100644 compiler/ast/src/impls.rs delete mode 100644 compiler/ast/src/lib.rs delete mode 100644 compiler/ast/src/unparse.rs delete mode 100644 compiler/core/Cargo.toml delete mode 100644 compiler/core/src/bytecode.rs delete mode 100644 compiler/core/src/error.rs delete mode 100644 compiler/core/src/lib.rs delete mode 100644 compiler/core/src/location.rs delete mode 100644 compiler/core/src/marshal.rs delete mode 100644 compiler/core/src/mode.rs delete mode 100644 compiler/literal/Cargo.toml delete mode 100644 compiler/literal/src/char.rs delete mode 100644 compiler/literal/src/escape.rs delete mode 100644 compiler/literal/src/float.rs delete mode 100644 compiler/literal/src/format.rs delete mode 100644 compiler/literal/src/lib.rs delete mode 100644 compiler/parser/Cargo.toml delete mode 100644 compiler/parser/README.md delete mode 100644 compiler/parser/build.rs delete mode 100644 compiler/parser/python.lalrpop delete mode 100644 compiler/parser/src/context.rs delete mode 100644 compiler/parser/src/function.rs delete mode 100644 compiler/parser/src/lexer.rs delete mode 100644 compiler/parser/src/lib.rs delete mode 100644 compiler/parser/src/mode.rs delete mode 100644 compiler/parser/src/parser.rs delete mode 100644 compiler/parser/src/python.rs delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__ann_assign_name.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_attribute.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_for.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_list.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_list_comp.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_name.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_named_expr.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_set_comp.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_starred.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_subscript.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_tuple.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__assign_with.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__aug_assign_attribute.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__aug_assign_name.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__aug_assign_subscript.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__del_attribute.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__del_name.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__context__tests__del_subscript.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__function_kw_only_args.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__function_kw_only_args_with_defaults.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__function_no_args.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__function_pos_and_kw_only_args.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__function_pos_and_kw_only_args_with_defaults.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__function_pos_and_kw_only_args_with_defaults_and_varargs.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__function_pos_and_kw_only_args_with_defaults_and_varargs_and_kwargs.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__function_pos_args.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__function_pos_args_with_defaults.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__lambda_kw_only_args.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__lambda_kw_only_args_with_defaults.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__lambda_no_args.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__lambda_pos_and_kw_only_args.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__lambda_pos_args.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__function__tests__lambda_pos_args_with_defaults.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__dict_unpacking.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__generator_expression_argument.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__match.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__match_as_identifier.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_bool_op_and.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_bool_op_or.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_class.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_dict_comprehension.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_double_list_comprehension.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_empty.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_f_string.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_generator_comprehension.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_if_elif_else.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_if_else_generator_comprehension.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_kwargs.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_lambda.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_list_comprehension.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_named_expression_generator_comprehension.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_print_2.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_print_hello.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_string.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__parse_tuples.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__patma.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__slice.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__star_index.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__try.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__try_star.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__variadic_generics.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__parser__tests__with_statement.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__backspace_alias.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__bell_alias.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__carriage_return_alias.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__character_tabulation_with_justification_alias.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__delete_alias.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__double_quoted_byte.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__escape_alias.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__escape_char_in_byte_literal.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__escape_octet.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__form_feed_alias.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__fstring_escaped_character.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__fstring_escaped_newline.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__fstring_line_continuation.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_self_documenting_base.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_self_documenting_base_more.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_self_documenting_format.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__fstring_unescaped_newline.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__hts_alias.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_empty_fstring.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_f_string_concat_1.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_f_string_concat_2.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_f_string_concat_3.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_equals.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_nested_spec.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_not_equals.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_not_nested_spec.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_self_doc_prec_space.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_self_doc_trailing_space.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_yield_expr.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_string_concat.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_string_triple_quotes_with_kind.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_u_f_string_concat_1.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_u_f_string_concat_2.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_u_string_concat_1.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__parse_u_string_concat_2.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__raw_byte_literal_1.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__raw_byte_literal_2.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__raw_fstring.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__single_quoted_byte.snap delete mode 100644 compiler/parser/src/snapshots/rustpython_parser__string__tests__triple_quoted_raw_fstring.snap delete mode 100644 compiler/parser/src/soft_keywords.rs delete mode 100644 compiler/parser/src/string.rs delete mode 100644 compiler/parser/src/token.rs diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 731e629f67..45a21e35bb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -18,10 +18,8 @@ env: CARGO_ARGS: --no-default-features --features stdlib,zlib,importlib,encodings,ssl,jit NON_WASM_PACKAGES: >- -p rustpython-common - -p rustpython-compiler-core -p rustpython-compiler -p rustpython-codegen - -p rustpython-parser -p rustpython-vm -p rustpython-stdlib -p rustpython-jit @@ -105,7 +103,6 @@ jobs: env: RUST_BACKTRACE: full name: Run rust tests - needs: lalrpop runs-on: ${{ matrix.os }} strategy: matrix: @@ -113,11 +110,6 @@ jobs: fail-fast: false steps: - uses: actions/checkout@v3 - - name: Cache generated parser - uses: actions/cache@v3 - with: - path: compiler/parser/python.rs - key: lalrpop-${{ hashFiles('compiler/parser/python.lalrpop') }} - uses: dtolnay/rust-toolchain@stable with: components: clippy @@ -167,16 +159,9 @@ jobs: exotic_targets: if: ${{ !contains(github.event.pull_request.labels.*.name, 'skip:ci') }} name: Ensure compilation on various targets - needs: lalrpop runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Cache generated parser - uses: actions/cache@v3 - with: - path: compiler/parser/python.rs - key: lalrpop-${{ hashFiles('compiler/parser/python.lalrpop') }} - - uses: dtolnay/rust-toolchain@stable with: target: i686-unknown-linux-gnu @@ -239,7 +224,6 @@ jobs: snippets_cpython: if: ${{ !contains(github.event.pull_request.labels.*.name, 'skip:ci') }} - needs: lalrpop env: RUST_BACKTRACE: full name: Run snippets and cpython tests @@ -250,12 +234,6 @@ jobs: fail-fast: false steps: - uses: actions/checkout@v3 - - name: Cache generated parser - uses: actions/cache@v3 - with: - path: compiler/parser/python.rs - key: lalrpop-${{ hashFiles('compiler/parser/python.lalrpop') }} - - uses: dtolnay/rust-toolchain@stable - uses: actions/setup-python@v4 with: @@ -316,50 +294,11 @@ jobs: - name: Check whats_left is not broken run: python -I whats_left.py - lalrpop: - if: ${{ !contains(github.event.pull_request.labels.*.name, 'skip:ci') }} - name: Generate parser with lalrpop - strategy: - matrix: - os: [ubuntu-latest, windows-latest] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v3 - - name: Cache generated parser - uses: actions/cache@v3 - with: - path: compiler/parser/python.rs - key: lalrpop-${{ hashFiles('compiler/parser/python.lalrpop') }} - - name: Check if cached generated parser exists - id: generated_parser - uses: andstor/file-existence-action@v2 - with: - files: "compiler/parser/python.rs" - - if: runner.os == 'Windows' - name: Force python.lalrpop to be lf # actions@checkout ignore .gitattributes - run: | - set file compiler/parser/python.lalrpop; ((Get-Content $file) -join "`n") + "`n" | Set-Content -NoNewline $file - - name: Install lalrpop - if: steps.generated_parser.outputs.files_exists == 'false' - uses: baptiste0928/cargo-install@v2 - with: - crate: lalrpop - version: "0.19.9" - - name: Run lalrpop - if: steps.generated_parser.outputs.files_exists == 'false' - run: lalrpop compiler/parser/python.lalrpop - lint: name: Check Rust code with rustfmt and clippy - needs: lalrpop runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Cache generated parser - uses: actions/cache@v3 - with: - path: compiler/parser/python.rs - key: lalrpop-${{ hashFiles('compiler/parser/python.lalrpop') }} - uses: dtolnay/rust-toolchain@stable with: components: rustfmt, clippy @@ -372,28 +311,22 @@ jobs: python-version: "3.11" - name: install ruff run: python -m pip install ruff - - name: run lint - run: ruff extra_tests wasm examples compiler/ast --exclude='./.*',./Lib,./vm/Lib,./benches/ --select=E9,F63,F7,F82 --show-source + - name: run python lint + run: ruff extra_tests wasm examples --exclude='./.*',./Lib,./vm/Lib,./benches/ --select=E9,F63,F7,F82 --show-source - name: install prettier run: yarn global add prettier && echo "$(yarn global bin)" >>$GITHUB_PATH - name: check wasm code with prettier # prettier doesn't handle ignore files very well: https://github.com/prettier/prettier/issues/8506 run: cd wasm && git ls-files -z | xargs -0 prettier --check -u - - name: Check update_asdl.sh consistency - run: bash scripts/update_asdl.sh && git diff --exit-code + # - name: Check update_asdl.sh consistency + # run: bash scripts/update_asdl.sh && git diff --exit-code miri: if: ${{ !contains(github.event.pull_request.labels.*.name, 'skip:ci') }} name: Run tests under miri - needs: lalrpop runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Cache generated parser - uses: actions/cache@v3 - with: - path: compiler/parser/python.rs - key: lalrpop-${{ hashFiles('compiler/parser/python.lalrpop') }} - uses: dtolnay/rust-toolchain@master with: toolchain: nightly @@ -408,15 +341,9 @@ jobs: wasm: if: ${{ !contains(github.event.pull_request.labels.*.name, 'skip:ci') }} name: Check the WASM package and demo - needs: lalrpop runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Cache generated parser - uses: actions/cache@v3 - with: - path: compiler/parser/python.rs - key: lalrpop-${{ hashFiles('compiler/parser/python.lalrpop') }} - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 @@ -462,15 +389,9 @@ jobs: wasm-wasi: if: ${{ !contains(github.event.pull_request.labels.*.name, 'skip:ci') }} name: Run snippets and cpython tests on wasm-wasi - needs: lalrpop runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Cache generated parser - uses: actions/cache@v3 - with: - path: compiler/parser/python.rs - key: lalrpop-${{ hashFiles('compiler/parser/python.lalrpop') }} - uses: dtolnay/rust-toolchain@stable with: target: wasm32-wasi diff --git a/Cargo.lock b/Cargo.lock index f3fd953bdf..e844f5343c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1077,9 +1077,9 @@ dependencies = [ [[package]] name = "lalrpop" -version = "0.19.9" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f34313ec00c2eb5c3c87ca6732ea02dcf3af99c3ff7a8fb622ffb99c9d860a87" +checksum = "da4081d44f4611b66c6dd725e6de3169f9f63905421e8626fcb86b6a898998b8" dependencies = [ "ascii-canvas", "bit-set", @@ -1089,9 +1089,8 @@ dependencies = [ "itertools", "lalrpop-util", "petgraph", - "pico-args", "regex", - "regex-syntax", + "regex-syntax 0.7.1", "string_cache", "term", "tiny-keccak", @@ -1100,12 +1099,9 @@ dependencies = [ [[package]] name = "lalrpop-util" -version = "0.19.9" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5c1f7869c94d214466c5fd432dfed12c379fd87786768d36455892d46b18edd" -dependencies = [ - "regex", -] +checksum = "3f35c735096c0293d313e8f2a641627472b83d01b937177fe76e5e2708d31e0d" [[package]] name = "lazy_static" @@ -1639,12 +1635,6 @@ dependencies = [ "siphasher", ] -[[package]] -name = "pico-args" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db8bcd96cb740d03149cbad5518db9fd87126a10ab519c011893b1754134c468" - [[package]] name = "pin-utils" version = "0.1.0" @@ -1866,7 +1856,7 @@ checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.6.28", ] [[package]] @@ -1881,6 +1871,12 @@ version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +[[package]] +name = "regex-syntax" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5996294f19bd3aae0453a862ad728f60e6600695733dd5df01da90c54363a3c" + [[package]] name = "region" version = "2.2.0" @@ -1971,6 +1967,7 @@ dependencies = [ [[package]] name = "rustpython-ast" version = "0.2.0" +source = "git+https://github.com/RustPython/Parser.git?rev=6b60f85cc4ca248af9b787697c41be2adb7a3ec8#6b60f85cc4ca248af9b787697c41be2adb7a3ec8" dependencies = [ "num-bigint", "rustpython-compiler-core", @@ -2030,6 +2027,7 @@ dependencies = [ [[package]] name = "rustpython-compiler-core" version = "0.2.0" +source = "git+https://github.com/RustPython/Parser.git?rev=6b60f85cc4ca248af9b787697c41be2adb7a3ec8#6b60f85cc4ca248af9b787697c41be2adb7a3ec8" dependencies = [ "bitflags", "bstr", @@ -2037,7 +2035,6 @@ dependencies = [ "lz4_flex", "num-bigint", "num-complex", - "serde", ] [[package]] @@ -2092,21 +2089,21 @@ dependencies = [ [[package]] name = "rustpython-literal" version = "0.2.0" +source = "git+https://github.com/RustPython/Parser.git?rev=6b60f85cc4ca248af9b787697c41be2adb7a3ec8#6b60f85cc4ca248af9b787697c41be2adb7a3ec8" dependencies = [ "hexf-parse", "lexical-parse-float", "num-traits", - "rand", "unic-ucd-category", ] [[package]] name = "rustpython-parser" version = "0.2.0" +source = "git+https://github.com/RustPython/Parser.git?rev=6b60f85cc4ca248af9b787697c41be2adb7a3ec8#6b60f85cc4ca248af9b787697c41be2adb7a3ec8" dependencies = [ "ahash", "anyhow", - "insta", "itertools", "lalrpop", "lalrpop-util", @@ -2118,7 +2115,6 @@ dependencies = [ "rustc-hash", "rustpython-ast", "rustpython-compiler-core", - "serde", "tiny-keccak", "unic-emoji-char", "unic-ucd-ident", diff --git a/Cargo.toml b/Cargo.toml index 31056168c4..4428fa44ff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,16 +12,24 @@ include = ["LICENSE", "Cargo.toml", "src/**/*.rs"] [workspace] resolver = "2" members = [ - "compiler", "compiler/ast", "compiler/core", "compiler/literal", "compiler/codegen", "compiler/parser", + "compiler", "compiler/codegen", ".", "common", "derive", "jit", "vm", "pylib", "stdlib", "wasm/lib", "derive-impl", ] [workspace.dependencies] +rustpython-literal = { git = "https://github.com/RustPython/Parser.git", rev = "6b60f85cc4ca248af9b787697c41be2adb7a3ec8" } +rustpython-compiler-core = { git = "https://github.com/RustPython/Parser.git", rev = "6b60f85cc4ca248af9b787697c41be2adb7a3ec8" } +rustpython-parser = { git = "https://github.com/RustPython/Parser.git", rev = "6b60f85cc4ca248af9b787697c41be2adb7a3ec8" } +rustpython-ast = { git = "https://github.com/RustPython/Parser.git", rev = "6b60f85cc4ca248af9b787697c41be2adb7a3ec8" } +# rustpython-literal = { path = "../RustPython-parser/literal" } +# rustpython-compiler-core = { path = "../RustPython-parser/core" } +# rustpython-parser = { path = "../RustPython-parser/parser" } +# rustpython-ast = { path = "../RustPython-parser/ast" } + ahash = "0.7.6" anyhow = "1.0.45" ascii = "1.0" atty = "0.2.14" -bincode = "1.3.3" bitflags = "1.3.2" bstr = "0.2.17" cfg-if = "1.0" @@ -72,11 +80,12 @@ ssl-vendor = ["rustpython-stdlib/ssl-vendor"] [dependencies] rustpython-compiler = { path = "compiler", version = "0.2.0" } -rustpython-parser = { path = "compiler/parser", version = "0.2.0" } rustpython-pylib = { path = "pylib", optional = true, default-features = false } rustpython-stdlib = { path = "stdlib", optional = true, default-features = false } rustpython-vm = { path = "vm", version = "0.2.0", default-features = false, features = ["compiler"] } +rustpython-parser = { workspace = true } + atty = { workspace = true } cfg-if = { workspace = true } log = { workspace = true } diff --git a/common/Cargo.toml b/common/Cargo.toml index 7395f2e08c..f358b37ab6 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -11,7 +11,7 @@ license = "MIT" threading = ["parking_lot"] [dependencies] -rustpython-literal = { path = "../compiler/literal" } +rustpython-literal = { workspace = true } ascii = { workspace = true } bitflags = { workspace = true } diff --git a/compiler/Cargo.toml b/compiler/Cargo.toml index ff6ea77508..22c182fafb 100644 --- a/compiler/Cargo.toml +++ b/compiler/Cargo.toml @@ -6,6 +6,6 @@ authors = ["RustPython Team"] edition = "2021" [dependencies] -rustpython-compiler-core = { path = "core" } rustpython-codegen = { path = "codegen" } -rustpython-parser = { path = "parser" } +rustpython-compiler-core = { workspace = true } +rustpython-parser = { workspace = true } diff --git a/compiler/ast/Cargo.toml b/compiler/ast/Cargo.toml deleted file mode 100644 index 5de211dd72..0000000000 --- a/compiler/ast/Cargo.toml +++ /dev/null @@ -1,20 +0,0 @@ -[package] -name = "rustpython-ast" -version = "0.2.0" -description = "AST definitions for RustPython" -authors = ["RustPython Team"] -edition = "2021" -repository = "https://github.com/RustPython/RustPython" -license = "MIT" - -[features] -default = ["constant-optimization", "fold"] -constant-optimization = ["fold"] -fold = [] -unparse = ["rustpython-literal"] - -[dependencies] -rustpython-compiler-core = { path = "../core", version = "0.2.0" } -rustpython-literal = { path = "../literal", version = "0.2.0", optional = true } - -num-bigint = { workspace = true } diff --git a/compiler/ast/Python.asdl b/compiler/ast/Python.asdl deleted file mode 100644 index e9423a7c98..0000000000 --- a/compiler/ast/Python.asdl +++ /dev/null @@ -1,145 +0,0 @@ --- ASDL's 4 builtin types are: --- identifier, int, string, constant - -module Python -{ - mod = Module(stmt* body, type_ignore* type_ignores) - | Interactive(stmt* body) - | Expression(expr body) - | FunctionType(expr* argtypes, expr returns) - - stmt = FunctionDef(identifier name, arguments args, - stmt* body, expr* decorator_list, expr? returns, - string? type_comment) - | AsyncFunctionDef(identifier name, arguments args, - stmt* body, expr* decorator_list, expr? returns, - string? type_comment) - - | ClassDef(identifier name, - expr* bases, - keyword* keywords, - stmt* body, - expr* decorator_list) - | Return(expr? value) - - | Delete(expr* targets) - | Assign(expr* targets, expr value, string? type_comment) - | AugAssign(expr target, operator op, expr value) - -- 'simple' indicates that we annotate simple name without parens - | AnnAssign(expr target, expr annotation, expr? value, int simple) - - -- use 'orelse' because else is a keyword in target languages - | For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment) - | AsyncFor(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment) - | While(expr test, stmt* body, stmt* orelse) - | If(expr test, stmt* body, stmt* orelse) - | With(withitem* items, stmt* body, string? type_comment) - | AsyncWith(withitem* items, stmt* body, string? type_comment) - - | Match(expr subject, match_case* cases) - - | Raise(expr? exc, expr? cause) - | Try(stmt* body, excepthandler* handlers, stmt* orelse, stmt* finalbody) - | TryStar(stmt* body, excepthandler* handlers, stmt* orelse, stmt* finalbody) - | Assert(expr test, expr? msg) - - | Import(alias* names) - | ImportFrom(identifier? module, alias* names, int? level) - - | Global(identifier* names) - | Nonlocal(identifier* names) - | Expr(expr value) - | Pass | Break | Continue - - -- col_offset is the byte offset in the utf8 string the parser uses - attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) - - -- BoolOp() can use left & right? - expr = BoolOp(boolop op, expr* values) - | NamedExpr(expr target, expr value) - | BinOp(expr left, operator op, expr right) - | UnaryOp(unaryop op, expr operand) - | Lambda(arguments args, expr body) - | IfExp(expr test, expr body, expr orelse) - | Dict(expr* keys, expr* values) - | Set(expr* elts) - | ListComp(expr elt, comprehension* generators) - | SetComp(expr elt, comprehension* generators) - | DictComp(expr key, expr value, comprehension* generators) - | GeneratorExp(expr elt, comprehension* generators) - -- the grammar constrains where yield expressions can occur - | Await(expr value) - | Yield(expr? value) - | YieldFrom(expr value) - -- need sequences for compare to distinguish between - -- x < 4 < 3 and (x < 4) < 3 - | Compare(expr left, cmpop* ops, expr* comparators) - | Call(expr func, expr* args, keyword* keywords) - | FormattedValue(expr value, int conversion, expr? format_spec) - | JoinedStr(expr* values) - | Constant(constant value, string? kind) - - -- the following expression can appear in assignment context - | Attribute(expr value, identifier attr, expr_context ctx) - | Subscript(expr value, expr slice, expr_context ctx) - | Starred(expr value, expr_context ctx) - | Name(identifier id, expr_context ctx) - | List(expr* elts, expr_context ctx) - | Tuple(expr* elts, expr_context ctx) - - -- can appear only in Subscript - | Slice(expr? lower, expr? upper, expr? step) - - -- col_offset is the byte offset in the utf8 string the parser uses - attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) - - expr_context = Load | Store | Del - - boolop = And | Or - - operator = Add | Sub | Mult | MatMult | Div | Mod | Pow | LShift - | RShift | BitOr | BitXor | BitAnd | FloorDiv - - unaryop = Invert | Not | UAdd | USub - - cmpop = Eq | NotEq | Lt | LtE | Gt | GtE | Is | IsNot | In | NotIn - - comprehension = (expr target, expr iter, expr* ifs, int is_async) - - excepthandler = ExceptHandler(expr? type, identifier? name, stmt* body) - attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) - - arguments = (arg* posonlyargs, arg* args, arg? vararg, arg* kwonlyargs, - expr* kw_defaults, arg? kwarg, expr* defaults) - - arg = (identifier arg, expr? annotation, string? type_comment) - attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) - - -- keyword arguments supplied to call (NULL identifier for **kwargs) - keyword = (identifier? arg, expr value) - attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) - - -- import name with optional 'as' alias. - alias = (identifier name, identifier? asname) - attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) - - withitem = (expr context_expr, expr? optional_vars) - - match_case = (pattern pattern, expr? guard, stmt* body) - - pattern = MatchValue(expr value) - | MatchSingleton(constant value) - | MatchSequence(pattern* patterns) - | MatchMapping(expr* keys, pattern* patterns, identifier? rest) - | MatchClass(expr cls, pattern* patterns, identifier* kwd_attrs, pattern* kwd_patterns) - - | MatchStar(identifier? name) - -- The optional "rest" MatchMapping parameter handles capturing extra mapping keys - - | MatchAs(pattern? pattern, identifier? name) - | MatchOr(pattern* patterns) - - attributes (int lineno, int col_offset, int end_lineno, int end_col_offset) - - type_ignore = TypeIgnore(int lineno, string tag) -} diff --git a/compiler/ast/asdl.py b/compiler/ast/asdl.py deleted file mode 100644 index e3e6c34d2a..0000000000 --- a/compiler/ast/asdl.py +++ /dev/null @@ -1,385 +0,0 @@ -#------------------------------------------------------------------------------- -# Parser for ASDL [1] definition files. Reads in an ASDL description and parses -# it into an AST that describes it. -# -# The EBNF we're parsing here: Figure 1 of the paper [1]. Extended to support -# modules and attributes after a product. Words starting with Capital letters -# are terminals. Literal tokens are in "double quotes". Others are -# non-terminals. Id is either TokenId or ConstructorId. -# -# module ::= "module" Id "{" [definitions] "}" -# definitions ::= { TypeId "=" type } -# type ::= product | sum -# product ::= fields ["attributes" fields] -# fields ::= "(" { field, "," } field ")" -# field ::= TypeId ["?" | "*"] [Id] -# sum ::= constructor { "|" constructor } ["attributes" fields] -# constructor ::= ConstructorId [fields] -# -# [1] "The Zephyr Abstract Syntax Description Language" by Wang, et. al. See -# http://asdl.sourceforge.net/ -#------------------------------------------------------------------------------- -from collections import namedtuple -import re - -__all__ = [ - 'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor', - 'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check'] - -# The following classes define nodes into which the ASDL description is parsed. -# Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST -# structure used by a programming language. But ASDL files themselves need to be -# parsed. This module parses ASDL files and uses a simple AST to represent them. -# See the EBNF at the top of the file to understand the logical connection -# between the various node types. - -builtin_types = {'identifier', 'string', 'int', 'constant'} - -class AST: - def __repr__(self): - raise NotImplementedError - -class Module(AST): - def __init__(self, name, dfns): - self.name = name - self.dfns = dfns - self.types = {type.name: type.value for type in dfns} - - def __repr__(self): - return 'Module({0.name}, {0.dfns})'.format(self) - -class Type(AST): - def __init__(self, name, value): - self.name = name - self.value = value - - def __repr__(self): - return 'Type({0.name}, {0.value})'.format(self) - -class Constructor(AST): - def __init__(self, name, fields=None): - self.name = name - self.fields = fields or [] - - def __repr__(self): - return 'Constructor({0.name}, {0.fields})'.format(self) - -class Field(AST): - def __init__(self, type, name=None, seq=False, opt=False): - self.type = type - self.name = name - self.seq = seq - self.opt = opt - - def __str__(self): - if self.seq: - extra = "*" - elif self.opt: - extra = "?" - else: - extra = "" - - return "{}{} {}".format(self.type, extra, self.name) - - def __repr__(self): - if self.seq: - extra = ", seq=True" - elif self.opt: - extra = ", opt=True" - else: - extra = "" - if self.name is None: - return 'Field({0.type}{1})'.format(self, extra) - else: - return 'Field({0.type}, {0.name}{1})'.format(self, extra) - -class Sum(AST): - def __init__(self, types, attributes=None): - self.types = types - self.attributes = attributes or [] - - def __repr__(self): - if self.attributes: - return 'Sum({0.types}, {0.attributes})'.format(self) - else: - return 'Sum({0.types})'.format(self) - -class Product(AST): - def __init__(self, fields, attributes=None): - self.fields = fields - self.attributes = attributes or [] - - def __repr__(self): - if self.attributes: - return 'Product({0.fields}, {0.attributes})'.format(self) - else: - return 'Product({0.fields})'.format(self) - -# A generic visitor for the meta-AST that describes ASDL. This can be used by -# emitters. Note that this visitor does not provide a generic visit method, so a -# subclass needs to define visit methods from visitModule to as deep as the -# interesting node. -# We also define a Check visitor that makes sure the parsed ASDL is well-formed. - -class VisitorBase(object): - """Generic tree visitor for ASTs.""" - def __init__(self): - self.cache = {} - - def visit(self, obj, *args): - klass = obj.__class__ - meth = self.cache.get(klass) - if meth is None: - methname = "visit" + klass.__name__ - meth = getattr(self, methname, None) - self.cache[klass] = meth - if meth: - try: - meth(obj, *args) - except Exception as e: - print("Error visiting %r: %s" % (obj, e)) - raise - -class Check(VisitorBase): - """A visitor that checks a parsed ASDL tree for correctness. - - Errors are printed and accumulated. - """ - def __init__(self): - super(Check, self).__init__() - self.cons = {} - self.errors = 0 - self.types = {} - - def visitModule(self, mod): - for dfn in mod.dfns: - self.visit(dfn) - - def visitType(self, type): - self.visit(type.value, str(type.name)) - - def visitSum(self, sum, name): - for t in sum.types: - self.visit(t, name) - - def visitConstructor(self, cons, name): - key = str(cons.name) - conflict = self.cons.get(key) - if conflict is None: - self.cons[key] = name - else: - print('Redefinition of constructor {}'.format(key)) - print('Defined in {} and {}'.format(conflict, name)) - self.errors += 1 - for f in cons.fields: - self.visit(f, key) - - def visitField(self, field, name): - key = str(field.type) - l = self.types.setdefault(key, []) - l.append(name) - - def visitProduct(self, prod, name): - for f in prod.fields: - self.visit(f, name) - -def check(mod): - """Check the parsed ASDL tree for correctness. - - Return True if success. For failure, the errors are printed out and False - is returned. - """ - v = Check() - v.visit(mod) - - for t in v.types: - if t not in mod.types and not t in builtin_types: - v.errors += 1 - uses = ", ".join(v.types[t]) - print('Undefined type {}, used in {}'.format(t, uses)) - return not v.errors - -# The ASDL parser itself comes next. The only interesting external interface -# here is the top-level parse function. - -def parse(filename): - """Parse ASDL from the given file and return a Module node describing it.""" - with open(filename, encoding="utf-8") as f: - parser = ASDLParser() - return parser.parse(f.read()) - -# Types for describing tokens in an ASDL specification. -class TokenKind: - """TokenKind is provides a scope for enumerated token kinds.""" - (ConstructorId, TypeId, Equals, Comma, Question, Pipe, Asterisk, - LParen, RParen, LBrace, RBrace) = range(11) - - operator_table = { - '=': Equals, ',': Comma, '?': Question, '|': Pipe, '(': LParen, - ')': RParen, '*': Asterisk, '{': LBrace, '}': RBrace} - -Token = namedtuple('Token', 'kind value lineno') - -class ASDLSyntaxError(Exception): - def __init__(self, msg, lineno=None): - self.msg = msg - self.lineno = lineno or '' - - def __str__(self): - return 'Syntax error on line {0.lineno}: {0.msg}'.format(self) - -def tokenize_asdl(buf): - """Tokenize the given buffer. Yield Token objects.""" - for lineno, line in enumerate(buf.splitlines(), 1): - for m in re.finditer(r'\s*(\w+|--.*|.)', line.strip()): - c = m.group(1) - if c[0].isalpha(): - # Some kind of identifier - if c[0].isupper(): - yield Token(TokenKind.ConstructorId, c, lineno) - else: - yield Token(TokenKind.TypeId, c, lineno) - elif c[:2] == '--': - # Comment - break - else: - # Operators - try: - op_kind = TokenKind.operator_table[c] - except KeyError: - raise ASDLSyntaxError('Invalid operator %s' % c, lineno) - yield Token(op_kind, c, lineno) - -class ASDLParser: - """Parser for ASDL files. - - Create, then call the parse method on a buffer containing ASDL. - This is a simple recursive descent parser that uses tokenize_asdl for the - lexing. - """ - def __init__(self): - self._tokenizer = None - self.cur_token = None - - def parse(self, buf): - """Parse the ASDL in the buffer and return an AST with a Module root. - """ - self._tokenizer = tokenize_asdl(buf) - self._advance() - return self._parse_module() - - def _parse_module(self): - if self._at_keyword('module'): - self._advance() - else: - raise ASDLSyntaxError( - 'Expected "module" (found {})'.format(self.cur_token.value), - self.cur_token.lineno) - name = self._match(self._id_kinds) - self._match(TokenKind.LBrace) - defs = self._parse_definitions() - self._match(TokenKind.RBrace) - return Module(name, defs) - - def _parse_definitions(self): - defs = [] - while self.cur_token.kind == TokenKind.TypeId: - typename = self._advance() - self._match(TokenKind.Equals) - type = self._parse_type() - defs.append(Type(typename, type)) - return defs - - def _parse_type(self): - if self.cur_token.kind == TokenKind.LParen: - # If we see a (, it's a product - return self._parse_product() - else: - # Otherwise it's a sum. Look for ConstructorId - sumlist = [Constructor(self._match(TokenKind.ConstructorId), - self._parse_optional_fields())] - while self.cur_token.kind == TokenKind.Pipe: - # More constructors - self._advance() - sumlist.append(Constructor( - self._match(TokenKind.ConstructorId), - self._parse_optional_fields())) - return Sum(sumlist, self._parse_optional_attributes()) - - def _parse_product(self): - return Product(self._parse_fields(), self._parse_optional_attributes()) - - def _parse_fields(self): - fields = [] - self._match(TokenKind.LParen) - while self.cur_token.kind == TokenKind.TypeId: - typename = self._advance() - is_seq, is_opt = self._parse_optional_field_quantifier() - id = (self._advance() if self.cur_token.kind in self._id_kinds - else None) - fields.append(Field(typename, id, seq=is_seq, opt=is_opt)) - if self.cur_token.kind == TokenKind.RParen: - break - elif self.cur_token.kind == TokenKind.Comma: - self._advance() - self._match(TokenKind.RParen) - return fields - - def _parse_optional_fields(self): - if self.cur_token.kind == TokenKind.LParen: - return self._parse_fields() - else: - return None - - def _parse_optional_attributes(self): - if self._at_keyword('attributes'): - self._advance() - return self._parse_fields() - else: - return None - - def _parse_optional_field_quantifier(self): - is_seq, is_opt = False, False - if self.cur_token.kind == TokenKind.Asterisk: - is_seq = True - self._advance() - elif self.cur_token.kind == TokenKind.Question: - is_opt = True - self._advance() - return is_seq, is_opt - - def _advance(self): - """ Return the value of the current token and read the next one into - self.cur_token. - """ - cur_val = None if self.cur_token is None else self.cur_token.value - try: - self.cur_token = next(self._tokenizer) - except StopIteration: - self.cur_token = None - return cur_val - - _id_kinds = (TokenKind.ConstructorId, TokenKind.TypeId) - - def _match(self, kind): - """The 'match' primitive of RD parsers. - - * Verifies that the current token is of the given kind (kind can - be a tuple, in which the kind must match one of its members). - * Returns the value of the current token - * Reads in the next token - """ - if (isinstance(kind, tuple) and self.cur_token.kind in kind or - self.cur_token.kind == kind - ): - value = self.cur_token.value - self._advance() - return value - else: - raise ASDLSyntaxError( - 'Unmatched {} (found {})'.format(kind, self.cur_token.kind), - self.cur_token.lineno) - - def _at_keyword(self, keyword): - return (self.cur_token.kind == TokenKind.TypeId and - self.cur_token.value == keyword) diff --git a/compiler/ast/asdl_rs.py b/compiler/ast/asdl_rs.py deleted file mode 100755 index c9b819dae2..0000000000 --- a/compiler/ast/asdl_rs.py +++ /dev/null @@ -1,749 +0,0 @@ -#! /usr/bin/env python -"""Generate Rust code from an ASDL description.""" - -import sys -import json -import textwrap - -from argparse import ArgumentParser -from pathlib import Path - -import asdl - -TABSIZE = 4 -AUTOGEN_MESSAGE = "// File automatically generated by {}.\n" - -builtin_type_mapping = { - "identifier": "Ident", - "string": "String", - "int": "usize", - "constant": "Constant", -} -assert builtin_type_mapping.keys() == asdl.builtin_types - - -def get_rust_type(name): - """Return a string for the C name of the type. - - This function special cases the default types provided by asdl. - """ - if name in asdl.builtin_types: - return builtin_type_mapping[name] - elif name.islower(): - return "".join(part.capitalize() for part in name.split("_")) - else: - return name - - -def is_simple(sum): - """Return True if a sum is a simple. - - A sum is simple if its types have no fields, e.g. - unaryop = Invert | Not | UAdd | USub - """ - for t in sum.types: - if t.fields: - return False - return True - - -def asdl_of(name, obj): - if isinstance(obj, asdl.Product) or isinstance(obj, asdl.Constructor): - fields = ", ".join(map(str, obj.fields)) - if fields: - fields = "({})".format(fields) - return "{}{}".format(name, fields) - else: - if is_simple(obj): - types = " | ".join(type.name for type in obj.types) - else: - sep = "\n{}| ".format(" " * (len(name) + 1)) - types = sep.join(asdl_of(type.name, type) for type in obj.types) - return "{} = {}".format(name, types) - - -class EmitVisitor(asdl.VisitorBase): - """Visit that emits lines""" - - def __init__(self, file): - self.file = file - self.identifiers = set() - super(EmitVisitor, self).__init__() - - def emit_identifier(self, name): - name = str(name) - if name in self.identifiers: - return - self.emit("_Py_IDENTIFIER(%s);" % name, 0) - self.identifiers.add(name) - - def emit(self, line, depth): - if line: - line = (" " * TABSIZE * depth) + line - self.file.write(line + "\n") - - -class TypeInfo: - def __init__(self, name): - self.name = name - self.has_userdata = None - self.children = set() - self.boxed = False - self.product = False - - def __repr__(self): - return f"" - - def determine_userdata(self, typeinfo, stack): - if self.name in stack: - return None - stack.add(self.name) - for child, child_seq in self.children: - if child in asdl.builtin_types: - continue - childinfo = typeinfo[child] - child_has_userdata = childinfo.determine_userdata(typeinfo, stack) - if self.has_userdata is None and child_has_userdata is True: - self.has_userdata = True - - stack.remove(self.name) - return self.has_userdata - - -class FindUserdataTypesVisitor(asdl.VisitorBase): - def __init__(self, typeinfo): - self.typeinfo = typeinfo - super().__init__() - - def visitModule(self, mod): - for dfn in mod.dfns: - self.visit(dfn) - stack = set() - for info in self.typeinfo.values(): - info.determine_userdata(self.typeinfo, stack) - - def visitType(self, type): - self.typeinfo[type.name] = TypeInfo(type.name) - self.visit(type.value, type.name) - - def visitSum(self, sum, name): - info = self.typeinfo[name] - if is_simple(sum): - info.has_userdata = False - else: - if len(sum.types) > 1: - info.boxed = True - if sum.attributes: - # attributes means Located, which has the `custom: U` field - info.has_userdata = True - for variant in sum.types: - self.add_children(name, variant.fields) - - def visitProduct(self, product, name): - info = self.typeinfo[name] - if product.attributes: - # attributes means Located, which has the `custom: U` field - info.has_userdata = True - if len(product.fields) > 2: - info.boxed = True - info.product = True - self.add_children(name, product.fields) - - def add_children(self, name, fields): - self.typeinfo[name].children.update((field.type, field.seq) for field in fields) - - -def rust_field(field_name): - if field_name == "type": - return "type_" - else: - return field_name - - -class TypeInfoEmitVisitor(EmitVisitor): - def __init__(self, file, typeinfo): - self.typeinfo = typeinfo - super().__init__(file) - - def has_userdata(self, typ): - return self.typeinfo[typ].has_userdata - - def get_generics(self, typ, *generics): - if self.has_userdata(typ): - return [f"<{g}>" for g in generics] - else: - return ["" for g in generics] - - -class StructVisitor(TypeInfoEmitVisitor): - """Visitor to generate typedefs for AST.""" - - def visitModule(self, mod): - for dfn in mod.dfns: - self.visit(dfn) - - def visitType(self, type, depth=0): - self.visit(type.value, type.name, depth) - - def visitSum(self, sum, name, depth): - if is_simple(sum): - self.simple_sum(sum, name, depth) - else: - self.sum_with_constructors(sum, name, depth) - - def emit_attrs(self, depth): - self.emit("#[derive(Clone, Debug, PartialEq)]", depth) - - def simple_sum(self, sum, name, depth): - rustname = get_rust_type(name) - self.emit_attrs(depth) - self.emit(f"pub enum {rustname} {{", depth) - for variant in sum.types: - self.emit(f"{variant.name},", depth + 1) - self.emit("}", depth) - self.emit("", depth) - - def sum_with_constructors(self, sum, name, depth): - typeinfo = self.typeinfo[name] - generics, generics_applied = self.get_generics(name, "U = ()", "U") - enumname = rustname = get_rust_type(name) - # all the attributes right now are for location, so if it has attrs we - # can just wrap it in Located<> - if sum.attributes: - enumname = rustname + "Kind" - self.emit_attrs(depth) - self.emit(f"pub enum {enumname}{generics} {{", depth) - for t in sum.types: - self.visit(t, typeinfo, depth + 1) - self.emit("}", depth) - if sum.attributes: - self.emit( - f"pub type {rustname} = Located<{enumname}{generics_applied}, U>;", - depth, - ) - self.emit("", depth) - - def visitConstructor(self, cons, parent, depth): - if cons.fields: - self.emit(f"{cons.name} {{", depth) - for f in cons.fields: - self.visit(f, parent, "", depth + 1, cons.name) - self.emit("},", depth) - else: - self.emit(f"{cons.name},", depth) - - def visitField(self, field, parent, vis, depth, constructor=None): - typ = get_rust_type(field.type) - fieldtype = self.typeinfo.get(field.type) - if fieldtype and fieldtype.has_userdata: - typ = f"{typ}" - # don't box if we're doing Vec, but do box if we're doing Vec>> - if fieldtype and fieldtype.boxed and (not (parent.product or field.seq) or field.opt): - typ = f"Box<{typ}>" - if field.opt or ( - # When a dictionary literal contains dictionary unpacking (e.g., `{**d}`), - # the expression to be unpacked goes in `values` with a `None` at the corresponding - # position in `keys`. To handle this, the type of `keys` needs to be `Option>`. - constructor == "Dict" and field.name == "keys" - ): - typ = f"Option<{typ}>" - if field.seq: - typ = f"Vec<{typ}>" - name = rust_field(field.name) - self.emit(f"{vis}{name}: {typ},", depth) - - def visitProduct(self, product, name, depth): - typeinfo = self.typeinfo[name] - generics, generics_applied = self.get_generics(name, "U = ()", "U") - dataname = rustname = get_rust_type(name) - if product.attributes: - dataname = rustname + "Data" - self.emit_attrs(depth) - has_expr = any(f.type != "identifier" for f in product.fields) - if has_expr: - datadef = f"{dataname}{generics}" - else: - datadef = dataname - self.emit(f"pub struct {datadef} {{", depth) - for f in product.fields: - self.visit(f, typeinfo, "pub ", depth + 1) - self.emit("}", depth) - if product.attributes: - # attributes should just be location info - if not has_expr: - generics_applied = "" - self.emit( - f"pub type {rustname} = Located<{dataname}{generics_applied}, U>;", - depth, - ) - self.emit("", depth) - - -class FoldTraitDefVisitor(TypeInfoEmitVisitor): - def visitModule(self, mod, depth): - self.emit("pub trait Fold {", depth) - self.emit("type TargetU;", depth + 1) - self.emit("type Error;", depth + 1) - self.emit( - "fn map_user(&mut self, user: U) -> Result;", - depth + 2, - ) - for dfn in mod.dfns: - self.visit(dfn, depth + 2) - self.emit("}", depth) - - def visitType(self, type, depth): - name = type.name - apply_u, apply_target_u = self.get_generics(name, "U", "Self::TargetU") - enumname = get_rust_type(name) - self.emit( - f"fn fold_{name}(&mut self, node: {enumname}{apply_u}) -> Result<{enumname}{apply_target_u}, Self::Error> {{", - depth, - ) - self.emit(f"fold_{name}(self, node)", depth + 1) - self.emit("}", depth) - - -class FoldImplVisitor(TypeInfoEmitVisitor): - def visitModule(self, mod, depth): - self.emit( - "fn fold_located + ?Sized, T, MT>(folder: &mut F, node: Located, f: impl FnOnce(&mut F, T) -> Result) -> Result, F::Error> {", - depth, - ) - self.emit( - "Ok(Located { custom: folder.map_user(node.custom)?, location: node.location, end_location: node.end_location, node: f(folder, node.node)? })", - depth + 1, - ) - self.emit("}", depth) - for dfn in mod.dfns: - self.visit(dfn, depth) - - def visitType(self, type, depth=0): - self.visit(type.value, type.name, depth) - - def visitSum(self, sum, name, depth): - apply_t, apply_u, apply_target_u = self.get_generics( - name, "T", "U", "F::TargetU" - ) - enumname = get_rust_type(name) - is_located = bool(sum.attributes) - - self.emit(f"impl Foldable for {enumname}{apply_t} {{", depth) - self.emit(f"type Mapped = {enumname}{apply_u};", depth + 1) - self.emit( - "fn fold + ?Sized>(self, folder: &mut F) -> Result {", - depth + 1, - ) - self.emit(f"folder.fold_{name}(self)", depth + 2) - self.emit("}", depth + 1) - self.emit("}", depth) - - self.emit( - f"pub fn fold_{name} + ?Sized>(#[allow(unused)] folder: &mut F, node: {enumname}{apply_u}) -> Result<{enumname}{apply_target_u}, F::Error> {{", - depth, - ) - if is_located: - self.emit("fold_located(folder, node, |folder, node| {", depth) - enumname += "Kind" - self.emit("match node {", depth + 1) - for cons in sum.types: - fields_pattern = self.make_pattern(cons.fields) - self.emit( - f"{enumname}::{cons.name} {{ {fields_pattern} }} => {{", depth + 2 - ) - self.gen_construction(f"{enumname}::{cons.name}", cons.fields, depth + 3) - self.emit("}", depth + 2) - self.emit("}", depth + 1) - if is_located: - self.emit("})", depth) - self.emit("}", depth) - - def visitProduct(self, product, name, depth): - apply_t, apply_u, apply_target_u = self.get_generics( - name, "T", "U", "F::TargetU" - ) - structname = get_rust_type(name) - is_located = bool(product.attributes) - - self.emit(f"impl Foldable for {structname}{apply_t} {{", depth) - self.emit(f"type Mapped = {structname}{apply_u};", depth + 1) - self.emit( - "fn fold + ?Sized>(self, folder: &mut F) -> Result {", - depth + 1, - ) - self.emit(f"folder.fold_{name}(self)", depth + 2) - self.emit("}", depth + 1) - self.emit("}", depth) - - self.emit( - f"pub fn fold_{name} + ?Sized>(#[allow(unused)] folder: &mut F, node: {structname}{apply_u}) -> Result<{structname}{apply_target_u}, F::Error> {{", - depth, - ) - if is_located: - self.emit("fold_located(folder, node, |folder, node| {", depth) - structname += "Data" - fields_pattern = self.make_pattern(product.fields) - self.emit(f"let {structname} {{ {fields_pattern} }} = node;", depth + 1) - self.gen_construction(structname, product.fields, depth + 1) - if is_located: - self.emit("})", depth) - self.emit("}", depth) - - def make_pattern(self, fields): - return ",".join(rust_field(f.name) for f in fields) - - def gen_construction(self, cons_path, fields, depth): - self.emit(f"Ok({cons_path} {{", depth) - for field in fields: - name = rust_field(field.name) - self.emit(f"{name}: Foldable::fold({name}, folder)?,", depth + 1) - self.emit("})", depth) - - -class FoldModuleVisitor(TypeInfoEmitVisitor): - def visitModule(self, mod): - depth = 0 - self.emit('#[cfg(feature = "fold")]', depth) - self.emit("pub mod fold {", depth) - self.emit("use super::*;", depth + 1) - self.emit("use crate::fold_helpers::Foldable;", depth + 1) - FoldTraitDefVisitor(self.file, self.typeinfo).visit(mod, depth + 1) - FoldImplVisitor(self.file, self.typeinfo).visit(mod, depth + 1) - self.emit("}", depth) - - -class ClassDefVisitor(EmitVisitor): - def visitModule(self, mod): - for dfn in mod.dfns: - self.visit(dfn) - - def visitType(self, type, depth=0): - self.visit(type.value, type.name, depth) - - def visitSum(self, sum, name, depth): - structname = "NodeKind" + get_rust_type(name) - self.emit( - f'#[pyclass(module = "_ast", name = {json.dumps(name)}, base = "AstNode")]', - depth, - ) - self.emit(f"struct {structname};", depth) - self.emit("#[pyclass(flags(HAS_DICT, BASETYPE))]", depth) - self.emit(f"impl {structname} {{}}", depth) - for cons in sum.types: - self.visit(cons, sum.attributes, structname, depth) - - def visitConstructor(self, cons, attrs, base, depth): - self.gen_classdef(cons.name, cons.fields, attrs, depth, base) - - def visitProduct(self, product, name, depth): - self.gen_classdef(name, product.fields, product.attributes, depth) - - def gen_classdef(self, name, fields, attrs, depth, base="AstNode"): - structname = "Node" + get_rust_type(name) - self.emit( - f'#[pyclass(module = "_ast", name = {json.dumps(name)}, base = {json.dumps(base)})]', - depth, - ) - self.emit(f"struct {structname};", depth) - self.emit("#[pyclass(flags(HAS_DICT, BASETYPE))]", depth) - self.emit(f"impl {structname} {{", depth) - self.emit(f"#[extend_class]", depth + 1) - self.emit( - "fn extend_class_with_fields(ctx: &Context, class: &'static Py) {", - depth + 1, - ) - fields = ",".join( - f"ctx.new_str(ascii!({json.dumps(f.name)})).into()" for f in fields - ) - self.emit( - f"class.set_attr(identifier!(ctx, _fields), ctx.new_tuple(vec![{fields}]).into());", - depth + 2, - ) - attrs = ",".join( - f"ctx.new_str(ascii!({json.dumps(attr.name)})).into()" for attr in attrs - ) - self.emit( - f"class.set_attr(identifier!(ctx, _attributes), ctx.new_list(vec![{attrs}]).into());", - depth + 2, - ) - self.emit("}", depth + 1) - self.emit("}", depth) - - -class ExtendModuleVisitor(EmitVisitor): - def visitModule(self, mod): - depth = 0 - self.emit( - "pub fn extend_module_nodes(vm: &VirtualMachine, module: &Py) {", - depth, - ) - self.emit("extend_module!(vm, module, {", depth + 1) - for dfn in mod.dfns: - self.visit(dfn, depth + 2) - self.emit("})", depth + 1) - self.emit("}", depth) - - def visitType(self, type, depth): - self.visit(type.value, type.name, depth) - - def visitSum(self, sum, name, depth): - rust_name = get_rust_type(name) - self.emit( - f"{json.dumps(name)} => NodeKind{rust_name}::make_class(&vm.ctx),", depth - ) - for cons in sum.types: - self.visit(cons, depth) - - def visitConstructor(self, cons, depth): - self.gen_extension(cons.name, depth) - - def visitProduct(self, product, name, depth): - self.gen_extension(name, depth) - - def gen_extension(self, name, depth): - rust_name = get_rust_type(name) - self.emit(f"{json.dumps(name)} => Node{rust_name}::make_class(&vm.ctx),", depth) - - -class TraitImplVisitor(EmitVisitor): - def visitModule(self, mod): - for dfn in mod.dfns: - self.visit(dfn) - - def visitType(self, type, depth=0): - self.visit(type.value, type.name, depth) - - def visitSum(self, sum, name, depth): - enumname = get_rust_type(name) - if sum.attributes: - enumname += "Kind" - - self.emit(f"impl NamedNode for ast::{enumname} {{", depth) - self.emit(f"const NAME: &'static str = {json.dumps(name)};", depth + 1) - self.emit("}", depth) - self.emit(f"impl Node for ast::{enumname} {{", depth) - self.emit( - "fn ast_to_object(self, _vm: &VirtualMachine) -> PyObjectRef {", depth + 1 - ) - self.emit("match self {", depth + 2) - for variant in sum.types: - self.constructor_to_object(variant, enumname, depth + 3) - self.emit("}", depth + 2) - self.emit("}", depth + 1) - self.emit( - "fn ast_from_object(_vm: &VirtualMachine, _object: PyObjectRef) -> PyResult {", - depth + 1, - ) - self.gen_sum_fromobj(sum, name, enumname, depth + 2) - self.emit("}", depth + 1) - self.emit("}", depth) - - def constructor_to_object(self, cons, enumname, depth): - fields_pattern = self.make_pattern(cons.fields) - self.emit(f"ast::{enumname}::{cons.name} {{ {fields_pattern} }} => {{", depth) - self.make_node(cons.name, cons.fields, depth + 1) - self.emit("}", depth) - - def visitProduct(self, product, name, depth): - structname = get_rust_type(name) - if product.attributes: - structname += "Data" - - self.emit(f"impl NamedNode for ast::{structname} {{", depth) - self.emit(f"const NAME: &'static str = {json.dumps(name)};", depth + 1) - self.emit("}", depth) - self.emit(f"impl Node for ast::{structname} {{", depth) - self.emit( - "fn ast_to_object(self, _vm: &VirtualMachine) -> PyObjectRef {", depth + 1 - ) - fields_pattern = self.make_pattern(product.fields) - self.emit(f"let ast::{structname} {{ {fields_pattern} }} = self;", depth + 2) - self.make_node(name, product.fields, depth + 2) - self.emit("}", depth + 1) - self.emit( - "fn ast_from_object(_vm: &VirtualMachine, _object: PyObjectRef) -> PyResult {", - depth + 1, - ) - self.gen_product_fromobj(product, name, structname, depth + 2) - self.emit("}", depth + 1) - self.emit("}", depth) - - def make_node(self, variant, fields, depth): - rust_variant = get_rust_type(variant) - self.emit( - f"let _node = AstNode.into_ref_with_type(_vm, Node{rust_variant}::static_type().to_owned()).unwrap();", - depth, - ) - if fields: - self.emit("let _dict = _node.as_object().dict().unwrap();", depth) - for f in fields: - self.emit( - f"_dict.set_item({json.dumps(f.name)}, {rust_field(f.name)}.ast_to_object(_vm), _vm).unwrap();", - depth, - ) - self.emit("_node.into()", depth) - - def make_pattern(self, fields): - return ",".join(rust_field(f.name) for f in fields) - - def gen_sum_fromobj(self, sum, sumname, enumname, depth): - if sum.attributes: - self.extract_location(sumname, depth) - - self.emit("let _cls = _object.class();", depth) - self.emit("Ok(", depth) - for cons in sum.types: - self.emit(f"if _cls.is(Node{cons.name}::static_type()) {{", depth) - self.gen_construction(f"{enumname}::{cons.name}", cons, sumname, depth + 1) - self.emit("} else", depth) - - self.emit("{", depth) - msg = f'format!("expected some sort of {sumname}, but got {{}}",_object.repr(_vm)?)' - self.emit(f"return Err(_vm.new_type_error({msg}));", depth + 1) - self.emit("})", depth) - - def gen_product_fromobj(self, product, prodname, structname, depth): - if product.attributes: - self.extract_location(prodname, depth) - - self.emit("Ok(", depth) - self.gen_construction(structname, product, prodname, depth + 1) - self.emit(")", depth) - - def gen_construction(self, cons_path, cons, name, depth): - self.emit(f"ast::{cons_path} {{", depth) - for field in cons.fields: - self.emit( - f"{rust_field(field.name)}: {self.decode_field(field, name)},", - depth + 1, - ) - self.emit("}", depth) - - def extract_location(self, typename, depth): - row = self.decode_field(asdl.Field("int", "lineno"), typename) - column = self.decode_field(asdl.Field("int", "col_offset"), typename) - self.emit(f"let _location = ast::Location::new({row}, {column});", depth) - - def decode_field(self, field, typename): - name = json.dumps(field.name) - if field.opt and not field.seq: - return f"get_node_field_opt(_vm, &_object, {name})?.map(|obj| Node::ast_from_object(_vm, obj)).transpose()?" - else: - return f"Node::ast_from_object(_vm, get_node_field(_vm, &_object, {name}, {json.dumps(typename)})?)?" - - -class ChainOfVisitors: - def __init__(self, *visitors): - self.visitors = visitors - - def visit(self, object): - for v in self.visitors: - v.visit(object) - v.emit("", 0) - - -def write_ast_def(mod, typeinfo, f): - f.write( - textwrap.dedent( - """ - #![allow(clippy::derive_partial_eq_without_eq)] - - pub use crate::constant::*; - pub use crate::Location; - - type Ident = String; - \n - """ - ) - ) - StructVisitor(f, typeinfo).emit_attrs(0) - f.write( - textwrap.dedent( - """ - pub struct Located { - pub location: Location, - pub end_location: Option, - pub custom: U, - pub node: T, - } - - impl Located { - pub fn new(location: Location, end_location: Location, node: T) -> Self { - Self { location, end_location: Some(end_location), custom: (), node } - } - - pub const fn start(&self) -> Location { - self.location - } - - /// Returns the node's [`end_location`](Located::end_location) or [`location`](Located::start) if - /// [`end_location`](Located::end_location) is `None`. - pub fn end(&self) -> Location { - self.end_location.unwrap_or(self.location) - } - } - - impl std::ops::Deref for Located { - type Target = T; - - fn deref(&self) -> &Self::Target { - &self.node - } - } - \n - """.lstrip() - ) - ) - - c = ChainOfVisitors(StructVisitor(f, typeinfo), FoldModuleVisitor(f, typeinfo)) - c.visit(mod) - - -def write_ast_mod(mod, f): - f.write( - textwrap.dedent( - """ - #![allow(clippy::all)] - - use super::*; - use crate::common::ascii; - - """ - ) - ) - - c = ChainOfVisitors(ClassDefVisitor(f), TraitImplVisitor(f), ExtendModuleVisitor(f)) - c.visit(mod) - - -def main(input_filename, ast_mod_filename, ast_def_filename, dump_module=False): - auto_gen_msg = AUTOGEN_MESSAGE.format("/".join(Path(__file__).parts[-2:])) - mod = asdl.parse(input_filename) - if dump_module: - print("Parsed Module:") - print(mod) - if not asdl.check(mod): - sys.exit(1) - - typeinfo = {} - FindUserdataTypesVisitor(typeinfo).visit(mod) - - with ast_def_filename.open("w") as def_file, ast_mod_filename.open("w") as mod_file: - def_file.write(auto_gen_msg) - write_ast_def(mod, typeinfo, def_file) - - mod_file.write(auto_gen_msg) - write_ast_mod(mod, mod_file) - - print(f"{ast_def_filename}, {ast_mod_filename} regenerated.") - - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument("input_file", type=Path) - parser.add_argument("-M", "--mod-file", type=Path, required=True) - parser.add_argument("-D", "--def-file", type=Path, required=True) - parser.add_argument("-d", "--dump-module", action="store_true") - - args = parser.parse_args() - main(args.input_file, args.mod_file, args.def_file, args.dump_module) diff --git a/compiler/ast/src/ast_gen.rs b/compiler/ast/src/ast_gen.rs deleted file mode 100644 index 6771dd0eb5..0000000000 --- a/compiler/ast/src/ast_gen.rs +++ /dev/null @@ -1,1320 +0,0 @@ -// File automatically generated by ast/asdl_rs.py. - -#![allow(clippy::derive_partial_eq_without_eq)] - -pub use crate::constant::*; -pub use crate::Location; - -type Ident = String; - -#[derive(Clone, Debug, PartialEq)] -pub struct Located { - pub location: Location, - pub end_location: Option, - pub custom: U, - pub node: T, -} - -impl Located { - pub fn new(location: Location, end_location: Location, node: T) -> Self { - Self { - location, - end_location: Some(end_location), - custom: (), - node, - } - } - - pub const fn start(&self) -> Location { - self.location - } - - /// Returns the node's [`end_location`](Located::end_location) or [`location`](Located::start) if - /// [`end_location`](Located::end_location) is `None`. - pub fn end(&self) -> Location { - self.end_location.unwrap_or(self.location) - } -} - -impl std::ops::Deref for Located { - type Target = T; - - fn deref(&self) -> &Self::Target { - &self.node - } -} - -#[derive(Clone, Debug, PartialEq)] -pub enum Mod { - Module { - body: Vec>, - type_ignores: Vec, - }, - Interactive { - body: Vec>, - }, - Expression { - body: Box>, - }, - FunctionType { - argtypes: Vec>, - returns: Box>, - }, -} - -#[derive(Clone, Debug, PartialEq)] -pub enum StmtKind { - FunctionDef { - name: Ident, - args: Box>, - body: Vec>, - decorator_list: Vec>, - returns: Option>>, - type_comment: Option, - }, - AsyncFunctionDef { - name: Ident, - args: Box>, - body: Vec>, - decorator_list: Vec>, - returns: Option>>, - type_comment: Option, - }, - ClassDef { - name: Ident, - bases: Vec>, - keywords: Vec>, - body: Vec>, - decorator_list: Vec>, - }, - Return { - value: Option>>, - }, - Delete { - targets: Vec>, - }, - Assign { - targets: Vec>, - value: Box>, - type_comment: Option, - }, - AugAssign { - target: Box>, - op: Operator, - value: Box>, - }, - AnnAssign { - target: Box>, - annotation: Box>, - value: Option>>, - simple: usize, - }, - For { - target: Box>, - iter: Box>, - body: Vec>, - orelse: Vec>, - type_comment: Option, - }, - AsyncFor { - target: Box>, - iter: Box>, - body: Vec>, - orelse: Vec>, - type_comment: Option, - }, - While { - test: Box>, - body: Vec>, - orelse: Vec>, - }, - If { - test: Box>, - body: Vec>, - orelse: Vec>, - }, - With { - items: Vec>, - body: Vec>, - type_comment: Option, - }, - AsyncWith { - items: Vec>, - body: Vec>, - type_comment: Option, - }, - Match { - subject: Box>, - cases: Vec>, - }, - Raise { - exc: Option>>, - cause: Option>>, - }, - Try { - body: Vec>, - handlers: Vec>, - orelse: Vec>, - finalbody: Vec>, - }, - TryStar { - body: Vec>, - handlers: Vec>, - orelse: Vec>, - finalbody: Vec>, - }, - Assert { - test: Box>, - msg: Option>>, - }, - Import { - names: Vec>, - }, - ImportFrom { - module: Option, - names: Vec>, - level: Option, - }, - Global { - names: Vec, - }, - Nonlocal { - names: Vec, - }, - Expr { - value: Box>, - }, - Pass, - Break, - Continue, -} -pub type Stmt = Located, U>; - -#[derive(Clone, Debug, PartialEq)] -pub enum ExprKind { - BoolOp { - op: Boolop, - values: Vec>, - }, - NamedExpr { - target: Box>, - value: Box>, - }, - BinOp { - left: Box>, - op: Operator, - right: Box>, - }, - UnaryOp { - op: Unaryop, - operand: Box>, - }, - Lambda { - args: Box>, - body: Box>, - }, - IfExp { - test: Box>, - body: Box>, - orelse: Box>, - }, - Dict { - keys: Vec>>, - values: Vec>, - }, - Set { - elts: Vec>, - }, - ListComp { - elt: Box>, - generators: Vec>, - }, - SetComp { - elt: Box>, - generators: Vec>, - }, - DictComp { - key: Box>, - value: Box>, - generators: Vec>, - }, - GeneratorExp { - elt: Box>, - generators: Vec>, - }, - Await { - value: Box>, - }, - Yield { - value: Option>>, - }, - YieldFrom { - value: Box>, - }, - Compare { - left: Box>, - ops: Vec, - comparators: Vec>, - }, - Call { - func: Box>, - args: Vec>, - keywords: Vec>, - }, - FormattedValue { - value: Box>, - conversion: usize, - format_spec: Option>>, - }, - JoinedStr { - values: Vec>, - }, - Constant { - value: Constant, - kind: Option, - }, - Attribute { - value: Box>, - attr: Ident, - ctx: ExprContext, - }, - Subscript { - value: Box>, - slice: Box>, - ctx: ExprContext, - }, - Starred { - value: Box>, - ctx: ExprContext, - }, - Name { - id: Ident, - ctx: ExprContext, - }, - List { - elts: Vec>, - ctx: ExprContext, - }, - Tuple { - elts: Vec>, - ctx: ExprContext, - }, - Slice { - lower: Option>>, - upper: Option>>, - step: Option>>, - }, -} -pub type Expr = Located, U>; - -#[derive(Clone, Debug, PartialEq)] -pub enum ExprContext { - Load, - Store, - Del, -} - -#[derive(Clone, Debug, PartialEq)] -pub enum Boolop { - And, - Or, -} - -#[derive(Clone, Debug, PartialEq)] -pub enum Operator { - Add, - Sub, - Mult, - MatMult, - Div, - Mod, - Pow, - LShift, - RShift, - BitOr, - BitXor, - BitAnd, - FloorDiv, -} - -#[derive(Clone, Debug, PartialEq)] -pub enum Unaryop { - Invert, - Not, - UAdd, - USub, -} - -#[derive(Clone, Debug, PartialEq)] -pub enum Cmpop { - Eq, - NotEq, - Lt, - LtE, - Gt, - GtE, - Is, - IsNot, - In, - NotIn, -} - -#[derive(Clone, Debug, PartialEq)] -pub struct Comprehension { - pub target: Expr, - pub iter: Expr, - pub ifs: Vec>, - pub is_async: usize, -} - -#[derive(Clone, Debug, PartialEq)] -pub enum ExcepthandlerKind { - ExceptHandler { - type_: Option>>, - name: Option, - body: Vec>, - }, -} -pub type Excepthandler = Located, U>; - -#[derive(Clone, Debug, PartialEq)] -pub struct Arguments { - pub posonlyargs: Vec>, - pub args: Vec>, - pub vararg: Option>>, - pub kwonlyargs: Vec>, - pub kw_defaults: Vec>, - pub kwarg: Option>>, - pub defaults: Vec>, -} - -#[derive(Clone, Debug, PartialEq)] -pub struct ArgData { - pub arg: Ident, - pub annotation: Option>>, - pub type_comment: Option, -} -pub type Arg = Located, U>; - -#[derive(Clone, Debug, PartialEq)] -pub struct KeywordData { - pub arg: Option, - pub value: Expr, -} -pub type Keyword = Located, U>; - -#[derive(Clone, Debug, PartialEq)] -pub struct AliasData { - pub name: Ident, - pub asname: Option, -} -pub type Alias = Located; - -#[derive(Clone, Debug, PartialEq)] -pub struct Withitem { - pub context_expr: Expr, - pub optional_vars: Option>>, -} - -#[derive(Clone, Debug, PartialEq)] -pub struct MatchCase { - pub pattern: Pattern, - pub guard: Option>>, - pub body: Vec>, -} - -#[derive(Clone, Debug, PartialEq)] -pub enum PatternKind { - MatchValue { - value: Box>, - }, - MatchSingleton { - value: Constant, - }, - MatchSequence { - patterns: Vec>, - }, - MatchMapping { - keys: Vec>, - patterns: Vec>, - rest: Option, - }, - MatchClass { - cls: Box>, - patterns: Vec>, - kwd_attrs: Vec, - kwd_patterns: Vec>, - }, - MatchStar { - name: Option, - }, - MatchAs { - pattern: Option>>, - name: Option, - }, - MatchOr { - patterns: Vec>, - }, -} -pub type Pattern = Located, U>; - -#[derive(Clone, Debug, PartialEq)] -pub enum TypeIgnore { - TypeIgnore { lineno: usize, tag: String }, -} - -#[cfg(feature = "fold")] -pub mod fold { - use super::*; - use crate::fold_helpers::Foldable; - pub trait Fold { - type TargetU; - type Error; - fn map_user(&mut self, user: U) -> Result; - fn fold_mod(&mut self, node: Mod) -> Result, Self::Error> { - fold_mod(self, node) - } - fn fold_stmt(&mut self, node: Stmt) -> Result, Self::Error> { - fold_stmt(self, node) - } - fn fold_expr(&mut self, node: Expr) -> Result, Self::Error> { - fold_expr(self, node) - } - fn fold_expr_context(&mut self, node: ExprContext) -> Result { - fold_expr_context(self, node) - } - fn fold_boolop(&mut self, node: Boolop) -> Result { - fold_boolop(self, node) - } - fn fold_operator(&mut self, node: Operator) -> Result { - fold_operator(self, node) - } - fn fold_unaryop(&mut self, node: Unaryop) -> Result { - fold_unaryop(self, node) - } - fn fold_cmpop(&mut self, node: Cmpop) -> Result { - fold_cmpop(self, node) - } - fn fold_comprehension( - &mut self, - node: Comprehension, - ) -> Result, Self::Error> { - fold_comprehension(self, node) - } - fn fold_excepthandler( - &mut self, - node: Excepthandler, - ) -> Result, Self::Error> { - fold_excepthandler(self, node) - } - fn fold_arguments( - &mut self, - node: Arguments, - ) -> Result, Self::Error> { - fold_arguments(self, node) - } - fn fold_arg(&mut self, node: Arg) -> Result, Self::Error> { - fold_arg(self, node) - } - fn fold_keyword( - &mut self, - node: Keyword, - ) -> Result, Self::Error> { - fold_keyword(self, node) - } - fn fold_alias(&mut self, node: Alias) -> Result, Self::Error> { - fold_alias(self, node) - } - fn fold_withitem( - &mut self, - node: Withitem, - ) -> Result, Self::Error> { - fold_withitem(self, node) - } - fn fold_match_case( - &mut self, - node: MatchCase, - ) -> Result, Self::Error> { - fold_match_case(self, node) - } - fn fold_pattern( - &mut self, - node: Pattern, - ) -> Result, Self::Error> { - fold_pattern(self, node) - } - fn fold_type_ignore(&mut self, node: TypeIgnore) -> Result { - fold_type_ignore(self, node) - } - } - fn fold_located + ?Sized, T, MT>( - folder: &mut F, - node: Located, - f: impl FnOnce(&mut F, T) -> Result, - ) -> Result, F::Error> { - Ok(Located { - custom: folder.map_user(node.custom)?, - location: node.location, - end_location: node.end_location, - node: f(folder, node.node)?, - }) - } - impl Foldable for Mod { - type Mapped = Mod; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_mod(self) - } - } - pub fn fold_mod + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Mod, - ) -> Result, F::Error> { - match node { - Mod::Module { body, type_ignores } => Ok(Mod::Module { - body: Foldable::fold(body, folder)?, - type_ignores: Foldable::fold(type_ignores, folder)?, - }), - Mod::Interactive { body } => Ok(Mod::Interactive { - body: Foldable::fold(body, folder)?, - }), - Mod::Expression { body } => Ok(Mod::Expression { - body: Foldable::fold(body, folder)?, - }), - Mod::FunctionType { argtypes, returns } => Ok(Mod::FunctionType { - argtypes: Foldable::fold(argtypes, folder)?, - returns: Foldable::fold(returns, folder)?, - }), - } - } - impl Foldable for Stmt { - type Mapped = Stmt; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_stmt(self) - } - } - pub fn fold_stmt + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Stmt, - ) -> Result, F::Error> { - fold_located(folder, node, |folder, node| match node { - StmtKind::FunctionDef { - name, - args, - body, - decorator_list, - returns, - type_comment, - } => Ok(StmtKind::FunctionDef { - name: Foldable::fold(name, folder)?, - args: Foldable::fold(args, folder)?, - body: Foldable::fold(body, folder)?, - decorator_list: Foldable::fold(decorator_list, folder)?, - returns: Foldable::fold(returns, folder)?, - type_comment: Foldable::fold(type_comment, folder)?, - }), - StmtKind::AsyncFunctionDef { - name, - args, - body, - decorator_list, - returns, - type_comment, - } => Ok(StmtKind::AsyncFunctionDef { - name: Foldable::fold(name, folder)?, - args: Foldable::fold(args, folder)?, - body: Foldable::fold(body, folder)?, - decorator_list: Foldable::fold(decorator_list, folder)?, - returns: Foldable::fold(returns, folder)?, - type_comment: Foldable::fold(type_comment, folder)?, - }), - StmtKind::ClassDef { - name, - bases, - keywords, - body, - decorator_list, - } => Ok(StmtKind::ClassDef { - name: Foldable::fold(name, folder)?, - bases: Foldable::fold(bases, folder)?, - keywords: Foldable::fold(keywords, folder)?, - body: Foldable::fold(body, folder)?, - decorator_list: Foldable::fold(decorator_list, folder)?, - }), - StmtKind::Return { value } => Ok(StmtKind::Return { - value: Foldable::fold(value, folder)?, - }), - StmtKind::Delete { targets } => Ok(StmtKind::Delete { - targets: Foldable::fold(targets, folder)?, - }), - StmtKind::Assign { - targets, - value, - type_comment, - } => Ok(StmtKind::Assign { - targets: Foldable::fold(targets, folder)?, - value: Foldable::fold(value, folder)?, - type_comment: Foldable::fold(type_comment, folder)?, - }), - StmtKind::AugAssign { target, op, value } => Ok(StmtKind::AugAssign { - target: Foldable::fold(target, folder)?, - op: Foldable::fold(op, folder)?, - value: Foldable::fold(value, folder)?, - }), - StmtKind::AnnAssign { - target, - annotation, - value, - simple, - } => Ok(StmtKind::AnnAssign { - target: Foldable::fold(target, folder)?, - annotation: Foldable::fold(annotation, folder)?, - value: Foldable::fold(value, folder)?, - simple: Foldable::fold(simple, folder)?, - }), - StmtKind::For { - target, - iter, - body, - orelse, - type_comment, - } => Ok(StmtKind::For { - target: Foldable::fold(target, folder)?, - iter: Foldable::fold(iter, folder)?, - body: Foldable::fold(body, folder)?, - orelse: Foldable::fold(orelse, folder)?, - type_comment: Foldable::fold(type_comment, folder)?, - }), - StmtKind::AsyncFor { - target, - iter, - body, - orelse, - type_comment, - } => Ok(StmtKind::AsyncFor { - target: Foldable::fold(target, folder)?, - iter: Foldable::fold(iter, folder)?, - body: Foldable::fold(body, folder)?, - orelse: Foldable::fold(orelse, folder)?, - type_comment: Foldable::fold(type_comment, folder)?, - }), - StmtKind::While { test, body, orelse } => Ok(StmtKind::While { - test: Foldable::fold(test, folder)?, - body: Foldable::fold(body, folder)?, - orelse: Foldable::fold(orelse, folder)?, - }), - StmtKind::If { test, body, orelse } => Ok(StmtKind::If { - test: Foldable::fold(test, folder)?, - body: Foldable::fold(body, folder)?, - orelse: Foldable::fold(orelse, folder)?, - }), - StmtKind::With { - items, - body, - type_comment, - } => Ok(StmtKind::With { - items: Foldable::fold(items, folder)?, - body: Foldable::fold(body, folder)?, - type_comment: Foldable::fold(type_comment, folder)?, - }), - StmtKind::AsyncWith { - items, - body, - type_comment, - } => Ok(StmtKind::AsyncWith { - items: Foldable::fold(items, folder)?, - body: Foldable::fold(body, folder)?, - type_comment: Foldable::fold(type_comment, folder)?, - }), - StmtKind::Match { subject, cases } => Ok(StmtKind::Match { - subject: Foldable::fold(subject, folder)?, - cases: Foldable::fold(cases, folder)?, - }), - StmtKind::Raise { exc, cause } => Ok(StmtKind::Raise { - exc: Foldable::fold(exc, folder)?, - cause: Foldable::fold(cause, folder)?, - }), - StmtKind::Try { - body, - handlers, - orelse, - finalbody, - } => Ok(StmtKind::Try { - body: Foldable::fold(body, folder)?, - handlers: Foldable::fold(handlers, folder)?, - orelse: Foldable::fold(orelse, folder)?, - finalbody: Foldable::fold(finalbody, folder)?, - }), - StmtKind::TryStar { - body, - handlers, - orelse, - finalbody, - } => Ok(StmtKind::TryStar { - body: Foldable::fold(body, folder)?, - handlers: Foldable::fold(handlers, folder)?, - orelse: Foldable::fold(orelse, folder)?, - finalbody: Foldable::fold(finalbody, folder)?, - }), - StmtKind::Assert { test, msg } => Ok(StmtKind::Assert { - test: Foldable::fold(test, folder)?, - msg: Foldable::fold(msg, folder)?, - }), - StmtKind::Import { names } => Ok(StmtKind::Import { - names: Foldable::fold(names, folder)?, - }), - StmtKind::ImportFrom { - module, - names, - level, - } => Ok(StmtKind::ImportFrom { - module: Foldable::fold(module, folder)?, - names: Foldable::fold(names, folder)?, - level: Foldable::fold(level, folder)?, - }), - StmtKind::Global { names } => Ok(StmtKind::Global { - names: Foldable::fold(names, folder)?, - }), - StmtKind::Nonlocal { names } => Ok(StmtKind::Nonlocal { - names: Foldable::fold(names, folder)?, - }), - StmtKind::Expr { value } => Ok(StmtKind::Expr { - value: Foldable::fold(value, folder)?, - }), - StmtKind::Pass {} => Ok(StmtKind::Pass {}), - StmtKind::Break {} => Ok(StmtKind::Break {}), - StmtKind::Continue {} => Ok(StmtKind::Continue {}), - }) - } - impl Foldable for Expr { - type Mapped = Expr; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_expr(self) - } - } - pub fn fold_expr + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Expr, - ) -> Result, F::Error> { - fold_located(folder, node, |folder, node| match node { - ExprKind::BoolOp { op, values } => Ok(ExprKind::BoolOp { - op: Foldable::fold(op, folder)?, - values: Foldable::fold(values, folder)?, - }), - ExprKind::NamedExpr { target, value } => Ok(ExprKind::NamedExpr { - target: Foldable::fold(target, folder)?, - value: Foldable::fold(value, folder)?, - }), - ExprKind::BinOp { left, op, right } => Ok(ExprKind::BinOp { - left: Foldable::fold(left, folder)?, - op: Foldable::fold(op, folder)?, - right: Foldable::fold(right, folder)?, - }), - ExprKind::UnaryOp { op, operand } => Ok(ExprKind::UnaryOp { - op: Foldable::fold(op, folder)?, - operand: Foldable::fold(operand, folder)?, - }), - ExprKind::Lambda { args, body } => Ok(ExprKind::Lambda { - args: Foldable::fold(args, folder)?, - body: Foldable::fold(body, folder)?, - }), - ExprKind::IfExp { test, body, orelse } => Ok(ExprKind::IfExp { - test: Foldable::fold(test, folder)?, - body: Foldable::fold(body, folder)?, - orelse: Foldable::fold(orelse, folder)?, - }), - ExprKind::Dict { keys, values } => Ok(ExprKind::Dict { - keys: Foldable::fold(keys, folder)?, - values: Foldable::fold(values, folder)?, - }), - ExprKind::Set { elts } => Ok(ExprKind::Set { - elts: Foldable::fold(elts, folder)?, - }), - ExprKind::ListComp { elt, generators } => Ok(ExprKind::ListComp { - elt: Foldable::fold(elt, folder)?, - generators: Foldable::fold(generators, folder)?, - }), - ExprKind::SetComp { elt, generators } => Ok(ExprKind::SetComp { - elt: Foldable::fold(elt, folder)?, - generators: Foldable::fold(generators, folder)?, - }), - ExprKind::DictComp { - key, - value, - generators, - } => Ok(ExprKind::DictComp { - key: Foldable::fold(key, folder)?, - value: Foldable::fold(value, folder)?, - generators: Foldable::fold(generators, folder)?, - }), - ExprKind::GeneratorExp { elt, generators } => Ok(ExprKind::GeneratorExp { - elt: Foldable::fold(elt, folder)?, - generators: Foldable::fold(generators, folder)?, - }), - ExprKind::Await { value } => Ok(ExprKind::Await { - value: Foldable::fold(value, folder)?, - }), - ExprKind::Yield { value } => Ok(ExprKind::Yield { - value: Foldable::fold(value, folder)?, - }), - ExprKind::YieldFrom { value } => Ok(ExprKind::YieldFrom { - value: Foldable::fold(value, folder)?, - }), - ExprKind::Compare { - left, - ops, - comparators, - } => Ok(ExprKind::Compare { - left: Foldable::fold(left, folder)?, - ops: Foldable::fold(ops, folder)?, - comparators: Foldable::fold(comparators, folder)?, - }), - ExprKind::Call { - func, - args, - keywords, - } => Ok(ExprKind::Call { - func: Foldable::fold(func, folder)?, - args: Foldable::fold(args, folder)?, - keywords: Foldable::fold(keywords, folder)?, - }), - ExprKind::FormattedValue { - value, - conversion, - format_spec, - } => Ok(ExprKind::FormattedValue { - value: Foldable::fold(value, folder)?, - conversion: Foldable::fold(conversion, folder)?, - format_spec: Foldable::fold(format_spec, folder)?, - }), - ExprKind::JoinedStr { values } => Ok(ExprKind::JoinedStr { - values: Foldable::fold(values, folder)?, - }), - ExprKind::Constant { value, kind } => Ok(ExprKind::Constant { - value: Foldable::fold(value, folder)?, - kind: Foldable::fold(kind, folder)?, - }), - ExprKind::Attribute { value, attr, ctx } => Ok(ExprKind::Attribute { - value: Foldable::fold(value, folder)?, - attr: Foldable::fold(attr, folder)?, - ctx: Foldable::fold(ctx, folder)?, - }), - ExprKind::Subscript { value, slice, ctx } => Ok(ExprKind::Subscript { - value: Foldable::fold(value, folder)?, - slice: Foldable::fold(slice, folder)?, - ctx: Foldable::fold(ctx, folder)?, - }), - ExprKind::Starred { value, ctx } => Ok(ExprKind::Starred { - value: Foldable::fold(value, folder)?, - ctx: Foldable::fold(ctx, folder)?, - }), - ExprKind::Name { id, ctx } => Ok(ExprKind::Name { - id: Foldable::fold(id, folder)?, - ctx: Foldable::fold(ctx, folder)?, - }), - ExprKind::List { elts, ctx } => Ok(ExprKind::List { - elts: Foldable::fold(elts, folder)?, - ctx: Foldable::fold(ctx, folder)?, - }), - ExprKind::Tuple { elts, ctx } => Ok(ExprKind::Tuple { - elts: Foldable::fold(elts, folder)?, - ctx: Foldable::fold(ctx, folder)?, - }), - ExprKind::Slice { lower, upper, step } => Ok(ExprKind::Slice { - lower: Foldable::fold(lower, folder)?, - upper: Foldable::fold(upper, folder)?, - step: Foldable::fold(step, folder)?, - }), - }) - } - impl Foldable for ExprContext { - type Mapped = ExprContext; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_expr_context(self) - } - } - pub fn fold_expr_context + ?Sized>( - #[allow(unused)] folder: &mut F, - node: ExprContext, - ) -> Result { - match node { - ExprContext::Load {} => Ok(ExprContext::Load {}), - ExprContext::Store {} => Ok(ExprContext::Store {}), - ExprContext::Del {} => Ok(ExprContext::Del {}), - } - } - impl Foldable for Boolop { - type Mapped = Boolop; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_boolop(self) - } - } - pub fn fold_boolop + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Boolop, - ) -> Result { - match node { - Boolop::And {} => Ok(Boolop::And {}), - Boolop::Or {} => Ok(Boolop::Or {}), - } - } - impl Foldable for Operator { - type Mapped = Operator; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_operator(self) - } - } - pub fn fold_operator + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Operator, - ) -> Result { - match node { - Operator::Add {} => Ok(Operator::Add {}), - Operator::Sub {} => Ok(Operator::Sub {}), - Operator::Mult {} => Ok(Operator::Mult {}), - Operator::MatMult {} => Ok(Operator::MatMult {}), - Operator::Div {} => Ok(Operator::Div {}), - Operator::Mod {} => Ok(Operator::Mod {}), - Operator::Pow {} => Ok(Operator::Pow {}), - Operator::LShift {} => Ok(Operator::LShift {}), - Operator::RShift {} => Ok(Operator::RShift {}), - Operator::BitOr {} => Ok(Operator::BitOr {}), - Operator::BitXor {} => Ok(Operator::BitXor {}), - Operator::BitAnd {} => Ok(Operator::BitAnd {}), - Operator::FloorDiv {} => Ok(Operator::FloorDiv {}), - } - } - impl Foldable for Unaryop { - type Mapped = Unaryop; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_unaryop(self) - } - } - pub fn fold_unaryop + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Unaryop, - ) -> Result { - match node { - Unaryop::Invert {} => Ok(Unaryop::Invert {}), - Unaryop::Not {} => Ok(Unaryop::Not {}), - Unaryop::UAdd {} => Ok(Unaryop::UAdd {}), - Unaryop::USub {} => Ok(Unaryop::USub {}), - } - } - impl Foldable for Cmpop { - type Mapped = Cmpop; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_cmpop(self) - } - } - pub fn fold_cmpop + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Cmpop, - ) -> Result { - match node { - Cmpop::Eq {} => Ok(Cmpop::Eq {}), - Cmpop::NotEq {} => Ok(Cmpop::NotEq {}), - Cmpop::Lt {} => Ok(Cmpop::Lt {}), - Cmpop::LtE {} => Ok(Cmpop::LtE {}), - Cmpop::Gt {} => Ok(Cmpop::Gt {}), - Cmpop::GtE {} => Ok(Cmpop::GtE {}), - Cmpop::Is {} => Ok(Cmpop::Is {}), - Cmpop::IsNot {} => Ok(Cmpop::IsNot {}), - Cmpop::In {} => Ok(Cmpop::In {}), - Cmpop::NotIn {} => Ok(Cmpop::NotIn {}), - } - } - impl Foldable for Comprehension { - type Mapped = Comprehension; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_comprehension(self) - } - } - pub fn fold_comprehension + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Comprehension, - ) -> Result, F::Error> { - let Comprehension { - target, - iter, - ifs, - is_async, - } = node; - Ok(Comprehension { - target: Foldable::fold(target, folder)?, - iter: Foldable::fold(iter, folder)?, - ifs: Foldable::fold(ifs, folder)?, - is_async: Foldable::fold(is_async, folder)?, - }) - } - impl Foldable for Excepthandler { - type Mapped = Excepthandler; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_excepthandler(self) - } - } - pub fn fold_excepthandler + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Excepthandler, - ) -> Result, F::Error> { - fold_located(folder, node, |folder, node| match node { - ExcepthandlerKind::ExceptHandler { type_, name, body } => { - Ok(ExcepthandlerKind::ExceptHandler { - type_: Foldable::fold(type_, folder)?, - name: Foldable::fold(name, folder)?, - body: Foldable::fold(body, folder)?, - }) - } - }) - } - impl Foldable for Arguments { - type Mapped = Arguments; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_arguments(self) - } - } - pub fn fold_arguments + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Arguments, - ) -> Result, F::Error> { - let Arguments { - posonlyargs, - args, - vararg, - kwonlyargs, - kw_defaults, - kwarg, - defaults, - } = node; - Ok(Arguments { - posonlyargs: Foldable::fold(posonlyargs, folder)?, - args: Foldable::fold(args, folder)?, - vararg: Foldable::fold(vararg, folder)?, - kwonlyargs: Foldable::fold(kwonlyargs, folder)?, - kw_defaults: Foldable::fold(kw_defaults, folder)?, - kwarg: Foldable::fold(kwarg, folder)?, - defaults: Foldable::fold(defaults, folder)?, - }) - } - impl Foldable for Arg { - type Mapped = Arg; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_arg(self) - } - } - pub fn fold_arg + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Arg, - ) -> Result, F::Error> { - fold_located(folder, node, |folder, node| { - let ArgData { - arg, - annotation, - type_comment, - } = node; - Ok(ArgData { - arg: Foldable::fold(arg, folder)?, - annotation: Foldable::fold(annotation, folder)?, - type_comment: Foldable::fold(type_comment, folder)?, - }) - }) - } - impl Foldable for Keyword { - type Mapped = Keyword; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_keyword(self) - } - } - pub fn fold_keyword + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Keyword, - ) -> Result, F::Error> { - fold_located(folder, node, |folder, node| { - let KeywordData { arg, value } = node; - Ok(KeywordData { - arg: Foldable::fold(arg, folder)?, - value: Foldable::fold(value, folder)?, - }) - }) - } - impl Foldable for Alias { - type Mapped = Alias; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_alias(self) - } - } - pub fn fold_alias + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Alias, - ) -> Result, F::Error> { - fold_located(folder, node, |folder, node| { - let AliasData { name, asname } = node; - Ok(AliasData { - name: Foldable::fold(name, folder)?, - asname: Foldable::fold(asname, folder)?, - }) - }) - } - impl Foldable for Withitem { - type Mapped = Withitem; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_withitem(self) - } - } - pub fn fold_withitem + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Withitem, - ) -> Result, F::Error> { - let Withitem { - context_expr, - optional_vars, - } = node; - Ok(Withitem { - context_expr: Foldable::fold(context_expr, folder)?, - optional_vars: Foldable::fold(optional_vars, folder)?, - }) - } - impl Foldable for MatchCase { - type Mapped = MatchCase; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_match_case(self) - } - } - pub fn fold_match_case + ?Sized>( - #[allow(unused)] folder: &mut F, - node: MatchCase, - ) -> Result, F::Error> { - let MatchCase { - pattern, - guard, - body, - } = node; - Ok(MatchCase { - pattern: Foldable::fold(pattern, folder)?, - guard: Foldable::fold(guard, folder)?, - body: Foldable::fold(body, folder)?, - }) - } - impl Foldable for Pattern { - type Mapped = Pattern; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_pattern(self) - } - } - pub fn fold_pattern + ?Sized>( - #[allow(unused)] folder: &mut F, - node: Pattern, - ) -> Result, F::Error> { - fold_located(folder, node, |folder, node| match node { - PatternKind::MatchValue { value } => Ok(PatternKind::MatchValue { - value: Foldable::fold(value, folder)?, - }), - PatternKind::MatchSingleton { value } => Ok(PatternKind::MatchSingleton { - value: Foldable::fold(value, folder)?, - }), - PatternKind::MatchSequence { patterns } => Ok(PatternKind::MatchSequence { - patterns: Foldable::fold(patterns, folder)?, - }), - PatternKind::MatchMapping { - keys, - patterns, - rest, - } => Ok(PatternKind::MatchMapping { - keys: Foldable::fold(keys, folder)?, - patterns: Foldable::fold(patterns, folder)?, - rest: Foldable::fold(rest, folder)?, - }), - PatternKind::MatchClass { - cls, - patterns, - kwd_attrs, - kwd_patterns, - } => Ok(PatternKind::MatchClass { - cls: Foldable::fold(cls, folder)?, - patterns: Foldable::fold(patterns, folder)?, - kwd_attrs: Foldable::fold(kwd_attrs, folder)?, - kwd_patterns: Foldable::fold(kwd_patterns, folder)?, - }), - PatternKind::MatchStar { name } => Ok(PatternKind::MatchStar { - name: Foldable::fold(name, folder)?, - }), - PatternKind::MatchAs { pattern, name } => Ok(PatternKind::MatchAs { - pattern: Foldable::fold(pattern, folder)?, - name: Foldable::fold(name, folder)?, - }), - PatternKind::MatchOr { patterns } => Ok(PatternKind::MatchOr { - patterns: Foldable::fold(patterns, folder)?, - }), - }) - } - impl Foldable for TypeIgnore { - type Mapped = TypeIgnore; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - folder.fold_type_ignore(self) - } - } - pub fn fold_type_ignore + ?Sized>( - #[allow(unused)] folder: &mut F, - node: TypeIgnore, - ) -> Result { - match node { - TypeIgnore::TypeIgnore { lineno, tag } => Ok(TypeIgnore::TypeIgnore { - lineno: Foldable::fold(lineno, folder)?, - tag: Foldable::fold(tag, folder)?, - }), - } - } -} diff --git a/compiler/ast/src/constant.rs b/compiler/ast/src/constant.rs deleted file mode 100644 index 3514e52178..0000000000 --- a/compiler/ast/src/constant.rs +++ /dev/null @@ -1,239 +0,0 @@ -use num_bigint::BigInt; -pub use rustpython_compiler_core::ConversionFlag; - -#[derive(Clone, Debug, PartialEq)] -pub enum Constant { - None, - Bool(bool), - Str(String), - Bytes(Vec), - Int(BigInt), - Tuple(Vec), - Float(f64), - Complex { real: f64, imag: f64 }, - Ellipsis, -} - -impl From for Constant { - fn from(s: String) -> Constant { - Self::Str(s) - } -} -impl From> for Constant { - fn from(b: Vec) -> Constant { - Self::Bytes(b) - } -} -impl From for Constant { - fn from(b: bool) -> Constant { - Self::Bool(b) - } -} -impl From for Constant { - fn from(i: BigInt) -> Constant { - Self::Int(i) - } -} - -#[cfg(feature = "rustpython-literal")] -impl std::fmt::Display for Constant { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Constant::None => f.pad("None"), - Constant::Bool(b) => f.pad(if *b { "True" } else { "False" }), - Constant::Str(s) => rustpython_literal::escape::UnicodeEscape::new_repr(s.as_str()) - .str_repr() - .write(f), - Constant::Bytes(b) => { - let escape = rustpython_literal::escape::AsciiEscape::new_repr(b); - let repr = escape.bytes_repr().to_string().unwrap(); - f.pad(&repr) - } - Constant::Int(i) => i.fmt(f), - Constant::Tuple(tup) => { - if let [elt] = &**tup { - write!(f, "({elt},)") - } else { - f.write_str("(")?; - for (i, elt) in tup.iter().enumerate() { - if i != 0 { - f.write_str(", ")?; - } - elt.fmt(f)?; - } - f.write_str(")") - } - } - Constant::Float(fp) => f.pad(&rustpython_literal::float::to_string(*fp)), - Constant::Complex { real, imag } => { - if *real == 0.0 { - write!(f, "{imag}j") - } else { - write!(f, "({real}{imag:+}j)") - } - } - Constant::Ellipsis => f.pad("..."), - } - } -} - -#[cfg(feature = "constant-optimization")] -#[non_exhaustive] -#[derive(Default)] -pub struct ConstantOptimizer {} - -#[cfg(feature = "constant-optimization")] -impl ConstantOptimizer { - #[inline] - pub fn new() -> Self { - Self {} - } -} - -#[cfg(feature = "constant-optimization")] -impl crate::fold::Fold for ConstantOptimizer { - type TargetU = U; - type Error = std::convert::Infallible; - #[inline] - fn map_user(&mut self, user: U) -> Result { - Ok(user) - } - fn fold_expr(&mut self, node: crate::Expr) -> Result, Self::Error> { - match node.node { - crate::ExprKind::Tuple { elts, ctx } => { - let elts = elts - .into_iter() - .map(|x| self.fold_expr(x)) - .collect::, _>>()?; - let expr = if elts - .iter() - .all(|e| matches!(e.node, crate::ExprKind::Constant { .. })) - { - let tuple = elts - .into_iter() - .map(|e| match e.node { - crate::ExprKind::Constant { value, .. } => value, - _ => unreachable!(), - }) - .collect(); - crate::ExprKind::Constant { - value: Constant::Tuple(tuple), - kind: None, - } - } else { - crate::ExprKind::Tuple { elts, ctx } - }; - Ok(crate::Expr { - node: expr, - custom: node.custom, - location: node.location, - end_location: node.end_location, - }) - } - _ => crate::fold::fold_expr(self, node), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[cfg(feature = "constant-optimization")] - #[test] - fn test_constant_opt() { - use crate::{fold::Fold, *}; - - let start = Default::default(); - let end = None; - let custom = (); - let ast = Located { - location: start, - end_location: end, - custom, - node: ExprKind::Tuple { - ctx: ExprContext::Load, - elts: vec![ - Located { - location: start, - end_location: end, - custom, - node: ExprKind::Constant { - value: BigInt::from(1).into(), - kind: None, - }, - }, - Located { - location: start, - end_location: end, - custom, - node: ExprKind::Constant { - value: BigInt::from(2).into(), - kind: None, - }, - }, - Located { - location: start, - end_location: end, - custom, - node: ExprKind::Tuple { - ctx: ExprContext::Load, - elts: vec![ - Located { - location: start, - end_location: end, - custom, - node: ExprKind::Constant { - value: BigInt::from(3).into(), - kind: None, - }, - }, - Located { - location: start, - end_location: end, - custom, - node: ExprKind::Constant { - value: BigInt::from(4).into(), - kind: None, - }, - }, - Located { - location: start, - end_location: end, - custom, - node: ExprKind::Constant { - value: BigInt::from(5).into(), - kind: None, - }, - }, - ], - }, - }, - ], - }, - }; - let new_ast = ConstantOptimizer::new() - .fold_expr(ast) - .unwrap_or_else(|e| match e {}); - assert_eq!( - new_ast, - Located { - location: start, - end_location: end, - custom, - node: ExprKind::Constant { - value: Constant::Tuple(vec![ - BigInt::from(1).into(), - BigInt::from(2).into(), - Constant::Tuple(vec![ - BigInt::from(3).into(), - BigInt::from(4).into(), - BigInt::from(5).into(), - ]) - ]), - kind: None - }, - } - ); - } -} diff --git a/compiler/ast/src/fold_helpers.rs b/compiler/ast/src/fold_helpers.rs deleted file mode 100644 index 969ea4e587..0000000000 --- a/compiler/ast/src/fold_helpers.rs +++ /dev/null @@ -1,65 +0,0 @@ -use crate::{constant, fold::Fold}; - -pub(crate) trait Foldable { - type Mapped; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result; -} - -impl Foldable for Vec -where - X: Foldable, -{ - type Mapped = Vec; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - self.into_iter().map(|x| x.fold(folder)).collect() - } -} - -impl Foldable for Option -where - X: Foldable, -{ - type Mapped = Option; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - self.map(|x| x.fold(folder)).transpose() - } -} - -impl Foldable for Box -where - X: Foldable, -{ - type Mapped = Box; - fn fold + ?Sized>( - self, - folder: &mut F, - ) -> Result { - (*self).fold(folder).map(Box::new) - } -} - -macro_rules! simple_fold { - ($($t:ty),+$(,)?) => { - $(impl $crate::fold_helpers::Foldable for $t { - type Mapped = Self; - #[inline] - fn fold + ?Sized>( - self, - _folder: &mut F, - ) -> Result { - Ok(self) - } - })+ - }; -} - -simple_fold!(usize, String, bool, constant::Constant); diff --git a/compiler/ast/src/impls.rs b/compiler/ast/src/impls.rs deleted file mode 100644 index a93028874c..0000000000 --- a/compiler/ast/src/impls.rs +++ /dev/null @@ -1,60 +0,0 @@ -use crate::{Constant, ExprKind}; - -impl ExprKind { - /// Returns a short name for the node suitable for use in error messages. - pub fn name(&self) -> &'static str { - match self { - ExprKind::BoolOp { .. } | ExprKind::BinOp { .. } | ExprKind::UnaryOp { .. } => { - "operator" - } - ExprKind::Subscript { .. } => "subscript", - ExprKind::Await { .. } => "await expression", - ExprKind::Yield { .. } | ExprKind::YieldFrom { .. } => "yield expression", - ExprKind::Compare { .. } => "comparison", - ExprKind::Attribute { .. } => "attribute", - ExprKind::Call { .. } => "function call", - ExprKind::Constant { value, .. } => match value { - Constant::Str(_) - | Constant::Int(_) - | Constant::Float(_) - | Constant::Complex { .. } - | Constant::Bytes(_) => "literal", - Constant::Tuple(_) => "tuple", - Constant::Bool(b) => { - if *b { - "True" - } else { - "False" - } - } - Constant::None => "None", - Constant::Ellipsis => "ellipsis", - }, - ExprKind::List { .. } => "list", - ExprKind::Tuple { .. } => "tuple", - ExprKind::Dict { .. } => "dict display", - ExprKind::Set { .. } => "set display", - ExprKind::ListComp { .. } => "list comprehension", - ExprKind::DictComp { .. } => "dict comprehension", - ExprKind::SetComp { .. } => "set comprehension", - ExprKind::GeneratorExp { .. } => "generator expression", - ExprKind::Starred { .. } => "starred", - ExprKind::Slice { .. } => "slice", - ExprKind::JoinedStr { values } => { - if values - .iter() - .any(|e| matches!(e.node, ExprKind::JoinedStr { .. })) - { - "f-string expression" - } else { - "literal" - } - } - ExprKind::FormattedValue { .. } => "f-string expression", - ExprKind::Name { .. } => "name", - ExprKind::Lambda { .. } => "lambda", - ExprKind::IfExp { .. } => "conditional expression", - ExprKind::NamedExpr { .. } => "named expression", - } - } -} diff --git a/compiler/ast/src/lib.rs b/compiler/ast/src/lib.rs deleted file mode 100644 index d668bede69..0000000000 --- a/compiler/ast/src/lib.rs +++ /dev/null @@ -1,12 +0,0 @@ -mod ast_gen; -mod constant; -#[cfg(feature = "fold")] -mod fold_helpers; -mod impls; -#[cfg(feature = "unparse")] -mod unparse; - -pub use ast_gen::*; -pub use rustpython_compiler_core::Location; - -pub type Suite = Vec>; diff --git a/compiler/ast/src/unparse.rs b/compiler/ast/src/unparse.rs deleted file mode 100644 index 40570fc7ff..0000000000 --- a/compiler/ast/src/unparse.rs +++ /dev/null @@ -1,536 +0,0 @@ -use crate::{ - Arg, Arguments, Boolop, Cmpop, Comprehension, Constant, ConversionFlag, Expr, ExprKind, - Operator, -}; -use std::fmt; - -mod precedence { - macro_rules! precedence { - ($($op:ident,)*) => { - precedence!(@0, $($op,)*); - }; - (@$i:expr, $op1:ident, $($op:ident,)*) => { - pub const $op1: u8 = $i; - precedence!(@$i + 1, $($op,)*); - }; - (@$i:expr,) => {}; - } - precedence!( - TUPLE, TEST, OR, AND, NOT, CMP, // "EXPR" = - BOR, BXOR, BAND, SHIFT, ARITH, TERM, FACTOR, POWER, AWAIT, ATOM, - ); - pub const EXPR: u8 = BOR; -} - -#[repr(transparent)] -struct Unparser<'a> { - f: fmt::Formatter<'a>, -} -impl<'a> Unparser<'a> { - fn new<'b>(f: &'b mut fmt::Formatter<'a>) -> &'b mut Unparser<'a> { - unsafe { &mut *(f as *mut fmt::Formatter<'a> as *mut Unparser<'a>) } - } - - fn p(&mut self, s: &str) -> fmt::Result { - self.f.write_str(s) - } - fn p_if(&mut self, cond: bool, s: &str) -> fmt::Result { - if cond { - self.f.write_str(s)?; - } - Ok(()) - } - fn p_delim(&mut self, first: &mut bool, s: &str) -> fmt::Result { - self.p_if(!std::mem::take(first), s) - } - fn write_fmt(&mut self, f: fmt::Arguments<'_>) -> fmt::Result { - self.f.write_fmt(f) - } - - fn unparse_expr(&mut self, ast: &Expr, level: u8) -> fmt::Result { - macro_rules! op_prec { - ($op_ty:ident, $x:expr, $enu:path, $($var:ident($op:literal, $prec:ident)),*$(,)?) => { - match $x { - $(<$enu>::$var => (op_prec!(@space $op_ty, $op), precedence::$prec),)* - } - }; - (@space bin, $op:literal) => { - concat!(" ", $op, " ") - }; - (@space un, $op:literal) => { - $op - }; - } - macro_rules! group_if { - ($lvl:expr, $body:block) => {{ - let group = level > $lvl; - self.p_if(group, "(")?; - let ret = $body; - self.p_if(group, ")")?; - ret - }}; - } - match &ast.node { - ExprKind::BoolOp { op, values } => { - let (op, prec) = op_prec!(bin, op, Boolop, And("and", AND), Or("or", OR)); - group_if!(prec, { - let mut first = true; - for val in values { - self.p_delim(&mut first, op)?; - self.unparse_expr(val, prec + 1)?; - } - }) - } - ExprKind::NamedExpr { target, value } => { - group_if!(precedence::TUPLE, { - self.unparse_expr(target, precedence::ATOM)?; - self.p(" := ")?; - self.unparse_expr(value, precedence::ATOM)?; - }) - } - ExprKind::BinOp { left, op, right } => { - let right_associative = matches!(op, Operator::Pow); - let (op, prec) = op_prec!( - bin, - op, - Operator, - Add("+", ARITH), - Sub("-", ARITH), - Mult("*", TERM), - MatMult("@", TERM), - Div("/", TERM), - Mod("%", TERM), - Pow("**", POWER), - LShift("<<", SHIFT), - RShift(">>", SHIFT), - BitOr("|", BOR), - BitXor("^", BXOR), - BitAnd("&", BAND), - FloorDiv("//", TERM), - ); - group_if!(prec, { - self.unparse_expr(left, prec + right_associative as u8)?; - self.p(op)?; - self.unparse_expr(right, prec + !right_associative as u8)?; - }) - } - ExprKind::UnaryOp { op, operand } => { - let (op, prec) = op_prec!( - un, - op, - crate::Unaryop, - Invert("~", FACTOR), - Not("not ", NOT), - UAdd("+", FACTOR), - USub("-", FACTOR) - ); - group_if!(prec, { - self.p(op)?; - self.unparse_expr(operand, prec)?; - }) - } - ExprKind::Lambda { args, body } => { - group_if!(precedence::TEST, { - let pos = args.args.len() + args.posonlyargs.len(); - self.p(if pos > 0 { "lambda " } else { "lambda" })?; - self.unparse_args(args)?; - write!(self, ": {}", **body)?; - }) - } - ExprKind::IfExp { test, body, orelse } => { - group_if!(precedence::TEST, { - self.unparse_expr(body, precedence::TEST + 1)?; - self.p(" if ")?; - self.unparse_expr(test, precedence::TEST + 1)?; - self.p(" else ")?; - self.unparse_expr(orelse, precedence::TEST)?; - }) - } - ExprKind::Dict { keys, values } => { - self.p("{")?; - let mut first = true; - let (packed, unpacked) = values.split_at(keys.len()); - for (k, v) in keys.iter().zip(packed) { - self.p_delim(&mut first, ", ")?; - if let Some(k) = k { - write!(self, "{}: {}", *k, *v)?; - } else { - write!(self, "**{}", *v)?; - } - } - for d in unpacked { - self.p_delim(&mut first, ", ")?; - write!(self, "**{}", *d)?; - } - self.p("}")?; - } - ExprKind::Set { elts } => { - self.p("{")?; - let mut first = true; - for v in elts { - self.p_delim(&mut first, ", ")?; - self.unparse_expr(v, precedence::TEST)?; - } - self.p("}")?; - } - ExprKind::ListComp { elt, generators } => { - self.p("[")?; - self.unparse_expr(elt, precedence::TEST)?; - self.unparse_comp(generators)?; - self.p("]")?; - } - ExprKind::SetComp { elt, generators } => { - self.p("{")?; - self.unparse_expr(elt, precedence::TEST)?; - self.unparse_comp(generators)?; - self.p("}")?; - } - ExprKind::DictComp { - key, - value, - generators, - } => { - self.p("{")?; - self.unparse_expr(key, precedence::TEST)?; - self.p(": ")?; - self.unparse_expr(value, precedence::TEST)?; - self.unparse_comp(generators)?; - self.p("}")?; - } - ExprKind::GeneratorExp { elt, generators } => { - self.p("(")?; - self.unparse_expr(elt, precedence::TEST)?; - self.unparse_comp(generators)?; - self.p(")")?; - } - ExprKind::Await { value } => { - group_if!(precedence::AWAIT, { - self.p("await ")?; - self.unparse_expr(value, precedence::ATOM)?; - }) - } - ExprKind::Yield { value } => { - if let Some(value) = value { - write!(self, "(yield {})", **value)?; - } else { - self.p("(yield)")?; - } - } - ExprKind::YieldFrom { value } => { - write!(self, "(yield from {})", **value)?; - } - ExprKind::Compare { - left, - ops, - comparators, - } => { - group_if!(precedence::CMP, { - let new_lvl = precedence::CMP + 1; - self.unparse_expr(left, new_lvl)?; - for (op, cmp) in ops.iter().zip(comparators) { - let op = match op { - Cmpop::Eq => " == ", - Cmpop::NotEq => " != ", - Cmpop::Lt => " < ", - Cmpop::LtE => " <= ", - Cmpop::Gt => " > ", - Cmpop::GtE => " >= ", - Cmpop::Is => " is ", - Cmpop::IsNot => " is not ", - Cmpop::In => " in ", - Cmpop::NotIn => " not in ", - }; - self.p(op)?; - self.unparse_expr(cmp, new_lvl)?; - } - }) - } - ExprKind::Call { - func, - args, - keywords, - } => { - self.unparse_expr(func, precedence::ATOM)?; - self.p("(")?; - if let ( - [Expr { - node: ExprKind::GeneratorExp { elt, generators }, - .. - }], - [], - ) = (&**args, &**keywords) - { - // make sure a single genexpr doesn't get double parens - self.unparse_expr(elt, precedence::TEST)?; - self.unparse_comp(generators)?; - } else { - let mut first = true; - for arg in args { - self.p_delim(&mut first, ", ")?; - self.unparse_expr(arg, precedence::TEST)?; - } - for kw in keywords { - self.p_delim(&mut first, ", ")?; - if let Some(arg) = &kw.node.arg { - self.p(arg)?; - self.p("=")?; - } else { - self.p("**")?; - } - self.unparse_expr(&kw.node.value, precedence::TEST)?; - } - } - self.p(")")?; - } - ExprKind::FormattedValue { - value, - conversion, - format_spec, - } => self.unparse_formatted(value, *conversion, format_spec.as_deref())?, - ExprKind::JoinedStr { values } => self.unparse_joined_str(values, false)?, - ExprKind::Constant { value, kind } => { - if let Some(kind) = kind { - self.p(kind)?; - } - assert_eq!(f64::MAX_10_EXP, 308); - let inf_str = "1e309"; - match value { - Constant::Float(f) if f.is_infinite() => self.p(inf_str)?, - Constant::Complex { real, imag } - if real.is_infinite() || imag.is_infinite() => - { - self.p(&value.to_string().replace("inf", inf_str))? - } - _ => fmt::Display::fmt(value, &mut self.f)?, - } - } - ExprKind::Attribute { value, attr, .. } => { - self.unparse_expr(value, precedence::ATOM)?; - let period = if let ExprKind::Constant { - value: Constant::Int(_), - .. - } = &value.node - { - " ." - } else { - "." - }; - self.p(period)?; - self.p(attr)?; - } - ExprKind::Subscript { value, slice, .. } => { - self.unparse_expr(value, precedence::ATOM)?; - let mut lvl = precedence::TUPLE; - if let ExprKind::Tuple { elts, .. } = &slice.node { - if elts - .iter() - .any(|expr| matches!(expr.node, ExprKind::Starred { .. })) - { - lvl += 1 - } - } - self.p("[")?; - self.unparse_expr(slice, lvl)?; - self.p("]")?; - } - ExprKind::Starred { value, .. } => { - self.p("*")?; - self.unparse_expr(value, precedence::EXPR)?; - } - ExprKind::Name { id, .. } => self.p(id)?, - ExprKind::List { elts, .. } => { - self.p("[")?; - let mut first = true; - for elt in elts { - self.p_delim(&mut first, ", ")?; - self.unparse_expr(elt, precedence::TEST)?; - } - self.p("]")?; - } - ExprKind::Tuple { elts, .. } => { - if elts.is_empty() { - self.p("()")?; - } else { - group_if!(precedence::TUPLE, { - let mut first = true; - for elt in elts { - self.p_delim(&mut first, ", ")?; - self.unparse_expr(elt, precedence::TEST)?; - } - self.p_if(elts.len() == 1, ",")?; - }) - } - } - ExprKind::Slice { lower, upper, step } => { - if let Some(lower) = lower { - self.unparse_expr(lower, precedence::TEST)?; - } - self.p(":")?; - if let Some(upper) = upper { - self.unparse_expr(upper, precedence::TEST)?; - } - if let Some(step) = step { - self.p(":")?; - self.unparse_expr(step, precedence::TEST)?; - } - } - } - Ok(()) - } - - fn unparse_args(&mut self, args: &Arguments) -> fmt::Result { - let mut first = true; - let defaults_start = args.posonlyargs.len() + args.args.len() - args.defaults.len(); - for (i, arg) in args.posonlyargs.iter().chain(&args.args).enumerate() { - self.p_delim(&mut first, ", ")?; - self.unparse_arg(arg)?; - if let Some(i) = i.checked_sub(defaults_start) { - write!(self, "={}", &args.defaults[i])?; - } - self.p_if(i + 1 == args.posonlyargs.len(), ", /")?; - } - if args.vararg.is_some() || !args.kwonlyargs.is_empty() { - self.p_delim(&mut first, ", ")?; - self.p("*")?; - } - if let Some(vararg) = &args.vararg { - self.unparse_arg(vararg)?; - } - let defaults_start = args.kwonlyargs.len() - args.kw_defaults.len(); - for (i, kwarg) in args.kwonlyargs.iter().enumerate() { - self.p_delim(&mut first, ", ")?; - self.unparse_arg(kwarg)?; - if let Some(default) = i - .checked_sub(defaults_start) - .and_then(|i| args.kw_defaults.get(i)) - { - write!(self, "={default}")?; - } - } - if let Some(kwarg) = &args.kwarg { - self.p_delim(&mut first, ", ")?; - self.p("**")?; - self.unparse_arg(kwarg)?; - } - Ok(()) - } - fn unparse_arg(&mut self, arg: &Arg) -> fmt::Result { - self.p(&arg.node.arg)?; - if let Some(ann) = &arg.node.annotation { - write!(self, ": {}", **ann)?; - } - Ok(()) - } - - fn unparse_comp(&mut self, generators: &[Comprehension]) -> fmt::Result { - for comp in generators { - self.p(if comp.is_async > 0 { - " async for " - } else { - " for " - })?; - self.unparse_expr(&comp.target, precedence::TUPLE)?; - self.p(" in ")?; - self.unparse_expr(&comp.iter, precedence::TEST + 1)?; - for cond in &comp.ifs { - self.p(" if ")?; - self.unparse_expr(cond, precedence::TEST + 1)?; - } - } - Ok(()) - } - - fn unparse_fstring_body(&mut self, values: &[Expr], is_spec: bool) -> fmt::Result { - for value in values { - self.unparse_fstring_elem(value, is_spec)?; - } - Ok(()) - } - - fn unparse_formatted( - &mut self, - val: &Expr, - conversion: usize, - spec: Option<&Expr>, - ) -> fmt::Result { - let buffered = to_string_fmt(|f| Unparser::new(f).unparse_expr(val, precedence::TEST + 1)); - let brace = if buffered.starts_with('{') { - // put a space to avoid escaping the bracket - "{ " - } else { - "{" - }; - self.p(brace)?; - self.p(&buffered)?; - drop(buffered); - - if conversion != ConversionFlag::None as usize { - self.p("!")?; - let buf = &[conversion as u8]; - let c = std::str::from_utf8(buf).unwrap(); - self.p(c)?; - } - - if let Some(spec) = spec { - self.p(":")?; - self.unparse_fstring_elem(spec, true)?; - } - - self.p("}")?; - - Ok(()) - } - - fn unparse_fstring_elem(&mut self, expr: &Expr, is_spec: bool) -> fmt::Result { - match &expr.node { - ExprKind::Constant { value, .. } => { - if let Constant::Str(s) = value { - self.unparse_fstring_str(s) - } else { - unreachable!() - } - } - ExprKind::JoinedStr { values } => self.unparse_joined_str(values, is_spec), - ExprKind::FormattedValue { - value, - conversion, - format_spec, - } => self.unparse_formatted(value, *conversion, format_spec.as_deref()), - _ => unreachable!(), - } - } - - fn unparse_fstring_str(&mut self, s: &str) -> fmt::Result { - let s = s.replace('{', "{{").replace('}', "}}"); - self.p(&s) - } - - fn unparse_joined_str(&mut self, values: &[Expr], is_spec: bool) -> fmt::Result { - if is_spec { - self.unparse_fstring_body(values, is_spec) - } else { - self.p("f")?; - let body = to_string_fmt(|f| Unparser::new(f).unparse_fstring_body(values, is_spec)); - rustpython_literal::escape::UnicodeEscape::new_repr(&body) - .str_repr() - .write(&mut self.f) - } - } -} - -impl fmt::Display for Expr { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - Unparser::new(f).unparse_expr(self, precedence::TEST) - } -} - -fn to_string_fmt(f: impl FnOnce(&mut fmt::Formatter) -> fmt::Result) -> String { - use std::cell::Cell; - struct Fmt(Cell>); - impl fmt::Result> fmt::Display for Fmt { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.take().unwrap()(f) - } - } - Fmt(Cell::new(Some(f))).to_string() -} diff --git a/compiler/codegen/Cargo.toml b/compiler/codegen/Cargo.toml index 3bb891ed3f..861c9fba1a 100644 --- a/compiler/codegen/Cargo.toml +++ b/compiler/codegen/Cargo.toml @@ -8,8 +8,8 @@ license = "MIT" edition = "2021" [dependencies] -rustpython-ast = { path = "../ast", features = ["unparse"] } -rustpython-compiler-core = { path = "../core", version = "0.2.0" } +rustpython-ast = { workspace = true, features=["unparse"] } +rustpython-compiler-core = { workspace = true } ahash = { workspace = true } bitflags = { workspace = true } @@ -20,6 +20,6 @@ num-complex = { workspace = true } num-traits = { workspace = true } [dev-dependencies] -rustpython-parser = { path = "../parser" } +rustpython-parser = { workspace = true } insta = { workspace = true } diff --git a/compiler/core/Cargo.toml b/compiler/core/Cargo.toml deleted file mode 100644 index 79622a9569..0000000000 --- a/compiler/core/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "rustpython-compiler-core" -description = "RustPython specific bytecode." -version = "0.2.0" -authors = ["RustPython Team"] -edition = "2021" -repository = "https://github.com/RustPython/RustPython" -license = "MIT" - -[dependencies] -bitflags = { workspace = true } -bstr = { workspace = true } -itertools = { workspace = true } -num-bigint = { workspace = true } -num-complex = { workspace = true } -serde = { version = "1.0.133", optional = true, default-features = false, features = ["derive"] } - -lz4_flex = "0.9.2" diff --git a/compiler/core/src/bytecode.rs b/compiler/core/src/bytecode.rs deleted file mode 100644 index a522d3fbf1..0000000000 --- a/compiler/core/src/bytecode.rs +++ /dev/null @@ -1,1610 +0,0 @@ -//! Implement python as a virtual machine with bytecode. This module -//! implements bytecode structure. - -use crate::{marshal, Location}; -use bitflags::bitflags; -use itertools::Itertools; -use num_bigint::BigInt; -use num_complex::Complex64; -use std::marker::PhantomData; -use std::{collections::BTreeSet, fmt, hash, mem}; - -pub trait Constant: Sized { - type Name: AsRef; - - /// Transforms the given Constant to a BorrowedConstant - fn borrow_constant(&self) -> BorrowedConstant; -} - -impl Constant for ConstantData { - type Name = String; - fn borrow_constant(&self) -> BorrowedConstant { - use BorrowedConstant::*; - match self { - ConstantData::Integer { value } => Integer { value }, - ConstantData::Float { value } => Float { value: *value }, - ConstantData::Complex { value } => Complex { value: *value }, - ConstantData::Boolean { value } => Boolean { value: *value }, - ConstantData::Str { value } => Str { value }, - ConstantData::Bytes { value } => Bytes { value }, - ConstantData::Code { code } => Code { code }, - ConstantData::Tuple { elements } => Tuple { elements }, - ConstantData::None => None, - ConstantData::Ellipsis => Ellipsis, - } - } -} - -/// A Constant Bag -pub trait ConstantBag: Sized + Copy { - type Constant: Constant; - fn make_constant(&self, constant: BorrowedConstant) -> Self::Constant; - fn make_int(&self, value: BigInt) -> Self::Constant; - fn make_tuple(&self, elements: impl Iterator) -> Self::Constant; - fn make_code(&self, code: CodeObject) -> Self::Constant; - fn make_name(&self, name: &str) -> ::Name; -} - -pub trait AsBag { - type Bag: ConstantBag; - #[allow(clippy::wrong_self_convention)] - fn as_bag(self) -> Self::Bag; -} - -impl AsBag for Bag { - type Bag = Self; - fn as_bag(self) -> Self { - self - } -} - -#[derive(Clone, Copy)] -pub struct BasicBag; - -impl ConstantBag for BasicBag { - type Constant = ConstantData; - fn make_constant(&self, constant: BorrowedConstant) -> Self::Constant { - constant.to_owned() - } - fn make_int(&self, value: BigInt) -> Self::Constant { - ConstantData::Integer { value } - } - fn make_tuple(&self, elements: impl Iterator) -> Self::Constant { - ConstantData::Tuple { - elements: elements.collect(), - } - } - fn make_code(&self, code: CodeObject) -> Self::Constant { - ConstantData::Code { - code: Box::new(code), - } - } - fn make_name(&self, name: &str) -> ::Name { - name.to_owned() - } -} - -/// Primary container of a single code object. Each python function has -/// a code object. Also a module has a code object. -#[derive(Clone)] -pub struct CodeObject { - pub instructions: Box<[CodeUnit]>, - pub locations: Box<[Location]>, - pub flags: CodeFlags, - pub posonlyarg_count: u32, - // Number of positional-only arguments - pub arg_count: u32, - pub kwonlyarg_count: u32, - pub source_path: C::Name, - pub first_line_number: u32, - pub max_stackdepth: u32, - pub obj_name: C::Name, - // Name of the object that created this code object - pub cell2arg: Option>, - pub constants: Box<[C]>, - pub names: Box<[C::Name]>, - pub varnames: Box<[C::Name]>, - pub cellvars: Box<[C::Name]>, - pub freevars: Box<[C::Name]>, -} - -bitflags! { - pub struct CodeFlags: u16 { - const NEW_LOCALS = 0x01; - const IS_GENERATOR = 0x02; - const IS_COROUTINE = 0x04; - const HAS_VARARGS = 0x08; - const HAS_VARKEYWORDS = 0x10; - const IS_OPTIMIZED = 0x20; - } -} - -impl CodeFlags { - pub const NAME_MAPPING: &'static [(&'static str, CodeFlags)] = &[ - ("GENERATOR", CodeFlags::IS_GENERATOR), - ("COROUTINE", CodeFlags::IS_COROUTINE), - ( - "ASYNC_GENERATOR", - Self::from_bits_truncate(Self::IS_GENERATOR.bits | Self::IS_COROUTINE.bits), - ), - ("VARARGS", CodeFlags::HAS_VARARGS), - ("VARKEYWORDS", CodeFlags::HAS_VARKEYWORDS), - ]; -} - -/// an opcode argument that may be extended by a prior ExtendedArg -#[derive(Copy, Clone, PartialEq, Eq)] -#[repr(transparent)] -pub struct OpArgByte(pub u8); -impl OpArgByte { - pub const fn null() -> Self { - OpArgByte(0) - } -} -impl fmt::Debug for OpArgByte { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) - } -} - -/// a full 32-bit op_arg, including any possible ExtendedArg extension -#[derive(Copy, Clone, Debug)] -#[repr(transparent)] -pub struct OpArg(pub u32); -impl OpArg { - pub const fn null() -> Self { - OpArg(0) - } - - /// Returns how many CodeUnits a instruction with this op_arg will be encoded as - #[inline] - pub fn instr_size(self) -> usize { - (self.0 > 0xff) as usize + (self.0 > 0xff_ff) as usize + (self.0 > 0xff_ff_ff) as usize + 1 - } - - /// returns the arg split into any necessary ExtendedArg components (in big-endian order) and - /// the arg for the real opcode itself - #[inline(always)] - pub fn split(self) -> (impl ExactSizeIterator, OpArgByte) { - let mut it = self - .0 - .to_le_bytes() - .map(OpArgByte) - .into_iter() - .take(self.instr_size()); - let lo = it.next().unwrap(); - (it.rev(), lo) - } -} - -#[derive(Default, Copy, Clone)] -#[repr(transparent)] -pub struct OpArgState { - state: u32, -} - -impl OpArgState { - #[inline(always)] - pub fn get(&mut self, ins: CodeUnit) -> (Instruction, OpArg) { - let arg = self.extend(ins.arg); - if ins.op != Instruction::ExtendedArg { - self.reset(); - } - (ins.op, arg) - } - #[inline(always)] - pub fn extend(&mut self, arg: OpArgByte) -> OpArg { - self.state = self.state << 8 | u32::from(arg.0); - OpArg(self.state) - } - #[inline(always)] - pub fn reset(&mut self) { - self.state = 0 - } -} - -pub trait OpArgType: Copy { - fn from_op_arg(x: u32) -> Option; - fn to_op_arg(self) -> u32; -} - -impl OpArgType for u32 { - #[inline(always)] - fn from_op_arg(x: u32) -> Option { - Some(x) - } - #[inline(always)] - fn to_op_arg(self) -> u32 { - self - } -} - -impl OpArgType for bool { - #[inline(always)] - fn from_op_arg(x: u32) -> Option { - Some(x != 0) - } - #[inline(always)] - fn to_op_arg(self) -> u32 { - self as u32 - } -} - -macro_rules! op_arg_enum { - ($(#[$attr:meta])* $vis:vis enum $name:ident { $($(#[$var_attr:meta])* $var:ident = $value:literal,)* }) => { - $(#[$attr])* - $vis enum $name { - $($(#[$var_attr])* $var = $value,)* - } - - impl OpArgType for $name { - fn to_op_arg(self) -> u32 { - self as u32 - } - fn from_op_arg(x: u32) -> Option { - Some(match u8::try_from(x).ok()? { - $($value => Self::$var,)* - _ => return None, - }) - } - } - }; -} - -#[derive(Copy, Clone)] -pub struct Arg(PhantomData); - -impl Arg { - #[inline] - pub fn marker() -> Self { - Arg(PhantomData) - } - #[inline] - pub fn new(arg: T) -> (Self, OpArg) { - (Self(PhantomData), OpArg(arg.to_op_arg())) - } - #[inline] - pub fn new_single(arg: T) -> (Self, OpArgByte) - where - T: Into, - { - (Self(PhantomData), OpArgByte(arg.into())) - } - #[inline(always)] - pub fn get(self, arg: OpArg) -> T { - self.try_get(arg).unwrap() - } - #[inline(always)] - pub fn try_get(self, arg: OpArg) -> Option { - T::from_op_arg(arg.0) - } - #[inline(always)] - /// # Safety - /// T::from_op_arg(self) must succeed - pub unsafe fn get_unchecked(self, arg: OpArg) -> T { - match T::from_op_arg(arg.0) { - Some(t) => t, - None => std::hint::unreachable_unchecked(), - } - } -} - -impl PartialEq for Arg { - fn eq(&self, _: &Self) -> bool { - true - } -} -impl Eq for Arg {} - -impl fmt::Debug for Arg { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Arg<{}>", std::any::type_name::()) - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] -#[repr(transparent)] -// XXX: if you add a new instruction that stores a Label, make sure to add it in -// Instruction::label_arg -pub struct Label(pub u32); - -impl OpArgType for Label { - #[inline(always)] - fn from_op_arg(x: u32) -> Option { - Some(Label(x)) - } - #[inline(always)] - fn to_op_arg(self) -> u32 { - self.0 - } -} - -impl fmt::Display for Label { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.0.fmt(f) - } -} - -op_arg_enum!( - /// Transforms a value prior to formatting it. - #[derive(Copy, Clone, Debug, PartialEq, Eq)] - #[repr(u8)] - pub enum ConversionFlag { - /// No conversion - None = 0, // CPython uses -1 but not pleasure for us - /// Converts by calling `str()`. - Str = b's', - /// Converts by calling `ascii()`. - Ascii = b'a', - /// Converts by calling `repr()`. - Repr = b'r', - } -); - -impl TryFrom for ConversionFlag { - type Error = usize; - fn try_from(b: usize) -> Result { - u32::try_from(b).ok().and_then(Self::from_op_arg).ok_or(b) - } -} - -op_arg_enum!( - /// The kind of Raise that occurred. - #[derive(Copy, Clone, Debug, PartialEq, Eq)] - #[repr(u8)] - pub enum RaiseKind { - Reraise = 0, - Raise = 1, - RaiseCause = 2, - } -); - -pub type NameIdx = u32; - -/// A Single bytecode instruction. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -#[repr(u8)] -pub enum Instruction { - /// Importing by name - ImportName { - idx: Arg, - }, - /// Importing without name - ImportNameless, - /// Import * - ImportStar, - /// from ... import ... - ImportFrom { - idx: Arg, - }, - LoadFast(Arg), - LoadNameAny(Arg), - LoadGlobal(Arg), - LoadDeref(Arg), - LoadClassDeref(Arg), - StoreFast(Arg), - StoreLocal(Arg), - StoreGlobal(Arg), - StoreDeref(Arg), - DeleteFast(Arg), - DeleteLocal(Arg), - DeleteGlobal(Arg), - DeleteDeref(Arg), - LoadClosure(Arg), - Subscript, - StoreSubscript, - DeleteSubscript, - StoreAttr { - idx: Arg, - }, - DeleteAttr { - idx: Arg, - }, - LoadConst { - /// index into constants vec - idx: Arg, - }, - UnaryOperation { - op: Arg, - }, - BinaryOperation { - op: Arg, - }, - BinaryOperationInplace { - op: Arg, - }, - LoadAttr { - idx: Arg, - }, - TestOperation { - op: Arg, - }, - CompareOperation { - op: Arg, - }, - Pop, - Rotate2, - Rotate3, - Duplicate, - Duplicate2, - GetIter, - Continue { - target: Arg