Skip to content

Import sre-engine repository to main RustPython #5202

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 105 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
105 commits
Select commit Hold shift + click to select a range
c2ee9ca
WIP - native _sre
coolreader18 Oct 1, 2020
e1362ea
WIP structure
qingshi163 Dec 1, 2020
82922bf
upgrade re version; implement helper functions;
qingshi163 Dec 23, 2020
4e03fb3
upgrade sre_parse.py; impl marks count
qingshi163 Dec 25, 2020
78485fd
create _sre.Match
qingshi163 Dec 26, 2020
aa0f20b
impl Pattern.fullmatch, Pattern.search
qingshi163 Dec 26, 2020
312e5b8
impl opcode groupref and assert_not
qingshi163 Dec 27, 2020
0b2c8d1
impl OpMaxUntil
qingshi163 Dec 29, 2020
93c2b8b
impl OpBranch
qingshi163 Dec 29, 2020
5a44598
impl OpRepeat
qingshi163 Dec 29, 2020
af7901d
impl OpMinUntil
qingshi163 Dec 29, 2020
fa2adaf
Impl OpRepeatONe
qingshi163 Dec 29, 2020
ae44580
general case for count
qingshi163 Dec 30, 2020
f728755
impl re.Match object
qingshi163 Dec 30, 2020
04bb80f
impl Match.group
qingshi163 Dec 31, 2020
8c442f5
rework OpMaxUntil; restruct popping context; add tests;
qingshi163 Jan 1, 2021
8fba935
OpMaxUntil zero-width protection
qingshi163 Jan 1, 2021
af1a53c
impl Match.groups()
qingshi163 Jan 1, 2021
db84f32
fix Opcode::CHARSET
qingshi163 Jan 1, 2021
36433a9
fix Opcode::BIGCHARSET
qingshi163 Jan 1, 2021
f05f6cb
impl Pattern.sub
qingshi163 Jan 3, 2021
817eb66
fix OpMinUntil
qingshi163 Jan 4, 2021
33ef823
impl Match.groupdict
qingshi163 Jan 6, 2021
76c95ab
impl Match.lastgroup
qingshi163 Jan 7, 2021
13a8b6c
add bytes support and refactor
qingshi163 Jan 17, 2021
97000fc
fix multiple bugs; skip crash tests
qingshi163 Jan 19, 2021
84113cb
fix zero width repeat
qingshi163 Jan 20, 2021
f2311b5
fix op branch
qingshi163 Jan 21, 2021
6a79232
fix at_beginning
qingshi163 Jan 21, 2021
7f0dad7
fix back_peek_char
qingshi163 Jan 21, 2021
4416158
fix multiple bugs; pass tests
qingshi163 Jan 22, 2021
db5bd64
Initial commit to switch to new repo
coolreader18 Jan 28, 2021
9c95994
Modify to work outside of rustpython-vm
coolreader18 Jan 28, 2021
2592067
Add LICENSE
coolreader18 Jan 28, 2021
2473c3e
add tests; fix OpAssert panic
qingshi163 Feb 1, 2021
174c932
Merge pull request #1 from qingshi163/master
coolreader18 Feb 1, 2021
cc4441b
Compile regex patterns for tests with a script
coolreader18 Feb 1, 2021
11532c5
Merge pull request #2 from RustPython/gen-tests
coolreader18 Feb 2, 2021
39a648b
fix OpAssert positive lookbehind
qingshi163 Feb 3, 2021
65c4774
Merge pull request #3 from qingshi163/master
coolreader18 Feb 3, 2021
2a43d66
Fix clippy lint
coolreader18 Apr 2, 2021
ca1346e
Have generate_tests.py generate Patterns inline in tests.rs
coolreader18 Apr 2, 2021
211a66c
Merge pull request #4 from RustPython/codegen-inline
coolreader18 Apr 5, 2021
b497b22
Add more info to Cargo.toml
coolreader18 Apr 5, 2021
9728dd8
fix test_string_boundaries
qingshi163 Apr 16, 2021
caef94d
Merge pull request #5 from qingshi163/master
coolreader18 Apr 16, 2021
d2b48fd
Release 0.1.1
coolreader18 Apr 16, 2021
73abbac
Add explicit include for Cargo files
coolreader18 Apr 16, 2021
df8453d
fix zerowidth search
qingshi163 Apr 20, 2021
b4f0b05
Merge pull request #6 from qingshi163/zerowidth
qingshi163 Apr 20, 2021
a3c3573
optimize count
qingshi163 Apr 20, 2021
869d80a
Merge pull request #7 from qingshi163/master
qingshi163 Apr 21, 2021
5bd6b67
optimize opcode that execute only once
qingshi163 Apr 21, 2021
7324fee
optimize search cache the string offset
qingshi163 Apr 21, 2021
9808633
Merge pull request #8 from qingshi163/master
qingshi163 Apr 22, 2021
86435b8
add benchmark
qingshi163 Apr 22, 2021
58981a4
optimize; replace hashmap with btreemap
qingshi163 Apr 22, 2021
c5871f4
Merge pull request #9 from qingshi163/master
coolreader18 May 1, 2021
74ebdaf
fix panic OpMinUntil return before restore repeat
qingshi163 Jul 11, 2022
919e1d7
Refactor and fix multiple max_until recusion (#10)
qingshi163 Jul 26, 2022
4007f82
optimize max_until and min_until
qingshi163 Jul 27, 2022
bf57f28
update version to 0.2.1
qingshi163 Jul 27, 2022
9058f28
refactor trait StrDrive instead enum
qingshi163 Jul 28, 2022
34bde45
pass compile
qingshi163 Aug 1, 2022
982d8f5
fix next_ctx
qingshi163 Aug 2, 2022
ccae898
fix next_ctx bug
qingshi163 Aug 2, 2022
f3b3044
fix lifetime
qingshi163 Aug 5, 2022
ca20b59
update version to 0.3.0
qingshi163 Aug 5, 2022
a48f5b0
impl op_info
qingshi163 Aug 5, 2022
8b1fcea
update version to 0.3.1
qingshi163 Aug 7, 2022
c31462d
refactor split State with Request
qingshi163 Aug 9, 2022
c15387e
refactor tests
qingshi163 Aug 9, 2022
942063d
refactor benches
qingshi163 Aug 9, 2022
de8973d
simplify lifetime
qingshi163 Aug 9, 2022
c494feb
refactor split Marks
qingshi163 Aug 9, 2022
1825800
clearup
qingshi163 Aug 9, 2022
e42df1d
update version to 0.4.0
qingshi163 Aug 9, 2022
c4f10ed
impl opinfo single literal
qingshi163 Aug 14, 2022
2366311
impl opinfo literal
qingshi163 Aug 15, 2022
646c8ac
impl opinfo charset
qingshi163 Aug 15, 2022
7e7b973
clearup
qingshi163 Aug 15, 2022
26a78db
update to 0.4.1
qingshi163 Aug 15, 2022
4e6b271
introduce SearchIter
qingshi163 Aug 15, 2022
285ba76
0.4.2 with dependency update
youknowone Oct 7, 2023
a777d22
fix _count
qingshi163 Dec 8, 2023
d73cc5f
update version and dependency
qingshi163 Dec 8, 2023
9070e12
refactor _match with nest loop
qingshi163 Dec 10, 2023
39c0106
update to cpython 3.12 op code
qingshi163 Dec 11, 2023
a9ed7de
fix _count general case
qingshi163 Dec 11, 2023
99ed744
impl atomic group & possessive repeat
qingshi163 Dec 13, 2023
9378497
clearup
qingshi163 Dec 13, 2023
003c45d
remove unneccesary INFO logic on main dispatch
qingshi163 Dec 13, 2023
41bdcfe
bump version to 0.5.0
qingshi163 Dec 13, 2023
169368b
fix some clippy
qingshi163 Dec 13, 2023
454aa4b
fix count not advance
qingshi163 Jan 3, 2024
17e1152
fix assert not mark
qingshi163 Jan 3, 2024
2fe1292
improve ctx in _match lazy create stack vec
qingshi163 Jan 4, 2024
f9b2d10
improve search at_beginning
qingshi163 Jan 5, 2024
118a00c
refactor _count general case
qingshi163 Jan 7, 2024
c93ea30
improve use StringCursor replace index based position
qingshi163 Jan 13, 2024
10e51ba
improve: use adjust_cursor reduce double calc
qingshi163 Jan 14, 2024
21fc205
improve: fix double count on _count
qingshi163 Jan 14, 2024
b3a606d
Add 'vm/sre_engine/' from commit '21fc2059b70ebd5bf4a7c524c40e7d4347e…
youknowone Mar 18, 2024
12601d0
integrate sre_engine crate to workspace
youknowone Mar 18, 2024
1dd9a2f
suppress clippy warnings
youknowone Mar 22, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ include = ["LICENSE", "Cargo.toml", "src/**/*.rs"]
resolver = "2"
members = [
"compiler", "compiler/core", "compiler/codegen",
".", "common", "derive", "jit", "vm", "pylib", "stdlib", "wasm/lib", "derive-impl",
".", "common", "derive", "jit", "vm", "vm/sre_engine", "pylib", "stdlib", "wasm/lib", "derive-impl",
]

[workspace.dependencies]
Expand All @@ -27,6 +27,7 @@ rustpython-jit = { path = "jit", version = "0.3.0" }
rustpython-vm = { path = "vm", default-features = false, version = "0.3.0" }
rustpython-pylib = { path = "pylib", version = "0.3.0" }
rustpython-stdlib = { path = "stdlib", default-features = false, version = "0.3.0" }
rustpython-sre_engine = { path = "vm/sre_engine", version = "0.6.0" }
rustpython-doc = { git = "https://github.com/RustPython/__doc__", tag = "0.3.0", version = "0.3.0" }

rustpython-literal = { git = "https://github.com/RustPython/Parser.git", rev = "29c4728dbedc7e69cc2560b9b34058bbba9b1303" }
Expand Down Expand Up @@ -64,7 +65,7 @@ malachite-base = "0.4.4"
num-complex = "0.4.0"
num-integer = "0.1.44"
num-traits = "0.2"
num_enum = "0.5.7"
num_enum = "0.7"
once_cell = "1.18"
parking_lot = "0.12.1"
paste = "1.0.7"
Expand Down
2 changes: 2 additions & 0 deletions vm/sre_engine/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/target
Cargo.lock
21 changes: 21 additions & 0 deletions vm/sre_engine/.vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"version": "0.2.0",
"configurations": [
{
"type": "lldb",
"request": "launch",
"name": "Debug Unit Test",
"cargo": {
"args": [
"test",
"--no-run"
],
"filter": {
"kind": "test"
}
},
"args": [],
"cwd": "${workspaceFolder}"
}
]
}
15 changes: 15 additions & 0 deletions vm/sre_engine/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "rustpython-sre_engine"
version = "0.6.0"
authors = ["Kangzhi Shi <shikangzhi@gmail.com>", "RustPython Team"]
description = "A low-level implementation of Python's SRE regex engine"
repository = "https://github.com/RustPython/RustPython"
license = "MIT"
edition = "2021"
keywords = ["regex"]
include = ["LICENSE", "src/**/*.rs"]

[dependencies]
num_enum = { workspace = true }
bitflags = { workspace = true }
optional = "0.5"
21 changes: 21 additions & 0 deletions vm/sre_engine/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2020 RustPython Team

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
111 changes: 111 additions & 0 deletions vm/sre_engine/benches/benches.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#![feature(test)]

extern crate test;
use test::Bencher;

use sre_engine::{Request, State, StrDrive};

struct Pattern {
code: &'static [u32],
}

impl Pattern {
fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) {
self.state_range(string, 0..usize::MAX)
}

fn state_range<'a, S: StrDrive>(
&self,
string: S,
range: std::ops::Range<usize>,
) -> (Request<'a, S>, State) {
let req = Request::new(string, range.start, range.end, self.code, false);
let state = State::default();
(req, state)
}
}

#[bench]
fn benchmarks(b: &mut Bencher) {
// # test common prefix
// pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p1 = Pattern { code: &[14, 8, 1, 4, 6, 1, 1, 80, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 1] };
// END GENERATED
// pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p2 = Pattern { code: &[14, 8, 1, 4, 6, 1, 0, 80, 0, 17, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 17, 1, 1] };
// END GENERATED
// pattern p3 = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p3 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 1] };
// END GENERATED
// pattern p4 = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p4 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 17, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 17, 1, 1] };
// END GENERATED
// pattern p5 = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p5 = Pattern { code: &[14, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 11, 0, 1] };
// END GENERATED
// pattern p6 = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p6 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 31, 1, 4294967295, 17, 0, 13, 7, 16, 48, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] };
// END GENERATED
// pattern p7 = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p7 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 29, 1, 4294967295, 17, 0, 13, 5, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] };
// END GENERATED
// pattern p8 = re.compile('Python') #, 'Python'), # Simple text literal
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p8 = Pattern { code: &[14, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] };
// END GENERATED
// pattern p9 = re.compile('.*Python') #, 'Python'), # Bad text literal
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p9 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] };
// END GENERATED
// pattern p10 = re.compile('.*Python.*') #, 'Python'), # Worse text literal
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p10 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 24, 5, 0, 4294967295, 2, 1, 1] };
// END GENERATED
// pattern p11 = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping
// START GENERATED by generate_tests.py
#[rustfmt::skip] let p11 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 1] };
// END GENERATED

let tests = [
(p1, "Perl"),
(p2, "Perl"),
(p3, "Perl"),
(p4, "Perl"),
(p5, "PythonPython"),
(p6, "a5,b7,c9,"),
(p7, "a5,b7,c9,"),
(p8, "Python"),
(p9, "Python"),
(p10, "Python"),
(p11, "Python"),
];

b.iter(move || {
for (p, s) in &tests {
let (req, mut state) = p.state(s.clone());
assert!(state.search(req));
let (req, mut state) = p.state(s.clone());
assert!(state.pymatch(&req));
let (mut req, mut state) = p.state(s.clone());
req.match_all = true;
assert!(state.pymatch(&req));
let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000));
let (req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX);
assert!(state.search(req));
let (req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX);
assert!(state.pymatch(&req));
let (req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len());
assert!(state.pymatch(&req));
let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len());
req.match_all = true;
assert!(state.pymatch(&req));
}
})
}
47 changes: 47 additions & 0 deletions vm/sre_engine/generate_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
from pathlib import Path
import re
import sre_constants
import sre_compile
import sre_parse
import json
from itertools import chain

m = re.search(r"const SRE_MAGIC: usize = (\d+);", open("src/constants.rs").read())
sre_engine_magic = int(m.group(1))
del m

assert sre_constants.MAGIC == sre_engine_magic

class CompiledPattern:
@classmethod
def compile(cls, pattern, flags=0):
p = sre_parse.parse(pattern)
code = sre_compile._code(p, flags)
self = cls()
self.pattern = pattern
self.code = code
self.flags = re.RegexFlag(flags | p.state.flags)
return self

for k, v in re.RegexFlag.__members__.items():
setattr(CompiledPattern, k, v)


# matches `// pattern {varname} = re.compile(...)`
pattern_pattern = re.compile(r"^((\s*)\/\/\s*pattern\s+(\w+)\s+=\s+(.+?))$(?:.+?END GENERATED)?", re.M | re.S)
def replace_compiled(m):
line, indent, varname, pattern = m.groups()
pattern = eval(pattern, {"re": CompiledPattern})
pattern = f"Pattern {{ code: &{json.dumps(pattern.code)} }}"
return f'''{line}
{indent}// START GENERATED by generate_tests.py
{indent}#[rustfmt::skip] let {varname} = {pattern};
{indent}// END GENERATED'''

with os.scandir("tests") as t, os.scandir("benches") as b:
for f in chain(t, b):
path = Path(f.path)
if path.suffix == ".rs":
replaced = pattern_pattern.sub(replace_compiled, path.read_text())
path.write_text(replaced)
Loading