Skip to content

Improved performance and readability of split_idents_on_dot: #6059

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 79 additions & 105 deletions src/shell/helper.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#![cfg_attr(target_arch = "wasm32", allow(dead_code))]

use rustpython_vm::{
AsObject, PyResult, TryFromObject, VirtualMachine,
builtins::{PyDictRef, PyStrRef},
Expand All @@ -11,138 +12,113 @@ pub struct ShellHelper<'vm> {
globals: PyDictRef,
}

fn reverse_string(s: &mut String) {
let rev = s.chars().rev().collect();
*s = rev;
}

fn split_idents_on_dot(line: &str) -> Option<(usize, Vec<String>)> {
let mut words = vec![String::new()];
let mut startpos = 0;
for (i, c) in line.chars().rev().enumerate() {
match c {
'.' => {
// check for a double dot
if i != 0 && words.last().is_some_and(|s| s.is_empty()) {
return None;
}
reverse_string(words.last_mut().unwrap());
if words.len() == 1 {
startpos = line.len() - i;
}
words.push(String::new());
}
c if c.is_alphanumeric() || c == '_' => words.last_mut().unwrap().push(c),
_ => {
if words.len() == 1 {
if words.last().unwrap().is_empty() {
return None;
}
startpos = line.len() - i;
}
break;
}
}
}
if words == [String::new()] {
return None;
}
reverse_string(words.last_mut().unwrap());
words.reverse();

Some((startpos, words))
}

impl<'vm> ShellHelper<'vm> {
pub const fn new(vm: &'vm VirtualMachine, globals: PyDictRef) -> Self {
ShellHelper { vm, globals }
Self { vm, globals }
}

fn get_available_completions<'w>(
&self,
words: &'w [String],
) -> Option<(&'w str, impl Iterator<Item = PyResult<PyStrRef>> + 'vm)> {
// the very first word and then all the ones after the dot
let (first, rest) = words.split_first().unwrap();
let (first, rest) = words.split_first()?;

let str_iter_method = |obj, name| {
let get_str_iter = |obj, name| {
let iter = self.vm.call_special_method(obj, name, ())?;
ArgIterable::<PyStrRef>::try_from_object(self.vm, iter)?.iter(self.vm)
};

let (word_start, iter1, iter2) = if let Some((last, parents)) = rest.split_last() {
// we need to get an attribute based off of the dir() of an object

// last: the last word, could be empty if it ends with a dot
// parents: the words before the dot

let mut current = self.globals.get_item_opt(first.as_str(), self.vm).ok()??;

if let Some((last, parents)) = rest.split_last() {
let mut current = self.globals.get_item_opt(first, self.vm).ok()??;
for attr in parents {
let attr = self.vm.ctx.new_str(attr.as_str());
current = current.get_attr(&attr, self.vm).ok()?;
current = current.get_attr(self.vm.ctx.new_str(attr), self.vm).ok()?;
}

let current_iter = str_iter_method(&current, identifier!(self.vm, __dir__)).ok()?;

(last, current_iter, None)
let iter = get_str_iter(&current, identifier!(self.vm, __dir__)).ok()?;
Some((last, iter))
} else {
// we need to get a variable based off of globals/builtins

let globals =
str_iter_method(self.globals.as_object(), identifier!(self.vm, keys)).ok()?;
let builtins =
str_iter_method(self.vm.builtins.as_object(), identifier!(self.vm, __dir__))
.ok()?;
(first, globals, Some(builtins))
};
Some((word_start, iter1.chain(iter2.into_iter().flatten())))
let globals = get_str_iter(self.globals.as_object(), identifier!(self.vm, keys)).ok()?;
let builtins = get_str_iter(self.vm.builtins.as_object(), identifier!(self.vm, __dir__)).ok()?;
Some((first, globals.chain(builtins)))
}
}

fn complete_opt(&self, line: &str) -> Option<(usize, Vec<String>)> {
let (startpos, words) = split_idents_on_dot(line)?;
let (prefix, completions) = self.get_available_completions(&words)?;

let (word_start, iter) = self.get_available_completions(&words)?;

let all_completions = iter
.filter(|res| {
res.as_ref()
.ok()
.is_none_or(|s| s.as_str().starts_with(word_start))
})
.collect::<Result<Vec<_>, _>>()
.ok()?;
let mut completions = if word_start.starts_with('_') {
// if they're already looking for something starting with a '_', just give
// them all the completions
all_completions
let completions = completions
.filter_map(Result::ok)
.filter(|s| prefix.is_empty() || s.as_str().starts_with(prefix))
.collect::<Vec<_>>();

let filtered = if prefix.starts_with('_') {
completions
} else {
// only the completions that don't start with a '_'
let no_underscore = all_completions
let no_underscore: Vec<_> = completions
.iter()
.filter(|&s| !s.as_str().starts_with('_'))
.filter(|s| !s.as_str().starts_with('_'))
.cloned()
.collect::<Vec<_>>();

// if there are only completions that start with a '_', give them all of the
// completions, otherwise only the ones that don't start with '_'
.collect();
if no_underscore.is_empty() {
all_completions
completions
} else {
no_underscore
}
};

// sort the completions alphabetically
completions.sort_by(|a, b| std::cmp::Ord::cmp(a.as_str(), b.as_str()));
let mut result = filtered
.into_iter()
.map(|s| s.as_str().to_owned())
.collect::<Vec<_>>();
result.sort_unstable();

Some((
startpos,
completions
.into_iter()
.map(|s| s.as_str().to_owned())
.collect(),
))
Some((startpos, result))
}
}

fn split_idents_on_dot(line: &str) -> Option<(usize, Vec<String>)> {
let mut idents = vec![String::new()];
let mut startpos = 0;
let mut rev_chars = line.chars().rev().enumerate();

while let Some((i, c)) = rev_chars.next() {
match c {
'.' => {
if idents.last().map_or(false, |s| s.is_empty()) && i != 0 {
return None; // Double dot
}
idents.last_mut().map(|s| reverse_string(s));
if idents.len() == 1 {
startpos = line.len() - i;
}
idents.push(String::new());
}
c if c.is_alphanumeric() || c == '_' => {
idents.last_mut()?.push(c);
}
_ => {
if idents.len() == 1 && idents.last().map_or(true, |s| s.is_empty()) {
return None;
}
startpos = line.len() - i;
break;
}
}
}

if idents == [String::new()] {
return None;
}

idents.last_mut().map(|s| reverse_string(s));
idents.reverse();
Some((startpos, idents))
}

fn reverse_string(s: &mut String) {
unsafe {
let bytes = s.as_bytes_mut();
bytes.reverse();
}
}
Comment on lines +118 to 123
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Critical: Unsafe string reversal breaks UTF-8 invariants

This unsafe optimization is incorrect and dangerous. Reversing UTF-8 bytes directly will corrupt multi-byte Unicode characters, violating Rust's UTF-8 string invariants and potentially causing undefined behavior.

For example, the string "café" (with é as U+00E9, encoded as bytes [0xC3, 0xA9]) would become invalid UTF-8 after byte reversal.

Revert to the safe character-based approach:

-fn reverse_string(s: &mut String) {
-    unsafe {
-        let bytes = s.as_bytes_mut();
-        bytes.reverse();
-    }
-}
+fn reverse_string(s: &mut String) {
+    let reversed: String = s.chars().rev().collect();
+    *s = reversed;
+}

If performance is critical, consider using a safer approach like s.as_mut_str().chars().rev().collect() or validate that all input is ASCII-only.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
fn reverse_string(s: &mut String) {
unsafe {
let bytes = s.as_bytes_mut();
bytes.reverse();
}
}
fn reverse_string(s: &mut String) {
let reversed: String = s.chars().rev().collect();
*s = reversed;
}
🤖 Prompt for AI Agents
In src/shell/helper.rs around lines 118 to 123, the current unsafe method
reverses the string by directly reversing its bytes, which breaks UTF-8 encoding
and can cause undefined behavior. Replace this unsafe byte reversal with a safe
character-based reversal by converting the string to characters, reversing them,
and collecting back into the string. This ensures UTF-8 validity is maintained.
If performance is a concern, ensure input is ASCII-only before using byte
reversal or stick to the safe char-based method.


Expand All @@ -152,6 +128,7 @@ cfg_if::cfg_if! {
completion::Completer, highlight::Highlighter, hint::Hinter, validate::Validator, Context,
Helper,
};

impl Completer for ShellHelper<'_> {
type Candidate = String;

Expand All @@ -161,10 +138,7 @@ cfg_if::cfg_if! {
pos: usize,
_ctx: &Context,
) -> rustyline::Result<(usize, Vec<String>)> {
Ok(self
.complete_opt(&line[0..pos])
// as far as I can tell, there's no better way to do both completion
// and indentation (or even just indentation)
Ok(self.complete_opt(&line[..pos])
.unwrap_or_else(|| (pos, vec!["\t".to_owned()])))
}
}
Expand Down
Loading