Skip to content

uniq: fix multibyte input #7046

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 31 additions & 30 deletions src/uu/uniq/src/uniq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,43 +154,44 @@ impl Uniq {

fn cmp_key<F>(&self, line: &[u8], mut closure: F) -> bool
where
F: FnMut(&mut dyn Iterator<Item = u8>) -> bool,
F: FnMut(&mut dyn Iterator<Item = char>) -> bool,
{
let fields_to_check = self.skip_fields(line);
let len = fields_to_check.len();
let slice_start = self.slice_start.unwrap_or(0);
let slice_stop = self.slice_stop.unwrap_or(len);
if len > 0 {
// fast path: avoid doing any work if there is no need to skip or map to lower-case
if !self.ignore_case && slice_start == 0 && slice_stop == len {
return closure(&mut fields_to_check.iter().copied());
}

// fast path: avoid skipping
if self.ignore_case && slice_start == 0 && slice_stop == len {
return closure(&mut fields_to_check.iter().map(|u| u.to_ascii_lowercase()));
}
// Skip self.slice_start bytes (if -s was used).
// self.slice_start is how many characters to skip, but historically
// uniq’s `-s N` means “skip N *bytes*,” so do that literally:
let skip_bytes = self.slice_start.unwrap_or(0);
let fields_to_check = if skip_bytes < fields_to_check.len() {
&fields_to_check[skip_bytes..]
} else {
// If skipping beyond end-of-line, leftover is empty => effectively ""
&[]
};

// fast path: we can avoid mapping chars to lower-case, if we don't want to ignore the case
if !self.ignore_case {
return closure(
&mut fields_to_check
.iter()
.skip(slice_start)
.take(slice_stop)
.copied(),
);
// Convert the leftover bytes to UTF-8 for character-based -w
// If invalid UTF-8, just compare them as individual bytes (fallback).
let string_after_skip = match std::str::from_utf8(fields_to_check) {
Ok(s) => s,
Err(_) => {
// Fallback: if invalid UTF-8, treat them as single-byte “chars”
return closure(&mut fields_to_check.iter().map(|&b| b as char));
}
};

closure(
&mut fields_to_check
.iter()
.skip(slice_start)
.take(slice_stop)
.map(|u| u.to_ascii_lowercase()),
)
let total_chars = string_after_skip.chars().count();

// `-w N` => Compare no more than N characters
let slice_stop = self.slice_stop.unwrap_or(total_chars);
let slice_start = slice_stop.min(total_chars);

let mut iter = string_after_skip.chars().take(slice_start);

if self.ignore_case {
// We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
closure(&mut iter.map(|c| c.to_ascii_lowercase()))
} else {
closure(&mut fields_to_check.iter().copied())
closure(&mut iter)
}
}

Expand Down
10 changes: 10 additions & 0 deletions tests/by-util/test_uniq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1172,3 +1172,13 @@ fn gnu_tests() {
}
}
}

#[test]
fn test_stdin_w1_multibyte() {
let input = "à\ná\n";
new_ucmd!()
.args(&["-w1"])
.pipe_in(input)
.run()
.stdout_is("à\ná\n");
}
Loading