diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs index b9090cd50cf..4995f8c198e 100644 --- a/src/uu/uniq/src/uniq.rs +++ b/src/uu/uniq/src/uniq.rs @@ -154,43 +154,44 @@ impl Uniq { fn cmp_key(&self, line: &[u8], mut closure: F) -> bool where - F: FnMut(&mut dyn Iterator) -> bool, + F: FnMut(&mut dyn Iterator) -> bool, { let fields_to_check = self.skip_fields(line); - let len = fields_to_check.len(); - let slice_start = self.slice_start.unwrap_or(0); - let slice_stop = self.slice_stop.unwrap_or(len); - if len > 0 { - // fast path: avoid doing any work if there is no need to skip or map to lower-case - if !self.ignore_case && slice_start == 0 && slice_stop == len { - return closure(&mut fields_to_check.iter().copied()); - } - // fast path: avoid skipping - if self.ignore_case && slice_start == 0 && slice_stop == len { - return closure(&mut fields_to_check.iter().map(|u| u.to_ascii_lowercase())); - } + // Skip self.slice_start bytes (if -s was used). + // self.slice_start is how many characters to skip, but historically + // uniq’s `-s N` means “skip N *bytes*,” so do that literally: + let skip_bytes = self.slice_start.unwrap_or(0); + let fields_to_check = if skip_bytes < fields_to_check.len() { + &fields_to_check[skip_bytes..] + } else { + // If skipping beyond end-of-line, leftover is empty => effectively "" + &[] + }; - // fast path: we can avoid mapping chars to lower-case, if we don't want to ignore the case - if !self.ignore_case { - return closure( - &mut fields_to_check - .iter() - .skip(slice_start) - .take(slice_stop) - .copied(), - ); + // Convert the leftover bytes to UTF-8 for character-based -w + // If invalid UTF-8, just compare them as individual bytes (fallback). + let string_after_skip = match std::str::from_utf8(fields_to_check) { + Ok(s) => s, + Err(_) => { + // Fallback: if invalid UTF-8, treat them as single-byte “chars” + return closure(&mut fields_to_check.iter().map(|&b| b as char)); } + }; - closure( - &mut fields_to_check - .iter() - .skip(slice_start) - .take(slice_stop) - .map(|u| u.to_ascii_lowercase()), - ) + let total_chars = string_after_skip.chars().count(); + + // `-w N` => Compare no more than N characters + let slice_stop = self.slice_stop.unwrap_or(total_chars); + let slice_start = slice_stop.min(total_chars); + + let mut iter = string_after_skip.chars().take(slice_start); + + if self.ignore_case { + // We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII: + closure(&mut iter.map(|c| c.to_ascii_lowercase())) } else { - closure(&mut fields_to_check.iter().copied()) + closure(&mut iter) } } diff --git a/tests/by-util/test_uniq.rs b/tests/by-util/test_uniq.rs index 30cf73b8436..18f226f07dd 100644 --- a/tests/by-util/test_uniq.rs +++ b/tests/by-util/test_uniq.rs @@ -1172,3 +1172,13 @@ fn gnu_tests() { } } } + +#[test] +fn test_stdin_w1_multibyte() { + let input = "à\ná\n"; + new_ucmd!() + .args(&["-w1"]) + .pipe_in(input) + .run() + .stdout_is("à\ná\n"); +}