From f7f5a690b905f9de4d0a350a65b7e294934bd37b Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Tue, 10 Mar 2026 16:04:47 +0100 Subject: [PATCH] numfmt: support explicit --unit-separator for suffix parsing --- src/uu/numfmt/src/format.rs | 198 ++++++++++++++++++++++------------- src/uu/numfmt/src/numfmt.rs | 14 +-- src/uu/numfmt/src/options.rs | 2 +- tests/by-util/test_numfmt.rs | 23 ++++ 4 files changed, 156 insertions(+), 81 deletions(-) diff --git a/src/uu/numfmt/src/format.rs b/src/uu/numfmt/src/format.rs index 57abc84588f..b0ff60876b8 100644 --- a/src/uu/numfmt/src/format.rs +++ b/src/uu/numfmt/src/format.rs @@ -11,59 +11,6 @@ use uucore::translate; use crate::options::{NumfmtOptions, RoundMethod, TransformOptions}; use crate::units::{DisplayableSuffix, IEC_BASES, RawSuffix, Result, SI_BASES, Suffix, Unit}; -/// Iterate over a line's fields, where each field is a contiguous sequence of -/// non-whitespace, optionally prefixed with one or more characters of leading -/// whitespace. Fields are returned as tuples of `(prefix, field)`. -/// -/// # Examples: -/// -/// ``` -/// let mut fields = uu_numfmt::format::WhitespaceSplitter { s: Some(" 1234 5") }; -/// -/// assert_eq!(Some((" ", "1234")), fields.next()); -/// assert_eq!(Some((" ", "5")), fields.next()); -/// assert_eq!(None, fields.next()); -/// ``` -/// -/// Delimiters are included in the results; `prefix` will be empty only for -/// the first field of the line (including the case where the input line is -/// empty): -/// -/// ``` -/// let mut fields = uu_numfmt::format::WhitespaceSplitter { s: Some("first second") }; -/// -/// assert_eq!(Some(("", "first")), fields.next()); -/// assert_eq!(Some((" ", "second")), fields.next()); -/// -/// let mut fields = uu_numfmt::format::WhitespaceSplitter { s: Some("") }; -/// -/// assert_eq!(Some(("", "")), fields.next()); -/// ``` -pub struct WhitespaceSplitter<'a> { - pub s: Option<&'a str>, -} - -impl<'a> Iterator for WhitespaceSplitter<'a> { - type Item = (&'a str, &'a str); - - /// Yield the next field in the input string as a tuple `(prefix, field)`. - fn next(&mut self) -> Option { - let haystack = self.s?; - - let (prefix, field) = haystack.split_at( - haystack - .find(|c: char| !c.is_whitespace()) - .unwrap_or(haystack.len()), - ); - - let (field, rest) = field.split_at(field.find(char::is_whitespace).unwrap_or(field.len())); - - self.s = if rest.is_empty() { None } else { Some(rest) }; - - Some((prefix, field)) - } -} - fn find_numeric_beginning(s: &str) -> Option<&str> { let mut decimal_point_seen = false; if s.is_empty() { @@ -130,6 +77,9 @@ fn detailed_error_message(s: &str, unit: Unit) -> Option { if valid_part != s && valid_part.parse::().is_ok() { return match s.chars().nth(valid_part.len()) { + Some('+' | '-') => { + Some(translate!("numfmt-error-invalid-number", "input" => s.quote())) + } Some(v) if RawSuffix::try_from(&v).is_ok() => Some( translate!("numfmt-error-rejecting-suffix", "number" => valid_part, "suffix" => s[valid_part.len()..]), ), @@ -146,7 +96,12 @@ fn detailed_error_message(s: &str, unit: Unit) -> Option { None } -fn parse_suffix(s: &str, unit: Unit, max_whitespace: usize) -> Result<(f64, Option)> { +fn parse_suffix( + s: &str, + unit: Unit, + unit_separator: &str, + explicit_unit_separator: bool, +) -> Result<(f64, Option)> { let trimmed = s.trim_end(); if trimmed.is_empty() { return Err(translate!("numfmt-error-invalid-number-empty")); @@ -185,23 +140,122 @@ fn parse_suffix(s: &str, unit: Unit, max_whitespace: usize) -> Result<(f64, Opti }; let number_part = &trimmed[..trimmed.len() - suffix_len]; - let number_trimmed = number_part.trim_end(); - // Validate whitespace between number and suffix if suffix.is_some() { - let whitespace = number_part.len() - number_trimmed.len(); - if whitespace > max_whitespace { - return Err(translate!("numfmt-error-invalid-suffix", "input" => s.quote())); - } + let separator_len = if explicit_unit_separator { + if number_part.ends_with(unit_separator) { + unit_separator.len() + } else if unit_separator.is_empty() { + 0 + } else { + return Err(translate!("numfmt-error-invalid-suffix", "input" => s.quote())); + } + } else { + let number_trimmed = number_part.trim_end(); + let whitespace = number_part.len() - number_trimmed.len(); + if whitespace > 1 { + return Err(translate!("numfmt-error-invalid-suffix", "input" => s.quote())); + } + whitespace + }; + + let number = number_part[..number_part.len() - separator_len] + .parse::() + .map_err(|_| translate!("numfmt-error-invalid-number", "input" => s.quote()))?; + + return Ok((number, suffix)); } - let number = number_trimmed + let number = number_part .parse::() .map_err(|_| translate!("numfmt-error-invalid-number", "input" => s.quote()))?; Ok((number, suffix)) } +fn next_field_index(s: &str) -> usize { + s.find(char::is_whitespace).unwrap_or(s.len()) +} + +fn split_next_field<'a>(s: &'a str) -> (&'a str, &'a str, &'a str) { + let prefix_len = s.find(|c: char| !c.is_whitespace()).unwrap_or(s.len()); + let prefix = &s[..prefix_len]; + let field_start = prefix_len; + let field_len = next_field_index(&s[field_start..]); + let field_end = field_start + field_len; + let field = &s[field_start..field_end]; + let rest = &s[field_end..]; + (prefix, field, rest) +} + +/// When an explicit whitespace unit separator is set (e.g. `--unit-separator=" "`), +/// a suffix like "K" may appear as a separate whitespace-delimited field. Detect +/// this case so the caller can merge the suffix back into the preceding number field. +fn split_mergeable_suffix<'a>(s: &'a str, options: &NumfmtOptions) -> Option<(&'a str, &'a str)> { + if !options.explicit_unit_separator + || options.unit_separator.is_empty() + || !options.unit_separator.chars().all(char::is_whitespace) + { + return None; + } + + if !s.starts_with(&options.unit_separator) { + return None; + } + + let (prefix, field, _) = split_next_field(s); + if prefix != options.unit_separator { + return None; + } + + match field.len() { + 1 => { + let _ = field + .chars() + .next() + .filter(|c| RawSuffix::try_from(c).is_ok())?; + } + 2 if field.ends_with('i') => { + let _ = field + .chars() + .next() + .filter(|c| RawSuffix::try_from(c).is_ok())?; + } + _ => return None, + } + + Some((prefix, field)) +} + +struct WhitespaceSplitter<'a, 'b> { + s: Option<&'a str>, + options: &'b NumfmtOptions, +} + +impl<'a> Iterator for WhitespaceSplitter<'a, '_> { + type Item = (&'a str, &'a str); + + fn next(&mut self) -> Option { + let haystack = self.s?; + let (prefix, field, rest) = split_next_field(haystack); + + if field.is_empty() { + self.s = None; + return Some((prefix, field)); + } + + if let Some((suffix_prefix, suffix_field)) = split_mergeable_suffix(rest, self.options) { + let merged_len = prefix.len() + field.len() + suffix_prefix.len() + suffix_field.len(); + let merged_field = &haystack[prefix.len()..merged_len]; + self.s = Some(&haystack[merged_len..]).filter(|rest| !rest.is_empty()); + return Some((prefix, merged_field)); + } + + self.s = Some(rest).filter(|rest| !rest.is_empty()); + Some((prefix, field)) + } +} + /// Returns the implicit precision of a number, which is the count of digits after the dot. For /// example, 1.23 has an implicit precision of 2. fn parse_implicit_precision(s: &str) -> usize { @@ -252,9 +306,14 @@ fn remove_suffix(i: f64, s: Option, u: Unit) -> Result { } } -fn transform_from(s: &str, opts: &TransformOptions, max_whitespace: usize) -> Result { - let (i, suffix) = parse_suffix(s, opts.from, max_whitespace) - .map_err(|original| detailed_error_message(s, opts.from).unwrap_or(original))?; +fn transform_from(s: &str, opts: &TransformOptions, options: &NumfmtOptions) -> Result { + let (i, suffix) = parse_suffix( + s, + opts.from, + &options.unit_separator, + options.explicit_unit_separator, + ) + .map_err(|original| detailed_error_message(s, opts.from).unwrap_or(original))?; let i = i * (opts.from_unit as f64); remove_suffix(i, suffix, opts.from).map(|n| { @@ -409,11 +468,7 @@ fn format_string( }; let number = transform_to( - transform_from( - source_without_suffix, - &options.transform, - options.max_whitespace, - )?, + transform_from(source_without_suffix, &options.transform, options)?, &options.transform, options.round, precision, @@ -531,7 +586,10 @@ pub fn write_formatted_with_whitespace( options: &NumfmtOptions, eol: Option, ) -> Result<()> { - for (n, (prefix, field)) in (1..).zip(WhitespaceSplitter { s: Some(s) }) { + for (n, (prefix, field)) in (1..).zip(WhitespaceSplitter { + s: Some(s), + options, + }) { let field_selected = uucore::ranges::contain(&options.fields, n); if field_selected { diff --git a/src/uu/numfmt/src/numfmt.rs b/src/uu/numfmt/src/numfmt.rs index e2e86b8e1b3..a779a7cd554 100644 --- a/src/uu/numfmt/src/numfmt.rs +++ b/src/uu/numfmt/src/numfmt.rs @@ -285,14 +285,8 @@ fn parse_options(args: &ArgMatches) -> Result { .cloned() .unwrap_or_default(); - // Max whitespace between number and suffix: length of separator if provided, default one - let max_whitespace = if args.contains_id(UNIT_SEPARATOR) - && args.value_source(UNIT_SEPARATOR) == Some(ValueSource::CommandLine) - { - unit_separator.len() - } else { - 1 - }; + let explicit_unit_separator = args.contains_id(UNIT_SEPARATOR) + && args.value_source(UNIT_SEPARATOR) == Some(ValueSource::CommandLine); let invalid = InvalidModes::from_str(args.get_one::(INVALID).unwrap()).unwrap(); @@ -309,7 +303,7 @@ fn parse_options(args: &ArgMatches) -> Result { round, suffix, unit_separator, - max_whitespace, + explicit_unit_separator, format, invalid, zero_terminated, @@ -530,7 +524,7 @@ mod tests { round: RoundMethod::Nearest, suffix: None, unit_separator: String::new(), - max_whitespace: 1, + explicit_unit_separator: false, format: FormatOptions::default(), invalid: InvalidModes::Abort, zero_terminated: false, diff --git a/src/uu/numfmt/src/options.rs b/src/uu/numfmt/src/options.rs index f6db4ff9f89..21d6e7ff9ec 100644 --- a/src/uu/numfmt/src/options.rs +++ b/src/uu/numfmt/src/options.rs @@ -55,7 +55,7 @@ pub struct NumfmtOptions { pub round: RoundMethod, pub suffix: Option, pub unit_separator: String, - pub max_whitespace: usize, + pub explicit_unit_separator: bool, pub format: FormatOptions, pub invalid: InvalidModes, pub zero_terminated: bool, diff --git a/tests/by-util/test_numfmt.rs b/tests/by-util/test_numfmt.rs index c9ff617c1e9..e9c9994a574 100644 --- a/tests/by-util/test_numfmt.rs +++ b/tests/by-util/test_numfmt.rs @@ -316,6 +316,14 @@ fn test_should_report_invalid_number_with_interior_junk() { .stderr_is("numfmt: invalid suffix in input: '1x0K'\n"); } +#[test] +fn test_should_report_invalid_number_with_sign_after_decimal() { + new_ucmd!() + .args(&["--", "-0.-1"]) + .fails_with_code(2) + .stderr_is("numfmt: invalid number: '-0.-1'\n"); +} + #[test] fn test_should_skip_leading_space_from_stdin() { new_ucmd!() @@ -1237,6 +1245,21 @@ fn test_empty_delimiter_multi_char_unit_separator() { .stdout_only("1000\n2000000\n3000000000\n"); } +#[test] +fn test_whitespace_mode_parses_custom_unit_separator_inputs() { + new_ucmd!() + .args(&["--from=iec", "--unit-separator=::"]) + .pipe_in("4::K\n") + .succeeds() + .stdout_only("4096\n"); + + new_ucmd!() + .args(&["--from=iec", "--unit-separator=\u{a0}"]) + .pipe_in("4\u{a0}K\n") + .succeeds() + .stdout_only("4096\n"); +} + #[test] fn test_empty_delimiter_whitespace_rejection() { new_ucmd!()