From f7f5a690b905f9de4d0a350a65b7e294934bd37b Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Tue, 10 Mar 2026 16:04:47 +0100
Subject: [PATCH] numfmt: support explicit --unit-separator for suffix parsing

---
 src/uu/numfmt/src/format.rs  | 198 ++++++++++++++++++++++-------------
 src/uu/numfmt/src/numfmt.rs  |  14 +--
 src/uu/numfmt/src/options.rs |   2 +-
 tests/by-util/test_numfmt.rs |  23 ++++
 4 files changed, 156 insertions(+), 81 deletions(-)
diff --git a/src/uu/numfmt/src/format.rs b/src/uu/numfmt/src/format.rs
index 57abc84588f..b0ff60876b8 100644
--- a/src/uu/numfmt/src/format.rs
+++ b/src/uu/numfmt/src/format.rs
@@ -11,59 +11,6 @@ use uucore::translate;
 use crate::options::{NumfmtOptions, RoundMethod, TransformOptions};
 use crate::units::{DisplayableSuffix, IEC_BASES, RawSuffix, Result, SI_BASES, Suffix, Unit};
 
-/// Iterate over a line's fields, where each field is a contiguous sequence of
-/// non-whitespace, optionally prefixed with one or more characters of leading
-/// whitespace. Fields are returned as tuples of `(prefix, field)`.
-///
-/// # Examples:
-///
-/// ```
-/// let mut fields = uu_numfmt::format::WhitespaceSplitter { s: Some("    1234 5") };
-///
-/// assert_eq!(Some(("    ", "1234")), fields.next());
-/// assert_eq!(Some((" ", "5")), fields.next());
-/// assert_eq!(None, fields.next());
-/// ```
-///
-/// Delimiters are included in the results; `prefix` will be empty only for
-/// the first field of the line (including the case where the input line is
-/// empty):
-///
-/// ```
-/// let mut fields = uu_numfmt::format::WhitespaceSplitter { s: Some("first second") };
-///
-/// assert_eq!(Some(("", "first")), fields.next());
-/// assert_eq!(Some((" ", "second")), fields.next());
-///
-/// let mut fields = uu_numfmt::format::WhitespaceSplitter { s: Some("") };
-///
-/// assert_eq!(Some(("", "")), fields.next());
-/// ```
-pub struct WhitespaceSplitter<'a> {
-    pub s: Option<&'a str>,
-}
-
-impl<'a> Iterator for WhitespaceSplitter<'a> {
-    type Item = (&'a str, &'a str);
-
-    /// Yield the next field in the input string as a tuple `(prefix, field)`.
-    fn next(&mut self) -> Option<Self::Item> {
-        let haystack = self.s?;
-
-        let (prefix, field) = haystack.split_at(
-            haystack
-                .find(|c: char| !c.is_whitespace())
-                .unwrap_or(haystack.len()),
-        );
-
-        let (field, rest) = field.split_at(field.find(char::is_whitespace).unwrap_or(field.len()));
-
-        self.s = if rest.is_empty() { None } else { Some(rest) };
-
-        Some((prefix, field))
-    }
-}
-
 fn find_numeric_beginning(s: &str) -> Option<&str> {
     let mut decimal_point_seen = false;
     if s.is_empty() {
@@ -130,6 +77,9 @@ fn detailed_error_message(s: &str, unit: Unit) -> Option<String> {
 
     if valid_part != s && valid_part.parse::<f64>().is_ok() {
         return match s.chars().nth(valid_part.len()) {
+            Some('+' | '-') => {
+                Some(translate!("numfmt-error-invalid-number", "input" => s.quote()))
+            }
             Some(v) if RawSuffix::try_from(&v).is_ok() => Some(
                 translate!("numfmt-error-rejecting-suffix", "number" => valid_part, "suffix" => s[valid_part.len()..]),
             ),
@@ -146,7 +96,12 @@ fn detailed_error_message(s: &str, unit: Unit) -> Option<String> {
     None
 }
 
-fn parse_suffix(s: &str, unit: Unit, max_whitespace: usize) -> Result<(f64, Option<Suffix>)> {
+fn parse_suffix(
+    s: &str,
+    unit: Unit,
+    unit_separator: &str,
+    explicit_unit_separator: bool,
+) -> Result<(f64, Option<Suffix>)> {
     let trimmed = s.trim_end();
     if trimmed.is_empty() {
         return Err(translate!("numfmt-error-invalid-number-empty"));
@@ -185,23 +140,122 @@ fn parse_suffix(s: &str, unit: Unit, max_whitespace: usize) -> Result<(f64, Opti
     };
 
     let number_part = &trimmed[..trimmed.len() - suffix_len];
-    let number_trimmed = number_part.trim_end();
 
-    // Validate whitespace between number and suffix
     if suffix.is_some() {
-        let whitespace = number_part.len() - number_trimmed.len();
-        if whitespace > max_whitespace {
-            return Err(translate!("numfmt-error-invalid-suffix", "input" => s.quote()));
-        }
+        let separator_len = if explicit_unit_separator {
+            if number_part.ends_with(unit_separator) {
+                unit_separator.len()
+            } else if unit_separator.is_empty() {
+                0
+            } else {
+                return Err(translate!("numfmt-error-invalid-suffix", "input" => s.quote()));
+            }
+        } else {
+            let number_trimmed = number_part.trim_end();
+            let whitespace = number_part.len() - number_trimmed.len();
+            if whitespace > 1 {
+                return Err(translate!("numfmt-error-invalid-suffix", "input" => s.quote()));
+            }
+            whitespace
+        };
+
+        let number = number_part[..number_part.len() - separator_len]
+            .parse::<f64>()
+            .map_err(|_| translate!("numfmt-error-invalid-number", "input" => s.quote()))?;
+
+        return Ok((number, suffix));
     }
 
-    let number = number_trimmed
+    let number = number_part
         .parse::<f64>()
         .map_err(|_| translate!("numfmt-error-invalid-number", "input" => s.quote()))?;
 
     Ok((number, suffix))
 }
 
+fn next_field_index(s: &str) -> usize {
+    s.find(char::is_whitespace).unwrap_or(s.len())
+}
+
+fn split_next_field<'a>(s: &'a str) -> (&'a str, &'a str, &'a str) {
+    let prefix_len = s.find(|c: char| !c.is_whitespace()).unwrap_or(s.len());
+    let prefix = &s[..prefix_len];
+    let field_start = prefix_len;
+    let field_len = next_field_index(&s[field_start..]);
+    let field_end = field_start + field_len;
+    let field = &s[field_start..field_end];
+    let rest = &s[field_end..];
+    (prefix, field, rest)
+}
+
+/// When an explicit whitespace unit separator is set (e.g. `--unit-separator=" "`),
+/// a suffix like "K" may appear as a separate whitespace-delimited field.  Detect
+/// this case so the caller can merge the suffix back into the preceding number field.
+fn split_mergeable_suffix<'a>(s: &'a str, options: &NumfmtOptions) -> Option<(&'a str, &'a str)> {
+    if !options.explicit_unit_separator
+        || options.unit_separator.is_empty()
+        || !options.unit_separator.chars().all(char::is_whitespace)
+    {
+        return None;
+    }
+
+    if !s.starts_with(&options.unit_separator) {
+        return None;
+    }
+
+    let (prefix, field, _) = split_next_field(s);
+    if prefix != options.unit_separator {
+        return None;
+    }
+
+    match field.len() {
+        1 => {
+            let _ = field
+                .chars()
+                .next()
+                .filter(|c| RawSuffix::try_from(c).is_ok())?;
+        }
+        2 if field.ends_with('i') => {
+            let _ = field
+                .chars()
+                .next()
+                .filter(|c| RawSuffix::try_from(c).is_ok())?;
+        }
+        _ => return None,
+    }
+
+    Some((prefix, field))
+}
+
+struct WhitespaceSplitter<'a, 'b> {
+    s: Option<&'a str>,
+    options: &'b NumfmtOptions,
+}
+
+impl<'a> Iterator for WhitespaceSplitter<'a, '_> {
+    type Item = (&'a str, &'a str);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let haystack = self.s?;
+        let (prefix, field, rest) = split_next_field(haystack);
+
+        if field.is_empty() {
+            self.s = None;
+            return Some((prefix, field));
+        }
+
+        if let Some((suffix_prefix, suffix_field)) = split_mergeable_suffix(rest, self.options) {
+            let merged_len = prefix.len() + field.len() + suffix_prefix.len() + suffix_field.len();
+            let merged_field = &haystack[prefix.len()..merged_len];
+            self.s = Some(&haystack[merged_len..]).filter(|rest| !rest.is_empty());
+            return Some((prefix, merged_field));
+        }
+
+        self.s = Some(rest).filter(|rest| !rest.is_empty());
+        Some((prefix, field))
+    }
+}
+
 /// Returns the implicit precision of a number, which is the count of digits after the dot. For
 /// example, 1.23 has an implicit precision of 2.
 fn parse_implicit_precision(s: &str) -> usize {
@@ -252,9 +306,14 @@ fn remove_suffix(i: f64, s: Option<Suffix>, u: Unit) -> Result<f64> {
     }
 }
 
-fn transform_from(s: &str, opts: &TransformOptions, max_whitespace: usize) -> Result<f64> {
-    let (i, suffix) = parse_suffix(s, opts.from, max_whitespace)
-        .map_err(|original| detailed_error_message(s, opts.from).unwrap_or(original))?;
+fn transform_from(s: &str, opts: &TransformOptions, options: &NumfmtOptions) -> Result<f64> {
+    let (i, suffix) = parse_suffix(
+        s,
+        opts.from,
+        &options.unit_separator,
+        options.explicit_unit_separator,
+    )
+    .map_err(|original| detailed_error_message(s, opts.from).unwrap_or(original))?;
     let i = i * (opts.from_unit as f64);
 
     remove_suffix(i, suffix, opts.from).map(|n| {
@@ -409,11 +468,7 @@ fn format_string(
     };
 
     let number = transform_to(
-        transform_from(
-            source_without_suffix,
-            &options.transform,
-            options.max_whitespace,
-        )?,
+        transform_from(source_without_suffix, &options.transform, options)?,
         &options.transform,
         options.round,
         precision,
@@ -531,7 +586,10 @@ pub fn write_formatted_with_whitespace<W: std::io::Write>(
     options: &NumfmtOptions,
     eol: Option<u8>,
 ) -> Result<()> {
-    for (n, (prefix, field)) in (1..).zip(WhitespaceSplitter { s: Some(s) }) {
+    for (n, (prefix, field)) in (1..).zip(WhitespaceSplitter {
+        s: Some(s),
+        options,
+    }) {
         let field_selected = uucore::ranges::contain(&options.fields, n);
 
         if field_selected {
diff --git a/src/uu/numfmt/src/numfmt.rs b/src/uu/numfmt/src/numfmt.rs
index e2e86b8e1b3..a779a7cd554 100644
--- a/src/uu/numfmt/src/numfmt.rs
+++ b/src/uu/numfmt/src/numfmt.rs
@@ -285,14 +285,8 @@ fn parse_options(args: &ArgMatches) -> Result<NumfmtOptions> {
         .cloned()
         .unwrap_or_default();
 
-    // Max whitespace between number and suffix: length of separator if provided, default one
-    let max_whitespace = if args.contains_id(UNIT_SEPARATOR)
-        && args.value_source(UNIT_SEPARATOR) == Some(ValueSource::CommandLine)
-    {
-        unit_separator.len()
-    } else {
-        1
-    };
+    let explicit_unit_separator = args.contains_id(UNIT_SEPARATOR)
+        && args.value_source(UNIT_SEPARATOR) == Some(ValueSource::CommandLine);
 
     let invalid = InvalidModes::from_str(args.get_one::<String>(INVALID).unwrap()).unwrap();
 
@@ -309,7 +303,7 @@ fn parse_options(args: &ArgMatches) -> Result<NumfmtOptions> {
         round,
         suffix,
         unit_separator,
-        max_whitespace,
+        explicit_unit_separator,
         format,
         invalid,
         zero_terminated,
@@ -530,7 +524,7 @@ mod tests {
             round: RoundMethod::Nearest,
             suffix: None,
             unit_separator: String::new(),
-            max_whitespace: 1,
+            explicit_unit_separator: false,
             format: FormatOptions::default(),
             invalid: InvalidModes::Abort,
             zero_terminated: false,
diff --git a/src/uu/numfmt/src/options.rs b/src/uu/numfmt/src/options.rs
index f6db4ff9f89..21d6e7ff9ec 100644
--- a/src/uu/numfmt/src/options.rs
+++ b/src/uu/numfmt/src/options.rs
@@ -55,7 +55,7 @@ pub struct NumfmtOptions {
     pub round: RoundMethod,
     pub suffix: Option<String>,
     pub unit_separator: String,
-    pub max_whitespace: usize,
+    pub explicit_unit_separator: bool,
     pub format: FormatOptions,
     pub invalid: InvalidModes,
     pub zero_terminated: bool,
diff --git a/tests/by-util/test_numfmt.rs b/tests/by-util/test_numfmt.rs
index c9ff617c1e9..e9c9994a574 100644
--- a/tests/by-util/test_numfmt.rs
+++ b/tests/by-util/test_numfmt.rs
@@ -316,6 +316,14 @@ fn test_should_report_invalid_number_with_interior_junk() {
         .stderr_is("numfmt: invalid suffix in input: '1x0K'\n");
 }
 
+#[test]
+fn test_should_report_invalid_number_with_sign_after_decimal() {
+    new_ucmd!()
+        .args(&["--", "-0.-1"])
+        .fails_with_code(2)
+        .stderr_is("numfmt: invalid number: '-0.-1'\n");
+}
+
 #[test]
 fn test_should_skip_leading_space_from_stdin() {
     new_ucmd!()
@@ -1237,6 +1245,21 @@ fn test_empty_delimiter_multi_char_unit_separator() {
         .stdout_only("1000\n2000000\n3000000000\n");
 }
 
+#[test]
+fn test_whitespace_mode_parses_custom_unit_separator_inputs() {
+    new_ucmd!()
+        .args(&["--from=iec", "--unit-separator=::"])
+        .pipe_in("4::K\n")
+        .succeeds()
+        .stdout_only("4096\n");
+
+    new_ucmd!()
+        .args(&["--from=iec", "--unit-separator=\u{a0}"])
+        .pipe_in("4\u{a0}K\n")
+        .succeeds()
+        .stdout_only("4096\n");
+}
+
 #[test]
 fn test_empty_delimiter_whitespace_rejection() {
     new_ucmd!()