From 523d612a56c2542fad536c60a239433bb81cfdeb Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Sun, 28 Jun 2026 23:47:39 +0200
Subject: [PATCH] cut: support multibyte characters in non-UTF-8 locales

Add locale-aware character handling so -c counts characters, -b -n keeps
multibyte characters whole, and -d accepts a single multibyte delimiter.

Should make test tests/cut/mb-non-utf8.sh pass
---
 src/uu/cut/Cargo.toml        |   2 +-
 src/uu/cut/locales/en-US.ftl |   1 +
 src/uu/cut/locales/fr-FR.ftl |   1 +
 src/uu/cut/src/cut.rs        | 137 +++++++++++++++++++++++++++--
 tests/by-util/test_cut.rs    | 162 +++++++++++++++++++++++++++++++++++
 5 files changed, 293 insertions(+), 10 deletions(-)

diff --git a/src/uu/cut/Cargo.toml b/src/uu/cut/Cargo.toml
index 66165a48e83..16b6e03d567 100644
--- a/src/uu/cut/Cargo.toml
+++ b/src/uu/cut/Cargo.toml
@@ -20,7 +20,7 @@ doctest = false
 
 [dependencies]
 clap = { workspace = true }
-uucore = { workspace = true, features = ["ranges"] }
+uucore = { workspace = true, features = ["ranges", "i18n-charmap"] }
 memchr = { workspace = true }
 bstr = { workspace = true }
 fluent = { workspace = true }
diff --git a/src/uu/cut/locales/en-US.ftl b/src/uu/cut/locales/en-US.ftl
index d320fc86d11..47f38525533 100644
--- a/src/uu/cut/locales/en-US.ftl
+++ b/src/uu/cut/locales/en-US.ftl
@@ -101,6 +101,7 @@ cut-help-complement = invert the filter - instead of displaying only the filtere
 cut-help-only-delimited = in field mode, only print lines which contain the delimiter
 cut-help-zero-terminated = instead of filtering columns based on line, filter columns based on \\0 (NULL character)
 cut-help-output-delimiter = in field mode, replace the delimiter in output lines with this option's argument
+cut-help-no-split-multibyte = in byte mode, do not split multibyte characters
 
 # Error messages
 cut-error-is-directory = Is a directory
diff --git a/src/uu/cut/locales/fr-FR.ftl b/src/uu/cut/locales/fr-FR.ftl
index a95773099d6..dc96f4aa202 100644
--- a/src/uu/cut/locales/fr-FR.ftl
+++ b/src/uu/cut/locales/fr-FR.ftl
@@ -101,6 +101,7 @@ cut-help-complement = inverser le filtre - au lieu d'afficher seulement les colo
 cut-help-only-delimited = en mode champ, afficher seulement les lignes qui contiennent le délimiteur
 cut-help-zero-terminated = au lieu de filtrer les colonnes basées sur la ligne, filtrer les colonnes basées sur \\0 (caractère NULL)
 cut-help-output-delimiter = en mode champ, remplacer le délimiteur dans les lignes de sortie avec l'argument de cette option
+cut-help-no-split-multibyte = en mode octet, ne pas couper les caractères multioctets
 
 # Messages d'erreur
 cut-error-is-directory = Est un répertoire
diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs
index c87c7e8f7fe..eb99d4331f1 100644
--- a/src/uu/cut/src/cut.rs
+++ b/src/uu/cut/src/cut.rs
@@ -13,6 +13,7 @@ use std::io::{BufRead, BufReader, BufWriter, IsTerminal, Read, Write, stdin, std
 use std::path::Path;
 use uucore::display::Quotable;
 use uucore::error::{FromIo, UResult, USimpleError, set_exit_code};
+use uucore::i18n::charmap::mb_char_len;
 use uucore::line_ending::LineEnding;
 use uucore::os_str_as_bytes;
 
@@ -29,6 +30,8 @@ struct Options<'a> {
     out_delimiter: Option<&'a [u8]>,
     line_ending: LineEnding,
     field_opts: Option<FieldOptions<'a>>,
+    /// `-n`: in byte mode, do not split multi-byte characters.
+    byte_no_split: bool,
 }
 
 enum Delimiter<'a> {
@@ -104,6 +107,107 @@ fn cut_bytes<R: Read, W: Write>(
     Ok(())
 }
 
+/// Fill `spans` with the byte spans `[start, end)` of `line`'s characters, using
+/// the current locale's encoding. Invalid/incomplete sequences count as one
+/// byte. `spans` is cleared first; its capacity is reused across calls.
+fn char_spans_into(line: &[u8], spans: &mut Vec<(usize, usize)>) {
+    spans.clear();
+    let mut i = 0;
+    while i < line.len() {
+        let len = mb_char_len(&line[i..]).clamp(1, line.len() - i);
+        spans.push((i, i + len));
+        i += len;
+    }
+}
+
+/// Character mode (`-c`): ranges index whole (possibly multi-byte) characters.
+fn cut_characters<R: Read, W: Write>(
+    reader: R,
+    out: &mut W,
+    ranges: &[Range],
+    opts: &Options,
+) -> UResult<()> {
+    let newline_char = opts.line_ending.into();
+    let mut buf_in = BufReader::new(reader);
+    let out_delim = opts.out_delimiter.unwrap_or(b"\t");
+    let mut spans: Vec<(usize, usize)> = Vec::new();
+
+    let result = buf_in.for_byte_record(newline_char, |line| {
+        char_spans_into(line, &mut spans);
+        let mut print_delim = false;
+        for &Range { low, high } in ranges {
+            if low > spans.len() {
+                break;
+            }
+            if print_delim {
+                out.write_all(out_delim)?;
+            } else if opts.out_delimiter.is_some() {
+                print_delim = true;
+            }
+            let high = high.min(spans.len());
+            out.write_all(&line[spans[low - 1].0..spans[high - 1].1])?;
+        }
+        out.write_all(&[newline_char])?;
+        Ok(true)
+    });
+
+    if let Err(e) = result {
+        return Err(USimpleError::new(1, e.to_string()));
+    }
+
+    Ok(())
+}
+
+/// Byte mode with `-n`: ranges index bytes, but a multi-byte character is
+/// emitted in full when (and only when) the range includes its last byte.
+fn cut_bytes_no_split<R: Read, W: Write>(
+    reader: R,
+    out: &mut W,
+    ranges: &[Range],
+    opts: &Options,
+) -> UResult<()> {
+    let newline_char = opts.line_ending.into();
+    let mut buf_in = BufReader::new(reader);
+    let out_delim = opts.out_delimiter.unwrap_or(b"\t");
+    let mut spans: Vec<(usize, usize)> = Vec::new();
+
+    let result = buf_in.for_byte_record(newline_char, |line| {
+        char_spans_into(line, &mut spans);
+        let mut print_delim = false;
+        for &Range { low, high } in ranges {
+            if low > line.len() {
+                break;
+            }
+            let high = high.min(line.len());
+            // A character's last byte is at 1-based position `end` (exclusive 0-based end).
+            // Emit the output delimiter lazily, only once this range has actually
+            // selected a character, so a range that matches nothing adds no delimiter.
+            let mut range_emitted = false;
+            for &(start, end) in &spans {
+                if end >= low && end <= high {
+                    if !range_emitted {
+                        if print_delim {
+                            out.write_all(out_delim)?;
+                        } else if opts.out_delimiter.is_some() {
+                            print_delim = true;
+                        }
+                        range_emitted = true;
+                    }
+                    out.write_all(&line[start..end])?;
+                }
+            }
+        }
+        out.write_all(&[newline_char])?;
+        Ok(true)
+    });
+
+    if let Err(e) = result {
+        return Err(USimpleError::new(1, e.to_string()));
+    }
+
+    Ok(())
+}
+
 /// Output delimiter is explicitly specified
 fn cut_fields_explicit_out_delim<R: Read, W: Write, M: Matcher>(
     reader: R,
@@ -458,8 +562,10 @@ where
             }
 
             show_if_err!(match mode {
+                Mode::Bytes(ranges, opts) if opts.byte_no_split =>
+                    cut_bytes_no_split(stdin(), &mut out, ranges, opts),
                 Mode::Bytes(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts),
-                Mode::Characters(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts),
+                Mode::Characters(ranges, opts) => cut_characters(stdin(), &mut out, ranges, opts),
                 Mode::Fields(ranges, opts) => cut_fields(stdin(), &mut out, ranges, opts),
             });
 
@@ -482,8 +588,12 @@ where
                     .map_err_context(|| filename.maybe_quote().to_string())
                     .and_then(|file| {
                         match &mode {
-                            Mode::Bytes(ranges, opts) | Mode::Characters(ranges, opts) => {
-                                cut_bytes(file, &mut out, ranges, opts)
+                            Mode::Bytes(ranges, opts) if opts.byte_no_split => {
+                                cut_bytes_no_split(file, &mut out, ranges, opts)
+                            }
+                            Mode::Bytes(ranges, opts) => cut_bytes(file, &mut out, ranges, opts),
+                            Mode::Characters(ranges, opts) => {
+                                cut_characters(file, &mut out, ranges, opts)
                             }
                             Mode::Fields(ranges, opts) => cut_fields(file, &mut out, ranges, opts),
                         }
@@ -514,12 +624,16 @@ fn get_delimiters(matches: &ArgMatches) -> UResult<(Delimiter<'_>, Option<&[u8]>
             if os_string.is_empty() {
                 Delimiter::Slice(b"\0")
             } else {
-                // For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters
-                // and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior
+                // For delimiter `-d` option value - allow a single character: a UTF-8
+                // character in a UTF-8 locale, or a single (possibly multi-byte)
+                // character of the current locale's encoding, e.g. a 2-byte GB18030
+                // character or any single byte like `b"\xAD"`, to align with GNU.
                 let bytes = os_str_as_bytes(os_string)?;
-                if os_string.to_str().is_some_and(|s| s.chars().count() > 1)
-                    || os_string.to_str().is_none() && bytes.len() > 1
-                {
+                let is_single_char = match os_string.to_str() {
+                    Some(s) => s.chars().count() == 1,
+                    None => mb_char_len(bytes) == bytes.len(),
+                };
+                if !is_single_char {
                     return Err(USimpleError::new(
                         1,
                         translate!("cut-error-delimiter-must-be-single-character"),
@@ -583,6 +697,8 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
 
     let (delimiter, out_delimiter) = get_delimiters(&matches)?;
     let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED));
+    // `-n`: only meaningful with `-b`; keeps multi-byte characters intact.
+    let byte_no_split = matches.get_flag(options::NOTHING);
 
     // Only one, and only one of cutting mode arguments, i.e. `-b`, `-c`, `-f`,
     // is expected. The number of those arguments is used for parsing a cutting
@@ -610,6 +726,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                         out_delimiter,
                         line_ending,
                         field_opts: None,
+                        byte_no_split,
                     },
                 )
             })
@@ -623,6 +740,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                         out_delimiter,
                         line_ending,
                         field_opts: None,
+                        byte_no_split,
                     },
                 )
             })
@@ -639,6 +757,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                             delimiter,
                             only_delimited,
                         }),
+                        byte_no_split,
                     },
                 )
             })
@@ -776,7 +895,7 @@ pub fn uu_app() -> Command {
         .arg(
             Arg::new(options::NOTHING)
                 .short('n')
-                .help("(ignored)")
+                .help(translate!("cut-help-no-split-multibyte"))
                 .action(ArgAction::SetTrue),
         )
 }
diff --git a/tests/by-util/test_cut.rs b/tests/by-util/test_cut.rs
index 26d52a6a5fd..eb000dc3807 100644
--- a/tests/by-util/test_cut.rs
+++ b/tests/by-util/test_cut.rs
@@ -627,6 +627,168 @@ fn test_emoji_delim() {
         .stdout_only("🌹\n");
 }
 
+// 你 (U+4F60) encodes as the two bytes 0xC4 0xE3 in GB18030.
+#[cfg(target_os = "linux")]
+const GB18030_NI: &[u8] = b"\xC4\xE3";
+// 好 (U+597D) encodes as the two bytes 0xBA 0xC3 in GB18030.
+#[cfg(target_os = "linux")]
+const GB18030_HAO: &[u8] = b"\xBA\xC3";
+
+#[test]
+#[cfg(target_os = "linux")]
+#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")]
+fn test_gb18030_multibyte_delimiter() {
+    use std::ffi::OsString;
+    use std::os::unix::ffi::OsStringExt;
+    // Use a 2-byte GB18030 character as the field delimiter.
+    let delim = OsString::from_vec(GB18030_NI.to_vec());
+
+    // Drop the middle field: "a<delim>b<delim>c" keeping fields 1 and 3.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .arg("-d")
+        .arg(&delim)
+        .args(&["-f1,3", "--output-delimiter=-"])
+        .pipe_in(&b"a\xC4\xE3b\xC4\xE3c\n"[..])
+        .succeeds()
+        .stdout_only_bytes(b"a-c\n");
+
+    // Leading empty fields: two delimiters then a trailing field, all selected.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .arg("-d")
+        .arg(&delim)
+        .args(&["-f1-3", "--output-delimiter=|"])
+        .pipe_in(&b"\xC4\xE3\xC4\xE3z\n"[..])
+        .succeeds()
+        .stdout_only_bytes(b"||z\n");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")]
+fn test_gb18030_complement_multibyte_delimiter() {
+    use std::ffi::OsString;
+    use std::os::unix::ffi::OsStringExt;
+    let delim = OsString::from_vec(GB18030_NI.to_vec());
+
+    // --complement of field 2 keeps the surrounding fields and the delimiters.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .arg("-d")
+        .arg(&delim)
+        .args(&["--complement", "-f2"])
+        .pipe_in(&b"a\xC4\xE3b\xC4\xE3c\n"[..])
+        .succeeds()
+        .stdout_only_bytes(b"a\xC4\xE3c\n");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")]
+fn test_gb18030_single_byte_delimiter_is_accepted() {
+    use std::ffi::OsString;
+    use std::os::unix::ffi::OsStringExt;
+    // 0xFE never starts a GB18030 character, yet any single byte is still a
+    // legal delimiter.
+    let delim = OsString::from_vec(vec![0xFE]);
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .arg("-d")
+        .arg(&delim)
+        .args(&["-f2,3", "--output-delimiter=-"])
+        .pipe_in(&b"p\xFEq\xFEr\n"[..])
+        .succeeds()
+        .stdout_only_bytes(b"q-r\n");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+#[cfg_attr(wasi_runner, ignore = "WASI: LC_ALL is not propagated to the guest")]
+fn test_gb18030_character_mode() {
+    // Input: the 2-byte character 好 followed by the ASCII byte 'y'.
+    let mut input = GB18030_HAO.to_vec();
+    input.extend_from_slice(b"y\n");
+
+    // Character 1 is the whole multibyte character.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-c1"])
+        .pipe_in(&input[..])
+        .succeeds()
+        .stdout_only_bytes(b"\xBA\xC3\n");
+    // Character 2 is the trailing ASCII byte.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-c2"])
+        .pipe_in(&input[..])
+        .succeeds()
+        .stdout_only_bytes(b"y\n");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+#[cfg_attr(wasi_runner, ignore = "WASI: LC_ALL is not propagated to the guest")]
+fn test_gb18030_byte_mode_no_split() {
+    // With -n a multibyte character is never split: it is printed only when the
+    // selected byte range reaches its final byte.
+    let mut input = GB18030_HAO.to_vec();
+    input.extend_from_slice(b"y\n");
+
+    // Byte 1 falls in the middle of 好, so nothing is emitted for it.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-b1", "-n"])
+        .pipe_in(&input[..])
+        .succeeds()
+        .stdout_only_bytes(b"\n");
+    // Byte 2 completes 好, so the full character is emitted.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-b2", "-n"])
+        .pipe_in(&input[..])
+        .succeeds()
+        .stdout_only_bytes(b"\xBA\xC3\n");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+#[cfg_attr(wasi_runner, ignore = "WASI: LC_ALL is not propagated to the guest")]
+fn test_gb18030_byte_mode_no_split_output_delimiter() {
+    // A -n range that selects no complete character must not emit an output
+    // delimiter: byte 1 (mid-character) yields nothing, so no leading delimiter
+    // precedes the 'z' selected by byte 3.
+    let mut input = GB18030_HAO.to_vec();
+    input.extend_from_slice(b"z\n");
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-b1,3", "-n", "--output-delimiter=|"])
+        .pipe_in(&input[..])
+        .succeeds()
+        .stdout_only_bytes(b"z\n");
+
+    // Two ranges that both select nothing produce an empty line, not a stray
+    // delimiter.
+    let mut two = GB18030_HAO.to_vec();
+    two.extend_from_slice(GB18030_HAO);
+    two.push(b'\n');
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-b1,3", "-n", "--output-delimiter=|"])
+        .pipe_in(&two[..])
+        .succeeds()
+        .stdout_only_bytes(b"\n");
+
+    // The delimiter is still emitted between two ranges that each select a full
+    // character.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-b2,4", "-n", "--output-delimiter=|"])
+        .pipe_in(&two[..])
+        .succeeds()
+        .stdout_only_bytes(b"\xBA\xC3|\xBA\xC3\n");
+}
+
 #[cfg(target_os = "linux")]
 #[test]
 fn test_failed_write_is_reported() {