uutils · sylvestre · Jun 28, 2026
diff --git a/src/uu/cut/Cargo.toml b/src/uu/cut/Cargo.toml
@@ -20,7 +20,7 @@ doctest = false
 
 [dependencies]
 clap = { workspace = true }
-uucore = { workspace = true, features = ["ranges"] }
+uucore = { workspace = true, features = ["ranges", "i18n-charmap"] }
 memchr = { workspace = true }
 bstr = { workspace = true }
 fluent = { workspace = true }

diff --git a/src/uu/cut/locales/en-US.ftl b/src/uu/cut/locales/en-US.ftl
@@ -101,6 +101,7 @@ cut-help-complement = invert the filter - instead of displaying only the filtere
 cut-help-only-delimited = in field mode, only print lines which contain the delimiter
 cut-help-zero-terminated = instead of filtering columns based on line, filter columns based on \\0 (NULL character)
 cut-help-output-delimiter = in field mode, replace the delimiter in output lines with this option's argument
+cut-help-no-split-multibyte = in byte mode, do not split multibyte characters
 
 # Error messages
 cut-error-is-directory = Is a directory

diff --git a/src/uu/cut/locales/fr-FR.ftl b/src/uu/cut/locales/fr-FR.ftl
@@ -101,6 +101,7 @@ cut-help-complement = inverser le filtre - au lieu d'afficher seulement les colo
 cut-help-only-delimited = en mode champ, afficher seulement les lignes qui contiennent le délimiteur
 cut-help-zero-terminated = au lieu de filtrer les colonnes basées sur la ligne, filtrer les colonnes basées sur \\0 (caractère NULL)
 cut-help-output-delimiter = en mode champ, remplacer le délimiteur dans les lignes de sortie avec l'argument de cette option
+cut-help-no-split-multibyte = en mode octet, ne pas couper les caractères multioctets
 
 # Messages d'erreur
 cut-error-is-directory = Est un répertoire

diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs
@@ -13,6 +13,7 @@ use std::io::{BufRead, BufReader, BufWriter, IsTerminal, Read, Write, stdin, std
 use std::path::Path;
 use uucore::display::Quotable;
 use uucore::error::{FromIo, UResult, USimpleError, set_exit_code};
+use uucore::i18n::charmap::mb_char_len;
 use uucore::line_ending::LineEnding;
 use uucore::os_str_as_bytes;
 
@@ -29,6 +30,8 @@ struct Options<'a> {
     out_delimiter: Option<&'a [u8]>,
     line_ending: LineEnding,
     field_opts: Option<FieldOptions<'a>>,
+    /// `-n`: in byte mode, do not split multi-byte characters.
+    byte_no_split: bool,
 }
 
 enum Delimiter<'a> {
@@ -104,6 +107,101 @@ fn cut_bytes<R: Read, W: Write>(
     Ok(())
 }
 
+/// Split `line` into the byte spans `[start, end)` of its characters, using the
+/// current locale's encoding. Invalid/incomplete sequences count as one byte.
+fn char_spans(line: &[u8]) -> Vec<(usize, usize)> {
+    let mut spans = Vec::new();
+    let mut i = 0;
+    while i < line.len() {
+        let len = mb_char_len(&line[i..]).clamp(1, line.len() - i);
+        spans.push((i, i + len));
+        i += len;
+    }
+    spans
+}
+
+/// Character mode (`-c`): ranges index whole (possibly multi-byte) characters.
+fn cut_characters<R: Read, W: Write>(
+    reader: R,
+    out: &mut W,
+    ranges: &[Range],
+    opts: &Options,
+) -> UResult<()> {
+    let newline_char = opts.line_ending.into();
+    let mut buf_in = BufReader::new(reader);
+    let out_delim = opts.out_delimiter.unwrap_or(b"\t");
+
+    let result = buf_in.for_byte_record(newline_char, |line| {
+        let spans = char_spans(line);
+        let mut print_delim = false;
+        for &Range { low, high } in ranges {
+            if low > spans.len() {
+                break;
+            }
+            if print_delim {
+                out.write_all(out_delim)?;
+            } else if opts.out_delimiter.is_some() {
+                print_delim = true;
+            }
+            let high = high.min(spans.len());
+            let start = spans[low - 1].0;
+            let end = spans[high - 1].1;
+            out.write_all(&line[start..end])?;
+        }
+        out.write_all(&[newline_char])?;
+        Ok(true)
+    });
+
+    if let Err(e) = result {
+        return Err(USimpleError::new(1, e.to_string()));
+    }
+
+    Ok(())
+}
+
+/// Byte mode with `-n`: ranges index bytes, but a multi-byte character is
+/// emitted in full when (and only when) the range includes its last byte.
+fn cut_bytes_no_split<R: Read, W: Write>(
+    reader: R,
+    out: &mut W,
+    ranges: &[Range],
+    opts: &Options,
+) -> UResult<()> {
+    let newline_char = opts.line_ending.into();
+    let mut buf_in = BufReader::new(reader);
+    let out_delim = opts.out_delimiter.unwrap_or(b"\t");
+
+    let result = buf_in.for_byte_record(newline_char, |line| {
+        let spans = char_spans(line);
+        let mut print_delim = false;
+        for &Range { low, high } in ranges {
+            if low > line.len() {
+                break;
+            }
+            if print_delim {
+                out.write_all(out_delim)?;
+            } else if opts.out_delimiter.is_some() {
+                print_delim = true;
+            }
+            let high = high.min(line.len());
+            // A character's last byte is at 1-based position `end` (exclusive 0-based end).
+            for &(start, end) in &spans {
+                if end >= low && end <= high {
+                    out.write_all(&line[start..end])?;
+                }
+            }
+        }
+        out.write_all(&[newline_char])?;
+        Ok(true)
+    });
+
+    if let Err(e) = result {
+        return Err(USimpleError::new(1, e.to_string()));
+    }
+
+    Ok(())
+}
+
 /// Output delimiter is explicitly specified
 fn cut_fields_explicit_out_delim<R: Read, W: Write, M: Matcher>(
     reader: R,
@@ -458,8 +556,10 @@ where
             }
 
             show_if_err!(match mode {
+                Mode::Bytes(ranges, opts) if opts.byte_no_split =>
+                    cut_bytes_no_split(stdin(), &mut out, ranges, opts),
                 Mode::Bytes(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts),
-                Mode::Characters(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts),
+                Mode::Characters(ranges, opts) => cut_characters(stdin(), &mut out, ranges, opts),
                 Mode::Fields(ranges, opts) => cut_fields(stdin(), &mut out, ranges, opts),
             });
 
@@ -482,8 +582,12 @@ where
                     .map_err_context(|| filename.maybe_quote().to_string())
                     .and_then(|file| {
                         match &mode {
-                            Mode::Bytes(ranges, opts) | Mode::Characters(ranges, opts) => {
-                                cut_bytes(file, &mut out, ranges, opts)
+                            Mode::Bytes(ranges, opts) if opts.byte_no_split => {
+                                cut_bytes_no_split(file, &mut out, ranges, opts)
+                            }
+                            Mode::Bytes(ranges, opts) => cut_bytes(file, &mut out, ranges, opts),
+                            Mode::Characters(ranges, opts) => {
+                                cut_characters(file, &mut out, ranges, opts)
                             }
                             Mode::Fields(ranges, opts) => cut_fields(file, &mut out, ranges, opts),
                         }
@@ -514,12 +618,16 @@ fn get_delimiters(matches: &ArgMatches) -> UResult<(Delimiter<'_>, Option<&[u8]>
             if os_string.is_empty() {
                 Delimiter::Slice(b"\0")
             } else {
-                // For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters
-                // and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior
+                // For delimiter `-d` option value - allow a single character: a UTF-8
+                // character in a UTF-8 locale, or a single (possibly multi-byte)
+                // character of the current locale's encoding, e.g. a 2-byte GB18030
+                // character or any single byte like `b"\xAD"`, to align with GNU.
                 let bytes = os_str_as_bytes(os_string)?;
-                if os_string.to_str().is_some_and(|s| s.chars().count() > 1)
-                    || os_string.to_str().is_none() && bytes.len() > 1
-                {
+                let is_single_char = match os_string.to_str() {
+                    Some(s) => s.chars().count() == 1,
+                    None => mb_char_len(bytes) == bytes.len(),
+                };
+                if !is_single_char {
                     return Err(USimpleError::new(
                         1,
                         translate!("cut-error-delimiter-must-be-single-character"),
@@ -583,6 +691,8 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
 
     let (delimiter, out_delimiter) = get_delimiters(&matches)?;
     let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED));
+    // `-n`: only meaningful with `-b`; keeps multi-byte characters intact.
+    let byte_no_split = matches.get_flag(options::NOTHING);
 
     // Only one, and only one of cutting mode arguments, i.e. `-b`, `-c`, `-f`,
     // is expected. The number of those arguments is used for parsing a cutting
@@ -610,6 +720,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                         out_delimiter,
                         line_ending,
                         field_opts: None,
+                        byte_no_split,
                     },
                 )
             })
@@ -623,6 +734,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                         out_delimiter,
                         line_ending,
                         field_opts: None,
+                        byte_no_split,
                     },
                 )
             })
@@ -639,6 +751,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                             delimiter,
                             only_delimited,
                         }),
+                        byte_no_split,
                     },
                 )
             })
@@ -776,7 +889,7 @@ pub fn uu_app() -> Command {
         .arg(
             Arg::new(options::NOTHING)
                 .short('n')
-                .help("(ignored)")
+                .help(translate!("cut-help-no-split-multibyte"))
                 .action(ArgAction::SetTrue),
         )
 }
diff --git a/tests/by-util/test_cut.rs b/tests/by-util/test_cut.rs
@@ -627,6 +627,128 @@ fn test_emoji_delim() {
         .stdout_only("🌹\n");
 }
 
+// 你 (U+4F60) encodes as the two bytes 0xC4 0xE3 in GB18030.
+#[cfg(target_os = "linux")]
+const GB18030_NI: &[u8] = b"\xC4\xE3";
+// 好 (U+597D) encodes as the two bytes 0xBA 0xC3 in GB18030.
+#[cfg(target_os = "linux")]
+const GB18030_HAO: &[u8] = b"\xBA\xC3";
+
+#[test]
+#[cfg(target_os = "linux")]
+#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")]
+fn test_gb18030_multibyte_delimiter() {
+    use std::ffi::OsString;
+    use std::os::unix::ffi::OsStringExt;
+    // Use a 2-byte GB18030 character as the field delimiter.
+    let delim = OsString::from_vec(GB18030_NI.to_vec());
+
+    // Drop the middle field: "a<delim>b<delim>c" keeping fields 1 and 3.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .arg("-d")
+        .arg(&delim)
+        .args(&["-f1,3", "--output-delimiter=-"])
+        .pipe_in(&b"a\xC4\xE3b\xC4\xE3c\n"[..])
+        .succeeds()
+        .stdout_only_bytes(b"a-c\n");
+
+    // Leading empty fields: two delimiters then a trailing field, all selected.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .arg("-d")
+        .arg(&delim)
+        .args(&["-f1-3", "--output-delimiter=|"])
+        .pipe_in(&b"\xC4\xE3\xC4\xE3z\n"[..])
+        .succeeds()
+        .stdout_only_bytes(b"||z\n");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")]
+fn test_gb18030_complement_multibyte_delimiter() {
+    use std::ffi::OsString;
+    use std::os::unix::ffi::OsStringExt;
+    let delim = OsString::from_vec(GB18030_NI.to_vec());
+
+    // --complement of field 2 keeps the surrounding fields and the delimiters.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .arg("-d")
+        .arg(&delim)
+        .args(&["--complement", "-f2"])
+        .pipe_in(&b"a\xC4\xE3b\xC4\xE3c\n"[..])
+        .succeeds()
+        .stdout_only_bytes(b"a\xC4\xE3c\n");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")]
+fn test_gb18030_single_byte_delimiter_is_accepted() {
+    use std::ffi::OsString;
+    use std::os::unix::ffi::OsStringExt;
+    // 0xFE never starts a GB18030 character, yet any single byte is still a
+    // legal delimiter.
+    let delim = OsString::from_vec(vec![0xFE]);
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .arg("-d")
+        .arg(&delim)
+        .args(&["-f2,3", "--output-delimiter=-"])
+        .pipe_in(&b"p\xFEq\xFEr\n"[..])
+        .succeeds()
+        .stdout_only_bytes(b"q-r\n");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_gb18030_character_mode() {
+    // Input: the 2-byte character 好 followed by the ASCII byte 'y'.
+    let mut input = GB18030_HAO.to_vec();
+    input.extend_from_slice(b"y\n");
+
+    // Character 1 is the whole multibyte character.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-c1"])
+        .pipe_in(&input[..])
+        .succeeds()
+        .stdout_only_bytes(b"\xBA\xC3\n");
+    // Character 2 is the trailing ASCII byte.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-c2"])
+        .pipe_in(&input[..])
+        .succeeds()
+        .stdout_only_bytes(b"y\n");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_gb18030_byte_mode_no_split() {
+    // With -n a multibyte character is never split: it is printed only when the
+    // selected byte range reaches its final byte.
+    let mut input = GB18030_HAO.to_vec();
+    input.extend_from_slice(b"y\n");
+
+    // Byte 1 falls in the middle of 好, so nothing is emitted for it.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-b1", "-n"])
+        .pipe_in(&input[..])
+        .succeeds()
+        .stdout_only_bytes(b"\n");
+    // Byte 2 completes 好, so the full character is emitted.
+    new_ucmd!()
+        .env("LC_ALL", "zh_CN.gb18030")
+        .args(&["-b2", "-n"])
+        .pipe_in(&input[..])
+        .succeeds()
+        .stdout_only_bytes(b"\xBA\xC3\n");
+}
+
 #[cfg(target_os = "linux")]
 #[test]
 fn test_failed_write_is_reported() {