From 523d612a56c2542fad536c60a239433bb81cfdeb Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 28 Jun 2026 23:47:39 +0200 Subject: [PATCH] cut: support multibyte characters in non-UTF-8 locales Add locale-aware character handling so -c counts characters, -b -n keeps multibyte characters whole, and -d accepts a single multibyte delimiter. Should make test tests/cut/mb-non-utf8.sh pass --- src/uu/cut/Cargo.toml | 2 +- src/uu/cut/locales/en-US.ftl | 1 + src/uu/cut/locales/fr-FR.ftl | 1 + src/uu/cut/src/cut.rs | 137 +++++++++++++++++++++++++++-- tests/by-util/test_cut.rs | 162 +++++++++++++++++++++++++++++++++++ 5 files changed, 293 insertions(+), 10 deletions(-) diff --git a/src/uu/cut/Cargo.toml b/src/uu/cut/Cargo.toml index 66165a48e83..16b6e03d567 100644 --- a/src/uu/cut/Cargo.toml +++ b/src/uu/cut/Cargo.toml @@ -20,7 +20,7 @@ doctest = false [dependencies] clap = { workspace = true } -uucore = { workspace = true, features = ["ranges"] } +uucore = { workspace = true, features = ["ranges", "i18n-charmap"] } memchr = { workspace = true } bstr = { workspace = true } fluent = { workspace = true } diff --git a/src/uu/cut/locales/en-US.ftl b/src/uu/cut/locales/en-US.ftl index d320fc86d11..47f38525533 100644 --- a/src/uu/cut/locales/en-US.ftl +++ b/src/uu/cut/locales/en-US.ftl @@ -101,6 +101,7 @@ cut-help-complement = invert the filter - instead of displaying only the filtere cut-help-only-delimited = in field mode, only print lines which contain the delimiter cut-help-zero-terminated = instead of filtering columns based on line, filter columns based on \\0 (NULL character) cut-help-output-delimiter = in field mode, replace the delimiter in output lines with this option's argument +cut-help-no-split-multibyte = in byte mode, do not split multibyte characters # Error messages cut-error-is-directory = Is a directory diff --git a/src/uu/cut/locales/fr-FR.ftl b/src/uu/cut/locales/fr-FR.ftl index a95773099d6..dc96f4aa202 100644 --- a/src/uu/cut/locales/fr-FR.ftl +++ b/src/uu/cut/locales/fr-FR.ftl @@ -101,6 +101,7 @@ cut-help-complement = inverser le filtre - au lieu d'afficher seulement les colo cut-help-only-delimited = en mode champ, afficher seulement les lignes qui contiennent le délimiteur cut-help-zero-terminated = au lieu de filtrer les colonnes basées sur la ligne, filtrer les colonnes basées sur \\0 (caractère NULL) cut-help-output-delimiter = en mode champ, remplacer le délimiteur dans les lignes de sortie avec l'argument de cette option +cut-help-no-split-multibyte = en mode octet, ne pas couper les caractères multioctets # Messages d'erreur cut-error-is-directory = Est un répertoire diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index c87c7e8f7fe..eb99d4331f1 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -13,6 +13,7 @@ use std::io::{BufRead, BufReader, BufWriter, IsTerminal, Read, Write, stdin, std use std::path::Path; use uucore::display::Quotable; use uucore::error::{FromIo, UResult, USimpleError, set_exit_code}; +use uucore::i18n::charmap::mb_char_len; use uucore::line_ending::LineEnding; use uucore::os_str_as_bytes; @@ -29,6 +30,8 @@ struct Options<'a> { out_delimiter: Option<&'a [u8]>, line_ending: LineEnding, field_opts: Option>, + /// `-n`: in byte mode, do not split multi-byte characters. + byte_no_split: bool, } enum Delimiter<'a> { @@ -104,6 +107,107 @@ fn cut_bytes( Ok(()) } +/// Fill `spans` with the byte spans `[start, end)` of `line`'s characters, using +/// the current locale's encoding. Invalid/incomplete sequences count as one +/// byte. `spans` is cleared first; its capacity is reused across calls. +fn char_spans_into(line: &[u8], spans: &mut Vec<(usize, usize)>) { + spans.clear(); + let mut i = 0; + while i < line.len() { + let len = mb_char_len(&line[i..]).clamp(1, line.len() - i); + spans.push((i, i + len)); + i += len; + } +} + +/// Character mode (`-c`): ranges index whole (possibly multi-byte) characters. +fn cut_characters( + reader: R, + out: &mut W, + ranges: &[Range], + opts: &Options, +) -> UResult<()> { + let newline_char = opts.line_ending.into(); + let mut buf_in = BufReader::new(reader); + let out_delim = opts.out_delimiter.unwrap_or(b"\t"); + let mut spans: Vec<(usize, usize)> = Vec::new(); + + let result = buf_in.for_byte_record(newline_char, |line| { + char_spans_into(line, &mut spans); + let mut print_delim = false; + for &Range { low, high } in ranges { + if low > spans.len() { + break; + } + if print_delim { + out.write_all(out_delim)?; + } else if opts.out_delimiter.is_some() { + print_delim = true; + } + let high = high.min(spans.len()); + out.write_all(&line[spans[low - 1].0..spans[high - 1].1])?; + } + out.write_all(&[newline_char])?; + Ok(true) + }); + + if let Err(e) = result { + return Err(USimpleError::new(1, e.to_string())); + } + + Ok(()) +} + +/// Byte mode with `-n`: ranges index bytes, but a multi-byte character is +/// emitted in full when (and only when) the range includes its last byte. +fn cut_bytes_no_split( + reader: R, + out: &mut W, + ranges: &[Range], + opts: &Options, +) -> UResult<()> { + let newline_char = opts.line_ending.into(); + let mut buf_in = BufReader::new(reader); + let out_delim = opts.out_delimiter.unwrap_or(b"\t"); + let mut spans: Vec<(usize, usize)> = Vec::new(); + + let result = buf_in.for_byte_record(newline_char, |line| { + char_spans_into(line, &mut spans); + let mut print_delim = false; + for &Range { low, high } in ranges { + if low > line.len() { + break; + } + let high = high.min(line.len()); + // A character's last byte is at 1-based position `end` (exclusive 0-based end). + // Emit the output delimiter lazily, only once this range has actually + // selected a character, so a range that matches nothing adds no delimiter. + let mut range_emitted = false; + for &(start, end) in &spans { + if end >= low && end <= high { + if !range_emitted { + if print_delim { + out.write_all(out_delim)?; + } else if opts.out_delimiter.is_some() { + print_delim = true; + } + range_emitted = true; + } + out.write_all(&line[start..end])?; + } + } + } + out.write_all(&[newline_char])?; + Ok(true) + }); + + if let Err(e) = result { + return Err(USimpleError::new(1, e.to_string())); + } + + Ok(()) +} + /// Output delimiter is explicitly specified fn cut_fields_explicit_out_delim( reader: R, @@ -458,8 +562,10 @@ where } show_if_err!(match mode { + Mode::Bytes(ranges, opts) if opts.byte_no_split => + cut_bytes_no_split(stdin(), &mut out, ranges, opts), Mode::Bytes(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts), - Mode::Characters(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts), + Mode::Characters(ranges, opts) => cut_characters(stdin(), &mut out, ranges, opts), Mode::Fields(ranges, opts) => cut_fields(stdin(), &mut out, ranges, opts), }); @@ -482,8 +588,12 @@ where .map_err_context(|| filename.maybe_quote().to_string()) .and_then(|file| { match &mode { - Mode::Bytes(ranges, opts) | Mode::Characters(ranges, opts) => { - cut_bytes(file, &mut out, ranges, opts) + Mode::Bytes(ranges, opts) if opts.byte_no_split => { + cut_bytes_no_split(file, &mut out, ranges, opts) + } + Mode::Bytes(ranges, opts) => cut_bytes(file, &mut out, ranges, opts), + Mode::Characters(ranges, opts) => { + cut_characters(file, &mut out, ranges, opts) } Mode::Fields(ranges, opts) => cut_fields(file, &mut out, ranges, opts), } @@ -514,12 +624,16 @@ fn get_delimiters(matches: &ArgMatches) -> UResult<(Delimiter<'_>, Option<&[u8]> if os_string.is_empty() { Delimiter::Slice(b"\0") } else { - // For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters - // and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior + // For delimiter `-d` option value - allow a single character: a UTF-8 + // character in a UTF-8 locale, or a single (possibly multi-byte) + // character of the current locale's encoding, e.g. a 2-byte GB18030 + // character or any single byte like `b"\xAD"`, to align with GNU. let bytes = os_str_as_bytes(os_string)?; - if os_string.to_str().is_some_and(|s| s.chars().count() > 1) - || os_string.to_str().is_none() && bytes.len() > 1 - { + let is_single_char = match os_string.to_str() { + Some(s) => s.chars().count() == 1, + None => mb_char_len(bytes) == bytes.len(), + }; + if !is_single_char { return Err(USimpleError::new( 1, translate!("cut-error-delimiter-must-be-single-character"), @@ -583,6 +697,8 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let (delimiter, out_delimiter) = get_delimiters(&matches)?; let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)); + // `-n`: only meaningful with `-b`; keeps multi-byte characters intact. + let byte_no_split = matches.get_flag(options::NOTHING); // Only one, and only one of cutting mode arguments, i.e. `-b`, `-c`, `-f`, // is expected. The number of those arguments is used for parsing a cutting @@ -610,6 +726,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { out_delimiter, line_ending, field_opts: None, + byte_no_split, }, ) }) @@ -623,6 +740,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { out_delimiter, line_ending, field_opts: None, + byte_no_split, }, ) }) @@ -639,6 +757,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { delimiter, only_delimited, }), + byte_no_split, }, ) }) @@ -776,7 +895,7 @@ pub fn uu_app() -> Command { .arg( Arg::new(options::NOTHING) .short('n') - .help("(ignored)") + .help(translate!("cut-help-no-split-multibyte")) .action(ArgAction::SetTrue), ) } diff --git a/tests/by-util/test_cut.rs b/tests/by-util/test_cut.rs index 26d52a6a5fd..eb000dc3807 100644 --- a/tests/by-util/test_cut.rs +++ b/tests/by-util/test_cut.rs @@ -627,6 +627,168 @@ fn test_emoji_delim() { .stdout_only("🌹\n"); } +// 你 (U+4F60) encodes as the two bytes 0xC4 0xE3 in GB18030. +#[cfg(target_os = "linux")] +const GB18030_NI: &[u8] = b"\xC4\xE3"; +// 好 (U+597D) encodes as the two bytes 0xBA 0xC3 in GB18030. +#[cfg(target_os = "linux")] +const GB18030_HAO: &[u8] = b"\xBA\xC3"; + +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")] +fn test_gb18030_multibyte_delimiter() { + use std::ffi::OsString; + use std::os::unix::ffi::OsStringExt; + // Use a 2-byte GB18030 character as the field delimiter. + let delim = OsString::from_vec(GB18030_NI.to_vec()); + + // Drop the middle field: "abc" keeping fields 1 and 3. + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .arg("-d") + .arg(&delim) + .args(&["-f1,3", "--output-delimiter=-"]) + .pipe_in(&b"a\xC4\xE3b\xC4\xE3c\n"[..]) + .succeeds() + .stdout_only_bytes(b"a-c\n"); + + // Leading empty fields: two delimiters then a trailing field, all selected. + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .arg("-d") + .arg(&delim) + .args(&["-f1-3", "--output-delimiter=|"]) + .pipe_in(&b"\xC4\xE3\xC4\xE3z\n"[..]) + .succeeds() + .stdout_only_bytes(b"||z\n"); +} + +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")] +fn test_gb18030_complement_multibyte_delimiter() { + use std::ffi::OsString; + use std::os::unix::ffi::OsStringExt; + let delim = OsString::from_vec(GB18030_NI.to_vec()); + + // --complement of field 2 keeps the surrounding fields and the delimiters. + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .arg("-d") + .arg(&delim) + .args(&["--complement", "-f2"]) + .pipe_in(&b"a\xC4\xE3b\xC4\xE3c\n"[..]) + .succeeds() + .stdout_only_bytes(b"a\xC4\xE3c\n"); +} + +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")] +fn test_gb18030_single_byte_delimiter_is_accepted() { + use std::ffi::OsString; + use std::os::unix::ffi::OsStringExt; + // 0xFE never starts a GB18030 character, yet any single byte is still a + // legal delimiter. + let delim = OsString::from_vec(vec![0xFE]); + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .arg("-d") + .arg(&delim) + .args(&["-f2,3", "--output-delimiter=-"]) + .pipe_in(&b"p\xFEq\xFEr\n"[..]) + .succeeds() + .stdout_only_bytes(b"q-r\n"); +} + +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: LC_ALL is not propagated to the guest")] +fn test_gb18030_character_mode() { + // Input: the 2-byte character 好 followed by the ASCII byte 'y'. + let mut input = GB18030_HAO.to_vec(); + input.extend_from_slice(b"y\n"); + + // Character 1 is the whole multibyte character. + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .args(&["-c1"]) + .pipe_in(&input[..]) + .succeeds() + .stdout_only_bytes(b"\xBA\xC3\n"); + // Character 2 is the trailing ASCII byte. + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .args(&["-c2"]) + .pipe_in(&input[..]) + .succeeds() + .stdout_only_bytes(b"y\n"); +} + +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: LC_ALL is not propagated to the guest")] +fn test_gb18030_byte_mode_no_split() { + // With -n a multibyte character is never split: it is printed only when the + // selected byte range reaches its final byte. + let mut input = GB18030_HAO.to_vec(); + input.extend_from_slice(b"y\n"); + + // Byte 1 falls in the middle of 好, so nothing is emitted for it. + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .args(&["-b1", "-n"]) + .pipe_in(&input[..]) + .succeeds() + .stdout_only_bytes(b"\n"); + // Byte 2 completes 好, so the full character is emitted. + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .args(&["-b2", "-n"]) + .pipe_in(&input[..]) + .succeeds() + .stdout_only_bytes(b"\xBA\xC3\n"); +} + +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: LC_ALL is not propagated to the guest")] +fn test_gb18030_byte_mode_no_split_output_delimiter() { + // A -n range that selects no complete character must not emit an output + // delimiter: byte 1 (mid-character) yields nothing, so no leading delimiter + // precedes the 'z' selected by byte 3. + let mut input = GB18030_HAO.to_vec(); + input.extend_from_slice(b"z\n"); + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .args(&["-b1,3", "-n", "--output-delimiter=|"]) + .pipe_in(&input[..]) + .succeeds() + .stdout_only_bytes(b"z\n"); + + // Two ranges that both select nothing produce an empty line, not a stray + // delimiter. + let mut two = GB18030_HAO.to_vec(); + two.extend_from_slice(GB18030_HAO); + two.push(b'\n'); + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .args(&["-b1,3", "-n", "--output-delimiter=|"]) + .pipe_in(&two[..]) + .succeeds() + .stdout_only_bytes(b"\n"); + + // The delimiter is still emitted between two ranges that each select a full + // character. + new_ucmd!() + .env("LC_ALL", "zh_CN.gb18030") + .args(&["-b2,4", "-n", "--output-delimiter=|"]) + .pipe_in(&two[..]) + .succeeds() + .stdout_only_bytes(b"\xBA\xC3|\xBA\xC3\n"); +} + #[cfg(target_os = "linux")] #[test] fn test_failed_write_is_reported() {