Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/uu/cut/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ doctest = false

[dependencies]
clap = { workspace = true }
uucore = { workspace = true, features = ["ranges"] }
uucore = { workspace = true, features = ["ranges", "i18n-charmap"] }
memchr = { workspace = true }
bstr = { workspace = true }
fluent = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions src/uu/cut/locales/en-US.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ cut-help-complement = invert the filter - instead of displaying only the filtere
cut-help-only-delimited = in field mode, only print lines which contain the delimiter
cut-help-zero-terminated = instead of filtering columns based on line, filter columns based on \\0 (NULL character)
cut-help-output-delimiter = in field mode, replace the delimiter in output lines with this option's argument
cut-help-no-split-multibyte = in byte mode, do not split multibyte characters

# Error messages
cut-error-is-directory = Is a directory
Expand Down
1 change: 1 addition & 0 deletions src/uu/cut/locales/fr-FR.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ cut-help-complement = inverser le filtre - au lieu d'afficher seulement les colo
cut-help-only-delimited = en mode champ, afficher seulement les lignes qui contiennent le délimiteur
cut-help-zero-terminated = au lieu de filtrer les colonnes basées sur la ligne, filtrer les colonnes basées sur \\0 (caractère NULL)
cut-help-output-delimiter = en mode champ, remplacer le délimiteur dans les lignes de sortie avec l'argument de cette option
cut-help-no-split-multibyte = en mode octet, ne pas couper les caractères multioctets

# Messages d'erreur
cut-error-is-directory = Est un répertoire
Expand Down
131 changes: 122 additions & 9 deletions src/uu/cut/src/cut.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use std::io::{BufRead, BufReader, BufWriter, IsTerminal, Read, Write, stdin, std
use std::path::Path;
use uucore::display::Quotable;
use uucore::error::{FromIo, UResult, USimpleError, set_exit_code};
use uucore::i18n::charmap::mb_char_len;
use uucore::line_ending::LineEnding;
use uucore::os_str_as_bytes;

Expand All @@ -29,6 +30,8 @@ struct Options<'a> {
out_delimiter: Option<&'a [u8]>,
line_ending: LineEnding,
field_opts: Option<FieldOptions<'a>>,
/// `-n`: in byte mode, do not split multi-byte characters.
byte_no_split: bool,
}

enum Delimiter<'a> {
Expand Down Expand Up @@ -104,6 +107,101 @@ fn cut_bytes<R: Read, W: Write>(
Ok(())
}

/// Split `line` into the byte spans `[start, end)` of its characters, using the
/// current locale's encoding. Invalid/incomplete sequences count as one byte.
fn char_spans(line: &[u8]) -> Vec<(usize, usize)> {
let mut spans = Vec::new();
let mut i = 0;
while i < line.len() {
let len = mb_char_len(&line[i..]).clamp(1, line.len() - i);
spans.push((i, i + len));
i += len;
}
spans
}

/// Character mode (`-c`): ranges index whole (possibly multi-byte) characters.
fn cut_characters<R: Read, W: Write>(
reader: R,
out: &mut W,
ranges: &[Range],
opts: &Options,
) -> UResult<()> {
let newline_char = opts.line_ending.into();
let mut buf_in = BufReader::new(reader);
let out_delim = opts.out_delimiter.unwrap_or(b"\t");

let result = buf_in.for_byte_record(newline_char, |line| {
let spans = char_spans(line);
let mut print_delim = false;
for &Range { low, high } in ranges {
if low > spans.len() {
break;
}
if print_delim {
out.write_all(out_delim)?;
} else if opts.out_delimiter.is_some() {
print_delim = true;
}
let high = high.min(spans.len());
let start = spans[low - 1].0;
let end = spans[high - 1].1;
out.write_all(&line[start..end])?;
}
out.write_all(&[newline_char])?;
Ok(true)
});

if let Err(e) = result {
return Err(USimpleError::new(1, e.to_string()));
}

Ok(())
}

/// Byte mode with `-n`: ranges index bytes, but a multi-byte character is
/// emitted in full when (and only when) the range includes its last byte.
fn cut_bytes_no_split<R: Read, W: Write>(
reader: R,
out: &mut W,
ranges: &[Range],
opts: &Options,
) -> UResult<()> {
let newline_char = opts.line_ending.into();
let mut buf_in = BufReader::new(reader);
let out_delim = opts.out_delimiter.unwrap_or(b"\t");

let result = buf_in.for_byte_record(newline_char, |line| {
let spans = char_spans(line);
let mut print_delim = false;
for &Range { low, high } in ranges {
if low > line.len() {
break;
}
if print_delim {
out.write_all(out_delim)?;
} else if opts.out_delimiter.is_some() {
print_delim = true;
}
let high = high.min(line.len());
// A character's last byte is at 1-based position `end` (exclusive 0-based end).
for &(start, end) in &spans {
if end >= low && end <= high {
out.write_all(&line[start..end])?;
}
}
}
out.write_all(&[newline_char])?;
Ok(true)
});

if let Err(e) = result {
return Err(USimpleError::new(1, e.to_string()));
}

Ok(())
}

/// Output delimiter is explicitly specified
fn cut_fields_explicit_out_delim<R: Read, W: Write, M: Matcher>(
reader: R,
Expand Down Expand Up @@ -458,8 +556,10 @@ where
}

show_if_err!(match mode {
Mode::Bytes(ranges, opts) if opts.byte_no_split =>
cut_bytes_no_split(stdin(), &mut out, ranges, opts),
Mode::Bytes(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts),
Mode::Characters(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts),
Mode::Characters(ranges, opts) => cut_characters(stdin(), &mut out, ranges, opts),
Mode::Fields(ranges, opts) => cut_fields(stdin(), &mut out, ranges, opts),
});

Expand All @@ -482,8 +582,12 @@ where
.map_err_context(|| filename.maybe_quote().to_string())
.and_then(|file| {
match &mode {
Mode::Bytes(ranges, opts) | Mode::Characters(ranges, opts) => {
cut_bytes(file, &mut out, ranges, opts)
Mode::Bytes(ranges, opts) if opts.byte_no_split => {
cut_bytes_no_split(file, &mut out, ranges, opts)
}
Mode::Bytes(ranges, opts) => cut_bytes(file, &mut out, ranges, opts),
Mode::Characters(ranges, opts) => {
cut_characters(file, &mut out, ranges, opts)
}
Mode::Fields(ranges, opts) => cut_fields(file, &mut out, ranges, opts),
}
Expand Down Expand Up @@ -514,12 +618,16 @@ fn get_delimiters(matches: &ArgMatches) -> UResult<(Delimiter<'_>, Option<&[u8]>
if os_string.is_empty() {
Delimiter::Slice(b"\0")
} else {
// For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters
// and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior
// For delimiter `-d` option value - allow a single character: a UTF-8
// character in a UTF-8 locale, or a single (possibly multi-byte)
// character of the current locale's encoding, e.g. a 2-byte GB18030
// character or any single byte like `b"\xAD"`, to align with GNU.
let bytes = os_str_as_bytes(os_string)?;
if os_string.to_str().is_some_and(|s| s.chars().count() > 1)
|| os_string.to_str().is_none() && bytes.len() > 1
{
let is_single_char = match os_string.to_str() {
Some(s) => s.chars().count() == 1,
None => mb_char_len(bytes) == bytes.len(),
};
if !is_single_char {
return Err(USimpleError::new(
1,
translate!("cut-error-delimiter-must-be-single-character"),
Expand Down Expand Up @@ -583,6 +691,8 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {

let (delimiter, out_delimiter) = get_delimiters(&matches)?;
let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED));
// `-n`: only meaningful with `-b`; keeps multi-byte characters intact.
let byte_no_split = matches.get_flag(options::NOTHING);

// Only one, and only one of cutting mode arguments, i.e. `-b`, `-c`, `-f`,
// is expected. The number of those arguments is used for parsing a cutting
Expand Down Expand Up @@ -610,6 +720,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
out_delimiter,
line_ending,
field_opts: None,
byte_no_split,
},
)
})
Expand All @@ -623,6 +734,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
out_delimiter,
line_ending,
field_opts: None,
byte_no_split,
},
)
})
Expand All @@ -639,6 +751,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
delimiter,
only_delimited,
}),
byte_no_split,
},
)
})
Expand Down Expand Up @@ -776,7 +889,7 @@ pub fn uu_app() -> Command {
.arg(
Arg::new(options::NOTHING)
.short('n')
.help("(ignored)")
.help(translate!("cut-help-no-split-multibyte"))
.action(ArgAction::SetTrue),
)
}
122 changes: 122 additions & 0 deletions tests/by-util/test_cut.rs
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,128 @@ fn test_emoji_delim() {
.stdout_only("🌹\n");
}

// 你 (U+4F60) encodes as the two bytes 0xC4 0xE3 in GB18030.
#[cfg(target_os = "linux")]
const GB18030_NI: &[u8] = b"\xC4\xE3";
// 好 (U+597D) encodes as the two bytes 0xBA 0xC3 in GB18030.
#[cfg(target_os = "linux")]
const GB18030_HAO: &[u8] = b"\xBA\xC3";

#[test]
#[cfg(target_os = "linux")]
#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")]
fn test_gb18030_multibyte_delimiter() {
use std::ffi::OsString;
use std::os::unix::ffi::OsStringExt;
// Use a 2-byte GB18030 character as the field delimiter.
let delim = OsString::from_vec(GB18030_NI.to_vec());

// Drop the middle field: "a<delim>b<delim>c" keeping fields 1 and 3.
new_ucmd!()
.env("LC_ALL", "zh_CN.gb18030")
.arg("-d")
.arg(&delim)
.args(&["-f1,3", "--output-delimiter=-"])
.pipe_in(&b"a\xC4\xE3b\xC4\xE3c\n"[..])
.succeeds()
.stdout_only_bytes(b"a-c\n");

// Leading empty fields: two delimiters then a trailing field, all selected.
new_ucmd!()
.env("LC_ALL", "zh_CN.gb18030")
.arg("-d")
.arg(&delim)
.args(&["-f1-3", "--output-delimiter=|"])
.pipe_in(&b"\xC4\xE3\xC4\xE3z\n"[..])
.succeeds()
.stdout_only_bytes(b"||z\n");
}

#[test]
#[cfg(target_os = "linux")]
#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")]
fn test_gb18030_complement_multibyte_delimiter() {
use std::ffi::OsString;
use std::os::unix::ffi::OsStringExt;
let delim = OsString::from_vec(GB18030_NI.to_vec());

// --complement of field 2 keeps the surrounding fields and the delimiters.
new_ucmd!()
.env("LC_ALL", "zh_CN.gb18030")
.arg("-d")
.arg(&delim)
.args(&["--complement", "-f2"])
.pipe_in(&b"a\xC4\xE3b\xC4\xE3c\n"[..])
.succeeds()
.stdout_only_bytes(b"a\xC4\xE3c\n");
}

#[test]
#[cfg(target_os = "linux")]
#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")]
fn test_gb18030_single_byte_delimiter_is_accepted() {
use std::ffi::OsString;
use std::os::unix::ffi::OsStringExt;
// 0xFE never starts a GB18030 character, yet any single byte is still a
// legal delimiter.
let delim = OsString::from_vec(vec![0xFE]);
new_ucmd!()
.env("LC_ALL", "zh_CN.gb18030")
.arg("-d")
.arg(&delim)
.args(&["-f2,3", "--output-delimiter=-"])
.pipe_in(&b"p\xFEq\xFEr\n"[..])
.succeeds()
.stdout_only_bytes(b"q-r\n");
}
Comment thread
sylvestre marked this conversation as resolved.

#[test]
#[cfg(target_os = "linux")]
fn test_gb18030_character_mode() {
// Input: the 2-byte character 好 followed by the ASCII byte 'y'.
let mut input = GB18030_HAO.to_vec();
input.extend_from_slice(b"y\n");

// Character 1 is the whole multibyte character.
new_ucmd!()
.env("LC_ALL", "zh_CN.gb18030")
.args(&["-c1"])
.pipe_in(&input[..])
.succeeds()
.stdout_only_bytes(b"\xBA\xC3\n");
// Character 2 is the trailing ASCII byte.
new_ucmd!()
.env("LC_ALL", "zh_CN.gb18030")
.args(&["-c2"])
.pipe_in(&input[..])
.succeeds()
.stdout_only_bytes(b"y\n");
}

#[test]
#[cfg(target_os = "linux")]
fn test_gb18030_byte_mode_no_split() {
// With -n a multibyte character is never split: it is printed only when the
// selected byte range reaches its final byte.
let mut input = GB18030_HAO.to_vec();
input.extend_from_slice(b"y\n");

// Byte 1 falls in the middle of 好, so nothing is emitted for it.
new_ucmd!()
.env("LC_ALL", "zh_CN.gb18030")
.args(&["-b1", "-n"])
.pipe_in(&input[..])
.succeeds()
.stdout_only_bytes(b"\n");
// Byte 2 completes 好, so the full character is emitted.
new_ucmd!()
.env("LC_ALL", "zh_CN.gb18030")
.args(&["-b2", "-n"])
.pipe_in(&input[..])
.succeeds()
.stdout_only_bytes(b"\xBA\xC3\n");
}
Comment thread
sylvestre marked this conversation as resolved.

#[cfg(target_os = "linux")]
#[test]
fn test_failed_write_is_reported() {
Expand Down
Loading