From a66b7a1fc9702380e92292fc63cc15e4493dacc5 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 23 Feb 2026 05:58:32 +0000 Subject: [PATCH 1/2] feat(cut,tr): add cut -c char mode, tr -s/-c, complement, char classes Cut improvements: - Add -c character mode (cut -c1-5, -c1,3,5, -c3-, -c-3) - Add -s (only delimited lines) - Add --complement (invert selection) - Add --output-delimiter - Support open-ended ranges: f3- (field to end), -f3 (from start) - Refactor field parsing with Position enum for flexible ranges Tr improvements: - Add -s (squeeze repeated characters) - Add -c (complement set) - Support combined flags (-cd, -ds) - Fix trailing newline: tests now use printf with \n Remove 24 skip markers from cuttr spec tests. Add 5 new unit tests for cut/tr features. --- crates/bashkit/src/builtins/cuttr.rs | 346 +++++++++++++++--- .../tests/spec_cases/bash/cuttr.test.sh | 65 ++-- 2 files changed, 312 insertions(+), 99 deletions(-) diff --git a/crates/bashkit/src/builtins/cuttr.rs b/crates/bashkit/src/builtins/cuttr.rs index fd8dc1a4..31a828a9 100644 --- a/crates/bashkit/src/builtins/cuttr.rs +++ b/crates/bashkit/src/builtins/cuttr.rs @@ -9,17 +9,32 @@ use crate::interpreter::ExecResult; /// The cut builtin - remove sections from each line. /// /// Usage: cut -d DELIM -f FIELDS [FILE...] +/// cut -c CHARS [FILE...] /// /// Options: -/// -d DELIM Use DELIM instead of TAB for field delimiter -/// -f FIELDS Select only these fields (1-indexed, comma-separated or ranges) +/// -d DELIM Use DELIM instead of TAB for field delimiter +/// -f FIELDS Select only these fields (1-indexed) +/// -c CHARS Select only these characters (1-indexed) +/// -s Only print lines containing delimiter (with -f) +/// --complement Complement the selection +/// --output-delimiter Use STRING as output delimiter pub struct Cut; +#[derive(PartialEq)] +enum CutMode { + Fields, + Chars, +} + #[async_trait] impl Builtin for Cut { async fn execute(&self, ctx: Context<'_>) -> Result { let mut delimiter = '\t'; - let mut fields_spec = String::new(); + let mut spec = String::new(); + let mut mode = CutMode::Fields; + let mut complement = false; + let mut only_delimited = false; + let mut output_delimiter: Option = None; let mut files = Vec::new(); // Parse arguments @@ -36,44 +51,116 @@ impl Builtin for Cut { } else if arg == "-f" { i += 1; if i < ctx.args.len() { - fields_spec = ctx.args[i].clone(); + spec = ctx.args[i].clone(); + mode = CutMode::Fields; } } else if let Some(f) = arg.strip_prefix("-f") { - fields_spec = f.to_string(); + spec = f.to_string(); + mode = CutMode::Fields; + } else if arg == "-c" { + i += 1; + if i < ctx.args.len() { + spec = ctx.args[i].clone(); + mode = CutMode::Chars; + } + } else if let Some(c) = arg.strip_prefix("-c") { + spec = c.to_string(); + mode = CutMode::Chars; + } else if arg == "-s" { + only_delimited = true; + } else if arg == "--complement" { + complement = true; + } else if let Some(od) = arg.strip_prefix("--output-delimiter=") { + output_delimiter = Some(od.to_string()); + } else if arg == "--output-delimiter" { + i += 1; + if i < ctx.args.len() { + output_delimiter = Some(ctx.args[i].clone()); + } } else if !arg.starts_with('-') { files.push(arg.clone()); } i += 1; } - if fields_spec.is_empty() { + if spec.is_empty() { return Ok(ExecResult::err( "cut: you must specify a list of fields\n".to_string(), 1, )); } - // Parse field specification - let fields = parse_field_spec(&fields_spec); + // Parse position specification (supports open-ended ranges like "3-" and "-3") + let positions = parse_position_spec(&spec); + let out_delim = output_delimiter.unwrap_or_else(|| delimiter.to_string()); + + let process_line = |line: &str| -> Option { + match mode { + CutMode::Chars => { + let chars: Vec = line.chars().collect(); + let total = chars.len(); + let resolved = resolve_positions(&positions, total); + let selected: Vec = if complement { + chars + .iter() + .enumerate() + .filter(|(i, _)| !resolved.contains(&(i + 1))) + .map(|(_, c)| *c) + .collect() + } else { + resolved + .iter() + .filter_map(|&p| chars.get(p - 1).copied()) + .collect() + }; + Some(selected.into_iter().collect()) + } + CutMode::Fields => { + // -s: skip lines without delimiter + if only_delimited && !line.contains(delimiter) { + return None; + } + let parts: Vec<&str> = line.split(delimiter).collect(); + let total = parts.len(); + let resolved = resolve_positions(&positions, total); + let selected: Vec<&str> = if complement { + parts + .iter() + .enumerate() + .filter(|(i, _)| !resolved.contains(&(i + 1))) + .map(|(_, s)| *s) + .collect() + } else { + resolved + .iter() + .filter_map(|&f| parts.get(f - 1).copied()) + .collect() + }; + Some(selected.join(&out_delim)) + } + } + }; let mut output = String::new(); if files.is_empty() || files.iter().all(|f| f.as_str() == "-") { - // Read from stdin if let Some(stdin) = ctx.stdin { for line in stdin.lines() { - output.push_str(&cut_line(line, delimiter, &fields)); - output.push('\n'); + if let Some(result) = process_line(line) { + output.push_str(&result); + output.push('\n'); + } } } } else { - // Read from files for file in &files { if file.as_str() == "-" { if let Some(stdin) = ctx.stdin { for line in stdin.lines() { - output.push_str(&cut_line(line, delimiter, &fields)); - output.push('\n'); + if let Some(result) = process_line(line) { + output.push_str(&result); + output.push('\n'); + } } } continue; @@ -89,8 +176,10 @@ impl Builtin for Cut { Ok(content) => { let text = String::from_utf8_lossy(&content); for line in text.lines() { - output.push_str(&cut_line(line, delimiter, &fields)); - output.push('\n'); + if let Some(result) = process_line(line) { + output.push_str(&result); + output.push('\n'); + } } } Err(e) => { @@ -104,77 +193,156 @@ impl Builtin for Cut { } } -/// Parse a field specification like "1", "1,3", "1-3", "1,3-5" -fn parse_field_spec(spec: &str) -> Vec { - let mut fields = Vec::new(); +/// Position in a cut specification — can be open-ended +#[derive(Debug, Clone)] +enum Position { + Single(usize), + Range(usize, usize), + FromStart(usize), // -N (1 to N) + ToEnd(usize), // N- (N to end) +} + +/// Parse a position specification like "1", "1,3", "1-3", "3-", "-3" +fn parse_position_spec(spec: &str) -> Vec { + let mut positions = Vec::new(); for part in spec.split(',') { if let Some((start, end)) = part.split_once('-') { - let start: usize = start.parse().unwrap_or(1); - let end: usize = end.parse().unwrap_or(start); - for f in start..=end { - if f > 0 { - fields.push(f); + if start.is_empty() { + // -N + if let Ok(n) = end.parse::() { + positions.push(Position::FromStart(n)); } + } else if end.is_empty() { + // N- + if let Ok(n) = start.parse::() { + positions.push(Position::ToEnd(n)); + } + } else { + // N-M + let s: usize = start.parse().unwrap_or(1); + let e: usize = end.parse().unwrap_or(s); + positions.push(Position::Range(s, e)); } } else if let Ok(f) = part.parse::() { if f > 0 { - fields.push(f); + positions.push(Position::Single(f)); } } } - fields.sort(); - fields.dedup(); - fields + positions } -/// Cut fields from a line -fn cut_line(line: &str, delimiter: char, fields: &[usize]) -> String { - let parts: Vec<&str> = line.split(delimiter).collect(); - let selected: Vec<&str> = fields - .iter() - .filter_map(|&f| parts.get(f - 1).copied()) - .collect(); - selected.join(&delimiter.to_string()) +/// Resolve position specifications into concrete 1-indexed positions +fn resolve_positions(positions: &[Position], total: usize) -> Vec { + let mut result = Vec::new(); + for pos in positions { + match pos { + Position::Single(n) => { + if *n > 0 && *n <= total { + result.push(*n); + } + } + Position::Range(s, e) => { + let start = (*s).max(1); + let end = (*e).min(total); + for i in start..=end { + result.push(i); + } + } + Position::FromStart(n) => { + for i in 1..=(*n).min(total) { + result.push(i); + } + } + Position::ToEnd(n) => { + let start = (*n).max(1); + for i in start..=total { + result.push(i); + } + } + } + } + result.sort(); + result.dedup(); + result } /// The tr builtin - translate or delete characters. /// -/// Usage: tr [-d] SET1 [SET2] +/// Usage: tr [-d] [-s] [-c] SET1 [SET2] /// /// Options: /// -d Delete characters in SET1 +/// -s Squeeze repeated output characters in SET2 (or SET1 if no SET2) +/// -c Complement SET1 (use all chars NOT in SET1) /// /// SET1 and SET2 can contain character ranges like a-z, A-Z, 0-9 +/// and POSIX classes like [:lower:], [:upper:], [:digit:] pub struct Tr; #[async_trait] impl Builtin for Tr { async fn execute(&self, ctx: Context<'_>) -> Result { - let delete = ctx.args.iter().any(|a| a == "-d"); - // Only treat as flag if it's a known flag like "-d", not a lone "-" which is a valid char set - let non_flag_args: Vec<_> = ctx - .args - .iter() - .filter(|a| *a != "-d" && (a.len() == 1 || !a.starts_with('-'))) - .collect(); + let mut delete = false; + let mut squeeze = false; + let mut complement = false; + + // Parse flags (can be combined like -ds, -cd) + let mut non_flag_args: Vec<&String> = Vec::new(); + for arg in ctx.args.iter() { + if arg.starts_with('-') + && arg.len() > 1 + && arg.chars().skip(1).all(|c| "dsc".contains(c)) + { + for c in arg.chars().skip(1) { + match c { + 'd' => delete = true, + 's' => squeeze = true, + 'c' => complement = true, + _ => {} + } + } + } else { + non_flag_args.push(arg); + } + } if non_flag_args.is_empty() { return Ok(ExecResult::err("tr: missing operand\n".to_string(), 1)); } - let set1 = expand_char_set(non_flag_args[0]); + let mut set1 = expand_char_set(non_flag_args[0]); + if complement { + // Complement: use all ASCII chars NOT in set1 + let original = set1.clone(); + set1 = (0u8..=127) + .map(|b| b as char) + .filter(|c| !original.contains(c)) + .collect(); + } - let result = if delete { - // Delete mode - let stdin = ctx.stdin.unwrap_or(""); + let stdin = ctx.stdin.unwrap_or(""); + + let result = if delete && squeeze { + // -ds: delete SET1 chars, then squeeze SET2 chars + let set2 = if non_flag_args.len() >= 2 { + expand_char_set(non_flag_args[1]) + } else { + set1.clone() + }; + let after_delete: String = stdin.chars().filter(|c| !set1.contains(c)).collect(); + squeeze_chars(&after_delete, &set2) + } else if delete { stdin .chars() .filter(|c| !set1.contains(c)) .collect::() + } else if squeeze && non_flag_args.len() < 2 { + // -s with only SET1: squeeze characters in SET1 + squeeze_chars(stdin, &set1) } else { - // Translate mode if non_flag_args.len() < 2 { return Ok(ExecResult::err( "tr: missing operand after SET1\n".to_string(), @@ -183,25 +351,44 @@ impl Builtin for Tr { } let set2 = expand_char_set(non_flag_args[1]); - let stdin = ctx.stdin.unwrap_or(""); - stdin + let translated: String = stdin .chars() .map(|c| { if let Some(pos) = set1.iter().position(|&x| x == c) { - // Get corresponding char from set2, or last char if set2 is shorter *set2.get(pos).or(set2.last()).unwrap_or(&c) } else { c } }) - .collect::() + .collect(); + + if squeeze { + squeeze_chars(&translated, &set2) + } else { + translated + } }; Ok(ExecResult::ok(result)) } } +/// Squeeze repeated consecutive characters that are in the given set +fn squeeze_chars(s: &str, set: &[char]) -> String { + let mut result = String::with_capacity(s.len()); + let mut last_char: Option = None; + + for c in s.chars() { + if set.contains(&c) && last_char == Some(c) { + continue; // skip repeated char in squeeze set + } + result.push(c); + last_char = Some(c); + } + result +} + /// Expand a character set specification like "a-z" into a list of characters. /// Supports POSIX character classes: [:lower:], [:upper:], [:digit:], [:alpha:], [:alnum:], [:space:] fn expand_char_set(spec: &str) -> Vec { @@ -422,10 +609,53 @@ mod tests { } #[test] - fn test_parse_field_spec() { - assert_eq!(parse_field_spec("1"), vec![1]); - assert_eq!(parse_field_spec("1,3"), vec![1, 3]); - assert_eq!(parse_field_spec("1-3"), vec![1, 2, 3]); - assert_eq!(parse_field_spec("1,3-5"), vec![1, 3, 4, 5]); + fn test_parse_position_spec() { + // Resolved against 10 total positions + let resolve = |spec: &str| resolve_positions(&parse_position_spec(spec), 10); + assert_eq!(resolve("1"), vec![1]); + assert_eq!(resolve("1,3"), vec![1, 3]); + assert_eq!(resolve("1-3"), vec![1, 2, 3]); + assert_eq!(resolve("1,3-5"), vec![1, 3, 4, 5]); + assert_eq!(resolve("3-"), vec![3, 4, 5, 6, 7, 8, 9, 10]); + assert_eq!(resolve("-3"), vec![1, 2, 3]); + } + + #[tokio::test] + async fn test_cut_char_mode() { + let result = run_cut(&["-c", "1-5"], Some("hello world\n")).await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "hello\n"); + } + + #[tokio::test] + async fn test_cut_complement() { + let result = run_cut(&["-d", ",", "--complement", "-f", "2"], Some("a,b,c,d\n")).await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "a,c,d\n"); + } + + #[tokio::test] + async fn test_cut_only_delimited() { + let result = run_cut( + &["-d", ",", "-f", "1", "-s"], + Some("a,b,c\nno delim\nx,y\n"), + ) + .await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "a\nx\n"); + } + + #[tokio::test] + async fn test_tr_squeeze() { + let result = run_tr(&["-s", "eol "], Some("heeelllo wooorld\n")).await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "helo world\n"); + } + + #[tokio::test] + async fn test_tr_complement_delete() { + let result = run_tr(&["-cd", "0-9\n"], Some("hello123\n")).await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "123\n"); } } diff --git a/crates/bashkit/tests/spec_cases/bash/cuttr.test.sh b/crates/bashkit/tests/spec_cases/bash/cuttr.test.sh index bd15dd5c..80bd3ea1 100644 --- a/crates/bashkit/tests/spec_cases/bash/cuttr.test.sh +++ b/crates/bashkit/tests/spec_cases/bash/cuttr.test.sh @@ -22,33 +22,29 @@ a:b ### end ### tr_lowercase_to_uppercase -### skip: test expects trailing newline but tr preserves input format # Translate lowercase to uppercase -printf 'hello' | tr a-z A-Z +printf 'hello\n' | tr a-z A-Z ### expect HELLO ### end ### tr_delete -### skip: test expects trailing newline but tr preserves input format # Delete characters -printf 'hello world' | tr -d aeiou +printf 'hello world\n' | tr -d aeiou ### expect hll wrld ### end ### tr_single_char -### skip: test expects trailing newline but tr preserves input format # Translate single character -printf 'a:b:c' | tr : - +printf 'a:b:c\n' | tr : - ### expect a-b-c ### end ### tr_spaces_to_newlines -### skip: tr escape sequence processing not implemented # Replace spaces with newlines -printf 'one two three' | tr ' ' '\n' +printf 'one two three\n' | tr ' ' '\n' ### expect one two @@ -72,15 +68,13 @@ done ### end ### tr_delete_all_vowels -### skip: test expects trailing newline but tr preserves input format # Delete all vowels -printf 'HELLO WORLD' | tr -d AEIOU +printf 'HELLO WORLD\n' | tr -d AEIOU ### expect HLL WRLD ### end ### cut_char_range -### skip: cut -c (character mode) not implemented # Cut character range printf 'hello world\n' | cut -c1-5 ### expect @@ -88,7 +82,6 @@ hello ### end ### cut_char_single -### skip: cut -c (character mode) not implemented # Cut single character printf 'hello\n' | cut -c1 ### expect @@ -96,7 +89,6 @@ h ### end ### cut_char_multiple -### skip: cut -c (character mode) not implemented # Cut multiple chars printf 'hello\n' | cut -c1,3,5 ### expect @@ -104,14 +96,13 @@ hlo ### end ### cut_char_from_end -### skip: cut -c (character mode) not implemented +# Cut from start to position N printf 'hello\n' | cut -c-3 ### expect hel ### end ### cut_char_to_end -### skip: cut -c (character mode) not implemented # Cut from position to end printf 'hello world\n' | cut -c7- ### expect @@ -126,7 +117,6 @@ a:b:c ### end ### cut_field_to_end -### skip: cut f3- (field to end) syntax not implemented # Cut fields to end printf 'a:b:c:d:e\n' | cut -d: -f3- ### expect @@ -134,15 +124,15 @@ c:d:e ### end ### cut_complement -### skip: cut --complement not implemented +# Complement field selection printf 'a,b,c,d\n' | cut -d, --complement -f2 ### expect a,c,d ### end ### cut_output_delimiter -### skip: cut --output-delimiter not implemented -printf 'a,b,c\n' | cut -d, -f1,3 --output-delimiter='-' +# Custom output delimiter +printf 'a,b,c\n' | cut -d, -f1,3 --output-delimiter=- ### expect a-c ### end @@ -155,55 +145,50 @@ b ### end ### tr_squeeze -### skip: tr -s (squeeze) not implemented # Squeeze repeated characters -printf 'heeelllo wooorld' | tr -s 'eol ' +printf 'heeelllo wooorld\n' | tr -s 'eol ' ### expect helo world ### end ### tr_complement -### skip: tr -c (complement) not implemented -# Complement character set -printf 'hello123' | tr -cd '0-9' +# Complement character set — delete all non-digits +printf 'hello123\n' | tr -cd '0-9\n' ### expect 123 ### end ### tr_class_lower -### skip: tr character classes not implemented # Character class [:lower:] -printf 'Hello World' | tr '[:upper:]' '[:lower:]' +printf 'Hello World\n' | tr '[:upper:]' '[:lower:]' ### expect hello world ### end ### tr_class_upper -### skip: tr character classes not implemented # Character class [:upper:] -printf 'Hello World' | tr '[:lower:]' '[:upper:]' +printf 'Hello World\n' | tr '[:lower:]' '[:upper:]' ### expect HELLO WORLD ### end ### tr_class_digit -### skip: tr character classes not implemented -printf 'a1b2c3' | tr -d '[:digit:]' +# Delete digits using character class +printf 'a1b2c3\n' | tr -d '[:digit:]' ### expect abc ### end ### tr_class_alpha -### skip: tr character classes not implemented -printf 'a1b2c3' | tr -d '[:alpha:]' +# Delete alpha using character class +printf 'a1b2c3\n' | tr -d '[:alpha:]' ### expect 123 ### end ### tr_escape_newline -### skip: tr escape sequence processing not implemented # Translate to newline -printf 'a:b:c' | tr ':' '\n' +printf 'a:b:c\n' | tr ':' '\n' ### expect a b @@ -211,30 +196,28 @@ c ### end ### tr_escape_tab -### skip: tr escape sequence processing not implemented # Translate to tab -printf 'a b c' | tr ' ' '\t' +printf 'a b c\n' | tr ' ' '\t' ### expect a b c ### end ### tr_multiple_chars -### skip: test expects trailing newline but tr preserves input format # Translate multiple chars -printf 'aabbcc' | tr 'abc' 'xyz' +printf 'aabbcc\n' | tr 'abc' 'xyz' ### expect xxyyzz ### end ### tr_truncate_set2 -### skip: test expects trailing newline but tr preserves input format -printf 'aabbcc' | tr 'abc' 'x' +# When SET2 shorter, last char repeats +printf 'aabbcc\n' | tr 'abc' 'x' ### expect xxxxxx ### end ### cut_only_delimited -### skip: cut -s (only delimited) not implemented +# Only print lines containing delimiter printf 'a,b,c\nno delim\nx,y\n' | cut -d, -f1 -s ### expect a From ec61b3fcd93c6da28e8849763a4c119cd316d4f3 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 23 Feb 2026 06:12:49 +0000 Subject: [PATCH 2/2] feat(sort,uniq): add sort -t/-k/-s/-c/-h/-M/-o and uniq -i/-f options Add field delimiter (-t), key field (-k with n/r suffix), stable sort (-s), check sorted (-c), human numeric sort (-h), month sort (-M), and output file (-o) to sort builtin. Add case insensitive (-i) and skip fields (-f) to uniq builtin. Remove 12 skip markers, add 4 new spec tests and 11 unit tests. https://claude.ai/code/session_012rzB3FRw7yoQWCG1mxyW7J --- crates/bashkit/src/builtins/sortuniq.rs | 409 +++++++++++++++--- .../tests/spec_cases/bash/sortuniq.test.sh | 52 ++- specs/009-implementation-status.md | 8 +- 3 files changed, 398 insertions(+), 71 deletions(-) diff --git a/crates/bashkit/src/builtins/sortuniq.rs b/crates/bashkit/src/builtins/sortuniq.rs index 75fdc4ab..ac62cce1 100644 --- a/crates/bashkit/src/builtins/sortuniq.rs +++ b/crates/bashkit/src/builtins/sortuniq.rs @@ -11,7 +11,7 @@ use crate::interpreter::ExecResult; /// The sort builtin - sort lines of text. /// -/// Usage: sort [-fnruV] [FILE...] +/// Usage: sort [-cfhnMruVs] [-t DELIM] [-k KEYDEF] [-o FILE] [FILE...] /// /// Options: /// -f Fold lower case to upper case characters (case insensitive) @@ -19,42 +19,153 @@ use crate::interpreter::ExecResult; /// -r Reverse the result of comparisons /// -u Output only unique lines (like sort | uniq) /// -V Natural sort of version numbers +/// -t Field delimiter character +/// -k Sort key definition (e.g., -k2 or -k2,2) +/// -s Stable sort (preserve input order for equal keys) +/// -c Check if input is sorted; exit 1 if not +/// -h Human numeric sort (1K, 2M, 3G) +/// -M Month sort (JAN < FEB < ... < DEC) +/// -o Write output to FILE pub struct Sort; +/// Extract the sort key from a line based on field delimiter and key spec +fn extract_key(line: &str, delimiter: Option, key_field: usize) -> String { + if let Some(delim) = delimiter { + line.split(delim) + .nth(key_field.saturating_sub(1)) + .unwrap_or("") + .to_string() + } else { + // Default: whitespace-separated fields + line.split_whitespace() + .nth(key_field.saturating_sub(1)) + .unwrap_or("") + .to_string() + } +} + +/// Parse human-numeric value (e.g., "10K" → 10_000, "5M" → 5_000_000) +fn parse_human_numeric(s: &str) -> f64 { + let s = s.trim(); + if s.is_empty() { + return 0.0; + } + let last = s.as_bytes().last().copied().unwrap_or(b'0'); + let multiplier = match last { + b'K' | b'k' => 1_000.0, + b'M' | b'm' => 1_000_000.0, + b'G' | b'g' => 1_000_000_000.0, + b'T' | b't' => 1_000_000_000_000.0, + _ => return s.parse::().unwrap_or(0.0), + }; + let num_part = &s[..s.len() - 1]; + num_part.parse::().unwrap_or(0.0) * multiplier +} + +/// Parse month abbreviation to ordinal (1-12, 0 for unknown) +fn month_ordinal(s: &str) -> u32 { + match s.trim().to_uppercase().as_str() { + "JAN" => 1, + "FEB" => 2, + "MAR" => 3, + "APR" => 4, + "MAY" => 5, + "JUN" => 6, + "JUL" => 7, + "AUG" => 8, + "SEP" => 9, + "OCT" => 10, + "NOV" => 11, + "DEC" => 12, + _ => 0, + } +} + #[async_trait] impl Builtin for Sort { async fn execute(&self, ctx: Context<'_>) -> Result { - let reverse = ctx - .args - .iter() - .any(|a| a.contains('r') && a.starts_with('-')); - let numeric = ctx - .args - .iter() - .any(|a| a.contains('n') && a.starts_with('-')); - let unique = ctx - .args - .iter() - .any(|a| a.contains('u') && a.starts_with('-')); - let fold_case = ctx - .args - .iter() - .any(|a| a.contains('f') && a.starts_with('-')); - - let files: Vec<_> = ctx.args.iter().filter(|a| !a.starts_with('-')).collect(); + let mut reverse = false; + let mut numeric = false; + let mut unique = false; + let mut fold_case = false; + let mut stable = false; + let mut check_sorted = false; + let mut human_numeric = false; + let mut month_sort = false; + let mut delimiter: Option = None; + let mut key_field: Option = None; + let mut output_file: Option = None; + let mut files = Vec::new(); + + let mut i = 0; + while i < ctx.args.len() { + let arg = &ctx.args[i]; + if arg == "-t" { + i += 1; + if i < ctx.args.len() { + delimiter = ctx.args[i].chars().next(); + } + } else if let Some(d) = arg.strip_prefix("-t") { + delimiter = d.chars().next(); + } else if arg == "-k" { + i += 1; + if i < ctx.args.len() { + // Parse key: "2" or "2,2" or "2n" + let key_spec = &ctx.args[i]; + let field_str: String = key_spec + .chars() + .take_while(|c| c.is_ascii_digit()) + .collect(); + key_field = field_str.parse().ok(); + // Check for type suffix in key spec + if key_spec.contains('n') { + numeric = true; + } + if key_spec.contains('r') { + reverse = true; + } + } + } else if let Some(k) = arg.strip_prefix("-k") { + let field_str: String = k.chars().take_while(|c| c.is_ascii_digit()).collect(); + key_field = field_str.parse().ok(); + if k.contains('n') { + numeric = true; + } + } else if arg == "-o" { + i += 1; + if i < ctx.args.len() { + output_file = Some(ctx.args[i].clone()); + } + } else if arg.starts_with('-') && !arg.starts_with("--") { + for c in arg.chars().skip(1) { + match c { + 'r' => reverse = true, + 'n' => numeric = true, + 'u' => unique = true, + 'f' => fold_case = true, + 's' => stable = true, + 'c' | 'C' => check_sorted = true, + 'h' => human_numeric = true, + 'M' => month_sort = true, + _ => {} + } + } + } else { + files.push(arg.clone()); + } + i += 1; + } // Collect all input let mut all_lines = Vec::new(); if files.is_empty() { - // Read from stdin if let Some(stdin) = ctx.stdin { for line in stdin.lines() { all_lines.push(line.to_string()); } } } else { - // Read from files for file in &files { let path = if file.starts_with('/') { std::path::PathBuf::from(file) @@ -76,34 +187,81 @@ impl Builtin for Sort { } } + // Check sorted mode + if check_sorted { + for i in 1..all_lines.len() { + let cmp = if numeric { + let a: f64 = all_lines[i - 1].trim().parse().unwrap_or(0.0); + let b: f64 = all_lines[i].trim().parse().unwrap_or(0.0); + a.partial_cmp(&b).unwrap_or(std::cmp::Ordering::Equal) + } else { + all_lines[i - 1].cmp(&all_lines[i]) + }; + let out_of_order = if reverse { + cmp == std::cmp::Ordering::Less + } else { + cmp == std::cmp::Ordering::Greater + }; + if out_of_order { + return Ok(ExecResult::err( + format!("sort: -:{}:disorder: {}\n", i + 1, all_lines[i]), + 1, + )); + } + } + return Ok(ExecResult::ok(String::new())); + } + + // Get the key extractor + let get_key = |line: &str| -> String { + if let Some(kf) = key_field { + extract_key(line, delimiter, kf) + } else { + line.to_string() + } + }; + // Sort the lines - if numeric { - all_lines.sort_by(|a, b| { - let a_num: f64 = a + let sort_fn = |a: &String, b: &String| -> std::cmp::Ordering { + let ka = get_key(a); + let kb = get_key(b); + if human_numeric { + let na = parse_human_numeric(&ka); + let nb = parse_human_numeric(&kb); + na.partial_cmp(&nb).unwrap_or(std::cmp::Ordering::Equal) + } else if month_sort { + let ma = month_ordinal(&ka); + let mb = month_ordinal(&kb); + ma.cmp(&mb) + } else if numeric { + let na: f64 = ka .split_whitespace() .next() .and_then(|s| s.parse().ok()) .unwrap_or(0.0); - let b_num: f64 = b + let nb: f64 = kb .split_whitespace() .next() .and_then(|s| s.parse().ok()) .unwrap_or(0.0); - a_num - .partial_cmp(&b_num) - .unwrap_or(std::cmp::Ordering::Equal) - }); - } else if fold_case { - all_lines.sort_by_key(|a| a.to_lowercase()); + na.partial_cmp(&nb).unwrap_or(std::cmp::Ordering::Equal) + } else if fold_case { + ka.to_lowercase().cmp(&kb.to_lowercase()) + } else { + ka.cmp(&kb) + } + }; + + if stable { + all_lines.sort_by(sort_fn); } else { - all_lines.sort(); + all_lines.sort_unstable_by(sort_fn); } if reverse { all_lines.reverse(); } - // Remove duplicates if -u if unique { all_lines.dedup(); } @@ -113,47 +271,98 @@ impl Builtin for Sort { output.push('\n'); } + // Write to output file if -o specified + if let Some(ref outfile) = output_file { + let path = if outfile.starts_with('/') { + std::path::PathBuf::from(outfile) + } else { + ctx.cwd.join(outfile) + }; + if let Err(e) = ctx.fs.write_file(&path, output.as_bytes()).await { + return Ok(ExecResult::err(format!("sort: {}: {}\n", outfile, e), 1)); + } + return Ok(ExecResult::ok(String::new())); + } + Ok(ExecResult::ok(output)) } } /// The uniq builtin - report or omit repeated lines. /// -/// Usage: uniq [-cdu] [INPUT [OUTPUT]] +/// Usage: uniq [-cdiu] [-f N] [INPUT [OUTPUT]] /// /// Options: /// -c Prefix lines by the number of occurrences /// -d Only print duplicate lines /// -u Only print unique lines +/// -i Case insensitive comparison +/// -f N Skip N fields before comparing pub struct Uniq; +/// Get the comparison key for a line, skipping fields and optionally case-folding +fn uniq_key(line: &str, skip_fields: usize, case_insensitive: bool) -> String { + let key = if skip_fields > 0 { + line.split_whitespace() + .skip(skip_fields) + .collect::>() + .join(" ") + } else { + line.to_string() + }; + if case_insensitive { + key.to_lowercase() + } else { + key + } +} + #[async_trait] impl Builtin for Uniq { async fn execute(&self, ctx: Context<'_>) -> Result { - let count = ctx - .args - .iter() - .any(|a| a.contains('c') && a.starts_with('-')); - let only_duplicates = ctx - .args - .iter() - .any(|a| a.contains('d') && a.starts_with('-')); - let only_unique = ctx - .args - .iter() - .any(|a| a.contains('u') && a.starts_with('-')); - - let files: Vec<_> = ctx.args.iter().filter(|a| !a.starts_with('-')).collect(); + let mut count = false; + let mut only_duplicates = false; + let mut only_unique = false; + let mut case_insensitive = false; + let mut skip_fields: usize = 0; + let mut files = Vec::new(); + + let mut idx = 0; + while idx < ctx.args.len() { + let arg = &ctx.args[idx]; + if arg == "-f" { + idx += 1; + if idx < ctx.args.len() { + skip_fields = ctx.args[idx].parse().unwrap_or(0); + } + } else if let Some(n) = arg + .strip_prefix("-f") + .filter(|s| s.chars().all(|c| c.is_ascii_digit())) + { + skip_fields = n.parse().unwrap_or(0); + } else if arg.starts_with('-') && !arg.starts_with("--") { + for c in arg.chars().skip(1) { + match c { + 'c' => count = true, + 'd' => only_duplicates = true, + 'u' => only_unique = true, + 'i' => case_insensitive = true, + _ => {} + } + } + } else { + files.push(arg.clone()); + } + idx += 1; + } // Get input lines let lines: Vec = if files.is_empty() { - // Read from stdin ctx.stdin .map(|s| s.lines().map(|l| l.to_string()).collect()) .unwrap_or_default() } else { - // Read from first file - let file = files.first().unwrap(); + let file = &files[0]; let path = if file.starts_with('/') { std::path::PathBuf::from(file) } else { @@ -171,18 +380,18 @@ impl Builtin for Uniq { } }; - // Process lines - uniq only removes adjacent duplicates let mut result = Vec::new(); let mut prev_line: Option = None; + let mut prev_key: Option = None; let mut current_count = 0usize; for line in lines { - if let Some(ref prev) = prev_line { - if *prev == line { + let key = uniq_key(&line, skip_fields, case_insensitive); + if let Some(ref pk) = prev_key { + if *pk == key { current_count += 1; continue; } else { - // Output previous line based on flags let should_output = if only_duplicates { current_count > 1 } else if only_unique { @@ -193,18 +402,23 @@ impl Builtin for Uniq { if should_output { if count { - result.push(format!("{:>7} {}", current_count, prev)); + result.push(format!( + "{:>7} {}", + current_count, + prev_line.as_deref().unwrap_or("") + )); } else { - result.push(prev.clone()); + result.push(prev_line.clone().unwrap_or_default()); } } } } prev_line = Some(line); + prev_key = Some(key); current_count = 1; } - // Don't forget the last line + // Last line if let Some(prev) = prev_line { let should_output = if only_duplicates { current_count > 1 @@ -356,4 +570,83 @@ mod tests { assert!(!result.stdout.contains("a\n")); assert!(!result.stdout.contains("c\n")); } + + #[tokio::test] + async fn test_sort_key_field() { + let result = run_sort(&["-k2n"], Some("Bob 25\nAlice 30\nDavid 20\n")).await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "David 20\nBob 25\nAlice 30\n"); + } + + #[tokio::test] + async fn test_sort_delimiter_key() { + let result = run_sort(&["-t:", "-k2n"], Some("b:2\na:1\nc:3\n")).await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "a:1\nb:2\nc:3\n"); + } + + #[tokio::test] + async fn test_sort_check_sorted() { + let result = run_sort(&["-c"], Some("a\nb\nc\n")).await; + assert_eq!(result.exit_code, 0); + } + + #[tokio::test] + async fn test_sort_check_unsorted() { + let result = run_sort(&["-c"], Some("b\na\nc\n")).await; + assert_eq!(result.exit_code, 1); + } + + #[tokio::test] + async fn test_sort_human_numeric() { + let result = run_sort(&["-h"], Some("10K\n1K\n100M\n1G\n")).await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "1K\n10K\n100M\n1G\n"); + } + + #[tokio::test] + async fn test_sort_month() { + let result = run_sort(&["-M"], Some("Mar\nJan\nFeb\n")).await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "Jan\nFeb\nMar\n"); + } + + #[tokio::test] + async fn test_uniq_case_insensitive() { + let result = run_uniq(&["-i"], Some("a\nA\nb\nB\n")).await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "a\nb\n"); + } + + #[tokio::test] + async fn test_uniq_skip_fields() { + let result = run_uniq(&["-f1"], Some("x a\ny a\nx b\n")).await; + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "x a\nx b\n"); + } + + #[tokio::test] + async fn test_extract_key() { + assert_eq!(extract_key("a:b:c", Some(':'), 2), "b"); + assert_eq!(extract_key("hello world", None, 1), "hello"); + assert_eq!(extract_key("hello world", None, 2), "world"); + assert_eq!(extract_key("x", None, 5), ""); + } + + #[tokio::test] + async fn test_parse_human_numeric() { + assert_eq!(parse_human_numeric("1K"), 1000.0); + assert_eq!(parse_human_numeric("5M"), 5_000_000.0); + assert_eq!(parse_human_numeric("2G"), 2_000_000_000.0); + assert_eq!(parse_human_numeric("42"), 42.0); + assert_eq!(parse_human_numeric(""), 0.0); + } + + #[tokio::test] + async fn test_month_ordinal() { + assert_eq!(month_ordinal("JAN"), 1); + assert_eq!(month_ordinal("feb"), 2); + assert_eq!(month_ordinal("Dec"), 12); + assert_eq!(month_ordinal("xyz"), 0); + } } diff --git a/crates/bashkit/tests/spec_cases/bash/sortuniq.test.sh b/crates/bashkit/tests/spec_cases/bash/sortuniq.test.sh index b69ba4ac..f88984fe 100644 --- a/crates/bashkit/tests/spec_cases/bash/sortuniq.test.sh +++ b/crates/bashkit/tests/spec_cases/bash/sortuniq.test.sh @@ -120,7 +120,7 @@ Cherry ### end ### sort_field_delim -### skip: sort -t (field delimiter) not implemented +# Sort by field with delimiter printf 'b:2\na:1\nc:3\n' | sort -t: -k2n ### expect a:1 @@ -129,7 +129,7 @@ c:3 ### end ### sort_key_field -### skip: sort -k (key field) not implemented +# Sort by key field printf 'Bob 25\nAlice 30\nDavid 20\n' | sort -k2n ### expect David 20 @@ -138,7 +138,7 @@ Alice 30 ### end ### sort_stable -### skip: sort -s (stable) not implemented +# Stable sort preserves input order for equal keys printf 'b 1\na 2\nb 3\n' | sort -s -k1,1 ### expect a 2 @@ -147,7 +147,7 @@ b 3 ### end ### sort_check -### skip: sort -c (check sorted) not implemented +# Check if input is sorted printf 'a\nb\nc\n' | sort -c echo $? ### expect @@ -178,7 +178,7 @@ b ### end ### uniq_ignore_case -### skip: uniq -i (case insensitive) not implemented +# Case insensitive dedup printf 'a\nA\nb\nB\n' | uniq -i ### expect a @@ -186,7 +186,7 @@ b ### end ### uniq_skip_fields -### skip: uniq -f (skip fields) not implemented +# Skip fields before comparing printf 'x a\ny a\nx b\n' | uniq -f1 ### expect x a @@ -202,7 +202,7 @@ printf 'a\nb\na\nb\na\n' | sort | uniq -c ### end ### sort_human_numeric -### skip: sort -h (human numeric) not implemented +# Sort human-readable numeric values printf '10K\n1K\n100M\n1G\n' | sort -h ### expect 1K @@ -212,7 +212,7 @@ printf '10K\n1K\n100M\n1G\n' | sort -h ### end ### sort_month -### skip: sort -M (month) not implemented +# Sort by month name printf 'Mar\nJan\nFeb\n' | sort -M ### expect Jan @@ -221,13 +221,47 @@ Mar ### end ### sort_output_file -### skip: sort -o (output file) not implemented +# Sort to output file printf 'b\na\n' | sort -o /tmp/sorted.txt && cat /tmp/sorted.txt ### expect a b ### end +### sort_check_unsorted +# Check unsorted input returns 1 +printf 'b\na\n' | sort -c 2>/dev/null +echo $? +### expect +1 +### end + +### sort_key_field_numeric_reverse +# Sort by key with numeric reverse +printf 'x 30\ny 10\nz 20\n' | sort -k2 -n -r +### expect +x 30 +z 20 +y 10 +### end + +### sort_field_delim_csv +# Sort CSV by second column +printf 'z,1\na,3\nm,2\n' | sort -t, -k2n +### expect +z,1 +m,2 +a,3 +### end + +### uniq_case_count +# Case insensitive count +printf 'Hello\nhello\nHELLO\nWorld\n' | uniq -ic +### expect + 3 Hello + 1 World +### end + ### sort_zero_terminated ### skip: sort -z (zero terminated) not implemented printf 'b\0a\0c\0' | sort -z | tr '\0' '\n' diff --git a/specs/009-implementation-status.md b/specs/009-implementation-status.md index 7855f0f9..605a4680 100644 --- a/specs/009-implementation-status.md +++ b/specs/009-implementation-status.md @@ -107,17 +107,17 @@ Bashkit implements IEEE 1003.1-2024 Shell Command Language. See ## Spec Test Coverage -**Total spec test cases:** 1038 +**Total spec test cases:** 1042 | Category | Cases | In CI | Pass | Skip | Notes | |----------|-------|-------|------|------|-------| -| Bash (core) | 636 | Yes | 576 | 60 | `bash_spec_tests` in CI | +| Bash (core) | 640 | Yes | 592 | 48 | `bash_spec_tests` in CI | | AWK | 90 | Yes | 73 | 17 | loops, arrays, -v, ternary, field assign | | Grep | 81 | Yes | 76 | 5 | now with -z, -r, -a, -b, -H, -h, -f, -P | | Sed | 65 | Yes | 53 | 12 | hold space, change, regex ranges, -E | | JQ | 108 | Yes | 100 | 8 | reduce, walk, regex funcs, --arg/--argjson, combined flags | | Python | 58 | Yes | 50 | 8 | **Experimental.** VFS bridging, pathlib, env vars | -| **Total** | **1038** | **Yes** | **928** | **110** | | +| **Total** | **1042** | **Yes** | **944** | **98** | | ### Bash Spec Tests Breakdown @@ -152,7 +152,7 @@ Bashkit implements IEEE 1003.1-2024 Shell Command Language. See | printf.test.sh | 24 | format specifiers, array expansion | | procsub.test.sh | 6 | | | sleep.test.sh | 6 | | -| sortuniq.test.sh | 28 | sort and uniq (14 skipped) | +| sortuniq.test.sh | 32 | sort and uniq (2 skipped) | | source.test.sh | 21 | source/., function loading, PATH search, positional params | | test-operators.test.sh | 17 | file/string tests (2 skipped) | | time.test.sh | 11 | Wall-clock only (user/sys always 0) |