Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions datafusion/optimizer/src/simplify_expressions/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -283,20 +283,23 @@ fn partial_anchored_literal_to_like(v: &[Hir]) -> Option<String> {

/// Extracts a string literal expression assuming that [`is_anchored_literal`]
/// returned true.
fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
fn anchored_literal_to_expr(v: &[Hir], string_scalar: &StringScalar) -> Option<Expr> {
match v.len() {
2 => Some(lit("")),
2 => Some(string_scalar.to_expr("")),
3 => {
let HirKind::Literal(l) = v[1].kind() else {
return None;
};
like_str_from_literal(l).map(lit)
like_str_from_literal(l).map(|s| string_scalar.to_expr(s))
}
_ => None,
}
}

fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
fn anchored_alternation_to_exprs(
v: &[Hir],
string_scalar: &StringScalar,
) -> Option<Vec<Expr>> {
if 3 != v.len() {
return None;
}
Expand All @@ -308,7 +311,8 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
for hir in alters {
let mut is_safe = false;
if let HirKind::Literal(l) = hir.kind()
&& let Some(safe_literal) = str_from_literal(l).map(lit)
&& let Some(safe_literal) =
str_from_literal(l).map(|s| string_scalar.to_expr(s))
{
literals.push(safe_literal);
is_safe = true;
Expand All @@ -321,7 +325,9 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {

return Some(literals);
} else if let HirKind::Literal(l) = sub.kind() {
if let Some(safe_literal) = str_from_literal(l).map(lit) {
if let Some(safe_literal) =
str_from_literal(l).map(|s| string_scalar.to_expr(s))
{
return Some(vec![safe_literal]);
}
return None;
Expand Down Expand Up @@ -351,12 +357,18 @@ fn lower_simple(
));
}
HirKind::Concat(inner) if is_anchored_literal(inner) => {
return anchored_literal_to_expr(inner).map(|right| {
mode.expr_matches_literal(Box::new(left.clone()), Box::new(right))
return anchored_literal_to_expr(inner, string_scalar).map(|right| {
if mode.i {
// Case-insensitive: use ILIKE for exact match (no wildcards)
mode.expr(Box::new(left.clone()), Box::new(right))
} else {
// Case-sensitive: use Eq / NotEq
mode.expr_matches_literal(Box::new(left.clone()), Box::new(right))
}
});
}
HirKind::Concat(inner) if is_anchored_capture(inner) => {
return anchored_alternation_to_exprs(inner)
HirKind::Concat(inner) if !mode.i && is_anchored_capture(inner) => {
return anchored_alternation_to_exprs(inner, string_scalar)
.map(|right| left.clone().in_list(right, mode.not));
}
HirKind::Concat(inner) => {
Expand Down
159 changes: 159 additions & 0 deletions datafusion/sqllogictest/test_files/predicates.slt
Original file line number Diff line number Diff line change
Expand Up @@ -204,12 +204,171 @@ SELECT * FROM test WHERE column1 ~ 'z'
----
Bazzz

query T
SELECT * FROM test WHERE column1 ~ '^Bazzz$'
----
Bazzz

query T
SELECT * FROM test WHERE column1 ~ '^(foo|Bazzz)$'
----
foo
Bazzz

statement ok
CREATE TABLE test_regex_utf8view(s VARCHAR) AS VALUES ('foo'), ('Bazzz');
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question to educate myself: How the values here are Utf8View ?
I'd expect some casting to achieve that.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's because the config map_string_types_to_utf8view defaults to true, so a VARCHAR column is planned as Utf8View in slt

/// If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning.
/// If false, they are mapped to `Utf8`.
/// Default is true.
pub map_string_types_to_utf8view: bool, default = true


statement ok
set datafusion.explain.logical_plan_only = true

# `~` anchored literal -> `= Utf8View(..)`
query TT
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~ '^Bazzz$'
----
logical_plan
01)Filter: test_regex_utf8view.s = Utf8View("Bazzz")
02)--TableScan: test_regex_utf8view projection=[s]

# `~*` anchored literal -> `ILIKE Utf8View(..)`
query TT
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~* '^bazzz$'
----
logical_plan
01)Filter: test_regex_utf8view.s ILIKE Utf8View("bazzz")
02)--TableScan: test_regex_utf8view projection=[s]

# `~` anchored alternation -> OR of `= Utf8View(..)` comparisons.
query TT
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~ '^(foo|Bazzz)$'
----
logical_plan
01)Filter: test_regex_utf8view.s = Utf8View("foo") OR test_regex_utf8view.s = Utf8View("Bazzz")
02)--TableScan: test_regex_utf8view projection=[s]

# `~*` anchored alternation -> NOT simplified: it falls back to a regex match,
# because `IN`/`=` cannot express case-insensitive matching.
query TT
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~* '^(foo|bazzz)$'
----
logical_plan
01)Filter: test_regex_utf8view.s ~* Utf8View("^(foo|bazzz)$")
02)--TableScan: test_regex_utf8view projection=[s]

# `!~` -> `!= Utf8View(..)`
query TT
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~ '^Bazzz$'
----
logical_plan
01)Filter: test_regex_utf8view.s != Utf8View("Bazzz")
02)--TableScan: test_regex_utf8view projection=[s]

# `!~*` -> `NOT ILIKE Utf8View(..)`
query TT
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~* '^bazzz$'
----
logical_plan
01)Filter: test_regex_utf8view.s NOT ILIKE Utf8View("bazzz")
02)--TableScan: test_regex_utf8view projection=[s]

# `!~` anchored alternation -> AND of `!= Utf8View(..)` comparisons.
query TT
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~ '^(foo|Bazzz)$'
----
logical_plan
01)Filter: test_regex_utf8view.s != Utf8View("foo") AND test_regex_utf8view.s != Utf8View("Bazzz")
02)--TableScan: test_regex_utf8view projection=[s]

# `!~*` anchored alternation -> NOT simplified: it falls back to a regex match,
# same reason as the `~*` alternation above.
query TT
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~* '^(foo|bazzz)$'
----
logical_plan
01)Filter: test_regex_utf8view.s !~* Utf8View("^(foo|bazzz)$")
02)--TableScan: test_regex_utf8view projection=[s]

statement ok
set datafusion.explain.logical_plan_only = false

# Result assertions
query T
SELECT * FROM test_regex_utf8view WHERE s ~ '^Bazzz$'
----
Bazzz

query T
SELECT * FROM test_regex_utf8view WHERE s ~ '^(foo|Bazzz)$'
----
foo
Bazzz

# Case-insensitive anchored match over Utf8View: must be simplified to ILIKE
# (not a case-sensitive Eq) and must keep operand types as Utf8View.
query T
SELECT * FROM test_regex_utf8view WHERE s ~* '^bazzz$'
----
Bazzz
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How this asserts the expected result ?
Neither the optimization nor the type is asserted.
Maybe use EXPLAIN ... and assert its output instead ?!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good to me, it makes the intent clearer. I've already added EXPLAIN assertions for all the anchored cases.


# Case-insensitive anchored alternation over Utf8View
query T rowsort
SELECT * FROM test_regex_utf8view WHERE s ~* '^(foo|bazzz)$'
----
Bazzz
foo

query T rowsort
SELECT * FROM test_regex_utf8view WHERE s !~ '^Bazzz$'
----
foo

query T rowsort
SELECT * FROM test_regex_utf8view WHERE s !~* '^bazzz$'
----
foo

# Both rows match the alternation, so the negated forms return nothing.
query T rowsort
SELECT * FROM test_regex_utf8view WHERE s !~ '^(foo|Bazzz)$'
----

query T rowsort
SELECT * FROM test_regex_utf8view WHERE s !~* '^(foo|bazzz)$'
----

statement ok
DROP TABLE test_regex_utf8view;

query T
SELECT * FROM test WHERE column1 ~* 'z'
----
Bazzz
ZZZZZ

query T
SELECT * FROM test WHERE column1 ~* '^barrr$'
----
Barrr

query T
SELECT * FROM test WHERE column1 ~* '^(barrr|bazzz)$'
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests with negation+regex are missing (!~ and !~*).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added !~ and !~* coverage.

----
Barrr
Bazzz

query T rowsort
SELECT * FROM test WHERE column1 !~ '^Bazzz$'
----
Barrr
ZZZZZ
foo

query T rowsort
SELECT * FROM test WHERE column1 !~* '^barrr$'
----
Bazzz
ZZZZZ
foo

query T
SELECT * FROM test WHERE column1 !~ 'z'
----
Expand Down
Loading