From d52ef2ac564f0164f6c0128595e787cb517aff78 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 22 Apr 2026 10:15:48 -0700 Subject: [PATCH 1/2] Fix bounds inference for implicit pure def with RVar args (#9102) When a Func's first definition is an update whose LHS uses an RVar directly (e.g. `h(r.x) += ...`), define_base_case synthesized an implicit pure definition but reused the RVar's name for the pure dimension. The resulting name collision caused bounds inference to resolve the update's RVar loop bounds to the pure dimension's output-buffer bounds instead of the RDom's, which broke scheduling directives like vectorize/unroll on the RVar. Treat Variables with a defined reduction_domain the same way we treat Variables with a defined param: generate a fresh pure-arg Var instead of reusing the name. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/Func.cpp | 2 +- test/correctness/CMakeLists.txt | 1 + .../implicit_pure_def_with_rvar_args.cpp | 95 +++++++++++++++++++ 3 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 test/correctness/implicit_pure_def_with_rvar_args.cpp diff --git a/src/Func.cpp b/src/Func.cpp index a081116762df..23d03447ba28 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -3190,7 +3190,7 @@ Func define_base_case(const Internal::Function &func, const vector &a, con // Reuse names of existing pure args for (size_t i = 0; i < a.size(); i++) { if (const Variable *v = a[i].as()) { - if (!v->param.defined()) { + if (!v->param.defined() && !v->reduction_domain.defined()) { pure_args[i] = Var(v->name); } } else { diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index a25b077a1abb..c6c90f833db0 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -180,6 +180,7 @@ tests(GROUPS correctness image_of_lists.cpp implicit_args.cpp implicit_args_tests.cpp + implicit_pure_def_with_rvar_args.cpp in_place.cpp indexing_access_undef.cpp infer_arguments.cpp diff --git a/test/correctness/implicit_pure_def_with_rvar_args.cpp b/test/correctness/implicit_pure_def_with_rvar_args.cpp new file mode 100644 index 000000000000..c324fd6b8486 --- /dev/null +++ b/test/correctness/implicit_pure_def_with_rvar_args.cpp @@ -0,0 +1,95 @@ +#include "Halide.h" +#include + +using namespace Halide; + +// Regression test for https://github.com/halide/Halide/issues/9102 +// +// When a Func's first definition is an update that uses RVars directly +// as LHS args (e.g. h(r.x) += ...), Halide auto-generates an implicit +// pure definition. The pure dimension must not share a name with the +// RVar, or bounds inference resolves the update's RVar loop bounds to +// the pure dimension's (buffer-driven) bounds instead of the RDom's. + +int main(int argc, char **argv) { + Var x; + + // Case 1: the original reproducer. Schedule references r.x / r.y on + // the update. Previously the r.x loop incorrectly inherited bounds + // from h's output buffer, which broke vectorization / unrolling. + { + RDom r(0, 15, 0, 8); + Func f{"f"}, g{"g"}, h{"h"}; + f(x) = x + 1; + g(x) = 2 * x + 3; + + h(r.x) += f(r.x + r.y) * g(r.y); + + f.compute_root(); + g.compute_root(); + h.update().atomic().vectorize(r.x).unroll(r.y); + + Buffer out = h.realize({15}); + for (int i = 0; i < 15; i++) { + int expected = 0; + for (int j = 0; j < 8; j++) { + expected += (i + j + 1) * (2 * j + 3); + } + if (out(i) != expected) { + printf("Case 1: out(%d) = %d, expected %d\n", i, out(i), expected); + return 1; + } + } + } + + // Case 2: same computation, but with an explicit pure definition. + // This was the user's workaround; it must still give the same answer. + { + RDom r(0, 15, 0, 8); + Func f{"f2"}, g{"g2"}, h{"h2"}; + f(x) = x + 1; + g(x) = 2 * x + 3; + + h(x) = 0; + h(r.x) += f(r.x + r.y) * g(r.y); + + f.compute_root(); + g.compute_root(); + h.update().atomic().vectorize(r.x).unroll(r.y); + + Buffer out = h.realize({15}); + for (int i = 0; i < 15; i++) { + int expected = 0; + for (int j = 0; j < 8; j++) { + expected += (i + j + 1) * (2 * j + 3); + } + if (out(i) != expected) { + printf("Case 2: out(%d) = %d, expected %d\n", i, out(i), expected); + return 1; + } + } + } + + // Case 3: RDom bounds narrower than the realized output. Without a + // correct loop bound from the RDom, the update would either write + // out-of-bounds or leave tail entries uninitialized. + { + RDom r(2, 5); + Func h{"h3"}; + h(r) += cast(r) * 10; + + h.update().vectorize(r, 4, TailStrategy::GuardWithIf); + + Buffer out = h.realize({10}); + for (int i = 0; i < 10; i++) { + int expected = (i >= 2 && i < 7) ? i * 10 : 0; + if (out(i) != expected) { + printf("Case 3: out(%d) = %d, expected %d\n", i, out(i), expected); + return 1; + } + } + } + + printf("Success!\n"); + return 0; +} From f7fa54af83c1ef5ec6ccaf7727bbfb9f295294c4 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 22 Apr 2026 12:53:52 -0700 Subject: [PATCH 2/2] Use round RDom extent in implicit_pure_def_with_rvar_args test LLVM's ARM64 backend fails to widen the 15-wide int32 vector store produced by the original reproducer's vectorize(r.x) schedule. The bug under test is about bounds inference, not vector widths, so round the RDom extent up to 16 so vectorize lowers to clean NEON stores on every supported target. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../implicit_pure_def_with_rvar_args.cpp | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/test/correctness/implicit_pure_def_with_rvar_args.cpp b/test/correctness/implicit_pure_def_with_rvar_args.cpp index c324fd6b8486..3449e0adadca 100644 --- a/test/correctness/implicit_pure_def_with_rvar_args.cpp +++ b/test/correctness/implicit_pure_def_with_rvar_args.cpp @@ -14,11 +14,12 @@ using namespace Halide; int main(int argc, char **argv) { Var x; - // Case 1: the original reproducer. Schedule references r.x / r.y on - // the update. Previously the r.x loop incorrectly inherited bounds - // from h's output buffer, which broke vectorization / unrolling. + // Case 1: the original reproducer. vectorize(r.x) requires the RVar + // loop to have a constant extent; before the fix, bounds inference + // produced a symbolic extent from the output buffer and this + // schedule failed to compile. { - RDom r(0, 15, 0, 8); + RDom r(0, 16, 0, 8); Func f{"f"}, g{"g"}, h{"h"}; f(x) = x + 1; g(x) = 2 * x + 3; @@ -29,8 +30,8 @@ int main(int argc, char **argv) { g.compute_root(); h.update().atomic().vectorize(r.x).unroll(r.y); - Buffer out = h.realize({15}); - for (int i = 0; i < 15; i++) { + Buffer out = h.realize({16}); + for (int i = 0; i < 16; i++) { int expected = 0; for (int j = 0; j < 8; j++) { expected += (i + j + 1) * (2 * j + 3); @@ -45,7 +46,7 @@ int main(int argc, char **argv) { // Case 2: same computation, but with an explicit pure definition. // This was the user's workaround; it must still give the same answer. { - RDom r(0, 15, 0, 8); + RDom r(0, 16, 0, 8); Func f{"f2"}, g{"g2"}, h{"h2"}; f(x) = x + 1; g(x) = 2 * x + 3; @@ -57,8 +58,8 @@ int main(int argc, char **argv) { g.compute_root(); h.update().atomic().vectorize(r.x).unroll(r.y); - Buffer out = h.realize({15}); - for (int i = 0; i < 15; i++) { + Buffer out = h.realize({16}); + for (int i = 0; i < 16; i++) { int expected = 0; for (int j = 0; j < 8; j++) { expected += (i + j + 1) * (2 * j + 3); @@ -70,9 +71,10 @@ int main(int argc, char **argv) { } } - // Case 3: RDom bounds narrower than the realized output. Without a - // correct loop bound from the RDom, the update would either write - // out-of-bounds or leave tail entries uninitialized. + // Case 3: RDom bounds narrower than the realized output. Exercises + // the underlying bounds bug directly (no vectorize needed): without + // a correct loop bound from the RDom, the update would write to + // indices outside the RDom, producing wrong values at the ends. { RDom r(2, 5); Func h{"h3"};