From d52ef2ac564f0164f6c0128595e787cb517aff78 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 22 Apr 2026 10:15:48 -0700
Subject: [PATCH 1/2] Fix bounds inference for implicit pure def with RVar args
 (#9102)

When a Func's first definition is an update whose LHS uses an RVar
directly (e.g. `h(r.x) += ...`), define_base_case synthesized an
implicit pure definition but reused the RVar's name for the pure
dimension. The resulting name collision caused bounds inference to
resolve the update's RVar loop bounds to the pure dimension's
output-buffer bounds instead of the RDom's, which broke scheduling
directives like vectorize/unroll on the RVar.

Treat Variables with a defined reduction_domain the same way we treat
Variables with a defined param: generate a fresh pure-arg Var instead
of reusing the name.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/Func.cpp                                  |  2 +-
 test/correctness/CMakeLists.txt               |  1 +
 .../implicit_pure_def_with_rvar_args.cpp      | 95 +++++++++++++++++++
 3 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100644 test/correctness/implicit_pure_def_with_rvar_args.cpp
diff --git a/src/Func.cpp b/src/Func.cpp
index a081116762df..23d03447ba28 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -3190,7 +3190,7 @@ Func define_base_case(const Internal::Function &func, const vector<Expr> &a, con
     // Reuse names of existing pure args
     for (size_t i = 0; i < a.size(); i++) {
         if (const Variable *v = a[i].as<Variable>()) {
-            if (!v->param.defined()) {
+            if (!v->param.defined() && !v->reduction_domain.defined()) {
                 pure_args[i] = Var(v->name);
             }
         } else {
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index a25b077a1abb..c6c90f833db0 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -180,6 +180,7 @@ tests(GROUPS correctness
       image_of_lists.cpp
       implicit_args.cpp
       implicit_args_tests.cpp
+      implicit_pure_def_with_rvar_args.cpp
       in_place.cpp
       indexing_access_undef.cpp
       infer_arguments.cpp
diff --git a/test/correctness/implicit_pure_def_with_rvar_args.cpp b/test/correctness/implicit_pure_def_with_rvar_args.cpp
new file mode 100644
index 000000000000..c324fd6b8486
--- /dev/null
+++ b/test/correctness/implicit_pure_def_with_rvar_args.cpp
@@ -0,0 +1,95 @@
+#include "Halide.h"
+#include <cstdio>
+
+using namespace Halide;
+
+// Regression test for https://github.com/halide/Halide/issues/9102
+//
+// When a Func's first definition is an update that uses RVars directly
+// as LHS args (e.g. h(r.x) += ...), Halide auto-generates an implicit
+// pure definition. The pure dimension must not share a name with the
+// RVar, or bounds inference resolves the update's RVar loop bounds to
+// the pure dimension's (buffer-driven) bounds instead of the RDom's.
+
+int main(int argc, char **argv) {
+    Var x;
+
+    // Case 1: the original reproducer. Schedule references r.x / r.y on
+    // the update. Previously the r.x loop incorrectly inherited bounds
+    // from h's output buffer, which broke vectorization / unrolling.
+    {
+        RDom r(0, 15, 0, 8);
+        Func f{"f"}, g{"g"}, h{"h"};
+        f(x) = x + 1;
+        g(x) = 2 * x + 3;
+
+        h(r.x) += f(r.x + r.y) * g(r.y);
+
+        f.compute_root();
+        g.compute_root();
+        h.update().atomic().vectorize(r.x).unroll(r.y);
+
+        Buffer<int> out = h.realize({15});
+        for (int i = 0; i < 15; i++) {
+            int expected = 0;
+            for (int j = 0; j < 8; j++) {
+                expected += (i + j + 1) * (2 * j + 3);
+            }
+            if (out(i) != expected) {
+                printf("Case 1: out(%d) = %d, expected %d\n", i, out(i), expected);
+                return 1;
+            }
+        }
+    }
+
+    // Case 2: same computation, but with an explicit pure definition.
+    // This was the user's workaround; it must still give the same answer.
+    {
+        RDom r(0, 15, 0, 8);
+        Func f{"f2"}, g{"g2"}, h{"h2"};
+        f(x) = x + 1;
+        g(x) = 2 * x + 3;
+
+        h(x) = 0;
+        h(r.x) += f(r.x + r.y) * g(r.y);
+
+        f.compute_root();
+        g.compute_root();
+        h.update().atomic().vectorize(r.x).unroll(r.y);
+
+        Buffer<int> out = h.realize({15});
+        for (int i = 0; i < 15; i++) {
+            int expected = 0;
+            for (int j = 0; j < 8; j++) {
+                expected += (i + j + 1) * (2 * j + 3);
+            }
+            if (out(i) != expected) {
+                printf("Case 2: out(%d) = %d, expected %d\n", i, out(i), expected);
+                return 1;
+            }
+        }
+    }
+
+    // Case 3: RDom bounds narrower than the realized output. Without a
+    // correct loop bound from the RDom, the update would either write
+    // out-of-bounds or leave tail entries uninitialized.
+    {
+        RDom r(2, 5);
+        Func h{"h3"};
+        h(r) += cast<int>(r) * 10;
+
+        h.update().vectorize(r, 4, TailStrategy::GuardWithIf);
+
+        Buffer<int> out = h.realize({10});
+        for (int i = 0; i < 10; i++) {
+            int expected = (i >= 2 && i < 7) ? i * 10 : 0;
+            if (out(i) != expected) {
+                printf("Case 3: out(%d) = %d, expected %d\n", i, out(i), expected);
+                return 1;
+            }
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From f7fa54af83c1ef5ec6ccaf7727bbfb9f295294c4 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 22 Apr 2026 12:53:52 -0700
Subject: [PATCH 2/2] Use round RDom extent in implicit_pure_def_with_rvar_args
 test

LLVM's ARM64 backend fails to widen the 15-wide int32 vector store
produced by the original reproducer's vectorize(r.x) schedule. The
bug under test is about bounds inference, not vector widths, so round
the RDom extent up to 16 so vectorize lowers to clean NEON stores on
every supported target.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../implicit_pure_def_with_rvar_args.cpp      | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/test/correctness/implicit_pure_def_with_rvar_args.cpp b/test/correctness/implicit_pure_def_with_rvar_args.cpp
index c324fd6b8486..3449e0adadca 100644
--- a/test/correctness/implicit_pure_def_with_rvar_args.cpp
+++ b/test/correctness/implicit_pure_def_with_rvar_args.cpp
@@ -14,11 +14,12 @@ using namespace Halide;
 int main(int argc, char **argv) {
     Var x;
 
-    // Case 1: the original reproducer. Schedule references r.x / r.y on
-    // the update. Previously the r.x loop incorrectly inherited bounds
-    // from h's output buffer, which broke vectorization / unrolling.
+    // Case 1: the original reproducer. vectorize(r.x) requires the RVar
+    // loop to have a constant extent; before the fix, bounds inference
+    // produced a symbolic extent from the output buffer and this
+    // schedule failed to compile.
     {
-        RDom r(0, 15, 0, 8);
+        RDom r(0, 16, 0, 8);
         Func f{"f"}, g{"g"}, h{"h"};
         f(x) = x + 1;
         g(x) = 2 * x + 3;
@@ -29,8 +30,8 @@ int main(int argc, char **argv) {
         g.compute_root();
         h.update().atomic().vectorize(r.x).unroll(r.y);
 
-        Buffer<int> out = h.realize({15});
-        for (int i = 0; i < 15; i++) {
+        Buffer<int> out = h.realize({16});
+        for (int i = 0; i < 16; i++) {
             int expected = 0;
             for (int j = 0; j < 8; j++) {
                 expected += (i + j + 1) * (2 * j + 3);
@@ -45,7 +46,7 @@ int main(int argc, char **argv) {
     // Case 2: same computation, but with an explicit pure definition.
     // This was the user's workaround; it must still give the same answer.
     {
-        RDom r(0, 15, 0, 8);
+        RDom r(0, 16, 0, 8);
         Func f{"f2"}, g{"g2"}, h{"h2"};
         f(x) = x + 1;
         g(x) = 2 * x + 3;
@@ -57,8 +58,8 @@ int main(int argc, char **argv) {
         g.compute_root();
         h.update().atomic().vectorize(r.x).unroll(r.y);
 
-        Buffer<int> out = h.realize({15});
-        for (int i = 0; i < 15; i++) {
+        Buffer<int> out = h.realize({16});
+        for (int i = 0; i < 16; i++) {
             int expected = 0;
             for (int j = 0; j < 8; j++) {
                 expected += (i + j + 1) * (2 * j + 3);
@@ -70,9 +71,10 @@ int main(int argc, char **argv) {
         }
     }
 
-    // Case 3: RDom bounds narrower than the realized output. Without a
-    // correct loop bound from the RDom, the update would either write
-    // out-of-bounds or leave tail entries uninitialized.
+    // Case 3: RDom bounds narrower than the realized output. Exercises
+    // the underlying bounds bug directly (no vectorize needed): without
+    // a correct loop bound from the RDom, the update would write to
+    // indices outside the RDom, producing wrong values at the ends.
     {
         RDom r(2, 5);
         Func h{"h3"};