From 1f3fed4ac4161f56c88a9e310c4611d8c29c2f50 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Thu, 16 Apr 2026 15:09:27 +0200
Subject: [PATCH 01/12] factorize out a bunch of stuff, make the Spherical
 Rectangle Solid Angle Sampler global like its Triangle counterpart, we need
 to keep the basis around

---
 examples_tests                                |   2 +-
 .../hlsl/sampling/spherical_rectangle.hlsl    | 116 ++++++++++++------
 .../hlsl/shapes/spherical_rectangle.hlsl      |   5 +-
 3 files changed, 86 insertions(+), 37 deletions(-)
diff --git a/examples_tests b/examples_tests
index 2fd21fd891..5eeb47351e 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 2fd21fd8917660d2a559b97d80afa95dd6b591d5
+Subproject commit 5eeb47351e29cb6ee02f8d8319f131a2c012b5a2
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
index 131cc92d70..8029205384 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
@@ -26,6 +26,7 @@ struct SphericalRectangle
     using vector2_type = vector<T, 2>;
     using vector3_type = vector<T, 3>;
     using vector4_type = vector<T, 4>;
+    using matrix3x3_type = matrix<T, 3, 3>;
 
     // BackwardTractableSampler concept types
     using domain_type = vector2_type;
@@ -35,16 +36,15 @@ struct SphericalRectangle
 
     struct cache_type {};
 
-    NBL_CONSTEXPR_STATIC_INLINE scalar_type ClampEps = 1e-5;
-
     static SphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
     {
-        return create(rect.solidAngle(observer), rect.extents);
+        return create(rect.basis,rect.solidAngle(observer),rect.extents);
     }
 
-    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
+    static SphericalRectangle<T> create(const matrix3x3_type _basis, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
     {
         SphericalRectangle<T> retval;
+        retval.basis = _basis;
         retval.r0 = sa.r0;
         retval.extents = _extents;
 
@@ -62,7 +62,7 @@ struct SphericalRectangle
     // Create directly from a local-frame corner position and rectangle extents.
     // Use when you already know r0 (e.g. from a gnomonic projection) and don't
     // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip.
-    static SphericalRectangle<T> create(const vector3_type _r0, const vector2_type _extents)
+    static SphericalRectangle<T> create(const matrix3x3_type _basis, const vector3_type _r0, const vector2_type _extents)
     {
         // Same math as shapes::SphericalRectangle::solidAngle() but without
         // the mul(basis, origin - observer) step since we already have r0.
@@ -82,13 +82,22 @@ struct SphericalRectangle
         acc.addCosine(sa.cosGamma[3]);
         sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi<scalar_type>;
 
-        return create(sa, _extents);
+        return create(_basis, sa, _extents);
     }
 
     // shared core of generate and generateSurfaceOffset
     // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction
-    vector3_type __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+    struct SCommonGen
+    {
+        scalar_type xu;
+        scalar_type d2;
+        scalar_type hv;
+        scalar_type cosElevation2;
+    };
+    SCommonGen __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
     {
+        SCommonGen retval;
+
         // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip
         const scalar_type negAbsR0z = -hlsl::abs(r0.z);
         const scalar_type r0zSq = r0.z * r0.z;
@@ -103,44 +112,80 @@ struct SphericalRectangle
         const scalar_type rcpCu_2 = hlsl::max<scalar_type>(absNegFu * absNegFu + b0 * b0, scalar_type(1.0));
         // sign(negFu) = sign(numerator) * sign(sin(au)); sin(au) < 0 iff au > PI
         const scalar_type negFuSign = hlsl::select((au > numbers::pi<scalar_type>) != (numerator < scalar_type(0.0)), scalar_type(-1.0), scalar_type(1.0));
-        scalar_type xu = negAbsR0z * negFuSign * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
-        xu = hlsl::clamp<scalar_type>(xu, r0.x, r1.x); // avoid Infs
-        const scalar_type d_2 = xu * xu + r0zSq;
-        const scalar_type d = hlsl::sqrt<scalar_type>(d_2);
+        retval.xu = negAbsR0z * negFuSign * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
+        retval.xu = hlsl::clamp<scalar_type>(retval.xu, r0.x, r1.x); // avoid Infs
+        // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2` and reclamping this result in a different way
+        // with `d2 = r0zSq*(rcpCu_2*rcpCu_2-rcpCu_2)` or `d2 = r0zSq*rcpCu_2*(rcpCu_2-1)`
+        retval.d2 = retval.xu * retval.xu + r0zSq;
 
-        const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(d_2 + r0.y * r0.y);
-        const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(d_2 + r1.y * r1.y);
-        const scalar_type hv = h0 + u.y * (h1 - h0);
+        const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(retval.d2 + r0.y * r0.y);
+        const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(retval.d2 + r1.y * r1.y);
+        retval.hv = h0 + u.y * (h1 - h0);
+        retval.cosElevation2 = scalar_type(1.0) - hlsl::min<scalar_type>(retval.hv * retval.hv,1);
 
-        return vector3_type(xu, hv, d);
+        return retval;
     }
 
     // returns a normalized 3D direction in the local frame with correct r0.z sign
+    vector3_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
+    {
+        const SCommonGen core = __generate(u);
+        scalar_type cosElevationOverD = hlsl::rsqrt(core.d2/core.cosElevation2);
+        // TODO: or shall we do some other more sophisticated clamp or correction? Is this even the right one to use?
+        cosElevationOverD = hlsl::select(hlsl::isnan(cosElevationOverD),1.f,cosElevationOverD);
+
+        // TODO: investigate if due to precision we need to compute this as a `sqrt` then the quantity being computed is `core.cosElevation2/core.d2`
+        // which what alss `generateLocalBasisXY` needs, in which case `__generate` can already compute it and return it as `hitDist2`
+        hitDist = 1.f/cosElevationOverD;
+
+        const vector3_type retval = vector3_type(core.xu / hitDist, core.hv, r0.z / hitDist);
+        assert(!hlsl::isnan(computeHitT(retval)));
+        return retval;
+    }
+
     codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
-        const vector3_type core = __generate(u);
-        const scalar_type xu = core.x;
-        const scalar_type hv = core.y;
-        const scalar_type d = core.z;
-        const scalar_type hv2 = hv * hv;
-        const scalar_type cosElevation = hlsl::sqrt<scalar_type>(hlsl::max<scalar_type>(scalar_type(1.0) - hv2, scalar_type(0.0)));
-        const scalar_type rcpD = scalar_type(1.0) / d;
-
-        return vector3_type(xu * cosElevation * rcpD, hv, r0.z * cosElevation * rcpD);
+        scalar_type dummy;
+        const vector3_type localL = generateNormalizedLocal(u,cache,dummy);
+        // could return `hlsl::mul(hlsl::tranpose(basis),localL)` or just this
+        return basis[0]*localL[0]+basis[1]*localL[1]+basis[2]*localL[2];
     }
 
-    // returns a 2D offset on the rectangle surface from the r0 corner
-    vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+    // utility to determine maxT for a ray from L shot from the origin which we're sure intersects the rectangle
+    scalar_type computeHitT(const vector3_type L)
     {
-        const vector3_type core = __generate(u);
-        const scalar_type xu = core.x;
-        const scalar_type hv = core.y;
-        const scalar_type d = core.z;
+        const scalar_type retval = hlsl::abs(r0.z/hlsl::dot(L,basis[2]));
+        {
+            const vector3_type hitPointRelative = L*retval;
+            const vector2_type uv = mul(basis,L).xy-r0.xy;
+            assert(uv[0]>=0.0 && uv[0]<=rectExtents[0]);
+            assert(uv[1]>=0.0 && uv[1]<=rectExtents[1]);
+        }
+        return retval;
+    }
+
+    // returns a 2D offset on the rectangle surface including the r0 corner -useful for generating unnormalized worldspace L
+    vector2_type generateLocalBasisXY(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+    {
+        const SCommonGen core = __generate(u);
         const scalar_type r1y = r0.y + extents.y;
-        const scalar_type hv2 = hv * hv;
-        const scalar_type yv = hlsl::mix(r1y, (hv * d) / hlsl::sqrt<scalar_type>(scalar_type(1.0) - hv2), hv2 < scalar_type(1.0) - ClampEps);
+        // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2`
+        const scalar_type yv = core.hv * hlsl::rsqrt(core.cosElevation2/core.d2);
+
+        // fun fact, when one of the operands to min or max is NaN, the SPIR-V builtin will select the other one
+        // TODO: maybe try just `min(yv,ry1)`
+        const vector2_type retval = vector2_type(core.xu,hlsl::clamp(yv,r0.y,r1y));
+        assert(retval[0] >= r0.x && retval[1] >= r0.y);
+        assert(retval[0] <= r0.x+rectExtents[0] && retval[1] <= r0.y+rectExtents[1]);
+        return retval;
+    }
 
-        return vector2_type((xu - r0.x), (yv - r0.y));
+    // its basically the hitpoint minus the observer origin
+    codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+    {
+        const vector2_type localXY = generateLocalBasisXY(u,cache);
+        // the `localXY` already contains r0.xy
+        return basis[0]*localXY[0]+basis[1]*localXY[1]+basis[2]*r0.z;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -163,12 +208,13 @@ struct SphericalRectangle
         return backwardPdf(L);
     }
 
+    matrix3x3_type basis;
+    vector3_type r0;
+    vector2_type extents;
     scalar_type solidAngle;
     scalar_type k;
     scalar_type b0;
     scalar_type b1;
-    vector3_type r0;
-    vector2_type extents;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
index 5895e4bc80..bc864f7a19 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
@@ -102,7 +102,8 @@ struct SphericalRectangle
         retval.basis[0] = compressed.right / retval.extents[0];
         retval.basis[1] = compressed.up / retval.extents[1];
         assert(hlsl::abs(hlsl::dot(retval.basis[0], retval.basis[1])) < scalar_type(1e-5));
-        retval.basis[2] = hlsl::normalize(hlsl::cross(retval.basis[0], retval.basis[1]));
+        // don't normalize, the `right` and `up` vectors are orthogonal and the two bases are normalized already!
+        retval.basis[2] = hlsl::cross(retval.basis[0], retval.basis[1]);
         return retval;
     }
 
@@ -184,6 +185,8 @@ struct SphericalRectangle
         // rcpLen[i]*rcpLen[j] for adjacent pairs: (0,1), (1,2), (2,3), (3,0)
         const vector4_type cos_sides = unnormDots * rcpLen * rcpLen.yzwx;
 
+        // TODO: there's the same opportunity for optimization of this as the Spherical Triangle
+        // https://www.linkedin.com/posts/matt-kielan-9b054a165_untitled-graph-activity-7442910005671923712-jHz6?utm_source=share&utm_medium=member_desktop&rcm=ACoAACdp2RQBqq2bJfC2zxpsme-vRv2zh9oP-8E
         const vector4_type pyramidAngles = hlsl::acos<vector4_type>(cos_sides);
 
         return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);

From 1df4116e33dfb26f6a8fc81b109b8f3c27a97146 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Thu, 16 Apr 2026 23:14:57 +0200
Subject: [PATCH 02/12] epic optimization!

---
 .../hlsl/shapes/spherical_triangle.hlsl       | 52 ++++++++++++-------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
index 1a5681b39e..4ebe52d312 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
@@ -20,6 +20,26 @@ namespace hlsl
 namespace shapes
 {
 
+// TODO: move to where fast_acos lives
+template<typename T, int order=2>
+T acos_csc_approx(const T arg)
+{
+    const T u = hlsl::log2(_static_cast<T>(1)+arg);
+    // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial
+    // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties:
+    // `F(0) = 0` and
+    // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy
+    // See https://www.desmos.com/calculator/sdptomhbju
+    // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments
+    T poly;
+    // TODO: actually optimize these constants in real world scenarios (renders)
+    if (order==1)
+        poly = (_static_cast<T>(1)-u)*_static_cast<T>(0.6);
+    else if (order==2)
+        poly = (_static_cast<T>(1)-u)*_static_cast<T>(0.637)+(_static_cast<T>(1) - u * u) * _static_cast<T>(0.0115);
+    return hlsl::exp2<T>(poly);
+}
+
 template<typename T>
 struct SphericalTriangle
 {
@@ -87,34 +107,30 @@ struct SphericalTriangle
             return 0;
 
         matrix<scalar_type, 3, 3> awayFromEdgePlane;
-        awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * csc_sides[0];
-        awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * csc_sides[1];
-        awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * csc_sides[2];
+        // `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross(A,B)*acos_csc_approx(dot(A,B))`
+#define ACOS_CSC(I) acos_csc_approx(cos_sides[I])
+//#define ACOS_CSC(I) hlsl::acos(cos_sides[I])*csc_sides[I]
+        awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * ACOS_CSC(0);
+        awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * ACOS_CSC(1);
+        awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * ACOS_CSC(2);
+#undef ACOS_CSC
+        const vector3_type externalProductsWeightedByPyramidAngles = hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal);
+
         // The ABS makes it so that the computation is correct for an `abs(cos(theta))` factor which is the projected solid angle used for a BSDF.
+        // It also makes the computation insensitive to the CW or CCW winding of the vertices in the triangle.
         // Proof: Kelvin-Stokes theorem, if you split the set into two along the horizon with constant CCW winding, the `cross` along the shared edge
         // goes in different directions and cancels out, while `acos` of the clipped great arcs corresponding to polygon edges add up to the original sides again.
-        const vector3_type externalProducts = hlsl::abs(hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal));
-
-        // Far TODO: `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross*acos_csc_approx(dot(A,B))`
-        // We could skip the `csc_sides` factor, and computing `pyramidAngles` and replace them with this approximation weighting before the dot product with the receiver notmal
-        // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial, so far I've calculated `F = (1-u)0.635+(1-u^2)0.0118` which gives <5% error until 165 degrees
-        // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties:
-        // `F(0) = 0` and
-        // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy
-        // See https://www.desmos.com/calculator/sdptomhbju
-        // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments
-        const vector3_type pyramidAngles = hlsl::acos<vector3_type>(cos_sides);
-        // So that triangle covering almost whole hemisphere sums to PI
-        return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);
+        // The 0.5 is so that triangle covering almost whole hemisphere sums to PI
+        return hlsl::abs(externalProductsWeightedByPyramidAngles[0]+externalProductsWeightedByPyramidAngles[1]+externalProductsWeightedByPyramidAngles[2]) * scalar_type(0.5);
     }
 
     vector3_type vertices[3];
     // angles of vertices with origin, so the sides are INSIDE the sphere
     vector3_type cos_sides;
-    vector3_type csc_sides;
+    vector3_type csc_sides; // TODO: spherical triangle sampling only needs `csc_sides[1]` and possibly `csc_sides[2]`
     // angles between arcs on the sphere, so angles in the TANGENT plane at each vertex
     vector3_type cos_vertices;
-    vector3_type sin_vertices;
+    vector3_type sin_vertices; // TODO: spherical triangle sampling only needs `sin_vertices[0]` a.k.a `sinA`
     scalar_type solid_angle;
 };
 

From a3ed91a1b8354162027adfb2370820299bdbcf2e Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 02:00:44 +0200
Subject: [PATCH 03/12] `UsePdfAsWeight==false` implementation was 100% wrong

---
 .../projected_spherical_triangle.hlsl         | 59 +++++++++----------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
index c4bc5fcea8..923fc6b537 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
@@ -47,73 +47,68 @@ struct ProjectedSphericalTriangle
 
     struct cache_type
     {
-        scalar_type abs_cos_theta;
-        vector2_type warped;
         typename Bilinear<scalar_type>::cache_type bilinearCache;
+        vector3_type L; // TODO: erase when UsePdfAsWeight==false
     };
 
-    // NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away
-    // from all three triangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure
-    // at least one vertex has positive projection onto the receiver normal.
+    // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
     static ProjectedSphericalTriangle<T,UsePdfAsWeight> create(NBL_REF_ARG(shapes::SphericalTriangle<T>) shape, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
     {
         ProjectedSphericalTriangle<T,UsePdfAsWeight> retval;
-       retval.sphtri = SphericalTriangle<T, UsePdfAsWeight>::create(shape);
-        retval.receiverNormal = _receiverNormal;
-        retval.receiverWasBSDF = _receiverWasBSDF;
+        retval.sphtri = SphericalTriangle<T, UsePdfAsWeight>::create(shape);
 
         const scalar_type minimumProjSolidAngle = 0.0;
         matrix<T, 3, 3> m = matrix<T, 3, 3>(shape.vertices[0], shape.vertices[1], shape.vertices[2]);
         const vector3_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, hlsl::mul(m, _receiverNormal), hlsl::promote<vector3_type>(minimumProjSolidAngle));
         retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex.yyxz);
 
-        const scalar_type projSA = shape.projectedSolidAngle(_receiverNormal);
-        retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0);
+        NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+        {
+            retval.receiverNormal = _receiverNormal;
+            // prevent division of 0 cosine by 0
+            retval.projSolidAngle = max<scalar_type>(shape.projectedSolidAngle(_receiverNormal),numeric_limits<scalar_type>::min);
+        }
         return retval;
     }
 
     codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
-        Bilinear<scalar_type> bilinear = bilinearPatch;
-        cache.warped = bilinear.generate(u, cache.bilinearCache);
-        typename SphericalTriangle<scalar_type>::cache_type sphtriCache;
-        const vector3_type L = sphtri.generate(cache.warped, sphtriCache);
-        cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
+        const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+        typename SphericalTriangle<scalar_type>::cache_type sphtriCache; // PDF is constant caches nothing, its empty
+        const codomain_type L = sphtri.generate(warped, sphtriCache);
+        NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+            cache.L = L;
         return L;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache);
+        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u,cache.bilinearCache);
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        if (UsePdfAsWeight)
-            return forwardPdf(u, cache);
-        return cache.abs_cos_theta * rcpProjSolidAngle;
-    }
-
-    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        const vector2_type u = sphtri.generateInverse(L);
-        return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+        NBL_IF_CONSTEXPR (UsePdfAsWeight)
+            return forwardPdf(u,cache);
+        return backwardWeight(cache.L);
     }
 
     weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
     {
-        NBL_IF_CONSTEXPR(UsePdfAsWeight)
-            return backwardPdf(L);
-        const scalar_type minimumProjSolidAngle = 0.0;
-        const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(receiverNormal, L), minimumProjSolidAngle);
-        return abs_cos_theta * rcpProjSolidAngle;
+        NBL_IF_CONSTEXPR (UsePdfAsWeight)
+        {
+            const vector2_type u = sphtri.generateInverse(L);
+            return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+        }
+        // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
+        return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
     }
 
     sampling::SphericalTriangle<T, UsePdfAsWeight> sphtri;
     Bilinear<scalar_type> bilinearPatch;
-    scalar_type rcpProjSolidAngle;
+    // TODO: erase when UsePdfAsWeight==false
     vector3_type receiverNormal;
-    bool receiverWasBSDF;
+    vector3_type projSolidAngle;
 };
 
 } // namespace sampling

From 99e9495506616dda793df3fd63b22ba1dbcdde39 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 03:32:20 +0200
Subject: [PATCH 04/12] why its always to use accessing methods than members
 directly in other code :P

---
 .../sampling/projected_spherical_triangle.hlsl     |  4 ++--
 .../builtin/hlsl/sampling/spherical_triangle.hlsl  | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
index 923fc6b537..a542357ead 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
@@ -83,7 +83,7 @@ struct ProjectedSphericalTriangle
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u,cache.bilinearCache);
+        return sphtri.getRcpSolidAngle() * bilinearPatch.forwardPdf(u,cache.bilinearCache);
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -98,7 +98,7 @@ struct ProjectedSphericalTriangle
         NBL_IF_CONSTEXPR (UsePdfAsWeight)
         {
             const vector2_type u = sphtri.generateInverse(L);
-            return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+            return sphtri.getRcpSolidAngle() * bilinearPatch.backwardPdf(u);
         }
         // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
         return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
index 6f29582e04..eb714a2505 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
@@ -172,6 +172,8 @@ struct SphericalTriangle<T, false, Algorithm>
       return backwardPdf(L);
    }
 
+   scalar_type getRcpSolidAngle() NBL_CONST_MEMBER_FUNC {return rcpSolidAngle;}
+
    scalar_type rcpSolidAngle;
    scalar_type cosA;
    scalar_type sinA;
@@ -203,7 +205,6 @@ struct SphericalTriangle<T, true, Algorithm>
    {
       SphericalTriangle retval;
       retval.base = base_type::create(tri);
-      retval.rcpSolidAngle = retval.base.rcpSolidAngle;
       retval.vertexC = tri.vertices[2];
       // precompute great circle normal of arc AC (needed for generateInverse)
       const scalar_type cscB = tri.csc_sides[1];
@@ -277,7 +278,7 @@ struct SphericalTriangle<T, true, Algorithm>
       const scalar_type cosCpB = nbl::hlsl::dot(base.tri_vertices[1], cp);
       const scalar_type den = scalar_type(1.0) + base.triCosC + cosCpB + cosBp_inv;
       const scalar_type subSolidAngle = scalar_type(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
-      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
+      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * base.rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
 
       // Step 3: u.y = |L-B|^2 / |C'-B|^2
       // Squared Euclidean distance avoids catastrophic cancellation vs (1-dot)/(1-dot)
@@ -295,7 +296,7 @@ struct SphericalTriangle<T, true, Algorithm>
 
    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
    {
-      return rcpSolidAngle;
+      return base.rcpSolidAngle;
    }
 
    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -305,16 +306,15 @@ struct SphericalTriangle<T, true, Algorithm>
 
    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
-      return rcpSolidAngle;
+      return base.rcpSolidAngle;
    }
 
    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
       return backwardPdf(L);
    }
-
-   // mirrored from base for uniform access across both specializations
-   scalar_type rcpSolidAngle;
+   
+   scalar_type getRcpSolidAngle() NBL_CONST_MEMBER_FUNC {return base.rcpSolidAngle;}
 
    base_type base;
    vector3_type vertexC;

From 6da021b89138b4d5cfd058b1ca62cfaf980c1fc0 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 03:34:42 +0200
Subject: [PATCH 05/12] The Spherical Rectangle Shape needs to be created with
 the observer (local coordinate system) already given and fixed.

This way we can share computation between Solid Angle and Projected Solid Angle calculations

Also apply the `abs` fix from the triangle
---
 .../hlsl/shapes/spherical_rectangle.hlsl      | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
index bc864f7a19..7ffe2ef407 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
@@ -94,6 +94,8 @@ struct SphericalRectangle
         vector4_type cosGamma;
     };
 
+    // TODO: take an observer already, this way we can precompute and store the `r0`, `denorm_n_z` and `rcpLen_denorm_n_z`
+    // we need all of the above for solid angle and projected solid angle computation
     static SphericalRectangle<Scalar> create(NBL_CONST_REF_ARG(CompressedSphericalRectangle<Scalar>) compressed)
     {
         SphericalRectangle<scalar_type> retval;
@@ -113,7 +115,8 @@ struct SphericalRectangle
         result.r0 = hlsl::mul(basis, origin - observer);
 
         const vector4_type denorm_n_z = vector4_type(-result.r0.y, result.r0.x + extents.x, result.r0.y + extents.y, -result.r0.x);
-        result.n_z = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z);
+        const vector4_type rcpLen_denorm_n_z = hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z);
+        result.n_z = denorm_n_z * rcpLen_denorm_n_z;
         result.cosGamma = vector4_type(
             -result.n_z[0] * result.n_z[1],
             -result.n_z[1] * result.n_z[2],
@@ -129,17 +132,17 @@ struct SphericalRectangle
     }
 
     // Kelvin-Stokes theorem: signed projected solid angle = integral_{rect} (n . omega) d_omega
+    // TODO: don't take the observer, observer should be taken at creation
     scalar_type projectedSolidAngle(const vector3_type observer, const vector3_type receiverNormal) NBL_CONST_MEMBER_FUNC
     {
         return projectedSolidAngleFromLocal(hlsl::mul(basis, origin - observer), hlsl::mul(basis, receiverNormal));
     }
 
-    // Overload for when r0 and localNormal are already computed (avoids redundant mul(basis, ...)).
-    // Exploits rectangle structure: all 4 corners share the same z, so cross products
-    // have only 2 nonzero components each, and externalProducts can be computed without
-    // normalizing the corner directions.
+    // TODO: only take a `localN`
     scalar_type projectedSolidAngleFromLocal(const vector3_type r0, const vector3_type n) NBL_CONST_MEMBER_FUNC
     {
+        // FUN FACT: `n_z` already holds Z coordinate the NORMALIZED `awayFromEdgePlane`, the non-zero coordinate absolute value is equal to `r0.z * rcpLen_denorm_n_z`
+// TODO: skip all this code until  just call `acos` on the `unnormDots`
         const scalar_type x0 = r0.x, y0 = r0.y, z = r0.z;
         const scalar_type x1 = x0 + extents.x;
         const scalar_type y1 = y0 + extents.y;
@@ -155,8 +158,9 @@ struct SphericalRectangle
             zSq + x1 * x1,
             zSq + y1 * y1,
             zSq + x0 * x0
-        );
+        ); // TODO: this is already computed as `rcpLen_denorm_n_z`
 
+// TODO: this can be computed from `denorm_n_z`, `z` and `rcpLen_denorm_n_z` instead
         // dot(cross(ri,rj), n) / |cross(ri,rj)| the ex/ey scale factors cancel
         const vector4_type crossDotN = vector4_type(
             z * n.y - y0 * n.z,
@@ -165,8 +169,12 @@ struct SphericalRectangle
             z * n.x - x0 * n.z
         );
         // The ABS makes the computation correct for abs(cos(theta)) (BSDF projected solid angle).
-        const vector4_type externalProducts = hlsl::abs(crossDotN) * hlsl::rsqrt<vector4_type>(crossLenSq);
+        const vector4_type externalProducts = crossDotN * hlsl::rsqrt<vector4_type>(crossLenSq);
 
+// TODO: isn't `rcpLen_denorm_n_z` related to the sin^-1() of arclengths ? Wouldn't `ACOS_CSC` apply instead  of `acos*rsqrt(1-cos^2)`
+// wouldn't then `hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z` be the sin^2 ?
+// it would probably have to be a different, here's the `acos(sqrt(1-x*x))/x` curve fit again revealed to me in a dream
+//  https://www.desmos.com/calculator/sbdrulot5a = exp2(-1.6*sqrt(1-x*x))*A+B
         // cos(arc length) between adjacent corners: dot(ri,rj) / (|ri|*|rj|)
         const vector4_type lenSq = vector4_type(
             x0 * x0 + y0 * y0,
@@ -189,7 +197,7 @@ struct SphericalRectangle
         // https://www.linkedin.com/posts/matt-kielan-9b054a165_untitled-graph-activity-7442910005671923712-jHz6?utm_source=share&utm_medium=member_desktop&rcm=ACoAACdp2RQBqq2bJfC2zxpsme-vRv2zh9oP-8E
         const vector4_type pyramidAngles = hlsl::acos<vector4_type>(cos_sides);
 
-        return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);
+        return hlsl::abs(hlsl::dot(pyramidAngles, externalProducts)) * scalar_type(0.5);
     }
 
     vector3_type origin;

From 404592196897a57ba991cbd24cea0ecc7ec797a6 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 03:39:43 +0200
Subject: [PATCH 06/12] Spherical Rectangle needs `generateInverse`
 implemented, otherwise PRactical Warps are broken wrong weight was getting
 returned and MIS weights were inconsistent.

They're broken now, NaN returned as the backwardWeight
---
 examples_tests                                |   2 +-
 .../projected_spherical_rectangle.hlsl        | 120 +++++++++---------
 2 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/examples_tests b/examples_tests
index 5eeb47351e..89ecce1444 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 5eeb47351e29cb6ee02f8d8319f131a2c012b5a2
+Subproject commit 89ecce14443c216b30ff84b837b899045bb5513f
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
index 5bf652cb4c..618f401ba9 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
@@ -25,12 +25,6 @@ namespace sampling
 //   2. Warp uniform [0,1]^2 through the bilinear to importance-sample NdotL
 //   3. Feed the warped UV into the solid angle sampler to get a rect offset
 //   4. PDF = (1/SolidAngle) * bilinearPdf
-//
-// Template parameter `UsePdfAsWeight`: when true (default), forwardWeight/backwardWeight
-// return the PDF instead of the projected-solid-angle MIS weight.
-// TODO: the projected-solid-angle MIS weight (UsePdfAsWeight=false) has been shown to be
-// poor in practice. Once confirmed by testing, remove the false path and stop storing
-// receiverNormal, receiverWasBSDF, and rcpProjSolidAngle as members.
 template<typename T, bool UsePdfAsWeight = true>
 struct ProjectedSphericalRectangle
 {
@@ -47,24 +41,32 @@ struct ProjectedSphericalRectangle
 
 	struct cache_type
 	{
-		scalar_type abs_cos_theta;
-		vector2_type warped;
 		typename Bilinear<scalar_type>::cache_type bilinearCache;
+		vector3_type L; // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
 	};
 
-	// NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away
-	// from all four rectangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure
-	// at least one vertex has positive projection onto the receiver normal.
-	static ProjectedSphericalRectangle<T,UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+	// Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+	static ProjectedSphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::CompressedSphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
 	{
-		ProjectedSphericalRectangle<T,UsePdfAsWeight> retval;
+		ProjectedSphericalRectangle<T> retval;
+// TODO:https://github.com/Devsh-Graphics-Programming/Nabla/pull/1001#discussion_r3088570145
+		return retval;
+	}
+
+	// Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+	static ProjectedSphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+	{
+		ProjectedSphericalRectangle<T> retval;
 		const vector3_type n = hlsl::mul(shape.basis, _receiverNormal);
-		retval.localReceiverNormal = n;
-		retval.receiverWasBSDF = _receiverWasBSDF;
 
 		// Compute solid angle and get r0 in local frame (before z-flip)
 		const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
 		const vector3_type r0 = sa.r0;
+		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+		{
+			retval.receiverNormal = _receiverNormal;
+			retval.projSolidAngle = shape.projectedSolidAngleFromLocal(r0,n);
+		}
 
 		// All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
 		const scalar_type r1x = r0.x + shape.extents.x;
@@ -94,78 +96,74 @@ struct ProjectedSphericalRectangle
 		retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
 
 		// Reuse the already-computed solid_angle_type to avoid recomputing mul(basis, origin - observer)
-		retval.sphrect = SphericalRectangle<T>::create(sa, shape.extents);
-		retval.rcpSolidAngle = retval.sphrect.solidAngle > scalar_type(0.0) ? scalar_type(1.0) / retval.sphrect.solidAngle : scalar_type(0.0);
-
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-		{
-			const scalar_type projSA = shape.projectedSolidAngleFromLocal(r0, n);
-			retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0);
-		}
+		retval.sphrect = SphericalRectangle<T>::create(shape.basis, sa, shape.extents);
 		return retval;
 	}
 
 	// returns a normalized 3D direction in the local frame
-	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	codomain_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
 	{
-		Bilinear<scalar_type> bilinear = bilinearPatch;
-		cache.warped = bilinear.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache;
-		const vector3_type dir = sphrect.generate(cache.warped, sphrectCache);
-		cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
+		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+		const vector3_type dir = sphrect.generateNormalizedLocal(warped,sphrectCache,hitDist);
+		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+			cache.L = hlsl::mul(hlsl::transpose(sphrect.basis),dir);
 		return dir;
 	}
 
-	// returns a 2D offset on the rectangle surface from the r0 corner
-	vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	// returns a unnormalized 3D direction in the global frame
+	codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		Bilinear<scalar_type> bilinear = bilinearPatch;
-		cache.warped = bilinear.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache;
-		const vector2_type sampleOffset = sphrect.generateSurfaceOffset(cache.warped, sphrectCache);
-		cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
-		return sampleOffset;
+		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+		const vector3_type dir = sphrect.generateUnnormalized(warped,sphrectCache);
+		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+			cache.L = dir * hlsl::rsqrt(hlsl::dot(dir,dir));
+		return dir;
 	}
 
-	density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+	// returns a normalized 3D direction in the global frame
+	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		return rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache);
+		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+		const vector3_type dir = sphrect.generate(warped, sphrectCache);
+		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+			cache.L = dir;
+		return dir;
 	}
 
-	weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+	density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
 	{
-		if (UsePdfAsWeight)
-			return forwardPdf(u, cache);
-		return cache.abs_cos_theta * rcpProjSolidAngle;
+		return bilinearPatch.forwardPdf(u,cache.bilinearCache) / sphrect.solidAngle;
 	}
 
-	// `p` is the normalized [0,1]^2 position on the rectangle
-	density_type backwardPdf(const vector2_type p) NBL_CONST_MEMBER_FUNC
+	weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
 	{
-		return rcpSolidAngle * bilinearPatch.backwardPdf(p);
+        NBL_IF_CONSTEXPR (UsePdfAsWeight)
+            return forwardPdf(u,cache);
+        return backwardWeight(cache.L);
 	}
 
-	weight_type backwardWeight(const vector2_type p) NBL_CONST_MEMBER_FUNC
+	weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
 	{
 		NBL_IF_CONSTEXPR(UsePdfAsWeight)
-			return backwardPdf(p);
-		const scalar_type minimumProjSolidAngle = 0.0;
-		// Reconstruct local direction from normalized rect position
-		const vector3_type localDir = hlsl::normalize(sphrect.r0 + vector3_type(
-			p.x * sphrect.extents.x,
-			p.y * sphrect.extents.y,
-			scalar_type(0)
-		));
-		const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(localReceiverNormal, localDir), minimumProjSolidAngle);
-		return abs_cos_theta * rcpProjSolidAngle;
+		{
+#if 0
+			const vector2_type warped = sphrect.generateInvese(L); // TODO: implement `generateInverse`
+			return bilinearPatch.backwardPdf(warped) / sphrect.solidAngle;
+#endif
+			return 0.f/0.f;
+		}
+        // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
+        return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
 	}
 
 	sampling::SphericalRectangle<T> sphrect;
 	Bilinear<scalar_type> bilinearPatch;
-	scalar_type rcpSolidAngle;
-	scalar_type rcpProjSolidAngle;
-	vector3_type localReceiverNormal;
-	bool receiverWasBSDF;
+	// TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
+	vector3_type receiverNormal;
+	vector3_type projSolidAngle;
 };
 
 } // namespace sampling

From 45a68a625475e3bd49245ac71da1a8d9c787a00d Mon Sep 17 00:00:00 2001
From: Mateusz Kielan <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 23:40:00 +0200
Subject: [PATCH 07/12] Update spherical_triangle.hlsl

always move the scalar multiplication of a vector in a dot product outside the dot product
---
 .../nbl/builtin/hlsl/shapes/spherical_triangle.hlsl   | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
index 4ebe52d312..b8a2c7229e 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
@@ -74,6 +74,7 @@ struct SphericalTriangle
         // degenerate triangle: any side has near-zero sin, so csc blows up
         if (hlsl::any<vector<bool, 3> >(retval.csc_sides >= hlsl::promote<vector3_type>(numeric_limits<scalar_type>::max)))
         {
+            // TODO: can't do this, still need to be able to sample thin triangle like a line light, so need to know all the angles which are still valid
             retval.cos_vertices = hlsl::promote<vector3_type>(0.0);
             retval.sin_vertices = hlsl::promote<vector3_type>(0.0);
             retval.solid_angle = 0;
@@ -106,22 +107,20 @@ struct SphericalTriangle
         if (solid_angle <= numeric_limits<scalar_type>::epsilon)
             return 0;
 
-        matrix<scalar_type, 3, 3> awayFromEdgePlane;
         // `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross(A,B)*acos_csc_approx(dot(A,B))`
 #define ACOS_CSC(I) acos_csc_approx(cos_sides[I])
 //#define ACOS_CSC(I) hlsl::acos(cos_sides[I])*csc_sides[I]
-        awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * ACOS_CSC(0);
-        awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * ACOS_CSC(1);
-        awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * ACOS_CSC(2);
+        scalar_type externalProductsWeightedByPyramidAngles = hlsl::dot(hlsl::cross(vertices[1], vertices[2]),receiverNormal) * ACOS_CSC(0);
+        externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[2], vertices[0]),receiverNormal) * ACOS_CSC(1);
+        externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[0], vertices[1]),receiverNormal) * ACOS_CSC(2);
 #undef ACOS_CSC
-        const vector3_type externalProductsWeightedByPyramidAngles = hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal);
 
         // The ABS makes it so that the computation is correct for an `abs(cos(theta))` factor which is the projected solid angle used for a BSDF.
         // It also makes the computation insensitive to the CW or CCW winding of the vertices in the triangle.
         // Proof: Kelvin-Stokes theorem, if you split the set into two along the horizon with constant CCW winding, the `cross` along the shared edge
         // goes in different directions and cancels out, while `acos` of the clipped great arcs corresponding to polygon edges add up to the original sides again.
         // The 0.5 is so that triangle covering almost whole hemisphere sums to PI
-        return hlsl::abs(externalProductsWeightedByPyramidAngles[0]+externalProductsWeightedByPyramidAngles[1]+externalProductsWeightedByPyramidAngles[2]) * scalar_type(0.5);
+        return externalProductsWeightedByPyramidAngles * scalar_type(0.5);
     }
 
     vector3_type vertices[3];

From 2291b7d4a1ebf775034dfed8ec0fafcaab9d163b Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 22 Apr 2026 01:24:28 +0300
Subject: [PATCH 08/12] fixed compartor float vs bool warning, no more
 spherical triangle boilerplate, addressed comments

---
 examples_tests                                |   2 +-
 include/nbl/builtin/hlsl/algorithm.hlsl       |   4 +-
 include/nbl/builtin/hlsl/functional.hlsl      |  24 +-
 include/nbl/builtin/hlsl/ies/sampler.hlsl     |   2 +-
 include/nbl/builtin/hlsl/math/functions.hlsl  |  19 +-
 .../nbl/builtin/hlsl/sampling/bilinear.hlsl   |   2 +-
 .../hlsl/sampling/cos_weighted_spheres.hlsl   |   2 +-
 include/nbl/builtin/hlsl/sampling/linear.hlsl |  18 +-
 .../projected_spherical_rectangle.hlsl        | 285 +++++++------
 .../projected_spherical_triangle.hlsl         |  10 +-
 .../hlsl/sampling/spherical_rectangle.hlsl    | 398 +++++++++---------
 .../hlsl/sampling/spherical_triangle.hlsl     | 178 +++-----
 .../hlsl/sampling/uniform_spheres.hlsl        |   9 +-
 13 files changed, 475 insertions(+), 478 deletions(-)

diff --git a/examples_tests b/examples_tests
index 89ecce1444..fb5cfa2bca 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 89ecce14443c216b30ff84b837b899045bb5513f
+Subproject commit fb5cfa2bcaa0a92aafb429f3d390658d28d1ca02
diff --git a/include/nbl/builtin/hlsl/algorithm.hlsl b/include/nbl/builtin/hlsl/algorithm.hlsl
index 631001686e..d80ebdf443 100644
--- a/include/nbl/builtin/hlsl/algorithm.hlsl
+++ b/include/nbl/builtin/hlsl/algorithm.hlsl
@@ -142,7 +142,9 @@ struct bound_t
 
     void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint, const uint32_t rightBegin)
     {
-        if (compare(accessor[testPoint],value))
+        typename Accessor::value_type val;
+        accessor.get(testPoint, val);
+        if (compare(val,value))
             it = rightBegin;
     }
     void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint)
diff --git a/include/nbl/builtin/hlsl/functional.hlsl b/include/nbl/builtin/hlsl/functional.hlsl
index 118fe07c63..6a155f2b92 100644
--- a/include/nbl/builtin/hlsl/functional.hlsl
+++ b/include/nbl/builtin/hlsl/functional.hlsl
@@ -89,12 +89,23 @@ struct reference_wrapper : enable_if_t<
         return lhs OP rhs; \
     }
 
+#define ALIAS_STD_CMP(NAME,OP) template<typename T NBL_STRUCT_CONSTRAINABLE > struct NAME { \
+    using type_t = T; \
+    \
+    bool operator()(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs) \
+    { \
+        return lhs OP rhs; \
+    }
+
 
 #else // CPP
 
 #define ALIAS_STD(NAME,OP) template<typename T> struct NAME : std::NAME<T> { \
     using type_t = T;
 
+#define ALIAS_STD_CMP(NAME,OP) template<typename T> struct NAME : std::NAME<T> { \
+    using type_t = T;
+
 #endif
 
 ALIAS_STD(bit_and,&)
@@ -136,14 +147,15 @@ ALIAS_STD(divides,/)
 };
 
 
-ALIAS_STD(equal_to, ==) };
-ALIAS_STD(not_equal_to, !=) };
-ALIAS_STD(greater, >) };
-ALIAS_STD(less, <) };
-ALIAS_STD(greater_equal, >=) };
-ALIAS_STD(less_equal, <=) };
+ALIAS_STD_CMP(equal_to, ==) };
+ALIAS_STD_CMP(not_equal_to, !=) };
+ALIAS_STD_CMP(greater, >) };
+ALIAS_STD_CMP(less, <) };
+ALIAS_STD_CMP(greater_equal, >=) };
+ALIAS_STD_CMP(less_equal, <=) };
 
 #undef ALIAS_STD
+#undef ALIAS_STD_CMP
 
 // The above comparison operators return bool on STD, but in HLSL they're supposed to yield bool vectors, so here's a specialization so that they return `vector<bool, N>` for vectorial types
 
diff --git a/include/nbl/builtin/hlsl/ies/sampler.hlsl b/include/nbl/builtin/hlsl/ies/sampler.hlsl
index ab4046477c..a6309fd128 100644
--- a/include/nbl/builtin/hlsl/ies/sampler.hlsl
+++ b/include/nbl/builtin/hlsl/ies/sampler.hlsl
@@ -85,7 +85,7 @@ struct CandelaSampler
         const angle_t vAngle = degrees(polar.theta);
         const angle_t hAngle = degrees(__wrapPhi(polar.phi, symmetry));
 
-#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } };
+#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } void get(uint32_t idx, NBL_REF_ARG(value_type) val) NBL_CONST_MEMBER_FUNC { val = EXPR; } };
 
         NBL_IES_DEF_ANGLE_ACC(VAcc, acc.vAngle(idx))
         NBL_IES_DEF_ANGLE_ACC(HAcc, acc.hAngle(idx))
diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index b5b6f8feea..a80bdda904 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -93,13 +93,22 @@ scalar_type_t<T> lpNorm(NBL_CONST_REF_ARG(T) v)
 
 
 // valid only for `theta` in [-PI,PI]
-template <typename T NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<T>)
+// UseRealSinCos=true  -> back-to-back sin + cos. Saturates the special-function pipeline, enables vendor sincos fusion, full precision near multiples of pi.
+// UseRealSinCos=false -> cos + sqrt(1-c*c) with sign recovered from theta. Saves one special-function op when cos alone is cheaper than sin+cos, but suffers catastrophic cancellation as |c| -> 1.
+template <typename T, bool UseRealSinCos = true NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<T>)
 void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c)
 {
-    s = sin<T>(theta);
-    c = cos<T>(theta);
-    // s = sqrt<T>(T(NBL_FP64_LITERAL(1.0))-c*c);
-    // s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0)));
+    if (UseRealSinCos)
+    {
+        s = sin<T>(theta);
+        c = cos<T>(theta);
+    }
+    else
+    {
+        c = cos<T>(theta);
+        s = sqrt<T>(T(NBL_FP64_LITERAL(1.0))-c*c);
+        s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0)));
+    }
 }
 
 template <typename T NBL_FUNC_REQUIRES(vector_traits<T>::Dimension == 3)
diff --git a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
index 35f1391930..baf1b5427e 100644
--- a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
@@ -65,7 +65,7 @@ struct Bilinear
 
         // bilinear PDF = marginal_y_pdf * conditional_x_pdf; reuse both linear caches
         const scalar_type yPdf = lineary.forwardPdf(u.y, linearYCache);
-        cache.normalizedStart = yPdf * linearx.linearCoeffStart;
+        cache.normalizedStart = yPdf * linearx.normalizedCoeffStart;
         cache.linearXCache.diffTimesX *= yPdf;
         return p;
     }
diff --git a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
index 3a191f142f..05aff5d3b8 100644
--- a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
@@ -93,7 +93,7 @@ struct ProjectedSphere
 	static codomain_type __generate(NBL_REF_ARG(domain_type) u)
 	{
 		vector_t3 retval = hemisphere_t::__generate(u.xy);
-		const bool chooseLower = u.z > T(0.5);
+		const bool chooseLower = u.z > scalar_type(0.5);
 		retval.z = chooseLower ? (-retval.z) : retval.z;
 		if (chooseLower)
 			u.z -= T(0.5);
diff --git a/include/nbl/builtin/hlsl/sampling/linear.hlsl b/include/nbl/builtin/hlsl/sampling/linear.hlsl
index 12602b4a79..8ebf2e38a0 100644
--- a/include/nbl/builtin/hlsl/sampling/linear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/linear.hlsl
@@ -39,11 +39,11 @@ struct Linear
         // add min to both coefficients so (0,0) input produces a valid uniform sampler
         // instead of inf normalization (2/0) leading to NaN; negligible for normal inputs
         const vector2_type safeCoeffs = linearCoeffs + vector2_type(hlsl::numeric_limits<scalar_type>::min, hlsl::numeric_limits<scalar_type>::min);
-        // normalize coefficients so that the PDF is simply linearCoeffStart + linearCoeffDiff * x
+        // normalize coefficients so that the PDF is simply normalizedCoeffStart + linearCoeffDiff * x
         const scalar_type normFactor = scalar_type(2.0) / (safeCoeffs[0] + safeCoeffs[1]);
         const vector2_type normalized = safeCoeffs * normFactor;
-        retval.linearCoeffStart = normalized[0];
-        retval.linearCoeffEnd = normalized[1];
+        retval.normalizedCoeffStart = normalized[0];
+        retval.normalizedCoeffEnd = normalized[1];
         // precompute for the stable quadratic in generate()
         retval.squaredCoeffStart = normalized[0] * normalized[0];
         retval.twoTimesDiff = scalar_type(2.0) * (normalized[1] - normalized[0]);
@@ -57,18 +57,18 @@ struct Linear
         // Quadratic (1-start)*x^2 + start*x - u = 0; since start >= 0 the stable root is
         // x = 2u / (start + sqrt(start^2 + 2*diff*u)), which never cancels.
         const scalar_type sqrtTerm = sqrt(squaredCoeffStart + twoTimesDiff * u);
-        const scalar_type denom = linearCoeffStart + sqrtTerm;
+        const scalar_type denom = normalizedCoeffStart + sqrtTerm;
         // NOTE: floating point can make x slightly > 1 when u~1 and diff < 0; callers needing
         // non-negative PDF at the boundary should clamp with min(x, 1).
         const codomain_type x = (u + u) / denom;
         // diff*x == sqrtTerm - start algebraically (conjugate identity), saves 1 mul
-        cache.diffTimesX = sqrtTerm - linearCoeffStart;
+        cache.diffTimesX = sqrtTerm - normalizedCoeffStart;
         return x;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return linearCoeffStart + cache.diffTimesX;
+        return normalizedCoeffStart + cache.diffTimesX;
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -83,7 +83,7 @@ struct Linear
     density_type backwardPdf(const codomain_type x) NBL_CONST_MEMBER_FUNC
     {
         assert(x >= scalar_type(0.0) && x <= scalar_type(1.0));
-        return hlsl::mix(linearCoeffStart, linearCoeffEnd, x);
+        return hlsl::mix(normalizedCoeffStart, normalizedCoeffEnd, x);
     }
 
     weight_type backwardWeight(const codomain_type x) NBL_CONST_MEMBER_FUNC
@@ -91,8 +91,8 @@ struct Linear
         return backwardPdf(x);
     }
 
-    scalar_type linearCoeffStart;
-    scalar_type linearCoeffEnd;
+    scalar_type normalizedCoeffStart;
+    scalar_type normalizedCoeffEnd;
     scalar_type squaredCoeffStart;
     scalar_type twoTimesDiff;
 };
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
index 618f401ba9..d40e58c5db 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
@@ -28,142 +28,163 @@ namespace sampling
 template<typename T, bool UsePdfAsWeight = true>
 struct ProjectedSphericalRectangle
 {
-	using scalar_type = T;
-	using vector2_type = vector<T, 2>;
-	using vector3_type = vector<T, 3>;
-	using vector4_type = vector<T, 4>;
-
-	// BackwardTractableSampler concept types
-	using domain_type = vector2_type;
-	using codomain_type = vector3_type;
-	using density_type = scalar_type;
-	using weight_type = density_type;
-
-	struct cache_type
-	{
-		typename Bilinear<scalar_type>::cache_type bilinearCache;
-		vector3_type L; // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
-	};
-
-	// Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
-	static ProjectedSphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::CompressedSphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
-	{
-		ProjectedSphericalRectangle<T> retval;
-// TODO:https://github.com/Devsh-Graphics-Programming/Nabla/pull/1001#discussion_r3088570145
-		return retval;
-	}
-
-	// Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
-	static ProjectedSphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
-	{
-		ProjectedSphericalRectangle<T> retval;
-		const vector3_type n = hlsl::mul(shape.basis, _receiverNormal);
-
-		// Compute solid angle and get r0 in local frame (before z-flip)
-		const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
-		const vector3_type r0 = sa.r0;
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-		{
-			retval.receiverNormal = _receiverNormal;
-			retval.projSolidAngle = shape.projectedSolidAngleFromLocal(r0,n);
-		}
-
-		// All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
-		const scalar_type r1x = r0.x + shape.extents.x;
-		const scalar_type r1y = r0.y + shape.extents.y;
-
-		// Unnormalized dots: dot(corner_i, n)
-		const scalar_type base_dot = hlsl::dot(r0, n);
-		const scalar_type dx = shape.extents.x * n.x;
-		const scalar_type dy = shape.extents.y * n.y;
-		const vector4_type dots = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy);
-
-		// Squared lengths of each corner
-		const scalar_type r0zSq = r0.z * r0.z;
-		const vector4_type lenSq = vector4_type(
-			r0.x * r0.x + r0.y * r0.y,
-			r1x * r1x + r0.y * r0.y,
-			r0.x * r0.x + r1y * r1y,
-			r1x * r1x + r1y * r1y
-		) + hlsl::promote<vector4_type>(r0zSq);
-
-		// dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq)
-		// Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11
-		const scalar_type minimumProjSolidAngle = 0.0;
-		const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF,
-			dots * hlsl::rsqrt<vector4_type>(lenSq),
-			hlsl::promote<vector4_type>(minimumProjSolidAngle));
-		retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
-
-		// Reuse the already-computed solid_angle_type to avoid recomputing mul(basis, origin - observer)
-		retval.sphrect = SphericalRectangle<T>::create(shape.basis, sa, shape.extents);
-		return retval;
-	}
-
-	// returns a normalized 3D direction in the local frame
-	codomain_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
-	{
-		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
-		const vector3_type dir = sphrect.generateNormalizedLocal(warped,sphrectCache,hitDist);
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-			cache.L = hlsl::mul(hlsl::transpose(sphrect.basis),dir);
-		return dir;
-	}
-
-	// returns a unnormalized 3D direction in the global frame
-	codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-	{
-		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
-		const vector3_type dir = sphrect.generateUnnormalized(warped,sphrectCache);
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-			cache.L = dir * hlsl::rsqrt(hlsl::dot(dir,dir));
-		return dir;
-	}
-
-	// returns a normalized 3D direction in the global frame
-	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-	{
-		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
-		const vector3_type dir = sphrect.generate(warped, sphrectCache);
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-			cache.L = dir;
-		return dir;
-	}
-
-	density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-	{
-		return bilinearPatch.forwardPdf(u,cache.bilinearCache) / sphrect.solidAngle;
-	}
-
-	weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-	{
-        NBL_IF_CONSTEXPR (UsePdfAsWeight)
-            return forwardPdf(u,cache);
-        return backwardWeight(cache.L);
-	}
-
-	weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
-	{
-		NBL_IF_CONSTEXPR(UsePdfAsWeight)
-		{
+   using scalar_type  = T;
+   using vector2_type = vector<T, 2>;
+   using vector3_type = vector<T, 3>;
+   using vector4_type = vector<T, 4>;
+
+   // BackwardTractableSampler concept types
+   using domain_type   = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type  = scalar_type;
+   using weight_type   = density_type;
+
+   struct cache_type
+   {
+      typename Bilinear<scalar_type>::cache_type bilinearCache;
+      vector3_type L; // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
+   };
+
+   // Shared finalization for both create() overloads: builds the bilinear patch, the inner sphrect
+   // sampler, and the UsePdfAsWeight=false extras. The two overloads differ only in how they
+   // compute bxdfPdfAtVertex (worldspace corner normalizations vs local-frame rsqrt(lenSq)).
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa,
+      const vector4_type bxdfPdfAtVertex, const vector3_type _receiverNormal)
+   {
+      ProjectedSphericalRectangle<T, UsePdfAsWeight> retval;
+      retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
+      // Reuse solid_angle_type to avoid recomputing mul(basis, origin - observer)
+      retval.sphrect = SphericalRectangle<T>::create(shape.basis, sa, shape.extents);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      {
+         retval.receiverNormal     = _receiverNormal;
+         const vector3_type nLocal = hlsl::mul(shape.basis, _receiverNormal);
+         retval.projSolidAngle     = shape.projectedSolidAngleFromLocal(sa.r0, nLocal);
+      }
+      return retval;
+   }
+
+   // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::CompressedSphericalRectangle<T>) compressed, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+   {
+      // 4 normalized worldspace corners dotted with the worldspace receiver normal. Avoids the
+      // mul(basis, receiverNormal) data dependency of the uncompressed overload so these 4
+      // normalize+dot chains can pipeline alongside the basis/solid-angle work below.
+      const vector3_type c0   = compressed.origin - observer;
+      const vector3_type c1   = c0 + compressed.right;
+      const vector3_type c2   = c0 + compressed.up;
+      const vector3_type c3   = c1 + compressed.up;
+      const vector4_type dots = vector4_type(
+         hlsl::dot(hlsl::normalize(c0), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c1), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c2), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c3), _receiverNormal));
+      const scalar_type minimumProjSolidAngle = scalar_type(0.0);
+      const vector4_type bxdfPdfAtVertex      = math::conditionalAbsOrMax(_receiverWasBSDF, dots, hlsl::promote<vector4_type>(minimumProjSolidAngle));
+
+      const shapes::SphericalRectangle<T> shape                         = shapes::SphericalRectangle<T>::create(compressed);
+      const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
+      return create(shape, sa, bxdfPdfAtVertex, _receiverNormal);
+   }
+
+   // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+   {
+      // Local-frame path: unnormalized dot(corner_i, n) with n = basis * receiverNormal, then
+      // a single rsqrt<vec4>(lenSq) for all 4 corner normalizations at once.
+      const vector3_type n                                              = hlsl::mul(shape.basis, _receiverNormal);
+      const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
+      const vector3_type r0                                             = sa.r0;
+
+      // All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
+      const scalar_type r1x      = r0.x + shape.extents.x;
+      const scalar_type r1y      = r0.y + shape.extents.y;
+      const scalar_type base_dot = hlsl::dot(r0, n);
+      const scalar_type dx       = shape.extents.x * n.x;
+      const scalar_type dy       = shape.extents.y * n.y;
+      const vector4_type dots    = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy);
+
+      const scalar_type r0zSq  = r0.z * r0.z;
+      const vector4_type lenSq = vector4_type(
+                                    r0.x * r0.x + r0.y * r0.y,
+                                    r1x * r1x + r0.y * r0.y,
+                                    r0.x * r0.x + r1y * r1y,
+                                    r1x * r1x + r1y * r1y) +
+         hlsl::promote<vector4_type>(r0zSq);
+
+      // dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq). Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11
+      const scalar_type minimumProjSolidAngle = scalar_type(0.0);
+      const vector4_type bxdfPdfAtVertex      = math::conditionalAbsOrMax(_receiverWasBSDF,
+         dots * hlsl::rsqrt<vector4_type>(lenSq),
+         hlsl::promote<vector4_type>(minimumProjSolidAngle));
+
+      return create(shape, sa, bxdfPdfAtVertex, _receiverNormal);
+   }
+
+   // returns a normalized 3D direction in the local frame
+   codomain_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generateNormalizedLocal(warped, sphrectCache, hitDist);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      cache.L = hlsl::mul(hlsl::transpose(sphrect.basis), dir);
+      return dir;
+   }
+
+   // returns a unnormalized 3D direction in the global frame
+   codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generateUnnormalized(warped, sphrectCache);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      cache.L = dir * hlsl::rsqrt(hlsl::dot(dir, dir));
+      return dir;
+   }
+
+   // returns a normalized 3D direction in the global frame
+   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generate(warped, sphrectCache);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      cache.L = dir;
+      return dir;
+   }
+
+   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return bilinearPatch.forwardPdf(u, cache.bilinearCache) / sphrect.solidAngle;
+   }
+
+   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      NBL_IF_CONSTEXPR(UsePdfAsWeight)
+      return forwardPdf(u, cache);
+      return backwardWeight(cache.L);
+   }
+
+   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      NBL_IF_CONSTEXPR(UsePdfAsWeight)
+      {
 #if 0
 			const vector2_type warped = sphrect.generateInvese(L); // TODO: implement `generateInverse`
 			return bilinearPatch.backwardPdf(warped) / sphrect.solidAngle;
 #endif
-			return 0.f/0.f;
-		}
-        // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
-        return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
-	}
-
-	sampling::SphericalRectangle<T> sphrect;
-	Bilinear<scalar_type> bilinearPatch;
-	// TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
-	vector3_type receiverNormal;
-	vector3_type projSolidAngle;
+         return hlsl::numeric_limits<weight_type>::quiet_NaN;
+      }
+      // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
+      return hlsl::abs(hlsl::dot(L, receiverNormal)) / projSolidAngle;
+   }
+
+   sampling::SphericalRectangle<T> sphrect;
+   Bilinear<scalar_type> bilinearPatch;
+   // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
+   vector3_type receiverNormal;
+   scalar_type projSolidAngle;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
index a542357ead..6eddd03e8a 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
@@ -55,7 +55,7 @@ struct ProjectedSphericalTriangle
     static ProjectedSphericalTriangle<T,UsePdfAsWeight> create(NBL_REF_ARG(shapes::SphericalTriangle<T>) shape, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
     {
         ProjectedSphericalTriangle<T,UsePdfAsWeight> retval;
-        retval.sphtri = SphericalTriangle<T, UsePdfAsWeight>::create(shape);
+        retval.sphtri = SphericalTriangle<T>::create(shape);
 
         const scalar_type minimumProjSolidAngle = 0.0;
         matrix<T, 3, 3> m = matrix<T, 3, 3>(shape.vertices[0], shape.vertices[1], shape.vertices[2]);
@@ -83,7 +83,7 @@ struct ProjectedSphericalTriangle
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return sphtri.getRcpSolidAngle() * bilinearPatch.forwardPdf(u,cache.bilinearCache);
+        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u,cache.bilinearCache);
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -98,17 +98,17 @@ struct ProjectedSphericalTriangle
         NBL_IF_CONSTEXPR (UsePdfAsWeight)
         {
             const vector2_type u = sphtri.generateInverse(L);
-            return sphtri.getRcpSolidAngle() * bilinearPatch.backwardPdf(u);
+            return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
         }
         // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
         return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
     }
 
-    sampling::SphericalTriangle<T, UsePdfAsWeight> sphtri;
+    sampling::SphericalTriangle<T> sphtri;
     Bilinear<scalar_type> bilinearPatch;
     // TODO: erase when UsePdfAsWeight==false
     vector3_type receiverNormal;
-    vector3_type projSolidAngle;
+    scalar_type projSolidAngle;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
index 8029205384..6873362b1a 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
@@ -19,202 +19,212 @@ namespace hlsl
 namespace sampling
 {
 
-template<typename T>
+// UseRealSinCos=true  (default): math::sincos uses real sin+cos; full precision near au=n*pi, Jacobian test passes.
+// UseRealSinCos=false          : math::sincos uses cos + sqrt(1-c*c); saves one special-function op but loses mantissa as |cos_au| -> 1.
+template<typename T, bool UseRealSinCos = true>
 struct SphericalRectangle
 {
-    using scalar_type = T;
-    using vector2_type = vector<T, 2>;
-    using vector3_type = vector<T, 3>;
-    using vector4_type = vector<T, 4>;
-    using matrix3x3_type = matrix<T, 3, 3>;
-
-    // BackwardTractableSampler concept types
-    using domain_type = vector2_type;
-    using codomain_type = vector3_type;
-    using density_type = scalar_type;
-    using weight_type = density_type;
-
-    struct cache_type {};
-
-    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
-    {
-        return create(rect.basis,rect.solidAngle(observer),rect.extents);
-    }
-
-    static SphericalRectangle<T> create(const matrix3x3_type _basis, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
-    {
-        SphericalRectangle<T> retval;
-        retval.basis = _basis;
-        retval.r0 = sa.r0;
-        retval.extents = _extents;
-
-        retval.solidAngle = sa.value;
-        retval.b0 = sa.n_z[0];
-        retval.b1 = sa.n_z[2];
-
-        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[2]);
-        angle_adder.addCosine(sa.cosGamma[3]);
-        retval.k = scalar_type(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
-
-        return retval;
-    }
-
-    // Create directly from a local-frame corner position and rectangle extents.
-    // Use when you already know r0 (e.g. from a gnomonic projection) and don't
-    // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip.
-    static SphericalRectangle<T> create(const matrix3x3_type _basis, const vector3_type _r0, const vector2_type _extents)
-    {
-        // Same math as shapes::SphericalRectangle::solidAngle() but without
-        // the mul(basis, origin - observer) step since we already have r0.
-        typename shapes::SphericalRectangle<T>::solid_angle_type sa;
-        sa.r0 = _r0;
-
-        const scalar_type zSq = _r0.z * _r0.z;
-        const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x);
-        sa.n_z = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(zSq) + denorm_n_z * denorm_n_z);
-        sa.cosGamma = vector4_type(
-            -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2],
-            -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]);
-
-        math::sincos_accumulator<scalar_type> acc = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[0]);
-        acc.addCosine(sa.cosGamma[1]);
-        acc.addCosine(sa.cosGamma[2]);
-        acc.addCosine(sa.cosGamma[3]);
-        sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi<scalar_type>;
-
-        return create(_basis, sa, _extents);
-    }
-
-    // shared core of generate and generateSurfaceOffset
-    // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction
-    struct SCommonGen
-    {
-        scalar_type xu;
-        scalar_type d2;
-        scalar_type hv;
-        scalar_type cosElevation2;
-    };
-    SCommonGen __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
-    {
-        SCommonGen retval;
-
-        // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip
-        const scalar_type negAbsR0z = -hlsl::abs(r0.z);
-        const scalar_type r0zSq = r0.z * r0.z;
-        const vector2_type r1 = vector2_type(r0.x + extents.x, r0.y + extents.y);
-
-        const scalar_type au = u.x * solidAngle + k;
-        const scalar_type cos_au = hlsl::cos<scalar_type>(au);
-        const scalar_type numerator = b1 - cos_au * b0;
-        // (1-cos)*(1+cos) avoids catastrophic cancellation of 1-cos^2 when cos_au is near +/-1
-        const scalar_type sin_au_sq = (scalar_type(1.0) - cos_au) * (scalar_type(1.0) + cos_au);
-        const scalar_type absNegFu = hlsl::abs(numerator) * hlsl::rsqrt<scalar_type>(sin_au_sq);
-        const scalar_type rcpCu_2 = hlsl::max<scalar_type>(absNegFu * absNegFu + b0 * b0, scalar_type(1.0));
-        // sign(negFu) = sign(numerator) * sign(sin(au)); sin(au) < 0 iff au > PI
-        const scalar_type negFuSign = hlsl::select((au > numbers::pi<scalar_type>) != (numerator < scalar_type(0.0)), scalar_type(-1.0), scalar_type(1.0));
-        retval.xu = negAbsR0z * negFuSign * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
-        retval.xu = hlsl::clamp<scalar_type>(retval.xu, r0.x, r1.x); // avoid Infs
-        // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2` and reclamping this result in a different way
-        // with `d2 = r0zSq*(rcpCu_2*rcpCu_2-rcpCu_2)` or `d2 = r0zSq*rcpCu_2*(rcpCu_2-1)`
-        retval.d2 = retval.xu * retval.xu + r0zSq;
-
-        const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(retval.d2 + r0.y * r0.y);
-        const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(retval.d2 + r1.y * r1.y);
-        retval.hv = h0 + u.y * (h1 - h0);
-        retval.cosElevation2 = scalar_type(1.0) - hlsl::min<scalar_type>(retval.hv * retval.hv,1);
-
-        return retval;
-    }
-
-    // returns a normalized 3D direction in the local frame with correct r0.z sign
-    vector3_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
-    {
-        const SCommonGen core = __generate(u);
-        scalar_type cosElevationOverD = hlsl::rsqrt(core.d2/core.cosElevation2);
-        // TODO: or shall we do some other more sophisticated clamp or correction? Is this even the right one to use?
-        cosElevationOverD = hlsl::select(hlsl::isnan(cosElevationOverD),1.f,cosElevationOverD);
-
-        // TODO: investigate if due to precision we need to compute this as a `sqrt` then the quantity being computed is `core.cosElevation2/core.d2`
-        // which what alss `generateLocalBasisXY` needs, in which case `__generate` can already compute it and return it as `hitDist2`
-        hitDist = 1.f/cosElevationOverD;
-
-        const vector3_type retval = vector3_type(core.xu / hitDist, core.hv, r0.z / hitDist);
-        assert(!hlsl::isnan(computeHitT(retval)));
-        return retval;
-    }
-
-    codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-    {
-        scalar_type dummy;
-        const vector3_type localL = generateNormalizedLocal(u,cache,dummy);
-        // could return `hlsl::mul(hlsl::tranpose(basis),localL)` or just this
-        return basis[0]*localL[0]+basis[1]*localL[1]+basis[2]*localL[2];
-    }
-
-    // utility to determine maxT for a ray from L shot from the origin which we're sure intersects the rectangle
-    scalar_type computeHitT(const vector3_type L)
-    {
-        const scalar_type retval = hlsl::abs(r0.z/hlsl::dot(L,basis[2]));
-        {
-            const vector3_type hitPointRelative = L*retval;
-            const vector2_type uv = mul(basis,L).xy-r0.xy;
-            assert(uv[0]>=0.0 && uv[0]<=rectExtents[0]);
-            assert(uv[1]>=0.0 && uv[1]<=rectExtents[1]);
-        }
-        return retval;
-    }
-
-    // returns a 2D offset on the rectangle surface including the r0 corner -useful for generating unnormalized worldspace L
-    vector2_type generateLocalBasisXY(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-    {
-        const SCommonGen core = __generate(u);
-        const scalar_type r1y = r0.y + extents.y;
-        // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2`
-        const scalar_type yv = core.hv * hlsl::rsqrt(core.cosElevation2/core.d2);
-
-        // fun fact, when one of the operands to min or max is NaN, the SPIR-V builtin will select the other one
-        // TODO: maybe try just `min(yv,ry1)`
-        const vector2_type retval = vector2_type(core.xu,hlsl::clamp(yv,r0.y,r1y));
-        assert(retval[0] >= r0.x && retval[1] >= r0.y);
-        assert(retval[0] <= r0.x+rectExtents[0] && retval[1] <= r0.y+rectExtents[1]);
-        return retval;
-    }
-
-    // its basically the hitpoint minus the observer origin
-    codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-    {
-        const vector2_type localXY = generateLocalBasisXY(u,cache);
-        // the `localXY` already contains r0.xy
-        return basis[0]*localXY[0]+basis[1]*localXY[1]+basis[2]*r0.z;
-    }
-
-    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-    {
-        return scalar_type(1.0) / solidAngle;
-    }
-
-    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-    {
-        return forwardPdf(u, cache);
-    }
-
-    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        return scalar_type(1.0) / solidAngle;
-    }
-
-    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        return backwardPdf(L);
-    }
-
-    matrix3x3_type basis;
-    vector3_type r0;
-    vector2_type extents;
-    scalar_type solidAngle;
-    scalar_type k;
-    scalar_type b0;
-    scalar_type b1;
+   using scalar_type    = T;
+   using vector2_type   = vector<T, 2>;
+   using vector3_type   = vector<T, 3>;
+   using vector4_type   = vector<T, 4>;
+   using matrix3x3_type = matrix<T, 3, 3>;
+
+   // BackwardTractableSampler concept types
+   using domain_type   = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type  = scalar_type;
+   using weight_type   = density_type;
+
+   struct cache_type
+   {
+   };
+
+   static SphericalRectangle<T, UseRealSinCos> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
+   {
+      return create(rect.basis, rect.solidAngle(observer), rect.extents);
+   }
+
+   static SphericalRectangle<T, UseRealSinCos> create(const matrix3x3_type _basis, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
+   {
+      SphericalRectangle<T, UseRealSinCos> retval;
+      retval.basis   = _basis;
+      retval.r0      = sa.r0;
+      retval.extents = _extents;
+
+      retval.solidAngle = sa.value;
+      retval.b0         = sa.n_z[0];
+      retval.b1         = sa.n_z[2];
+
+      math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[2]);
+      angle_adder.addCosine(sa.cosGamma[3]);
+      retval.k = scalar_type(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
+
+      return retval;
+   }
+
+   // Create directly from a local-frame corner position and rectangle extents.
+   // Use when you already know r0 (e.g. from a gnomonic projection) and don't
+   // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip.
+   static SphericalRectangle<T, UseRealSinCos> create(const matrix3x3_type _basis, const vector3_type _r0, const vector2_type _extents)
+   {
+      // Same math as shapes::SphericalRectangle::solidAngle() but without
+      // the mul(basis, origin - observer) step since we already have r0.
+      typename shapes::SphericalRectangle<T>::solid_angle_type sa;
+      sa.r0 = _r0;
+
+      const scalar_type zSq         = _r0.z * _r0.z;
+      const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x);
+      sa.n_z                        = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(zSq) + denorm_n_z * denorm_n_z);
+      sa.cosGamma                   = vector4_type(
+         -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2],
+         -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]);
+
+      math::sincos_accumulator<scalar_type> acc = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[0]);
+      acc.addCosine(sa.cosGamma[1]);
+      acc.addCosine(sa.cosGamma[2]);
+      acc.addCosine(sa.cosGamma[3]);
+      sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi<scalar_type>;
+
+      return create(_basis, sa, _extents);
+   }
+
+   // shared core of generate and generateSurfaceOffset
+   // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction
+   struct SCommonGen
+   {
+      scalar_type xu;
+      scalar_type d2;
+      scalar_type hv;
+      scalar_type cosElevation2;
+   };
+   SCommonGen __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+   {
+      SCommonGen retval;
+
+      // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip
+      const scalar_type negAbsR0z = -hlsl::abs(r0.z);
+      const scalar_type r0zSq     = r0.z * r0.z;
+      const vector2_type r1       = vector2_type(r0.x + extents.x, r0.y + extents.y);
+
+      // au in [0, 4*pi] since u.x in [0,1], solidAngle <= 2*pi, k <= 2*pi.
+      // The sqrt path in math::sincos recovers sin sign via sign(theta), which only holds for theta in [-pi,pi].
+      // Real sin/cos handle any range, so only wrap on the sqrt path (compile-time branch folds away).
+      scalar_type au = u.x * solidAngle + k;
+      NBL_IF_CONSTEXPR(!UseRealSinCos)
+      {
+         // au in [0, 4*pi] -> peel at most two 2*pi periods to land in (-pi, pi].
+         au = hlsl::select(au > numbers::pi<scalar_type>, au - scalar_type(2.0) * numbers::pi<scalar_type>, au);
+         au = hlsl::select(au > numbers::pi<scalar_type>, au - scalar_type(2.0) * numbers::pi<scalar_type>, au);
+      }
+      scalar_type sin_au, cos_au;
+      math::sincos<scalar_type, UseRealSinCos>(au, sin_au, cos_au);
+      const scalar_type numerator = b1 - cos_au * b0;
+      // negFu carries the sign directly (numerator and sin_au both signed), so xu's sign drops
+      // out of a single multiply + hlsl::sign.
+      const scalar_type negFu   = numerator / sin_au;
+      const scalar_type rcpCu_2 = hlsl::max<scalar_type>(negFu * negFu + b0 * b0, scalar_type(1.0));
+      retval.xu                 = negAbsR0z * hlsl::sign(negFu) * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
+      retval.xu = hlsl::clamp<scalar_type>(retval.xu, r0.x, r1.x); // avoid Infs
+      retval.d2 = retval.xu * retval.xu + r0zSq;
+
+      const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(retval.d2 + r0.y * r0.y);
+      const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(retval.d2 + r1.y * r1.y);
+      retval.hv            = h0 + u.y * (h1 - h0);
+      retval.cosElevation2 = scalar_type(1.0) - hlsl::min<scalar_type>(retval.hv * retval.hv, 1);
+
+      return retval;
+   }
+
+   // returns a normalized 3D direction in the local frame with correct r0.z sign
+   vector3_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
+   {
+      const SCommonGen core         = __generate(u);
+      scalar_type cosElevationOverD = hlsl::rsqrt(core.d2 / core.cosElevation2);
+      // TODO: or shall we do some other more sophisticated clamp or correction? Is this even the right one to use?
+      cosElevationOverD = hlsl::select(hlsl::isnan(cosElevationOverD), 1.f, cosElevationOverD);
+
+      // TODO: investigate if due to precision we need to compute this as a `sqrt` then the quantity being computed is `core.cosElevation2/core.d2`
+      // which what alss `generateLocalBasisXY` needs, in which case `__generate` can already compute it and return it as `hitDist2`
+      hitDist = 1.f / cosElevationOverD;
+
+      const vector3_type retval = vector3_type(core.xu / hitDist, core.hv, r0.z / hitDist);
+      assert(!hlsl::isnan(computeHitT(retval)));
+      return retval;
+   }
+
+   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      scalar_type dummy;
+      const vector3_type localL = generateNormalizedLocal(u, cache, dummy);
+      // could return `hlsl::mul(hlsl::tranpose(basis),localL)` or just this
+      return basis[0] * localL[0] + basis[1] * localL[1] + basis[2] * localL[2];
+   }
+
+   // utility to determine maxT for a ray from L shot from the origin which we're sure intersects the rectangle
+   scalar_type computeHitT(const vector3_type L)
+   {
+      const scalar_type retval = hlsl::abs(r0.z / hlsl::dot(L, basis[2]));
+      {
+         const vector3_type hitPointRelative = L * retval;
+         const vector2_type uv               = mul(basis, L).xy - r0.xy;
+         assert(uv[0] >= 0.0 && uv[0] <= rectExtents[0]);
+         assert(uv[1] >= 0.0 && uv[1] <= rectExtents[1]);
+      }
+      return retval;
+   }
+
+   // returns a 2D offset on the rectangle surface including the r0 corner -useful for generating unnormalized worldspace L
+   vector2_type generateLocalBasisXY(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const SCommonGen core = __generate(u);
+      const scalar_type r1y = r0.y + extents.y;
+      // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2`
+      const scalar_type yv = core.hv * hlsl::rsqrt(core.cosElevation2 / core.d2);
+
+      // fun fact, when one of the operands to min or max is NaN, the SPIR-V builtin will select the other one
+      // TODO: maybe try just `min(yv,ry1)`
+      const vector2_type retval = vector2_type(core.xu, hlsl::clamp(yv, r0.y, r1y));
+      assert(retval[0] >= r0.x && retval[1] >= r0.y);
+      assert(retval[0] <= r0.x + rectExtents[0] && retval[1] <= r0.y + rectExtents[1]);
+      return retval;
+   }
+
+   // its basically the hitpoint minus the observer origin
+   codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type localXY = generateLocalBasisXY(u, cache);
+      // the `localXY` already contains r0.xy
+      return basis[0] * localXY[0] + basis[1] * localXY[1] + basis[2] * r0.z;
+   }
+
+   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return scalar_type(1.0) / solidAngle;
+   }
+
+   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return forwardPdf(u, cache);
+   }
+
+   density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return scalar_type(1.0) / solidAngle;
+   }
+
+   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return backwardPdf(L);
+   }
+
+   matrix3x3_type basis;
+   vector3_type r0;
+   vector2_type extents;
+   scalar_type solidAngle;
+   scalar_type k;
+   scalar_type b0;
+   scalar_type b1;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
index eb714a2505..59dea3b6ea 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
@@ -11,6 +11,7 @@
 #include <nbl/builtin/hlsl/math/quaternions.hlsl>
 #include <nbl/builtin/hlsl/math/fast_acos.hlsl>
 #include <nbl/builtin/hlsl/shapes/spherical_triangle.hlsl>
+#include <nbl/builtin/hlsl/ieee754.hlsl>
 
 namespace nbl
 {
@@ -25,9 +26,6 @@ enum SphericalTriangleAlgorithm : uint16_t
    STA_PBRT = 1
 };
 
-template<typename T, bool Bijective = false, SphericalTriangleAlgorithm Algorithm = STA_ARVO>
-struct SphericalTriangle;
-
 namespace impl
 {
 
@@ -53,9 +51,8 @@ T sumOfProducts(T a, T b, T c, T d)
 
 } // namespace impl
 
-// Non-bijective: Resamplable & Tractable (generate + pdf/weight, no inverse)
-template<typename T, SphericalTriangleAlgorithm Algorithm>
-struct SphericalTriangle<T, false, Algorithm>
+template<typename T, SphericalTriangleAlgorithm Algorithm = STA_ARVO>
+struct SphericalTriangle
 {
    using scalar_type = T;
    using vector2_type = vector<T, 2>;
@@ -76,19 +73,20 @@ struct SphericalTriangle<T, false, Algorithm>
       retval.rcpSolidAngle = scalar_type(1.0) / tri.solid_angle;
       retval.tri_vertices[0] = tri.vertices[0];
       retval.tri_vertices[1] = tri.vertices[1];
-      retval.triCosC = tri.cos_sides[2];
+      retval.triCosc = tri.cos_sides[2];
       // precompute great circle normal of arc AC: cross(A,C) has magnitude sin(b),
       // so multiplying by csc(b) normalizes it; zero when side AC is degenerate
-      const scalar_type cscB = tri.csc_sides[1];
-      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits<scalar_type>::max, cscB, scalar_type(0));
+      const scalar_type cscb = tri.csc_sides[1];
+      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscb < numeric_limits<scalar_type>::max, cscb, scalar_type(0));
       retval.e_C = hlsl::cross(arcACPlaneNormal, tri.vertices[0]);
       retval.cosA = tri.cos_vertices[0];
       retval.sinA = tri.sin_vertices[0];
       if (Algorithm == STA_ARVO)
       {
-         retval.sinA_triCosC = retval.sinA * retval.triCosC;
+         retval.sinA_triCosc = retval.sinA * retval.triCosc;
          retval.eCdotB = hlsl::dot(retval.e_C, tri.vertices[1]);
       }
+      retval.APlusC = tri.vertices[0] + tri.vertices[2];
       return retval;
    }
 
@@ -106,21 +104,22 @@ struct SphericalTriangle<T, false, Algorithm>
       if (Algorithm == STA_ARVO) // faster than PBRT
       {
          const scalar_type u_ = t - cosA;
-         const scalar_type v_ = s + sinA_triCosC;
+         const scalar_type v_ = s + sinA_triCosc;
          const scalar_type num = (v_ * t - u_ * s) * cosA - v_;
          const scalar_type denum = (v_ * s + u_ * t) * sinA;
 
+#define ACCURATE 1
 #if ACCURATE
          // sqrt(1 - cosBp^2) loses precision when cosBp ~ 1 (small u.x).
          // Use stable factorization: sinBp = sqrt((denum-num)(denum+num)) / |denum|
-         // where denum-num = sinA*(1+triCosC)*(1-cosA_hat).
+         // where denum-num = sinA*(1+triCosc)*(1-cosA_hat).
 
          // For large triangles with high u.x, cosA_hat can approach -1,
          // making (1 + cosA_hat) near zero and the division unstable.
          // Use the algebraic identity only when cosA_hat > 0 (safe denominator).
          const scalar_type rcpDenum = scalar_type(1) / denum;
          const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > scalar_type(0), sinA_hat * sinA_hat / (scalar_type(1) + cosA_hat), scalar_type(1) - cosA_hat);
-         const scalar_type DminusN = sinA * (scalar_type(1) + triCosC) * oneMinusCosAhat;
+         const scalar_type DminusN = sinA * (scalar_type(1) + triCosc) * oneMinusCosAhat;
          sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum);
          cosBp = scalar_type(1) - DminusN * rcpDenum;
 #else // 17% faster, less accurate
@@ -132,10 +131,10 @@ struct SphericalTriangle<T, false, Algorithm>
       {
          // PBRT uses cosPhi = -t, sinPhi = -s (pi offset from Arvo's A_hat)
          const scalar_type k1 = -t + cosA;
-         const scalar_type k2 = -s - sinA * triCosC;
+         const scalar_type k2 = -s - sinA * triCosc;
          cosBp = (k2 + impl::differenceOfProducts(k2, -t, k1, -s) * cosA) / (impl::sumOfProducts(k2, -s, k1, -t) * sinA);
          cosBp = nbl::hlsl::clamp(cosBp, scalar_type(-1), scalar_type(1));
-         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - cosBp * cosBp));
+         sinBp = sqrt<scalar_type>(scalar_type(1) - cosBp * cosBp);
       }
 
       // Step 3: construct C' on the great circle through A toward C
@@ -143,82 +142,24 @@ struct SphericalTriangle<T, false, Algorithm>
 
       // Step 4: uniformly sample the great circle arc from B to C'
       scalar_type cosCpB;
-      if (Algorithm == STA_ARVO)
-         cosCpB = cosBp * triCosC + sinBp * eCdotB;
+      NBL_IF_CONSTEXPR(Algorithm == STA_ARVO)
+         cosCpB = cosBp * triCosc + sinBp * eCdotB;
       else
          cosCpB = nbl::hlsl::dot(cp, tri_vertices[1]);
+      // TODO: degeneracy at u.y = 0. z = 1 - u.y*(1-cosCpB) makes sinZ = sqrt(1-z^2) behave like
+      // sqrt(u.y) near zero, so dL/du.y diverges as u.y^(-1/2) and every higher derivative diverges
+      // faster. The forward Jacobian test in 37_HLSLSamplingTests reports ~2-8% error at u.y < 0.003
+      // even with the O(h^2) one-sided stencil because the third-derivative term dominates. At
+      // u.y = 0 exactly, L collapses to vertex B for all u.x (|det J| = 0), so it's an intrinsic
+      // property of the Arvo parameterization, not a bug. Fix: rework the arc interpolation to use
+      // a u.y -> angle mapping whose derivatives stay bounded near u.y = 0 (e.g. acos(z) = angle
+      // from B, then sample arc-length linearly), so the Jacobian is smooth and the skip band in
+      // the tester can be removed.
       const scalar_type z = scalar_type(1) - u.y * (scalar_type(1) - cosCpB);
       const scalar_type sinZ = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - z * z));
       return z * tri_vertices[1] + sinZ * hlsl::normalize(cp - cosCpB * tri_vertices[1]);
    }
 
-   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-   {
-      return rcpSolidAngle;
-   }
-
-   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-   {
-      return forwardPdf(u, cache);
-   }
-
-   density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-   {
-      return rcpSolidAngle;
-   }
-
-   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
-   {
-      return backwardPdf(L);
-   }
-
-   scalar_type getRcpSolidAngle() NBL_CONST_MEMBER_FUNC {return rcpSolidAngle;}
-
-   scalar_type rcpSolidAngle;
-   scalar_type cosA;
-   scalar_type sinA;
-   scalar_type sinA_triCosC; // precomputed sinA * triCosC
-   scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only
-
-   vector3_type tri_vertices[2]; // A and B only
-   scalar_type triCosC;
-   vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane
-};
-
-// Bijective: adds generateInverse, stores extra members for the inverse mapping
-template<typename T, SphericalTriangleAlgorithm Algorithm>
-struct SphericalTriangle<T, true, Algorithm>
-{
-   using scalar_type = T;
-   using vector2_type = vector<T, 2>;
-   using vector3_type = vector<T, 3>;
-
-   using base_type = SphericalTriangle<T, false, Algorithm>;
-   using domain_type = vector2_type;
-   using codomain_type = vector3_type;
-   using density_type = scalar_type;
-   using weight_type = density_type;
-
-   using cache_type = typename base_type::cache_type;
-
-   static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
-   {
-      SphericalTriangle retval;
-      retval.base = base_type::create(tri);
-      retval.vertexC = tri.vertices[2];
-      // precompute great circle normal of arc AC (needed for generateInverse)
-      const scalar_type cscB = tri.csc_sides[1];
-      retval.arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits<scalar_type>::max, cscB, scalar_type(0));
-      retval.triCscC = tri.csc_sides[2];
-      return retval;
-   }
-
-   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-   {
-      return base.generate(u, cache);
-   }
-
-
    // generate() works in two steps:
    //   u.x -> pick C' on arc AC (choosing a sub-area fraction)
    //   u.y -> pick L on arc B->C' (linear interpolation)
@@ -230,15 +171,22 @@ struct SphericalTriangle<T, true, Algorithm>
    domain_type generateInverse(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
       // Step 1: find C' = intersection of great circles (B,L) and (A,C)
-      const vector3_type BxL = nbl::hlsl::cross(base.tri_vertices[1], L);
+      const vector3_type BxL = nbl::hlsl::cross(tri_vertices[1], L);
       const scalar_type sinBL_sq = nbl::hlsl::dot(BxL, BxL);
-      if (sinBL_sq < numeric_limits<scalar_type>::epsilon)
+
+      // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t).
+      // C' also lies on the B-L plane, so dot(BxL, C') = 0.
+      // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R
+      const scalar_type tripleA = nbl::hlsl::dot(BxL, tri_vertices[0]);
+      const scalar_type tripleE = nbl::hlsl::dot(BxL, e_C);
+      const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE;
+
+      if (sinBL_sq < numeric_limits<scalar_type>::epsilon || R_sq < numeric_limits<scalar_type>::epsilon)
       {
-         // L ~ B: u.y ~ 0, u.x is indeterminate (all u.x map to B when u.y=0).
          // Recover u.y from |L-B|^2 / |A-B|^2 (using C'=A; the (1-cosCpB) ratio
          // cancels so any C' gives the same result).
-         const vector3_type LminusB = L - base.tri_vertices[1];
-         const vector3_type AminusB = base.tri_vertices[0] - base.tri_vertices[1];
+         const vector3_type LminusB = L - tri_vertices[1];
+         const vector3_type AminusB = tri_vertices[0] - tri_vertices[1];
          const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
          const scalar_type v_denom = nbl::hlsl::dot(AminusB, AminusB);
          const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
@@ -247,20 +195,10 @@ struct SphericalTriangle<T, true, Algorithm>
          return vector2_type(scalar_type(0.0), v);
       }
 
-      // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t).
-      // C' also lies on the B-L plane, so dot(BxL, C') = 0.
-      // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R
-      const scalar_type tripleA = nbl::hlsl::dot(BxL, base.tri_vertices[0]);
-      const scalar_type tripleE = nbl::hlsl::dot(BxL, base.e_C);
-      const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE;
-      if (R_sq < numeric_limits<scalar_type>::epsilon)
-         return vector2_type(scalar_type(0.0), scalar_type(0.0));
-
       const scalar_type rcpR = scalar_type(1.0) / nbl::hlsl::sqrt(R_sq);
-      vector3_type cp = base.tri_vertices[0] * (tripleE * rcpR) + base.e_C * (-tripleA * rcpR);
-      // two intersections exist; pick the one on the minor arc A->C
-      if (nbl::hlsl::dot(cp, base.tri_vertices[0] + vertexC) < scalar_type(0.0))
-         cp = -cp;
+      vector3_type cp = tri_vertices[0] * (tripleE * rcpR) + e_C * (-tripleA * rcpR);
+      // two intersections exist; pick the one on the minor arc A->C (branchless sign flip)
+      cp = ieee754::flipSignIfRHSNegative(cp, nbl::hlsl::dot(cp, APlusC));
 
       // Step 2: u.x = solidAngle(A,B,C') / solidAngle(A,B,C)
       // Van Oosterom-Strackee: tan(Omega/2) = |A.(BxC')| / (1 + A.B + B.C' + C'.A)
@@ -271,19 +209,19 @@ struct SphericalTriangle<T, true, Algorithm>
       // Expanding C' = cosBp*A + sinBp*e_C into the triple product:
       //   A.(BxC') = cosBp * A.(BxA) + sinBp * A.(Bxe_C) = sinBp * A.(Bxe_C)
       // since A.(BxA) = 0 identically. This avoids the cancellation.
-      const scalar_type cosBp_inv = nbl::hlsl::dot(cp, base.tri_vertices[0]);
-      const scalar_type sinBp_inv = nbl::hlsl::dot(cp, base.e_C);
-      const scalar_type AxBdotE = nbl::hlsl::dot(base.tri_vertices[0], nbl::hlsl::cross(base.tri_vertices[1], base.e_C));
+      const scalar_type cosBp_inv = nbl::hlsl::dot(cp, tri_vertices[0]);
+      const scalar_type sinBp_inv = nbl::hlsl::dot(cp, e_C);
+      const scalar_type AxBdotE = nbl::hlsl::dot(tri_vertices[0], nbl::hlsl::cross(tri_vertices[1], e_C));
       const scalar_type num = sinBp_inv * AxBdotE;
-      const scalar_type cosCpB = nbl::hlsl::dot(base.tri_vertices[1], cp);
-      const scalar_type den = scalar_type(1.0) + base.triCosC + cosCpB + cosBp_inv;
+      const scalar_type cosCpB = nbl::hlsl::dot(tri_vertices[1], cp);
+      const scalar_type den = scalar_type(1.0) + triCosc + cosCpB + cosBp_inv;
       const scalar_type subSolidAngle = scalar_type(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
-      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * base.rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
+      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
 
       // Step 3: u.y = |L-B|^2 / |C'-B|^2
       // Squared Euclidean distance avoids catastrophic cancellation vs (1-dot)/(1-dot)
-      const vector3_type LminusB = L - base.tri_vertices[1];
-      const vector3_type cpMinusB = cp - base.tri_vertices[1];
+      const vector3_type LminusB = L - tri_vertices[1];
+      const vector3_type cpMinusB = cp - tri_vertices[1];
       const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
       const scalar_type v_denom = nbl::hlsl::dot(cpMinusB, cpMinusB);
       const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
@@ -296,7 +234,7 @@ struct SphericalTriangle<T, true, Algorithm>
 
    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
    {
-      return base.rcpSolidAngle;
+      return rcpSolidAngle;
    }
 
    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -306,20 +244,24 @@ struct SphericalTriangle<T, true, Algorithm>
 
    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
-      return base.rcpSolidAngle;
+      return rcpSolidAngle;
    }
 
    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
       return backwardPdf(L);
    }
-   
-   scalar_type getRcpSolidAngle() NBL_CONST_MEMBER_FUNC {return base.rcpSolidAngle;}
 
-   base_type base;
-   vector3_type vertexC;
-   vector3_type arcACPlaneNormal; // precomputed normalize(cross(A, C)), great circle normal of arc AC
-   scalar_type triCscC;
+   scalar_type rcpSolidAngle;
+   scalar_type cosA;
+   scalar_type sinA;
+   scalar_type sinA_triCosc; // precomputed sinA * triCosc
+   scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only
+
+   vector3_type tri_vertices[2]; // A and B only
+   scalar_type triCosc;
+   vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane
+   vector3_type APlusC; // precomputed A + C, used to pick the minor-arc intersection in generateInverse
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
index 21901a9628..a5a14660f4 100644
--- a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
@@ -35,10 +35,11 @@ struct UniformHemisphere
 
 	static codomain_type generate(const domain_type u)
 	{
-		typename ConcentricMapping<T>::cache_type cmCache;
+		typename ConcentricMapping<scalar_type>::cache_type cmCache;
 		const vector_t2 p = ConcentricMapping<T>::generate(u, cmCache);
+		assert(cmCache.r2 <= scalar_type(1.0));
 		const T z = T(1.0) - cmCache.r2;
-		const T xyScale = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(2.0) - cmCache.r2));
+		const T xyScale = hlsl::sqrt<scalar_type>(scalar_type(2.0) - cmCache.r2);
 		return vector_t3(p.x * xyScale, p.y * xyScale, z);
 	}
 
@@ -50,13 +51,13 @@ struct UniformHemisphere
 	static domain_type generateInverse(const codomain_type v)
 	{
 		// r_disk / r_xy = sqrt(1-z) / sqrt(1-z^2) = 1/sqrt(1+z)
-		const T scale = T(1.0) / hlsl::sqrt<T>(T(1.0) + v.z);
+		const scalar_type scale = hlsl::rsqrt<T>(T(1.0) + v.z);
 		return ConcentricMapping<T>::generateInverse(vector_t2(v.x * scale, v.y * scale));
 	}
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * numbers::inv_pi<T>;
+		return scalar_type(0.5) * numbers::inv_pi<T>;
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)

From 37cb8173e30ad23d184c2ec9724012bf1beb03ef Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Fri, 24 Apr 2026 21:27:52 +0300
Subject: [PATCH 09/12] Eytzinger CDF, alias table is packed, 2 versions, CDF
 sampler now has 3 enumed types (tracking, YOLO, Eytzinger)

---
 examples_tests                                |   2 +-
 .../builtin/hlsl/sampling/alias_table.hlsl    | 201 +++++++++++++-----
 .../hlsl/sampling/alias_table_builder.h       | 198 +++++++++++------
 .../sampling/cumulative_probability_builder.h |  55 +++++
 4 files changed, 342 insertions(+), 114 deletions(-)

diff --git a/examples_tests b/examples_tests
index fb5cfa2bca..a4559b941a 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit fb5cfa2bcaa0a92aafb429f3d390658d28d1ca02
+Subproject commit a4559b941a9d0f465ccc8687630077e045829403
diff --git a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
index 15742e10f3..8e7b3249a0 100644
--- a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
@@ -7,6 +7,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/bit.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl>
 #include <nbl/builtin/hlsl/concepts/core.hlsl>
 #include <nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl>
 
@@ -17,84 +18,187 @@ namespace hlsl
 namespace sampling
 {
 
-// Alias Method (Vose/Walker) discrete sampler.
-//
-// Samples a discrete index in [0, N) with probability proportional to
-// precomputed weights in O(1) time per sample, using a prebuilt alias table.
-//
-// Accessor template parameters must satisfy GenericReadAccessor:
-//   accessor.template get<V, I>(index, outVal)  // void, writes to outVal
-//
-// - ProbabilityAccessor: reads scalar_type threshold in [0, 1] for bin i
-// - AliasIndexAccessor:  reads uint32_t redirect index for bin i
-// - PdfAccessor:         reads scalar_type weight[i] / totalWeight
-//
-// Satisfies TractableSampler (not BackwardTractableSampler: the mapping is discrete).
-// The cache stores the PDF value looked up during generate, avoiding redundant
-// storage of the codomain (sampled index) which is already the return value.
-template<typename T, typename Domain, typename Codomain, typename ProbabilityAccessor, typename AliasIndexAccessor, typename PdfAccessor
+// Packed alias-entry bit layout shared by every packed variant. One 32-bit
+// word holds the redirect index in the low Log2N bits and the stay-
+// probability quantized as an unorm in the high (32 - Log2N) bits.
+//   u * N  = scaled;  bin = floor(scaled);  remainder = scaled - bin
+//   if (remainder < getStayProb(word))  -> result = bin
+//   else                                -> result = getTarget(word)
+// Quantizing the threshold to (32 - Log2N) bits is precision-neutral: `u`
+// already consumed Log2N bits of randomness producing `bin`, so `remainder`
+// carries exactly that many bits of discriminatory power.
+namespace impl
+{
+template<uint32_t Log2N>
+struct AliasBitDecoder
+{
+	static uint32_t getTarget(uint32_t word)
+	{
+		return word & ((1u << Log2N) - 1u);
+	}
+	template<typename T>
+	static T getStayProb(uint32_t word)
+	{
+		const uint32_t unormMax = (~0u) >> Log2N;
+		return T(word >> Log2N) / T(unormMax);
+	}
+};
+} // namespace impl
+
+// 8 B entry used by the NBig == true variant. Embeds the bin's own pdf
+// alongside the packed word so the common stay-case needs no extra tap.
+template<typename T>
+struct PackedAliasEntryB
+{
+	uint32_t packedWord;	// low Log2N: redirect target; high 32-Log2N: stayProb unorm
+	T ownPdf;				// pdf of this bin
+};
+
+
+// NBig == false: 4 B packed word per bin + separate pdf[] array. Per sample
+// = one 4 B word load + one unconditional 4 B pdf[] tap indexed by the
+// selected bin (either the current bin or its redirect). Total 8 B whether
+// the sample stays or aliases. Favours small N.
+template<typename T, typename Domain, typename Codomain, typename PackedWordAccessor, typename PdfAccessor, uint32_t Log2N
 	NBL_PRIMARY_REQUIRES(
 		concepts::UnsignedIntegralScalar<Codomain> &&
-		concepts::accessors::GenericReadAccessor<ProbabilityAccessor, T, Codomain> &&
-		concepts::accessors::GenericReadAccessor<AliasIndexAccessor, Codomain, Codomain> &&
+		concepts::accessors::GenericReadAccessor<PackedWordAccessor, uint32_t, Codomain> &&
 		concepts::accessors::GenericReadAccessor<PdfAccessor, T, Codomain>)
-struct AliasTable
+struct PackedAliasTableA
 {
 	using scalar_type = T;
-
 	using domain_type = Domain;
 	using codomain_type = Codomain;
 	using density_type = scalar_type;
 	using weight_type = density_type;
+	using decoder = impl::AliasBitDecoder<Log2N>;
+	NBL_CONSTEXPR_STATIC_INLINE bool NBig = false;
 
 	struct cache_type
 	{
 		density_type pdf;
 	};
 
-	static AliasTable create(NBL_CONST_REF_ARG(ProbabilityAccessor) _probAccessor, NBL_CONST_REF_ARG(AliasIndexAccessor) _aliasAccessor, NBL_CONST_REF_ARG(PdfAccessor) _pdfAccessor, codomain_type _size)
+	static PackedAliasTableA create(NBL_CONST_REF_ARG(PackedWordAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size)
 	{
-		AliasTable retval;
-		retval.probAccessor = _probAccessor;
-		retval.aliasAccessor = _aliasAccessor;
-		retval.pdfAccessor = _pdfAccessor;
-		// Precompute tableSize as float minus 1 ULP so that u=1.0 maps to bin N-1
+		PackedAliasTableA retval;
+		retval.entryAcc = _entryAcc;
+		retval.pdfAcc = _pdfAcc;
 		const scalar_type exact = scalar_type(_size);
 		retval.tableSizeMinusUlp = nbl::hlsl::bit_cast<scalar_type>(nbl::hlsl::bit_cast<uint32_t>(exact) - 1u);
 		return retval;
 	}
 
-	// BasicSampler interface
 	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
 	{
 		const scalar_type scaled = u * tableSizeMinusUlp;
 		const codomain_type bin = _static_cast<codomain_type>(scaled);
 		const scalar_type remainder = scaled - scalar_type(bin);
 
-		scalar_type prob;
-		probAccessor.template get<scalar_type, codomain_type>(bin, prob);
-
-		// Use if-statement to avoid select: aliasIndex is a dependent read
-		codomain_type result;
-		if (remainder < prob)
-		{
-			result = bin;
-		}
-		else
-		{
-			codomain_type alias;
-			aliasAccessor.template get<codomain_type, codomain_type>(bin, alias);
-			result = alias;
-		}
+		uint32_t packedWord;
+		entryAcc.template get<uint32_t, codomain_type>(bin, packedWord);
+		return hlsl::select(remainder < decoder::template getStayProb<scalar_type>(packedWord), bin, codomain_type(decoder::getTarget(packedWord)));
+	}
 
+	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		const codomain_type result = generate(u);
+		pdfAcc.template get<scalar_type, codomain_type>(result, cache.pdf);
 		return result;
 	}
 
-	// TractableSampler interface
+	density_type forwardPdf(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.pdf;
+	}
+
+	weight_type forwardWeight(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.pdf;
+	}
+
+	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		scalar_type pdf;
+		pdfAcc.template get<scalar_type, codomain_type>(v, pdf);
+		return pdf;
+	}
+
+	weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		return backwardPdf(v);
+	}
+
+	PackedWordAccessor entryAcc;
+	PdfAccessor pdfAcc;
+	scalar_type tableSizeMinusUlp;
+};
+
+// NBig == true: 8 B entry {packedWord, ownPdf} + separate pdf[] array. Per
+// sample = one 8 B entry load (covers the common stay case where cache
+// already has ownPdf). If the sample aliases, a conditional 4 B pdf[target]
+// tap fills the cache. Total 8 B stay, 12 B aliased. Favours large N.
+template<typename T, typename Domain, typename Codomain, typename EntryAccessor, typename PdfAccessor, uint32_t Log2N
+	NBL_PRIMARY_REQUIRES(
+		concepts::UnsignedIntegralScalar<Codomain> &&
+		concepts::accessors::GenericReadAccessor<EntryAccessor, PackedAliasEntryB<T>, Codomain> &&
+		concepts::accessors::GenericReadAccessor<PdfAccessor, T, Codomain>)
+struct PackedAliasTableB
+{
+	using scalar_type = T;
+	using domain_type = Domain;
+	using codomain_type = Codomain;
+	using density_type = scalar_type;
+	using weight_type = density_type;
+	using entry_type = PackedAliasEntryB<scalar_type>;
+	using decoder = impl::AliasBitDecoder<Log2N>;
+	NBL_CONSTEXPR_STATIC_INLINE bool NBig = true;
+
+	struct cache_type
+	{
+		density_type pdf;
+	};
+
+	static PackedAliasTableB create(NBL_CONST_REF_ARG(EntryAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size)
+	{
+		PackedAliasTableB retval;
+		retval.entryAcc = _entryAcc;
+		retval.pdfAcc = _pdfAcc;
+		const scalar_type exact = scalar_type(_size);
+		retval.tableSizeMinusUlp = nbl::hlsl::bit_cast<scalar_type>(nbl::hlsl::bit_cast<uint32_t>(exact) - 1u);
+		return retval;
+	}
+
+	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+	{
+		const scalar_type scaled = u * tableSizeMinusUlp;
+		const codomain_type bin = _static_cast<codomain_type>(scaled);
+		const scalar_type remainder = scaled - scalar_type(bin);
+
+		entry_type entry;
+		entryAcc.template get<entry_type, codomain_type>(bin, entry);
+		return hlsl::select(remainder < decoder::template getStayProb<scalar_type>(entry.packedWord), bin, codomain_type(decoder::getTarget(entry.packedWord)));
+	}
+
 	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		const codomain_type result = generate(u);
-		pdfAccessor.template get<scalar_type, codomain_type>(result, cache.pdf);
+		const scalar_type scaled = u * tableSizeMinusUlp;
+		const codomain_type bin = _static_cast<codomain_type>(scaled);
+		const scalar_type remainder = scaled - scalar_type(bin);
+
+		entry_type entry;
+		entryAcc.template get<entry_type, codomain_type>(bin, entry);
+
+		const bool stay = remainder < decoder::template getStayProb<scalar_type>(entry.packedWord);
+		
+		cache.pdf = entry.ownPdf;
+		codomain_type result = bin;
+		if (!stay)
+		{
+			const codomain_type target = codomain_type(decoder::getTarget(entry.packedWord));
+			pdfAcc.template get<scalar_type, codomain_type>(target, cache.pdf);
+			result = target;
+		}
 		return result;
 	}
 
@@ -111,7 +215,7 @@ struct AliasTable
 	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
 	{
 		scalar_type pdf;
-		pdfAccessor.template get<scalar_type, codomain_type>(v, pdf);
+		pdfAcc.template get<scalar_type, codomain_type>(v, pdf);
 		return pdf;
 	}
 
@@ -120,9 +224,8 @@ struct AliasTable
 		return backwardPdf(v);
 	}
 
-	ProbabilityAccessor probAccessor;
-	AliasIndexAccessor aliasAccessor;
-	PdfAccessor pdfAccessor;
+	EntryAccessor entryAcc;
+	PdfAccessor pdfAcc;
 	scalar_type tableSizeMinusUlp;
 };
 
diff --git a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
index d02d21488c..2c7c53fd5f 100644
--- a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
+++ b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
@@ -5,7 +5,12 @@
 #ifndef _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_
 #define _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_
 
+#include <cmath>
 #include <cstdint>
+#include <span>
+#include <vector>
+
+#include <nbl/builtin/hlsl/sampling/alias_table.hlsl>
 
 namespace nbl
 {
@@ -15,74 +20,139 @@ namespace sampling
 {
 
 // Builds the alias table from an array of non-negative weights.
-// All output arrays must be pre-allocated to N entries.
 //
-// Parameters:
-//   weights         - input weights (non-negative, at least one must be > 0)
-//   N               - number of entries
-//   outProbability  - [out] alias table probability threshold per bin, in [0, 1]
-//   outAlias        - [out] alias redirect index per bin
-//   outPdf          - [out] normalized PDF per entry: weight[i] / sum(weights)
-//   workspace       - scratch buffer of N uint32_t entries
+// When `weights.size()` is a power of two, the builder transparently appends
+// one zero-weight dummy bucket so the GPU-facing table size is N+1 (odd),
+// which breaks PoT-periodic address patterns that alias memory channels /
+// cache sets on most GPUs. The sampled distribution is unchanged, the dummy
+// has stayProb = 0 and always redirects to a real donor.
+//
+// Output vectors are `resize`d by the builder to the final table size, so
+// the caller just passes (possibly empty) vectors and reads back the
+// returned size. That returned value is what to pass to the sampler's
+// `_size` argument and to use when packing / uploading.
 template<typename T>
 struct AliasTableBuilder
 {
-	static void build(std::span<const T> weights, T* outProbability, uint32_t* outAlias, T* outPdf, uint32_t* workspace)
-	{
-		T totalWeight = T(0);
-      for (uint32_t i = 0; i < weights.size(); i++)
-			totalWeight += weights[i];
-
-		const T rcpTotalWeight = T(1) / totalWeight;
-
-		// Compute PDFs, scaled probabilities, and partition into small/large in one pass
-		uint32_t smallEnd = 0;
-      uint32_t largeBegin = weights.size();
-      for (uint32_t i = 0; i < weights.size(); i++)
-		{
-			outPdf[i] = weights[i] * rcpTotalWeight;
-         outProbability[i] = outPdf[i] * T(weights.size());
-
-			if (outProbability[i] < T(1))
-				workspace[smallEnd++] = i;
-			else
-				workspace[--largeBegin] = i;
-		}
-
-		// Pair small and large entries
-      while (smallEnd > 0 && largeBegin < weights.size())
-		{
-			const uint32_t s = workspace[--smallEnd];
-			const uint32_t l = workspace[largeBegin];
-
-			outAlias[s] = l;
-			// outProbability[s] already holds the correct probability for bin s
-
-			outProbability[l] -= (T(1) - outProbability[s]);
-
-			if (outProbability[l] < T(1))
-			{
-				// l became small: pop from large, push to small
-				largeBegin++;
-				workspace[smallEnd++] = l;
-			}
-			// else l stays in large (don't pop, reuse next iteration)
-		}
-
-		// Remaining entries (floating point rounding artifacts)
-		while (smallEnd > 0)
-		{
-			const uint32_t s = workspace[--smallEnd];
-			outProbability[s] = T(1);
-			outAlias[s] = s;
-		}
-      while (largeBegin < weights.size())
-		{
-			const uint32_t l = workspace[largeBegin++];
-			outProbability[l] = T(1);
-			outAlias[l] = l;
-		}
-	}
+   // Ugly but much faster: we better ensure the table size is not a power of
+   // two, so we pad with +1 zero-weight dummy bucket when needed. PoT-sized
+   // alias tables hit GPU memory channel / cache set aliasing that can be
+   // wildly (sometimes 2x+) slower than a nearby non-PoT size. Builder owns
+   // all the sizing (resizes the output vectors, allocates its own scratch),
+   // so the caller can't get it wrong.
+   static uint32_t build(std::span<const T> weights, std::vector<T>& outProbability, std::vector<uint32_t>& outAlias, std::vector<T>& outPdf)
+   {
+      const uint32_t userN  = static_cast<uint32_t>(weights.size());
+      const uint32_t tableN = (userN > 1u && (userN & (userN - 1u)) == 0u) ? (userN + 1u) : userN;
+
+      outProbability.resize(tableN);
+      outAlias.resize(tableN);
+      outPdf.resize(tableN);
+      std::vector<uint32_t> workspace(tableN);
+
+      T totalWeight = T(0);
+      for (uint32_t i = 0; i < userN; i++)
+         totalWeight += weights[i];
+
+      const T rcpTotalWeight = T(1) / totalWeight;
+
+      // Compute PDFs, scaled probabilities, and partition into small/large in one pass
+      uint32_t smallEnd   = 0u;
+      uint32_t largeBegin = tableN;
+      for (uint32_t i = 0; i < userN; i++)
+      {
+         outPdf[i]         = weights[i] * rcpTotalWeight;
+         outProbability[i] = outPdf[i] * T(tableN);
+
+         if (outProbability[i] < T(1))
+            workspace[smallEnd++] = i;
+         else
+            workspace[--largeBegin] = i;
+      }
+      // PoT dodge tail: one zero-weight dummy at index userN, always in the small list.
+      if (tableN != userN)
+      {
+         outPdf[userN]         = T(0);
+         outProbability[userN] = T(0);
+         workspace[smallEnd++] = userN;
+      }
+
+      // Pair small and large entries
+      while (smallEnd > 0u && largeBegin < tableN)
+      {
+         const uint32_t s = workspace[--smallEnd];
+         const uint32_t l = workspace[largeBegin];
+
+         outAlias[s] = l;
+         // outProbability[s] already holds the correct probability for bin s
+
+         outProbability[l] -= (T(1) - outProbability[s]);
+
+         if (outProbability[l] < T(1))
+         {
+            // l became small: pop from large, push to small
+            largeBegin++;
+            workspace[smallEnd++] = l;
+         }
+         // else l stays in large (don't pop, reuse next iteration)
+      }
+
+      // Remaining entries (floating point rounding artifacts)
+      while (smallEnd > 0u)
+      {
+         const uint32_t s  = workspace[--smallEnd];
+         outProbability[s] = T(1);
+         outAlias[s]       = s;
+      }
+      while (largeBegin < tableN)
+      {
+         const uint32_t l  = workspace[largeBegin++];
+         outProbability[l] = T(1);
+         outAlias[l]       = l;
+      }
+
+      return tableN;
+   }
+
+   // Pack (target, stayProb) into a single 32-bit word with Log2N bits for
+   // target and (32 - Log2N) bits for the unorm-quantized threshold. Used by
+   // every packed variant; each packX() below calls this on a per-entry basis.
+   template<uint32_t Log2N>
+   static uint32_t packWord(uint32_t target, T stayProb)
+   {
+      const uint32_t targetMask = (Log2N == 32u) ? ~0u : ((1u << Log2N) - 1u);
+      const T        clamped    = stayProb < T(0) ? T(0) : (stayProb > T(1) ? T(1) : stayProb);
+      const uint32_t unormMax   = (Log2N == 0u) ? ~0u : ((~0u) >> Log2N);
+      const uint32_t probUnorm  = static_cast<uint32_t>(std::round(static_cast<double>(clamped) * static_cast<double>(unormMax)));
+      return (target & targetMask) | (probUnorm << Log2N);
+   }
+
+   // Variant A, pack SoA outputs into an array of 4 B packed words. The
+   // pdf[] array is consumed directly by the sampler as a second accessor.
+   // outWords must be pre-allocated to N uint32_t entries.
+   template<uint32_t Log2N>
+   static void packA(std::span<const T> probability, std::span<const uint32_t> alias, uint32_t* outWords)
+   {
+      const uint32_t N = static_cast<uint32_t>(probability.size());
+      for (uint32_t i = 0; i < N; i++)
+         outWords[i] = packWord<Log2N>(alias[i], probability[i]);
+   }
+
+   // Variant B, pack SoA outputs into 8 B entries { packedWord, ownPdf }.
+   // The pdf[] array is *also* passed to the sampler (same contents as ownPdf
+   // column, but tapped independently with a 4 B fetch when the sample aliases).
+   // outEntries must be pre-allocated to N entries.
+   template<uint32_t Log2N>
+   static void packB(std::span<const T> probability, std::span<const uint32_t> alias, std::span<const T> pdf,
+      PackedAliasEntryB<T>* outEntries)
+   {
+      const uint32_t N = static_cast<uint32_t>(probability.size());
+      for (uint32_t i = 0; i < N; i++)
+      {
+         outEntries[i].packedWord = packWord<Log2N>(alias[i], probability[i]);
+         outEntries[i].ownPdf     = pdf[i];
+      }
+   }
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
index a511fc2d8c..bf98d5ec93 100644
--- a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
@@ -30,6 +30,61 @@ void computeNormalizedCumulativeHistogram(std::span<const T> weights, T* outCumP
 	std::for_each(outCumProb, outCumProb + N - 1, [normalizationFactor](T& v) { v *= normalizationFactor; });
 }
 
+// Returns the next power of two >= x (and 1 for x <= 1). Matches the leaf-count
+// the Eytzinger builder pads to.
+inline uint32_t eytzingerLeafCount(uint32_t N)
+{
+	uint32_t P = 1u;
+	while (P < N) P <<= 1u;
+	return P;
+}
+
+// Builds an Eytzinger-layout CDF for cache-friendly binary search on the GPU.
+//
+// Layout (1-indexed, size 2*P where P = eytzingerLeafCount(N)):
+//   tree[0]           unused (keeps parent/child arithmetic branch-free)
+//   tree[1 .. P-1]    interior split keys; tree[v] == rightmost leaf of v's left subtree
+//   tree[P .. P+N-1]  leaves, tree[P + i] = normalized inclusive scan of weights up to i
+//   tree[P+N .. 2P-1] padded leaves, all 1.0 (any u < 1.0 routes away from these)
+//
+// The sampler walks the tree as index = (index << 1) | goRight for ceil(log2(N))
+// iterations. Successive taps within one search land on adjacent memory, so every
+// cache line pulled is fully used and the first log2(subgroupSize) iterations are
+// served by a single memory transaction per subgroup.
+template<typename T>
+void buildEytzinger(std::span<const T> weights, T* outTree)
+{
+	const uint32_t N = static_cast<uint32_t>(weights.size());
+	if (N == 0)
+		return;
+
+	const uint32_t P = eytzingerLeafCount(N);
+
+	T total = T(0);
+	for (uint32_t i = 0; i < N; ++i)
+		total += weights[i];
+	const T rcpTotal = T(1) / total;
+
+	T acc = T(0);
+	for (uint32_t i = 0; i < N; ++i)
+	{
+		acc += weights[i];
+		outTree[P + i] = acc * rcpTotal;
+	}
+	for (uint32_t i = N; i < P; ++i)
+		outTree[P + i] = T(1);
+
+	// Bottom-up: each interior node copies the rightmost leaf of its left subtree,
+	// found by descending left-then-always-right from v.
+	for (uint32_t v = P; v-- > 1u;)
+	{
+		uint32_t r = v << 1u;
+		while (r < P)
+			r = (r << 1u) | 1u;
+		outTree[v] = outTree[r];
+	}
+}
+
 } // namespace sampling
 } // namespace hlsl
 } // namespace nbl

From c12a724ef5c0dfd2c0968404d21a7acb8a584f7e Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Fri, 24 Apr 2026 21:28:50 +0300
Subject: [PATCH 10/12] forgotten to commit file

---
 .../hlsl/sampling/cumulative_probability.hlsl | 189 +++++++++++++-----
 1 file changed, 143 insertions(+), 46 deletions(-)

diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
index 1f176f9713..f9acba367a 100644
--- a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
@@ -16,18 +16,38 @@ namespace hlsl
 namespace sampling
 {
 
-// Discrete sampler using cumulative probability lookup via upper_bound.
+// Discrete sampler using cumulative probability lookup.
 //
 // Samples a discrete index in [0, N) with probability proportional to
 // precomputed weights in O(log N) time per sample.
 //
-// The cumulative probability array stores N-1 entries (the last bucket
-// is always 1.0 and need not be stored). Entry i holds the sum of
-// probabilities for indices [0, i].
+// Three layouts / cache-population strategies, selected by the Mode parameter:
+//
+//   TRACKING (default):  N-1 CDF entries, last bucket implicit at 1.0.
+//                        A stateful comparator records the straddling CDF
+//                        values during upper_bound itself.
+//   YOLO:                Same storage. Plain upper_bound followed by two
+//                        re-reads of the adjacent CDF entries (warm cache).
+//                        Lower register footprint, two extra array reads.
+//   EYTZINGER:           Level-order implicit binary tree in 2*P entries
+//                        where P = roundUpPot(N). Leaves at [P, P+N) hold
+//                        the CDF; interior nodes at [1, P) hold split keys.
+//                        Descent reads adjacent memory at each step, so
+//                        every cache line pulled is fully utilised and the
+//                        first log2(subgroupSize) iterations are served by a
+//                        single transaction per subgroup. Build with
+//                        sampling::buildEytzinger<T>().
 //
 // Satisfies TractableSampler and ResamplableSampler (not BackwardTractableSampler:
 // the mapping is discrete).
-template<typename T, typename Domain, typename Codomain, typename CumProbAccessor
+enum CumulativeProbabilityMode : uint32_t
+{
+	TRACKING  = 0u,
+	YOLO      = 1u,
+	EYTZINGER = 2u
+};
+
+template<typename T, typename Domain, typename Codomain, typename CumProbAccessor, CumulativeProbabilityMode Mode = CumulativeProbabilityMode::TRACKING
 	NBL_PRIMARY_REQUIRES(concepts::accessors::GenericReadAccessor<CumProbAccessor, T, Codomain>)
 struct CumulativeProbabilitySampler
 {
@@ -44,58 +64,116 @@ struct CumulativeProbabilitySampler
 		density_type upperBound;
 	};
 
+	// `_size` is the user-facing bucket count N for every mode. TRACKING / YOLO
+	// expect the accessor to hold N-1 CDF entries; EYTZINGER expects 2*P entries
+	// in the level-order layout produced by buildEytzinger.
 	static CumulativeProbabilitySampler create(NBL_CONST_REF_ARG(CumProbAccessor) _cumProbAccessor, uint32_t _size)
 	{
 		CumulativeProbabilitySampler retval;
 		retval.cumProbAccessor = _cumProbAccessor;
 		retval.storedCount = _size - 1u;
+		retval.depth = 0u;
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
+		{
+			uint32_t P = 1u;
+			uint32_t d = 0u;
+			while (P < _size) { P <<= 1u; ++d; }
+			retval.depth = d;
+		}
 		return retval;
 	}
 
 	// BasicSampler interface
 	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
 	{
-		// upper_bound returns first index where cumProb > u
-		return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
+		{
+			const uint32_t leafBase = 1u << depth;
+			uint32_t index = 1u;
+			for (uint32_t iter = 0u; iter < depth; ++iter)
+			{
+				density_type key;
+				cumProbAccessor.template get<density_type, uint32_t>(index, key);
+				index = (index << 1u) | uint32_t(!(u < key));
+			}
+			const codomain_type result = codomain_type(index - leafBase);
+			return result < codomain_type(storedCount) ? result : codomain_type(storedCount);
+		}
+		else
+		{
+			return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+		}
 	}
 
 	// TractableSampler interface
 	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		// #define NBL_CUMPROB_YOLO_READS
-#ifdef NBL_CUMPROB_YOLO_READS
-		// YOLO approach: re-read the array after binary search.
-		// The accessed elements are adjacent to the found index so the cache is warm.
-		const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
-		cache.oneBefore = density_type(0.0);
-		if (result)
-			cumProbAccessor.template get<density_type, codomain_type>(result - 1u, cache.oneBefore);
-		cache.upperBound = density_type(1.0);
-		if (result < storedCount)
-			cumProbAccessor.template get<density_type, codomain_type>(result, cache.upperBound);
-#else
-		// Tracking reads approach: stateful comparator captures CDF values during binary search.
-		struct CdfComparator
+		codomain_type result;
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
 		{
-			bool operator()(const density_type value, const density_type rhs)
+			// Descent visits one interior node per level. Going left tightens
+			// the upper bound to the current key; going right tightens the
+			// lower bound. Final index, leafBase is the bucket.
+			cache.oneBefore = density_type(0.0);
+			cache.upperBound = density_type(1.0);
+			const uint32_t leafBase = 1u << depth;
+			uint32_t index = 1u;
+			for (uint32_t iter = 0u; iter < depth; ++iter)
 			{
-				const bool retval = value < rhs;
-				if (retval)
-					upperBound = rhs;
+				density_type key;
+				cumProbAccessor.template get<density_type, uint32_t>(index, key);
+				const bool goRight = !(u < key);
+				if (goRight)
+				{
+					cache.oneBefore = key;
+					index = (index << 1u) | 1u;
+				}
 				else
-					oneBefore = rhs;
-				return retval;
+				{
+					cache.upperBound = key;
+					index = (index << 1u);
+				}
 			}
-
-			density_type oneBefore;
-			density_type upperBound;
-		} comp;
-		comp.oneBefore = density_type(0.0);
-		comp.upperBound = density_type(1.0);
-		const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp);
-		cache.oneBefore = comp.oneBefore;
-		cache.upperBound = comp.upperBound;
-#endif
+			const codomain_type raw = codomain_type(index - leafBase);
+			result = raw < codomain_type(storedCount) ? raw : codomain_type(storedCount);
+		}
+		else NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::YOLO)
+		{
+			// Re-read the two adjacent CDF entries after the binary search.
+			// Both sit on the cache lines the search just touched, so they are warm.
+			result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+			cache.oneBefore = density_type(0.0);
+			if (result)
+				cumProbAccessor.template get<density_type, codomain_type>(result - 1u, cache.oneBefore);
+			cache.upperBound = density_type(1.0);
+			if (result < storedCount)
+				cumProbAccessor.template get<density_type, codomain_type>(result, cache.upperBound);
+		}
+		else
+		{
+			// TRACKING: stateful comparator captures the CDF values straddling the
+			// found index during the binary search itself, avoiding the two extra reads.
+			struct CdfComparator
+			{
+				bool operator()(const density_type value, const density_type rhs)
+				{
+					const bool retval = value < rhs;
+					if (retval)
+						upperBound = rhs;
+					else
+						oneBefore = rhs;
+					return retval;
+				}
+
+				density_type oneBefore;
+				density_type upperBound;
+			} comp;
+			comp.oneBefore = density_type(0.0);
+			comp.upperBound = density_type(1.0);
+			result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp);
+			cache.oneBefore = comp.oneBefore;
+			cache.upperBound = comp.upperBound;
+		}
 		return result;
 	}
 
@@ -111,16 +189,34 @@ struct CumulativeProbabilitySampler
 
 	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
 	{
-		density_type retval = density_type(1.0);
-		if (v < storedCount)
-			cumProbAccessor.template get<density_type, codomain_type>(v, retval);
-		if (v)
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
 		{
-			density_type prev;
-			cumProbAccessor.template get<density_type, codomain_type>(v - 1u, prev);
-			retval -= prev;
+			// Leaves store the CDF directly; the last real leaf is normalized
+			// to 1.0 and padded leaves (if any) also hold 1.0.
+			const uint32_t leafBase = 1u << depth;
+			density_type retval;
+			cumProbAccessor.template get<density_type, uint32_t>(leafBase + uint32_t(v), retval);
+			if (v)
+			{
+				density_type prev;
+				cumProbAccessor.template get<density_type, uint32_t>(leafBase + uint32_t(v) - 1u, prev);
+				retval -= prev;
+			}
+			return retval;
+		}
+		else
+		{
+			density_type retval = density_type(1.0);
+			if (v < storedCount)
+				cumProbAccessor.template get<density_type, codomain_type>(v, retval);
+			if (v)
+			{
+				density_type prev;
+				cumProbAccessor.template get<density_type, codomain_type>(v - 1u, prev);
+				retval -= prev;
+			}
+			return retval;
 		}
-		return retval;
 	}
 
 	weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC
@@ -129,7 +225,8 @@ struct CumulativeProbabilitySampler
 	}
 
 	CumProbAccessor cumProbAccessor;
-	uint32_t storedCount;
+	uint32_t storedCount;    // N - 1 (last real bucket index)
+	uint32_t depth;          // EYTZINGER only: ceil(log2(N)), iteration count; leafBase = 1 << depth
 };
 
 } // namespace sampling

From e135db266d9687099e80a102d1cc007b8c5f5a36 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Sat, 25 Apr 2026 12:54:05 +0300
Subject: [PATCH 11/12] Added Sobol matrices as builtins

---
 examples_tests                               |  2 +-
 include/nbl/builtin/hlsl/sampling/sobol.hlsl | 66 ++++++++++++++++++++
 src/nbl/builtin/CMakeLists.txt               |  1 +
 3 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 include/nbl/builtin/hlsl/sampling/sobol.hlsl

diff --git a/examples_tests b/examples_tests
index 74103b58ca..157bc5ec1f 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 74103b58ca9c077f3e4702051f676bdb14258fd0
+Subproject commit 157bc5ec1f5492f359e519c2075b114a35711f04
diff --git a/include/nbl/builtin/hlsl/sampling/sobol.hlsl b/include/nbl/builtin/hlsl/sampling/sobol.hlsl
new file mode 100644
index 0000000000..ea92aaded2
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/sobol.hlsl
@@ -0,0 +1,66 @@
+// Copyright (C) 2018-2026 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_SOBOL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_SOBOL_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/macros.h>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+// without acceleration
+struct RowMajorSobolMatrix
+{
+   inline uint16_t operator()(const uint16_t sampleIx)
+   {
+      // max number bits set is 16, if want to mask then need to be 5 apart
+      const uint16_t mask = 0b10000100001u;
+      uint16_t       val  = _static_cast<uint16_t>(spirv::bitCount<uint32_t>(rows[0] & sampleIx));
+      val |= _static_cast<uint16_t>(spirv::bitCount<uint32_t>(rows[5] & sampleIx)) << 5;
+      val |= _static_cast<uint16_t>(spirv::bitCount<uint32_t>(rows[10] & sampleIx)) << 10;
+      val |= _static_cast<uint16_t>(spirv::bitCount<uint32_t>(rows[15] & sampleIx)) << 15;
+      val &= mask;
+      NBL_UNROLL
+      for (uint16_t i = 1; i < 5; i++)
+      {
+         uint16_t tmp = _static_cast<uint16_t>(spirv::bitCount<uint32_t>(rows[i] & sampleIx));
+         tmp |= _static_cast<uint16_t>(spirv::bitCount<uint32_t>(rows[5 + i] & sampleIx)) << 5;
+         tmp |= _static_cast<uint16_t>(spirv::bitCount<uint32_t>(rows[10 + i] & sampleIx)) << 10;
+         val |= (tmp & mask) << i;
+      }
+      return val;
+   }
+
+   uint16_t rows[16];
+};
+
+// bitcount is expensive, its full 32 bit and needs shifting and masking anyway
+struct ColMajorSobolMatrix
+{
+   inline uint16_t operator()(const uint16_t sampleIx)
+   {
+      // abuse 2s completement to get a 0xffffu mask when `sampleIx` bit is on
+      uint16_t val = cols[0] & ((sampleIx & _static_cast<uint16_t>(0x1u)) - _static_cast<uint16_t>(2));
+      NBL_UNROLL
+      for (uint16_t i = 1; i < 16; i++)
+      {
+         const uint16_t mask = _static_cast<uint16_t>(0x1u) << i;
+         val |= cols[i] & ((sampleIx & mask) - _static_cast<uint16_t>(mask + 1));
+      }
+      return val;
+   }
+
+   uint16_t cols[16];
+};
+
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
+
+#endif
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 12b4af1bef..e13b803db1 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -290,6 +290,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/concepts.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/uniform_spheres.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/alias_table.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/cumulative_probability.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/sobol.hlsl")
 #
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ndarray_addressing.hlsl")
 #

From 4f17bc1b99dcf5d9645debee21a7a41f307197f3 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Sat, 2 May 2026 18:39:31 +0300
Subject: [PATCH 12/12] update examples submodule pointer

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 157bc5ec1f..c512cd850c 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 157bc5ec1f5492f359e519c2075b114a35711f04
+Subproject commit c512cd850cfe3c49d84e9dff5f3021e1af5f59e9