From 14636d30b1f08c0f87a25dea12e2c900ad726981 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 3 Dec 2025 23:29:35 +0300
Subject: [PATCH 01/26] update examples submodule

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 1508702f27..f18160276e 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 1508702f27dbd4c7fa9642e26b1047b0cd8889c9
+Subproject commit f18160276e78f860f64c45111c874e3351b44ffb

From 6887419b3d5a6b95851235fdbb2a1bae9c1f335f Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Sat, 6 Dec 2025 21:03:27 +0300
Subject: [PATCH 02/26] updated examples submodule

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index f18160276e..93861bd59f 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit f18160276e78f860f64c45111c874e3351b44ffb
+Subproject commit 93861bd59f85721993472e3de67f23bec6170363

From f32ddd2c45088bd715fe411b9d0ee3f5e93654fe Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Sun, 7 Dec 2025 00:53:04 +0300
Subject: [PATCH 03/26] Update examples submodule

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 93861bd59f..008e2ee154 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 93861bd59f85721993472e3de67f23bec6170363
+Subproject commit 008e2ee154b6cf5ba725752a3f1b4dac5d37ff42

From 183205914eaed37bacb146ee7c0d987ac09265c1 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Tue, 9 Dec 2025 00:21:01 +0300
Subject: [PATCH 04/26] update examples submodule

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 008e2ee154..91ae8657de 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 008e2ee154b6cf5ba725752a3f1b4dac5d37ff42
+Subproject commit 91ae8657dee9b4de82c81b97b23b83d3824a6011

From b79bf8f7f44b913766a4fedaf2b887912d766e7a Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Tue, 9 Dec 2025 00:30:59 +0300
Subject: [PATCH 05/26] update examples submodule

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 91ae8657de..0124cc9c0a 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 91ae8657dee9b4de82c81b97b23b83d3824a6011
+Subproject commit 0124cc9c0ad83d4a38f1e8ac3ddcdf56125740ac

From 49a017afca6718faac8b4bc08e55fe2d473f2d43 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Tue, 9 Dec 2025 00:45:05 +0300
Subject: [PATCH 06/26] update examples submodule

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 0124cc9c0a..a35eddd1bd 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 0124cc9c0ad83d4a38f1e8ac3ddcdf56125740ac
+Subproject commit a35eddd1bd83fbf636e820b59c6eef939ed09668

From e714c2469357633bd17a26b693e9157c94116dd8 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 17 Dec 2025 22:25:41 +0300
Subject: [PATCH 07/26] RandomSampler can give floats now, ranged and [0, 1),
 also update examples submodule

---
 examples_tests                            |  2 +-
 include/nbl/core/sampling/RandomSampler.h | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/examples_tests b/examples_tests
index a35eddd1bd..1c6458d81b 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit a35eddd1bd83fbf636e820b59c6eef939ed09668
+Subproject commit 1c6458d81b83aea176ac7ebda7450a9b395a85bd
diff --git a/include/nbl/core/sampling/RandomSampler.h b/include/nbl/core/sampling/RandomSampler.h
index 39832dc8f1..b692ef5e08 100644
--- a/include/nbl/core/sampling/RandomSampler.h
+++ b/include/nbl/core/sampling/RandomSampler.h
@@ -11,8 +11,8 @@
 namespace nbl::core
 {
 
-class RandomSampler
-{
+	class RandomSampler
+	{
 	public:
 		RandomSampler(uint32_t _seed)
 		{
@@ -25,9 +25,24 @@ class RandomSampler
 			return mersenneTwister();
 		}
 
+		// Returns a float in [0, 1)
+		inline float nextFloat()
+		{
+			// 1 / 2^32
+			constexpr float norm = 1.0f / 4294967296.0f;
+			return mersenneTwister() * norm;
+		}
+
+		// Returns a float in [min, max)
+		inline float nextFloat(float min, float max)
+		{
+			constexpr float norm = 1.0f / 4294967296.0f;
+			return min + (mersenneTwister() * norm) * (max - min);
+		}
+
 	protected:
 		std::mt19937 mersenneTwister;
-};
+	};
 
 
 }

From 6741c756172abb1e8095e9c153cecc3207622313 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Thu, 18 Dec 2025 01:11:29 +0300
Subject: [PATCH 08/26] update examples submodules

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 1c6458d81b..2e306fc96b 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 1c6458d81b83aea176ac7ebda7450a9b395a85bd
+Subproject commit 2e306fc96bfae85a9669ad552751cece33d1b383

From 92545a557f6231d8a84275e75228f235ea7b4e41 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Thu, 18 Dec 2025 02:25:03 +0300
Subject: [PATCH 09/26] update examples submodule

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 2e306fc96b..12486d4670 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 2e306fc96bfae85a9669ad552751cece33d1b383
+Subproject commit 12486d4670f0453722351814996d91f198a16749

From 993032c01e2890934021af8a0525eda310cd984e Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Sat, 20 Dec 2025 10:19:17 +0300
Subject: [PATCH 10/26] update examples submodule

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 12486d4670..1961a898fd 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 12486d4670f0453722351814996d91f198a16749
+Subproject commit 1961a898fd0a91c8e4d5c1a3fcb02df9142e8388

From af7574a1154d2c78b9b0814dd37b2a52ba442361 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 31 Dec 2025 14:06:29 +0300
Subject: [PATCH 11/26] include `tgmath.hlsl` in `functions.hlsl`, update
 examples_tests

---
 examples_tests                               | 2 +-
 include/nbl/builtin/hlsl/math/functions.hlsl | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index b5d8abc0e5..086af9e659 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit b5d8abc0e5c4761a3714b2c4a074cb10aaa90573
+Subproject commit 086af9e6590119bd394f2622db80ab0054445502
diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index a52eb21c23..a1c51d4e51 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -5,6 +5,7 @@
 #define _NBL_BUILTIN_HLSL_MATH_FUNCTIONS_INCLUDED_
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/tgmath.hlsl"
 #include "nbl/builtin/hlsl/numbers.hlsl"
 #include "nbl/builtin/hlsl/vector_utils/vector_traits.hlsl"
 #include "nbl/builtin/hlsl/concepts/vector.hlsl"

From 27aad5c4d57aa316203120a50874e0f7ea493224 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 7 Jan 2026 00:37:01 +0300
Subject: [PATCH 12/26] Added `SRasterizationParams` to the full screen
 triangle pipeline creation arguments, also used promote instead of
 _static_cast(not sure if this is fine), also updated examples submodule

---
 examples_tests                                          | 2 +-
 include/nbl/builtin/hlsl/math/functions.hlsl            | 2 +-
 include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples_tests b/examples_tests
index 086af9e659..15e4d5d044 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 086af9e6590119bd394f2622db80ab0054445502
+Subproject commit 15e4d5d044d0b682279fcce5486a841e1f3d3541
diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index a1c51d4e51..f7db44b9fb 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -152,7 +152,7 @@ struct conditionalAbsOrMax_helper<T NBL_PARTIAL_REQ_BOT(concepts::FloatingPointL
 
         Uint32VectorWithDimensionOfT xAsUintVec = bit_cast<Uint32VectorWithDimensionOfT, T>(x);
 
-        const Uint32VectorWithDimensionOfT mask = cond ? _static_cast<Uint32VectorWithDimensionOfT>(numeric_limits<UintOfTSize>::max >> 1) : _static_cast<Uint32VectorWithDimensionOfT>(numeric_limits<UintOfTSize>::max);
+        const Uint32VectorWithDimensionOfT mask = cond ? promote<Uint32VectorWithDimensionOfT>(numeric_limits<UintOfTSize>::max >> 1) : promote<Uint32VectorWithDimensionOfT>(numeric_limits<UintOfTSize>::max);
         const Uint32VectorWithDimensionOfT condAbsAsUint = xAsUintVec & mask;
         T condAbs = bit_cast<T, Uint32VectorWithDimensionOfT>(condAbsAsUint);
 
diff --git a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
index 1abebf23ea..f537994450 100644
--- a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
+++ b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
@@ -45,6 +45,7 @@ struct ProtoPipeline final
 			video::IGPURenderpass* renderpass,
 			const uint32_t subpassIx=0,
 			asset::SBlendParams blendParams = {},
+			asset::SRasterizationParams rasterizationParams = DefaultRasterParams,
 			const hlsl::SurfaceTransform::FLAG_BITS swapchainTransform=hlsl::SurfaceTransform::FLAG_BITS::IDENTITY_BIT
 		)
 		{
@@ -68,7 +69,7 @@ struct ProtoPipeline final
 				params[0].cached = {
 					.vertexInput = {}, // The Full Screen Triangle doesn't use any HW vertex input state
 					.primitiveAssembly = {},
-					.rasterization = DefaultRasterParams,
+					.rasterization = rasterizationParams,
 					.blend = blendParams,
 					.subpassIx = subpassIx
 				};

From e3adac0d28dda099617327d1a8f01e5af9d72c62 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 21 Jan 2026 04:56:01 +0300
Subject: [PATCH 13/26] update examples_tests

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 15e4d5d044..3e39f036cd 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 15e4d5d044d0b682279fcce5486a841e1f3d3541
+Subproject commit 3e39f036cda70bc7a8e4dccdfe99d59a60b0a263

From 68f5a4937ad62652f39b3486fa2f2e8b17a00c85 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 18 Feb 2026 02:42:28 +0300
Subject: [PATCH 14/26] fixes after merge, update examples submodule

---
 examples_tests                                          | 2 +-
 include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h | 6 ++++++
 src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp  | 8 ++------
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples_tests b/examples_tests
index 3e39f036cd..2b034eb4a7 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 3e39f036cda70bc7a8e4dccdfe99d59a60b0a263
+Subproject commit 2b034eb4a796e043d882e9e6335070466e7a871f
diff --git a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
index 149873d9c1..2046e5b592 100644
--- a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
+++ b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
@@ -29,6 +29,12 @@ struct ProtoPipeline final
 		);
 
 		core::smart_refctd_ptr<asset::IShader> m_vxShader;
+
+		constexpr static inline asset::SRasterizationParams DefaultRasterParams = {
+			.faceCullingMode = asset::EFCM_NONE,
+			.depthWriteEnable = false,
+			.depthCompareOp = asset::ECO_ALWAYS
+		};
 };
 
 bool recordDrawCall(video::IGPUCommandBuffer* commandBuffer);
diff --git a/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp b/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp
index fd5411c2ab..a825c580a9 100644
--- a/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp
+++ b/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp
@@ -84,6 +84,7 @@ smart_refctd_ptr<IGPUGraphicsPipeline> ProtoPipeline::createPipeline(
 	const IGPURenderpass* renderpass,
 	const uint32_t subpassIx,
 	SBlendParams blendParams,
+	asset::SRasterizationParams rasterizationParams,
 	const hlsl::SurfaceTransform::FLAG_BITS swapchainTransform)
 {
 	if (!renderpass || !bool(*this) || hlsl::bitCount(swapchainTransform) != 1)
@@ -93,11 +94,6 @@ smart_refctd_ptr<IGPUGraphicsPipeline> ProtoPipeline::createPipeline(
 
 	smart_refctd_ptr<IGPUGraphicsPipeline> m_retval;
 	{
-		constexpr SRasterizationParams defaultRasterParams = {
-			.faceCullingMode = EFCM_NONE,
-			.depthWriteEnable = false,
-			.depthCompareOp = ECO_ALWAYS
-		};
 		const auto orientationAsUint32 = static_cast<uint32_t>(swapchainTransform);
 
 		IGPUPipelineBase::SShaderEntryMap specConstants;
@@ -110,7 +106,7 @@ smart_refctd_ptr<IGPUGraphicsPipeline> ProtoPipeline::createPipeline(
 		params[0].cached = {
 			.vertexInput = {}, // The Full Screen Triangle doesn't use any HW vertex input state
 			.primitiveAssembly = {},
-			.rasterization = defaultRasterParams,
+			.rasterization = rasterizationParams,
 			.blend = blendParams,
 			.subpassIx = subpassIx
 		};

From 1f3fed4ac4161f56c88a9e310c4611d8c29c2f50 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Thu, 16 Apr 2026 15:09:27 +0200
Subject: [PATCH 15/26] factorize out a bunch of stuff, make the Spherical
 Rectangle Solid Angle Sampler global like its Triangle counterpart, we need
 to keep the basis around

---
 examples_tests                                |   2 +-
 .../hlsl/sampling/spherical_rectangle.hlsl    | 116 ++++++++++++------
 .../hlsl/shapes/spherical_rectangle.hlsl      |   5 +-
 3 files changed, 86 insertions(+), 37 deletions(-)

diff --git a/examples_tests b/examples_tests
index 2fd21fd891..5eeb47351e 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 2fd21fd8917660d2a559b97d80afa95dd6b591d5
+Subproject commit 5eeb47351e29cb6ee02f8d8319f131a2c012b5a2
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
index 131cc92d70..8029205384 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
@@ -26,6 +26,7 @@ struct SphericalRectangle
     using vector2_type = vector<T, 2>;
     using vector3_type = vector<T, 3>;
     using vector4_type = vector<T, 4>;
+    using matrix3x3_type = matrix<T, 3, 3>;
 
     // BackwardTractableSampler concept types
     using domain_type = vector2_type;
@@ -35,16 +36,15 @@ struct SphericalRectangle
 
     struct cache_type {};
 
-    NBL_CONSTEXPR_STATIC_INLINE scalar_type ClampEps = 1e-5;
-
     static SphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
     {
-        return create(rect.solidAngle(observer), rect.extents);
+        return create(rect.basis,rect.solidAngle(observer),rect.extents);
     }
 
-    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
+    static SphericalRectangle<T> create(const matrix3x3_type _basis, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
     {
         SphericalRectangle<T> retval;
+        retval.basis = _basis;
         retval.r0 = sa.r0;
         retval.extents = _extents;
 
@@ -62,7 +62,7 @@ struct SphericalRectangle
     // Create directly from a local-frame corner position and rectangle extents.
     // Use when you already know r0 (e.g. from a gnomonic projection) and don't
     // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip.
-    static SphericalRectangle<T> create(const vector3_type _r0, const vector2_type _extents)
+    static SphericalRectangle<T> create(const matrix3x3_type _basis, const vector3_type _r0, const vector2_type _extents)
     {
         // Same math as shapes::SphericalRectangle::solidAngle() but without
         // the mul(basis, origin - observer) step since we already have r0.
@@ -82,13 +82,22 @@ struct SphericalRectangle
         acc.addCosine(sa.cosGamma[3]);
         sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi<scalar_type>;
 
-        return create(sa, _extents);
+        return create(_basis, sa, _extents);
     }
 
     // shared core of generate and generateSurfaceOffset
     // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction
-    vector3_type __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+    struct SCommonGen
+    {
+        scalar_type xu;
+        scalar_type d2;
+        scalar_type hv;
+        scalar_type cosElevation2;
+    };
+    SCommonGen __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
     {
+        SCommonGen retval;
+
         // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip
         const scalar_type negAbsR0z = -hlsl::abs(r0.z);
         const scalar_type r0zSq = r0.z * r0.z;
@@ -103,44 +112,80 @@ struct SphericalRectangle
         const scalar_type rcpCu_2 = hlsl::max<scalar_type>(absNegFu * absNegFu + b0 * b0, scalar_type(1.0));
         // sign(negFu) = sign(numerator) * sign(sin(au)); sin(au) < 0 iff au > PI
         const scalar_type negFuSign = hlsl::select((au > numbers::pi<scalar_type>) != (numerator < scalar_type(0.0)), scalar_type(-1.0), scalar_type(1.0));
-        scalar_type xu = negAbsR0z * negFuSign * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
-        xu = hlsl::clamp<scalar_type>(xu, r0.x, r1.x); // avoid Infs
-        const scalar_type d_2 = xu * xu + r0zSq;
-        const scalar_type d = hlsl::sqrt<scalar_type>(d_2);
+        retval.xu = negAbsR0z * negFuSign * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
+        retval.xu = hlsl::clamp<scalar_type>(retval.xu, r0.x, r1.x); // avoid Infs
+        // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2` and reclamping this result in a different way
+        // with `d2 = r0zSq*(rcpCu_2*rcpCu_2-rcpCu_2)` or `d2 = r0zSq*rcpCu_2*(rcpCu_2-1)`
+        retval.d2 = retval.xu * retval.xu + r0zSq;
 
-        const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(d_2 + r0.y * r0.y);
-        const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(d_2 + r1.y * r1.y);
-        const scalar_type hv = h0 + u.y * (h1 - h0);
+        const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(retval.d2 + r0.y * r0.y);
+        const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(retval.d2 + r1.y * r1.y);
+        retval.hv = h0 + u.y * (h1 - h0);
+        retval.cosElevation2 = scalar_type(1.0) - hlsl::min<scalar_type>(retval.hv * retval.hv,1);
 
-        return vector3_type(xu, hv, d);
+        return retval;
     }
 
     // returns a normalized 3D direction in the local frame with correct r0.z sign
+    vector3_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
+    {
+        const SCommonGen core = __generate(u);
+        scalar_type cosElevationOverD = hlsl::rsqrt(core.d2/core.cosElevation2);
+        // TODO: or shall we do some other more sophisticated clamp or correction? Is this even the right one to use?
+        cosElevationOverD = hlsl::select(hlsl::isnan(cosElevationOverD),1.f,cosElevationOverD);
+
+        // TODO: investigate if due to precision we need to compute this as a `sqrt` then the quantity being computed is `core.cosElevation2/core.d2`
+        // which what alss `generateLocalBasisXY` needs, in which case `__generate` can already compute it and return it as `hitDist2`
+        hitDist = 1.f/cosElevationOverD;
+
+        const vector3_type retval = vector3_type(core.xu / hitDist, core.hv, r0.z / hitDist);
+        assert(!hlsl::isnan(computeHitT(retval)));
+        return retval;
+    }
+
     codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
-        const vector3_type core = __generate(u);
-        const scalar_type xu = core.x;
-        const scalar_type hv = core.y;
-        const scalar_type d = core.z;
-        const scalar_type hv2 = hv * hv;
-        const scalar_type cosElevation = hlsl::sqrt<scalar_type>(hlsl::max<scalar_type>(scalar_type(1.0) - hv2, scalar_type(0.0)));
-        const scalar_type rcpD = scalar_type(1.0) / d;
-
-        return vector3_type(xu * cosElevation * rcpD, hv, r0.z * cosElevation * rcpD);
+        scalar_type dummy;
+        const vector3_type localL = generateNormalizedLocal(u,cache,dummy);
+        // could return `hlsl::mul(hlsl::tranpose(basis),localL)` or just this
+        return basis[0]*localL[0]+basis[1]*localL[1]+basis[2]*localL[2];
     }
 
-    // returns a 2D offset on the rectangle surface from the r0 corner
-    vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+    // utility to determine maxT for a ray from L shot from the origin which we're sure intersects the rectangle
+    scalar_type computeHitT(const vector3_type L)
     {
-        const vector3_type core = __generate(u);
-        const scalar_type xu = core.x;
-        const scalar_type hv = core.y;
-        const scalar_type d = core.z;
+        const scalar_type retval = hlsl::abs(r0.z/hlsl::dot(L,basis[2]));
+        {
+            const vector3_type hitPointRelative = L*retval;
+            const vector2_type uv = mul(basis,L).xy-r0.xy;
+            assert(uv[0]>=0.0 && uv[0]<=rectExtents[0]);
+            assert(uv[1]>=0.0 && uv[1]<=rectExtents[1]);
+        }
+        return retval;
+    }
+
+    // returns a 2D offset on the rectangle surface including the r0 corner -useful for generating unnormalized worldspace L
+    vector2_type generateLocalBasisXY(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+    {
+        const SCommonGen core = __generate(u);
         const scalar_type r1y = r0.y + extents.y;
-        const scalar_type hv2 = hv * hv;
-        const scalar_type yv = hlsl::mix(r1y, (hv * d) / hlsl::sqrt<scalar_type>(scalar_type(1.0) - hv2), hv2 < scalar_type(1.0) - ClampEps);
+        // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2`
+        const scalar_type yv = core.hv * hlsl::rsqrt(core.cosElevation2/core.d2);
+
+        // fun fact, when one of the operands to min or max is NaN, the SPIR-V builtin will select the other one
+        // TODO: maybe try just `min(yv,ry1)`
+        const vector2_type retval = vector2_type(core.xu,hlsl::clamp(yv,r0.y,r1y));
+        assert(retval[0] >= r0.x && retval[1] >= r0.y);
+        assert(retval[0] <= r0.x+rectExtents[0] && retval[1] <= r0.y+rectExtents[1]);
+        return retval;
+    }
 
-        return vector2_type((xu - r0.x), (yv - r0.y));
+    // its basically the hitpoint minus the observer origin
+    codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+    {
+        const vector2_type localXY = generateLocalBasisXY(u,cache);
+        // the `localXY` already contains r0.xy
+        return basis[0]*localXY[0]+basis[1]*localXY[1]+basis[2]*r0.z;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -163,12 +208,13 @@ struct SphericalRectangle
         return backwardPdf(L);
     }
 
+    matrix3x3_type basis;
+    vector3_type r0;
+    vector2_type extents;
     scalar_type solidAngle;
     scalar_type k;
     scalar_type b0;
     scalar_type b1;
-    vector3_type r0;
-    vector2_type extents;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
index 5895e4bc80..bc864f7a19 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
@@ -102,7 +102,8 @@ struct SphericalRectangle
         retval.basis[0] = compressed.right / retval.extents[0];
         retval.basis[1] = compressed.up / retval.extents[1];
         assert(hlsl::abs(hlsl::dot(retval.basis[0], retval.basis[1])) < scalar_type(1e-5));
-        retval.basis[2] = hlsl::normalize(hlsl::cross(retval.basis[0], retval.basis[1]));
+        // don't normalize, the `right` and `up` vectors are orthogonal and the two bases are normalized already!
+        retval.basis[2] = hlsl::cross(retval.basis[0], retval.basis[1]);
         return retval;
     }
 
@@ -184,6 +185,8 @@ struct SphericalRectangle
         // rcpLen[i]*rcpLen[j] for adjacent pairs: (0,1), (1,2), (2,3), (3,0)
         const vector4_type cos_sides = unnormDots * rcpLen * rcpLen.yzwx;
 
+        // TODO: there's the same opportunity for optimization of this as the Spherical Triangle
+        // https://www.linkedin.com/posts/matt-kielan-9b054a165_untitled-graph-activity-7442910005671923712-jHz6?utm_source=share&utm_medium=member_desktop&rcm=ACoAACdp2RQBqq2bJfC2zxpsme-vRv2zh9oP-8E
         const vector4_type pyramidAngles = hlsl::acos<vector4_type>(cos_sides);
 
         return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);

From 1df4116e33dfb26f6a8fc81b109b8f3c27a97146 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Thu, 16 Apr 2026 23:14:57 +0200
Subject: [PATCH 16/26] epic optimization!

---
 .../hlsl/shapes/spherical_triangle.hlsl       | 52 ++++++++++++-------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
index 1a5681b39e..4ebe52d312 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
@@ -20,6 +20,26 @@ namespace hlsl
 namespace shapes
 {
 
+// TODO: move to where fast_acos lives
+template<typename T, int order=2>
+T acos_csc_approx(const T arg)
+{
+    const T u = hlsl::log2(_static_cast<T>(1)+arg);
+    // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial
+    // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties:
+    // `F(0) = 0` and
+    // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy
+    // See https://www.desmos.com/calculator/sdptomhbju
+    // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments
+    T poly;
+    // TODO: actually optimize these constants in real world scenarios (renders)
+    if (order==1)
+        poly = (_static_cast<T>(1)-u)*_static_cast<T>(0.6);
+    else if (order==2)
+        poly = (_static_cast<T>(1)-u)*_static_cast<T>(0.637)+(_static_cast<T>(1) - u * u) * _static_cast<T>(0.0115);
+    return hlsl::exp2<T>(poly);
+}
+
 template<typename T>
 struct SphericalTriangle
 {
@@ -87,34 +107,30 @@ struct SphericalTriangle
             return 0;
 
         matrix<scalar_type, 3, 3> awayFromEdgePlane;
-        awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * csc_sides[0];
-        awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * csc_sides[1];
-        awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * csc_sides[2];
+        // `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross(A,B)*acos_csc_approx(dot(A,B))`
+#define ACOS_CSC(I) acos_csc_approx(cos_sides[I])
+//#define ACOS_CSC(I) hlsl::acos(cos_sides[I])*csc_sides[I]
+        awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * ACOS_CSC(0);
+        awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * ACOS_CSC(1);
+        awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * ACOS_CSC(2);
+#undef ACOS_CSC
+        const vector3_type externalProductsWeightedByPyramidAngles = hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal);
+
         // The ABS makes it so that the computation is correct for an `abs(cos(theta))` factor which is the projected solid angle used for a BSDF.
+        // It also makes the computation insensitive to the CW or CCW winding of the vertices in the triangle.
         // Proof: Kelvin-Stokes theorem, if you split the set into two along the horizon with constant CCW winding, the `cross` along the shared edge
         // goes in different directions and cancels out, while `acos` of the clipped great arcs corresponding to polygon edges add up to the original sides again.
-        const vector3_type externalProducts = hlsl::abs(hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal));
-
-        // Far TODO: `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross*acos_csc_approx(dot(A,B))`
-        // We could skip the `csc_sides` factor, and computing `pyramidAngles` and replace them with this approximation weighting before the dot product with the receiver notmal
-        // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial, so far I've calculated `F = (1-u)0.635+(1-u^2)0.0118` which gives <5% error until 165 degrees
-        // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties:
-        // `F(0) = 0` and
-        // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy
-        // See https://www.desmos.com/calculator/sdptomhbju
-        // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments
-        const vector3_type pyramidAngles = hlsl::acos<vector3_type>(cos_sides);
-        // So that triangle covering almost whole hemisphere sums to PI
-        return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);
+        // The 0.5 is so that triangle covering almost whole hemisphere sums to PI
+        return hlsl::abs(externalProductsWeightedByPyramidAngles[0]+externalProductsWeightedByPyramidAngles[1]+externalProductsWeightedByPyramidAngles[2]) * scalar_type(0.5);
     }
 
     vector3_type vertices[3];
     // angles of vertices with origin, so the sides are INSIDE the sphere
     vector3_type cos_sides;
-    vector3_type csc_sides;
+    vector3_type csc_sides; // TODO: spherical triangle sampling only needs `csc_sides[1]` and possibly `csc_sides[2]`
     // angles between arcs on the sphere, so angles in the TANGENT plane at each vertex
     vector3_type cos_vertices;
-    vector3_type sin_vertices;
+    vector3_type sin_vertices; // TODO: spherical triangle sampling only needs `sin_vertices[0]` a.k.a `sinA`
     scalar_type solid_angle;
 };
 

From a3ed91a1b8354162027adfb2370820299bdbcf2e Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 02:00:44 +0200
Subject: [PATCH 17/26] `UsePdfAsWeight==false` implementation was 100% wrong

---
 .../projected_spherical_triangle.hlsl         | 59 +++++++++----------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
index c4bc5fcea8..923fc6b537 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
@@ -47,73 +47,68 @@ struct ProjectedSphericalTriangle
 
     struct cache_type
     {
-        scalar_type abs_cos_theta;
-        vector2_type warped;
         typename Bilinear<scalar_type>::cache_type bilinearCache;
+        vector3_type L; // TODO: erase when UsePdfAsWeight==false
     };
 
-    // NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away
-    // from all three triangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure
-    // at least one vertex has positive projection onto the receiver normal.
+    // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
     static ProjectedSphericalTriangle<T,UsePdfAsWeight> create(NBL_REF_ARG(shapes::SphericalTriangle<T>) shape, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
     {
         ProjectedSphericalTriangle<T,UsePdfAsWeight> retval;
-       retval.sphtri = SphericalTriangle<T, UsePdfAsWeight>::create(shape);
-        retval.receiverNormal = _receiverNormal;
-        retval.receiverWasBSDF = _receiverWasBSDF;
+        retval.sphtri = SphericalTriangle<T, UsePdfAsWeight>::create(shape);
 
         const scalar_type minimumProjSolidAngle = 0.0;
         matrix<T, 3, 3> m = matrix<T, 3, 3>(shape.vertices[0], shape.vertices[1], shape.vertices[2]);
         const vector3_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, hlsl::mul(m, _receiverNormal), hlsl::promote<vector3_type>(minimumProjSolidAngle));
         retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex.yyxz);
 
-        const scalar_type projSA = shape.projectedSolidAngle(_receiverNormal);
-        retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0);
+        NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+        {
+            retval.receiverNormal = _receiverNormal;
+            // prevent division of 0 cosine by 0
+            retval.projSolidAngle = max<scalar_type>(shape.projectedSolidAngle(_receiverNormal),numeric_limits<scalar_type>::min);
+        }
         return retval;
     }
 
     codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
-        Bilinear<scalar_type> bilinear = bilinearPatch;
-        cache.warped = bilinear.generate(u, cache.bilinearCache);
-        typename SphericalTriangle<scalar_type>::cache_type sphtriCache;
-        const vector3_type L = sphtri.generate(cache.warped, sphtriCache);
-        cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
+        const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+        typename SphericalTriangle<scalar_type>::cache_type sphtriCache; // PDF is constant caches nothing, its empty
+        const codomain_type L = sphtri.generate(warped, sphtriCache);
+        NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+            cache.L = L;
         return L;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache);
+        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u,cache.bilinearCache);
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        if (UsePdfAsWeight)
-            return forwardPdf(u, cache);
-        return cache.abs_cos_theta * rcpProjSolidAngle;
-    }
-
-    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        const vector2_type u = sphtri.generateInverse(L);
-        return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+        NBL_IF_CONSTEXPR (UsePdfAsWeight)
+            return forwardPdf(u,cache);
+        return backwardWeight(cache.L);
     }
 
     weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
     {
-        NBL_IF_CONSTEXPR(UsePdfAsWeight)
-            return backwardPdf(L);
-        const scalar_type minimumProjSolidAngle = 0.0;
-        const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(receiverNormal, L), minimumProjSolidAngle);
-        return abs_cos_theta * rcpProjSolidAngle;
+        NBL_IF_CONSTEXPR (UsePdfAsWeight)
+        {
+            const vector2_type u = sphtri.generateInverse(L);
+            return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+        }
+        // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
+        return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
     }
 
     sampling::SphericalTriangle<T, UsePdfAsWeight> sphtri;
     Bilinear<scalar_type> bilinearPatch;
-    scalar_type rcpProjSolidAngle;
+    // TODO: erase when UsePdfAsWeight==false
     vector3_type receiverNormal;
-    bool receiverWasBSDF;
+    vector3_type projSolidAngle;
 };
 
 } // namespace sampling

From 99e9495506616dda793df3fd63b22ba1dbcdde39 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 03:32:20 +0200
Subject: [PATCH 18/26] why its always to use accessing methods than members
 directly in other code :P

---
 .../sampling/projected_spherical_triangle.hlsl     |  4 ++--
 .../builtin/hlsl/sampling/spherical_triangle.hlsl  | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
index 923fc6b537..a542357ead 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
@@ -83,7 +83,7 @@ struct ProjectedSphericalTriangle
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u,cache.bilinearCache);
+        return sphtri.getRcpSolidAngle() * bilinearPatch.forwardPdf(u,cache.bilinearCache);
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -98,7 +98,7 @@ struct ProjectedSphericalTriangle
         NBL_IF_CONSTEXPR (UsePdfAsWeight)
         {
             const vector2_type u = sphtri.generateInverse(L);
-            return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+            return sphtri.getRcpSolidAngle() * bilinearPatch.backwardPdf(u);
         }
         // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
         return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
index 6f29582e04..eb714a2505 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
@@ -172,6 +172,8 @@ struct SphericalTriangle<T, false, Algorithm>
       return backwardPdf(L);
    }
 
+   scalar_type getRcpSolidAngle() NBL_CONST_MEMBER_FUNC {return rcpSolidAngle;}
+
    scalar_type rcpSolidAngle;
    scalar_type cosA;
    scalar_type sinA;
@@ -203,7 +205,6 @@ struct SphericalTriangle<T, true, Algorithm>
    {
       SphericalTriangle retval;
       retval.base = base_type::create(tri);
-      retval.rcpSolidAngle = retval.base.rcpSolidAngle;
       retval.vertexC = tri.vertices[2];
       // precompute great circle normal of arc AC (needed for generateInverse)
       const scalar_type cscB = tri.csc_sides[1];
@@ -277,7 +278,7 @@ struct SphericalTriangle<T, true, Algorithm>
       const scalar_type cosCpB = nbl::hlsl::dot(base.tri_vertices[1], cp);
       const scalar_type den = scalar_type(1.0) + base.triCosC + cosCpB + cosBp_inv;
       const scalar_type subSolidAngle = scalar_type(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
-      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
+      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * base.rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
 
       // Step 3: u.y = |L-B|^2 / |C'-B|^2
       // Squared Euclidean distance avoids catastrophic cancellation vs (1-dot)/(1-dot)
@@ -295,7 +296,7 @@ struct SphericalTriangle<T, true, Algorithm>
 
    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
    {
-      return rcpSolidAngle;
+      return base.rcpSolidAngle;
    }
 
    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -305,16 +306,15 @@ struct SphericalTriangle<T, true, Algorithm>
 
    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
-      return rcpSolidAngle;
+      return base.rcpSolidAngle;
    }
 
    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
       return backwardPdf(L);
    }
-
-   // mirrored from base for uniform access across both specializations
-   scalar_type rcpSolidAngle;
+   
+   scalar_type getRcpSolidAngle() NBL_CONST_MEMBER_FUNC {return base.rcpSolidAngle;}
 
    base_type base;
    vector3_type vertexC;

From 6da021b89138b4d5cfd058b1ca62cfaf980c1fc0 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 03:34:42 +0200
Subject: [PATCH 19/26] The Spherical Rectangle Shape needs to be created with
 the observer (local coordinate system) already given and fixed.

This way we can share computation between Solid Angle and Projected Solid Angle calculations

Also apply the `abs` fix from the triangle
---
 .../hlsl/shapes/spherical_rectangle.hlsl      | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
index bc864f7a19..7ffe2ef407 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
@@ -94,6 +94,8 @@ struct SphericalRectangle
         vector4_type cosGamma;
     };
 
+    // TODO: take an observer already, this way we can precompute and store the `r0`, `denorm_n_z` and `rcpLen_denorm_n_z`
+    // we need all of the above for solid angle and projected solid angle computation
     static SphericalRectangle<Scalar> create(NBL_CONST_REF_ARG(CompressedSphericalRectangle<Scalar>) compressed)
     {
         SphericalRectangle<scalar_type> retval;
@@ -113,7 +115,8 @@ struct SphericalRectangle
         result.r0 = hlsl::mul(basis, origin - observer);
 
         const vector4_type denorm_n_z = vector4_type(-result.r0.y, result.r0.x + extents.x, result.r0.y + extents.y, -result.r0.x);
-        result.n_z = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z);
+        const vector4_type rcpLen_denorm_n_z = hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z);
+        result.n_z = denorm_n_z * rcpLen_denorm_n_z;
         result.cosGamma = vector4_type(
             -result.n_z[0] * result.n_z[1],
             -result.n_z[1] * result.n_z[2],
@@ -129,17 +132,17 @@ struct SphericalRectangle
     }
 
     // Kelvin-Stokes theorem: signed projected solid angle = integral_{rect} (n . omega) d_omega
+    // TODO: don't take the observer, observer should be taken at creation
     scalar_type projectedSolidAngle(const vector3_type observer, const vector3_type receiverNormal) NBL_CONST_MEMBER_FUNC
     {
         return projectedSolidAngleFromLocal(hlsl::mul(basis, origin - observer), hlsl::mul(basis, receiverNormal));
     }
 
-    // Overload for when r0 and localNormal are already computed (avoids redundant mul(basis, ...)).
-    // Exploits rectangle structure: all 4 corners share the same z, so cross products
-    // have only 2 nonzero components each, and externalProducts can be computed without
-    // normalizing the corner directions.
+    // TODO: only take a `localN`
     scalar_type projectedSolidAngleFromLocal(const vector3_type r0, const vector3_type n) NBL_CONST_MEMBER_FUNC
     {
+        // FUN FACT: `n_z` already holds Z coordinate the NORMALIZED `awayFromEdgePlane`, the non-zero coordinate absolute value is equal to `r0.z * rcpLen_denorm_n_z`
+// TODO: skip all this code until  just call `acos` on the `unnormDots`
         const scalar_type x0 = r0.x, y0 = r0.y, z = r0.z;
         const scalar_type x1 = x0 + extents.x;
         const scalar_type y1 = y0 + extents.y;
@@ -155,8 +158,9 @@ struct SphericalRectangle
             zSq + x1 * x1,
             zSq + y1 * y1,
             zSq + x0 * x0
-        );
+        ); // TODO: this is already computed as `rcpLen_denorm_n_z`
 
+// TODO: this can be computed from `denorm_n_z`, `z` and `rcpLen_denorm_n_z` instead
         // dot(cross(ri,rj), n) / |cross(ri,rj)| the ex/ey scale factors cancel
         const vector4_type crossDotN = vector4_type(
             z * n.y - y0 * n.z,
@@ -165,8 +169,12 @@ struct SphericalRectangle
             z * n.x - x0 * n.z
         );
         // The ABS makes the computation correct for abs(cos(theta)) (BSDF projected solid angle).
-        const vector4_type externalProducts = hlsl::abs(crossDotN) * hlsl::rsqrt<vector4_type>(crossLenSq);
+        const vector4_type externalProducts = crossDotN * hlsl::rsqrt<vector4_type>(crossLenSq);
 
+// TODO: isn't `rcpLen_denorm_n_z` related to the sin^-1() of arclengths ? Wouldn't `ACOS_CSC` apply instead  of `acos*rsqrt(1-cos^2)`
+// wouldn't then `hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z` be the sin^2 ?
+// it would probably have to be a different, here's the `acos(sqrt(1-x*x))/x` curve fit again revealed to me in a dream
+//  https://www.desmos.com/calculator/sbdrulot5a = exp2(-1.6*sqrt(1-x*x))*A+B
         // cos(arc length) between adjacent corners: dot(ri,rj) / (|ri|*|rj|)
         const vector4_type lenSq = vector4_type(
             x0 * x0 + y0 * y0,
@@ -189,7 +197,7 @@ struct SphericalRectangle
         // https://www.linkedin.com/posts/matt-kielan-9b054a165_untitled-graph-activity-7442910005671923712-jHz6?utm_source=share&utm_medium=member_desktop&rcm=ACoAACdp2RQBqq2bJfC2zxpsme-vRv2zh9oP-8E
         const vector4_type pyramidAngles = hlsl::acos<vector4_type>(cos_sides);
 
-        return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);
+        return hlsl::abs(hlsl::dot(pyramidAngles, externalProducts)) * scalar_type(0.5);
     }
 
     vector3_type origin;

From 404592196897a57ba991cbd24cea0ecc7ec797a6 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 03:39:43 +0200
Subject: [PATCH 20/26] Spherical Rectangle needs `generateInverse`
 implemented, otherwise PRactical Warps are broken wrong weight was getting
 returned and MIS weights were inconsistent.

They're broken now, NaN returned as the backwardWeight
---
 examples_tests                                |   2 +-
 .../projected_spherical_rectangle.hlsl        | 120 +++++++++---------
 2 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/examples_tests b/examples_tests
index 5eeb47351e..89ecce1444 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 5eeb47351e29cb6ee02f8d8319f131a2c012b5a2
+Subproject commit 89ecce14443c216b30ff84b837b899045bb5513f
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
index 5bf652cb4c..618f401ba9 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
@@ -25,12 +25,6 @@ namespace sampling
 //   2. Warp uniform [0,1]^2 through the bilinear to importance-sample NdotL
 //   3. Feed the warped UV into the solid angle sampler to get a rect offset
 //   4. PDF = (1/SolidAngle) * bilinearPdf
-//
-// Template parameter `UsePdfAsWeight`: when true (default), forwardWeight/backwardWeight
-// return the PDF instead of the projected-solid-angle MIS weight.
-// TODO: the projected-solid-angle MIS weight (UsePdfAsWeight=false) has been shown to be
-// poor in practice. Once confirmed by testing, remove the false path and stop storing
-// receiverNormal, receiverWasBSDF, and rcpProjSolidAngle as members.
 template<typename T, bool UsePdfAsWeight = true>
 struct ProjectedSphericalRectangle
 {
@@ -47,24 +41,32 @@ struct ProjectedSphericalRectangle
 
 	struct cache_type
 	{
-		scalar_type abs_cos_theta;
-		vector2_type warped;
 		typename Bilinear<scalar_type>::cache_type bilinearCache;
+		vector3_type L; // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
 	};
 
-	// NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away
-	// from all four rectangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure
-	// at least one vertex has positive projection onto the receiver normal.
-	static ProjectedSphericalRectangle<T,UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+	// Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+	static ProjectedSphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::CompressedSphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
 	{
-		ProjectedSphericalRectangle<T,UsePdfAsWeight> retval;
+		ProjectedSphericalRectangle<T> retval;
+// TODO:https://github.com/Devsh-Graphics-Programming/Nabla/pull/1001#discussion_r3088570145
+		return retval;
+	}
+
+	// Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+	static ProjectedSphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+	{
+		ProjectedSphericalRectangle<T> retval;
 		const vector3_type n = hlsl::mul(shape.basis, _receiverNormal);
-		retval.localReceiverNormal = n;
-		retval.receiverWasBSDF = _receiverWasBSDF;
 
 		// Compute solid angle and get r0 in local frame (before z-flip)
 		const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
 		const vector3_type r0 = sa.r0;
+		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+		{
+			retval.receiverNormal = _receiverNormal;
+			retval.projSolidAngle = shape.projectedSolidAngleFromLocal(r0,n);
+		}
 
 		// All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
 		const scalar_type r1x = r0.x + shape.extents.x;
@@ -94,78 +96,74 @@ struct ProjectedSphericalRectangle
 		retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
 
 		// Reuse the already-computed solid_angle_type to avoid recomputing mul(basis, origin - observer)
-		retval.sphrect = SphericalRectangle<T>::create(sa, shape.extents);
-		retval.rcpSolidAngle = retval.sphrect.solidAngle > scalar_type(0.0) ? scalar_type(1.0) / retval.sphrect.solidAngle : scalar_type(0.0);
-
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-		{
-			const scalar_type projSA = shape.projectedSolidAngleFromLocal(r0, n);
-			retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0);
-		}
+		retval.sphrect = SphericalRectangle<T>::create(shape.basis, sa, shape.extents);
 		return retval;
 	}
 
 	// returns a normalized 3D direction in the local frame
-	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	codomain_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
 	{
-		Bilinear<scalar_type> bilinear = bilinearPatch;
-		cache.warped = bilinear.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache;
-		const vector3_type dir = sphrect.generate(cache.warped, sphrectCache);
-		cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
+		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+		const vector3_type dir = sphrect.generateNormalizedLocal(warped,sphrectCache,hitDist);
+		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+			cache.L = hlsl::mul(hlsl::transpose(sphrect.basis),dir);
 		return dir;
 	}
 
-	// returns a 2D offset on the rectangle surface from the r0 corner
-	vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	// returns a unnormalized 3D direction in the global frame
+	codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		Bilinear<scalar_type> bilinear = bilinearPatch;
-		cache.warped = bilinear.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache;
-		const vector2_type sampleOffset = sphrect.generateSurfaceOffset(cache.warped, sphrectCache);
-		cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
-		return sampleOffset;
+		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+		const vector3_type dir = sphrect.generateUnnormalized(warped,sphrectCache);
+		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+			cache.L = dir * hlsl::rsqrt(hlsl::dot(dir,dir));
+		return dir;
 	}
 
-	density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+	// returns a normalized 3D direction in the global frame
+	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		return rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache);
+		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+		const vector3_type dir = sphrect.generate(warped, sphrectCache);
+		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+			cache.L = dir;
+		return dir;
 	}
 
-	weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+	density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
 	{
-		if (UsePdfAsWeight)
-			return forwardPdf(u, cache);
-		return cache.abs_cos_theta * rcpProjSolidAngle;
+		return bilinearPatch.forwardPdf(u,cache.bilinearCache) / sphrect.solidAngle;
 	}
 
-	// `p` is the normalized [0,1]^2 position on the rectangle
-	density_type backwardPdf(const vector2_type p) NBL_CONST_MEMBER_FUNC
+	weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
 	{
-		return rcpSolidAngle * bilinearPatch.backwardPdf(p);
+        NBL_IF_CONSTEXPR (UsePdfAsWeight)
+            return forwardPdf(u,cache);
+        return backwardWeight(cache.L);
 	}
 
-	weight_type backwardWeight(const vector2_type p) NBL_CONST_MEMBER_FUNC
+	weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
 	{
 		NBL_IF_CONSTEXPR(UsePdfAsWeight)
-			return backwardPdf(p);
-		const scalar_type minimumProjSolidAngle = 0.0;
-		// Reconstruct local direction from normalized rect position
-		const vector3_type localDir = hlsl::normalize(sphrect.r0 + vector3_type(
-			p.x * sphrect.extents.x,
-			p.y * sphrect.extents.y,
-			scalar_type(0)
-		));
-		const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(localReceiverNormal, localDir), minimumProjSolidAngle);
-		return abs_cos_theta * rcpProjSolidAngle;
+		{
+#if 0
+			const vector2_type warped = sphrect.generateInvese(L); // TODO: implement `generateInverse`
+			return bilinearPatch.backwardPdf(warped) / sphrect.solidAngle;
+#endif
+			return 0.f/0.f;
+		}
+        // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
+        return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
 	}
 
 	sampling::SphericalRectangle<T> sphrect;
 	Bilinear<scalar_type> bilinearPatch;
-	scalar_type rcpSolidAngle;
-	scalar_type rcpProjSolidAngle;
-	vector3_type localReceiverNormal;
-	bool receiverWasBSDF;
+	// TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
+	vector3_type receiverNormal;
+	vector3_type projSolidAngle;
 };
 
 } // namespace sampling

From 45a68a625475e3bd49245ac71da1a8d9c787a00d Mon Sep 17 00:00:00 2001
From: Mateusz Kielan <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 23:40:00 +0200
Subject: [PATCH 21/26] Update spherical_triangle.hlsl

always move the scalar multiplication of a vector in a dot product outside the dot product
---
 .../nbl/builtin/hlsl/shapes/spherical_triangle.hlsl   | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
index 4ebe52d312..b8a2c7229e 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
@@ -74,6 +74,7 @@ struct SphericalTriangle
         // degenerate triangle: any side has near-zero sin, so csc blows up
         if (hlsl::any<vector<bool, 3> >(retval.csc_sides >= hlsl::promote<vector3_type>(numeric_limits<scalar_type>::max)))
         {
+            // TODO: can't do this, still need to be able to sample thin triangle like a line light, so need to know all the angles which are still valid
             retval.cos_vertices = hlsl::promote<vector3_type>(0.0);
             retval.sin_vertices = hlsl::promote<vector3_type>(0.0);
             retval.solid_angle = 0;
@@ -106,22 +107,20 @@ struct SphericalTriangle
         if (solid_angle <= numeric_limits<scalar_type>::epsilon)
             return 0;
 
-        matrix<scalar_type, 3, 3> awayFromEdgePlane;
         // `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross(A,B)*acos_csc_approx(dot(A,B))`
 #define ACOS_CSC(I) acos_csc_approx(cos_sides[I])
 //#define ACOS_CSC(I) hlsl::acos(cos_sides[I])*csc_sides[I]
-        awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * ACOS_CSC(0);
-        awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * ACOS_CSC(1);
-        awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * ACOS_CSC(2);
+        scalar_type externalProductsWeightedByPyramidAngles = hlsl::dot(hlsl::cross(vertices[1], vertices[2]),receiverNormal) * ACOS_CSC(0);
+        externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[2], vertices[0]),receiverNormal) * ACOS_CSC(1);
+        externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[0], vertices[1]),receiverNormal) * ACOS_CSC(2);
 #undef ACOS_CSC
-        const vector3_type externalProductsWeightedByPyramidAngles = hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal);
 
         // The ABS makes it so that the computation is correct for an `abs(cos(theta))` factor which is the projected solid angle used for a BSDF.
         // It also makes the computation insensitive to the CW or CCW winding of the vertices in the triangle.
         // Proof: Kelvin-Stokes theorem, if you split the set into two along the horizon with constant CCW winding, the `cross` along the shared edge
         // goes in different directions and cancels out, while `acos` of the clipped great arcs corresponding to polygon edges add up to the original sides again.
         // The 0.5 is so that triangle covering almost whole hemisphere sums to PI
-        return hlsl::abs(externalProductsWeightedByPyramidAngles[0]+externalProductsWeightedByPyramidAngles[1]+externalProductsWeightedByPyramidAngles[2]) * scalar_type(0.5);
+        return externalProductsWeightedByPyramidAngles * scalar_type(0.5);
     }
 
     vector3_type vertices[3];

From 2291b7d4a1ebf775034dfed8ec0fafcaab9d163b Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 22 Apr 2026 01:24:28 +0300
Subject: [PATCH 22/26] fixed compartor float vs bool warning, no more
 spherical triangle boilerplate, addressed comments

---
 examples_tests                                |   2 +-
 include/nbl/builtin/hlsl/algorithm.hlsl       |   4 +-
 include/nbl/builtin/hlsl/functional.hlsl      |  24 +-
 include/nbl/builtin/hlsl/ies/sampler.hlsl     |   2 +-
 include/nbl/builtin/hlsl/math/functions.hlsl  |  19 +-
 .../nbl/builtin/hlsl/sampling/bilinear.hlsl   |   2 +-
 .../hlsl/sampling/cos_weighted_spheres.hlsl   |   2 +-
 include/nbl/builtin/hlsl/sampling/linear.hlsl |  18 +-
 .../projected_spherical_rectangle.hlsl        | 285 +++++++------
 .../projected_spherical_triangle.hlsl         |  10 +-
 .../hlsl/sampling/spherical_rectangle.hlsl    | 398 +++++++++---------
 .../hlsl/sampling/spherical_triangle.hlsl     | 178 +++-----
 .../hlsl/sampling/uniform_spheres.hlsl        |   9 +-
 13 files changed, 475 insertions(+), 478 deletions(-)

diff --git a/examples_tests b/examples_tests
index 89ecce1444..fb5cfa2bca 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 89ecce14443c216b30ff84b837b899045bb5513f
+Subproject commit fb5cfa2bcaa0a92aafb429f3d390658d28d1ca02
diff --git a/include/nbl/builtin/hlsl/algorithm.hlsl b/include/nbl/builtin/hlsl/algorithm.hlsl
index 631001686e..d80ebdf443 100644
--- a/include/nbl/builtin/hlsl/algorithm.hlsl
+++ b/include/nbl/builtin/hlsl/algorithm.hlsl
@@ -142,7 +142,9 @@ struct bound_t
 
     void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint, const uint32_t rightBegin)
     {
-        if (compare(accessor[testPoint],value))
+        typename Accessor::value_type val;
+        accessor.get(testPoint, val);
+        if (compare(val,value))
             it = rightBegin;
     }
     void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint)
diff --git a/include/nbl/builtin/hlsl/functional.hlsl b/include/nbl/builtin/hlsl/functional.hlsl
index 118fe07c63..6a155f2b92 100644
--- a/include/nbl/builtin/hlsl/functional.hlsl
+++ b/include/nbl/builtin/hlsl/functional.hlsl
@@ -89,12 +89,23 @@ struct reference_wrapper : enable_if_t<
         return lhs OP rhs; \
     }
 
+#define ALIAS_STD_CMP(NAME,OP) template<typename T NBL_STRUCT_CONSTRAINABLE > struct NAME { \
+    using type_t = T; \
+    \
+    bool operator()(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs) \
+    { \
+        return lhs OP rhs; \
+    }
+
 
 #else // CPP
 
 #define ALIAS_STD(NAME,OP) template<typename T> struct NAME : std::NAME<T> { \
     using type_t = T;
 
+#define ALIAS_STD_CMP(NAME,OP) template<typename T> struct NAME : std::NAME<T> { \
+    using type_t = T;
+
 #endif
 
 ALIAS_STD(bit_and,&)
@@ -136,14 +147,15 @@ ALIAS_STD(divides,/)
 };
 
 
-ALIAS_STD(equal_to, ==) };
-ALIAS_STD(not_equal_to, !=) };
-ALIAS_STD(greater, >) };
-ALIAS_STD(less, <) };
-ALIAS_STD(greater_equal, >=) };
-ALIAS_STD(less_equal, <=) };
+ALIAS_STD_CMP(equal_to, ==) };
+ALIAS_STD_CMP(not_equal_to, !=) };
+ALIAS_STD_CMP(greater, >) };
+ALIAS_STD_CMP(less, <) };
+ALIAS_STD_CMP(greater_equal, >=) };
+ALIAS_STD_CMP(less_equal, <=) };
 
 #undef ALIAS_STD
+#undef ALIAS_STD_CMP
 
 // The above comparison operators return bool on STD, but in HLSL they're supposed to yield bool vectors, so here's a specialization so that they return `vector<bool, N>` for vectorial types
 
diff --git a/include/nbl/builtin/hlsl/ies/sampler.hlsl b/include/nbl/builtin/hlsl/ies/sampler.hlsl
index ab4046477c..a6309fd128 100644
--- a/include/nbl/builtin/hlsl/ies/sampler.hlsl
+++ b/include/nbl/builtin/hlsl/ies/sampler.hlsl
@@ -85,7 +85,7 @@ struct CandelaSampler
         const angle_t vAngle = degrees(polar.theta);
         const angle_t hAngle = degrees(__wrapPhi(polar.phi, symmetry));
 
-#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } };
+#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } void get(uint32_t idx, NBL_REF_ARG(value_type) val) NBL_CONST_MEMBER_FUNC { val = EXPR; } };
 
         NBL_IES_DEF_ANGLE_ACC(VAcc, acc.vAngle(idx))
         NBL_IES_DEF_ANGLE_ACC(HAcc, acc.hAngle(idx))
diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index b5b6f8feea..a80bdda904 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -93,13 +93,22 @@ scalar_type_t<T> lpNorm(NBL_CONST_REF_ARG(T) v)
 
 
 // valid only for `theta` in [-PI,PI]
-template <typename T NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<T>)
+// UseRealSinCos=true  -> back-to-back sin + cos. Saturates the special-function pipeline, enables vendor sincos fusion, full precision near multiples of pi.
+// UseRealSinCos=false -> cos + sqrt(1-c*c) with sign recovered from theta. Saves one special-function op when cos alone is cheaper than sin+cos, but suffers catastrophic cancellation as |c| -> 1.
+template <typename T, bool UseRealSinCos = true NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<T>)
 void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c)
 {
-    s = sin<T>(theta);
-    c = cos<T>(theta);
-    // s = sqrt<T>(T(NBL_FP64_LITERAL(1.0))-c*c);
-    // s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0)));
+    if (UseRealSinCos)
+    {
+        s = sin<T>(theta);
+        c = cos<T>(theta);
+    }
+    else
+    {
+        c = cos<T>(theta);
+        s = sqrt<T>(T(NBL_FP64_LITERAL(1.0))-c*c);
+        s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0)));
+    }
 }
 
 template <typename T NBL_FUNC_REQUIRES(vector_traits<T>::Dimension == 3)
diff --git a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
index 35f1391930..baf1b5427e 100644
--- a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
@@ -65,7 +65,7 @@ struct Bilinear
 
         // bilinear PDF = marginal_y_pdf * conditional_x_pdf; reuse both linear caches
         const scalar_type yPdf = lineary.forwardPdf(u.y, linearYCache);
-        cache.normalizedStart = yPdf * linearx.linearCoeffStart;
+        cache.normalizedStart = yPdf * linearx.normalizedCoeffStart;
         cache.linearXCache.diffTimesX *= yPdf;
         return p;
     }
diff --git a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
index 3a191f142f..05aff5d3b8 100644
--- a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
@@ -93,7 +93,7 @@ struct ProjectedSphere
 	static codomain_type __generate(NBL_REF_ARG(domain_type) u)
 	{
 		vector_t3 retval = hemisphere_t::__generate(u.xy);
-		const bool chooseLower = u.z > T(0.5);
+		const bool chooseLower = u.z > scalar_type(0.5);
 		retval.z = chooseLower ? (-retval.z) : retval.z;
 		if (chooseLower)
 			u.z -= T(0.5);
diff --git a/include/nbl/builtin/hlsl/sampling/linear.hlsl b/include/nbl/builtin/hlsl/sampling/linear.hlsl
index 12602b4a79..8ebf2e38a0 100644
--- a/include/nbl/builtin/hlsl/sampling/linear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/linear.hlsl
@@ -39,11 +39,11 @@ struct Linear
         // add min to both coefficients so (0,0) input produces a valid uniform sampler
         // instead of inf normalization (2/0) leading to NaN; negligible for normal inputs
         const vector2_type safeCoeffs = linearCoeffs + vector2_type(hlsl::numeric_limits<scalar_type>::min, hlsl::numeric_limits<scalar_type>::min);
-        // normalize coefficients so that the PDF is simply linearCoeffStart + linearCoeffDiff * x
+        // normalize coefficients so that the PDF is simply normalizedCoeffStart + linearCoeffDiff * x
         const scalar_type normFactor = scalar_type(2.0) / (safeCoeffs[0] + safeCoeffs[1]);
         const vector2_type normalized = safeCoeffs * normFactor;
-        retval.linearCoeffStart = normalized[0];
-        retval.linearCoeffEnd = normalized[1];
+        retval.normalizedCoeffStart = normalized[0];
+        retval.normalizedCoeffEnd = normalized[1];
         // precompute for the stable quadratic in generate()
         retval.squaredCoeffStart = normalized[0] * normalized[0];
         retval.twoTimesDiff = scalar_type(2.0) * (normalized[1] - normalized[0]);
@@ -57,18 +57,18 @@ struct Linear
         // Quadratic (1-start)*x^2 + start*x - u = 0; since start >= 0 the stable root is
         // x = 2u / (start + sqrt(start^2 + 2*diff*u)), which never cancels.
         const scalar_type sqrtTerm = sqrt(squaredCoeffStart + twoTimesDiff * u);
-        const scalar_type denom = linearCoeffStart + sqrtTerm;
+        const scalar_type denom = normalizedCoeffStart + sqrtTerm;
         // NOTE: floating point can make x slightly > 1 when u~1 and diff < 0; callers needing
         // non-negative PDF at the boundary should clamp with min(x, 1).
         const codomain_type x = (u + u) / denom;
         // diff*x == sqrtTerm - start algebraically (conjugate identity), saves 1 mul
-        cache.diffTimesX = sqrtTerm - linearCoeffStart;
+        cache.diffTimesX = sqrtTerm - normalizedCoeffStart;
         return x;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return linearCoeffStart + cache.diffTimesX;
+        return normalizedCoeffStart + cache.diffTimesX;
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -83,7 +83,7 @@ struct Linear
     density_type backwardPdf(const codomain_type x) NBL_CONST_MEMBER_FUNC
     {
         assert(x >= scalar_type(0.0) && x <= scalar_type(1.0));
-        return hlsl::mix(linearCoeffStart, linearCoeffEnd, x);
+        return hlsl::mix(normalizedCoeffStart, normalizedCoeffEnd, x);
     }
 
     weight_type backwardWeight(const codomain_type x) NBL_CONST_MEMBER_FUNC
@@ -91,8 +91,8 @@ struct Linear
         return backwardPdf(x);
     }
 
-    scalar_type linearCoeffStart;
-    scalar_type linearCoeffEnd;
+    scalar_type normalizedCoeffStart;
+    scalar_type normalizedCoeffEnd;
     scalar_type squaredCoeffStart;
     scalar_type twoTimesDiff;
 };
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
index 618f401ba9..d40e58c5db 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
@@ -28,142 +28,163 @@ namespace sampling
 template<typename T, bool UsePdfAsWeight = true>
 struct ProjectedSphericalRectangle
 {
-	using scalar_type = T;
-	using vector2_type = vector<T, 2>;
-	using vector3_type = vector<T, 3>;
-	using vector4_type = vector<T, 4>;
-
-	// BackwardTractableSampler concept types
-	using domain_type = vector2_type;
-	using codomain_type = vector3_type;
-	using density_type = scalar_type;
-	using weight_type = density_type;
-
-	struct cache_type
-	{
-		typename Bilinear<scalar_type>::cache_type bilinearCache;
-		vector3_type L; // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
-	};
-
-	// Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
-	static ProjectedSphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::CompressedSphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
-	{
-		ProjectedSphericalRectangle<T> retval;
-// TODO:https://github.com/Devsh-Graphics-Programming/Nabla/pull/1001#discussion_r3088570145
-		return retval;
-	}
-
-	// Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
-	static ProjectedSphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
-	{
-		ProjectedSphericalRectangle<T> retval;
-		const vector3_type n = hlsl::mul(shape.basis, _receiverNormal);
-
-		// Compute solid angle and get r0 in local frame (before z-flip)
-		const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
-		const vector3_type r0 = sa.r0;
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-		{
-			retval.receiverNormal = _receiverNormal;
-			retval.projSolidAngle = shape.projectedSolidAngleFromLocal(r0,n);
-		}
-
-		// All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
-		const scalar_type r1x = r0.x + shape.extents.x;
-		const scalar_type r1y = r0.y + shape.extents.y;
-
-		// Unnormalized dots: dot(corner_i, n)
-		const scalar_type base_dot = hlsl::dot(r0, n);
-		const scalar_type dx = shape.extents.x * n.x;
-		const scalar_type dy = shape.extents.y * n.y;
-		const vector4_type dots = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy);
-
-		// Squared lengths of each corner
-		const scalar_type r0zSq = r0.z * r0.z;
-		const vector4_type lenSq = vector4_type(
-			r0.x * r0.x + r0.y * r0.y,
-			r1x * r1x + r0.y * r0.y,
-			r0.x * r0.x + r1y * r1y,
-			r1x * r1x + r1y * r1y
-		) + hlsl::promote<vector4_type>(r0zSq);
-
-		// dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq)
-		// Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11
-		const scalar_type minimumProjSolidAngle = 0.0;
-		const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF,
-			dots * hlsl::rsqrt<vector4_type>(lenSq),
-			hlsl::promote<vector4_type>(minimumProjSolidAngle));
-		retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
-
-		// Reuse the already-computed solid_angle_type to avoid recomputing mul(basis, origin - observer)
-		retval.sphrect = SphericalRectangle<T>::create(shape.basis, sa, shape.extents);
-		return retval;
-	}
-
-	// returns a normalized 3D direction in the local frame
-	codomain_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
-	{
-		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
-		const vector3_type dir = sphrect.generateNormalizedLocal(warped,sphrectCache,hitDist);
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-			cache.L = hlsl::mul(hlsl::transpose(sphrect.basis),dir);
-		return dir;
-	}
-
-	// returns a unnormalized 3D direction in the global frame
-	codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-	{
-		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
-		const vector3_type dir = sphrect.generateUnnormalized(warped,sphrectCache);
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-			cache.L = dir * hlsl::rsqrt(hlsl::dot(dir,dir));
-		return dir;
-	}
-
-	// returns a normalized 3D direction in the global frame
-	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-	{
-		const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
-		const vector3_type dir = sphrect.generate(warped, sphrectCache);
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-			cache.L = dir;
-		return dir;
-	}
-
-	density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-	{
-		return bilinearPatch.forwardPdf(u,cache.bilinearCache) / sphrect.solidAngle;
-	}
-
-	weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-	{
-        NBL_IF_CONSTEXPR (UsePdfAsWeight)
-            return forwardPdf(u,cache);
-        return backwardWeight(cache.L);
-	}
-
-	weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
-	{
-		NBL_IF_CONSTEXPR(UsePdfAsWeight)
-		{
+   using scalar_type  = T;
+   using vector2_type = vector<T, 2>;
+   using vector3_type = vector<T, 3>;
+   using vector4_type = vector<T, 4>;
+
+   // BackwardTractableSampler concept types
+   using domain_type   = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type  = scalar_type;
+   using weight_type   = density_type;
+
+   struct cache_type
+   {
+      typename Bilinear<scalar_type>::cache_type bilinearCache;
+      vector3_type L; // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
+   };
+
+   // Shared finalization for both create() overloads: builds the bilinear patch, the inner sphrect
+   // sampler, and the UsePdfAsWeight=false extras. The two overloads differ only in how they
+   // compute bxdfPdfAtVertex (worldspace corner normalizations vs local-frame rsqrt(lenSq)).
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa,
+      const vector4_type bxdfPdfAtVertex, const vector3_type _receiverNormal)
+   {
+      ProjectedSphericalRectangle<T, UsePdfAsWeight> retval;
+      retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
+      // Reuse solid_angle_type to avoid recomputing mul(basis, origin - observer)
+      retval.sphrect = SphericalRectangle<T>::create(shape.basis, sa, shape.extents);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      {
+         retval.receiverNormal     = _receiverNormal;
+         const vector3_type nLocal = hlsl::mul(shape.basis, _receiverNormal);
+         retval.projSolidAngle     = shape.projectedSolidAngleFromLocal(sa.r0, nLocal);
+      }
+      return retval;
+   }
+
+   // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::CompressedSphericalRectangle<T>) compressed, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+   {
+      // 4 normalized worldspace corners dotted with the worldspace receiver normal. Avoids the
+      // mul(basis, receiverNormal) data dependency of the uncompressed overload so these 4
+      // normalize+dot chains can pipeline alongside the basis/solid-angle work below.
+      const vector3_type c0   = compressed.origin - observer;
+      const vector3_type c1   = c0 + compressed.right;
+      const vector3_type c2   = c0 + compressed.up;
+      const vector3_type c3   = c1 + compressed.up;
+      const vector4_type dots = vector4_type(
+         hlsl::dot(hlsl::normalize(c0), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c1), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c2), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c3), _receiverNormal));
+      const scalar_type minimumProjSolidAngle = scalar_type(0.0);
+      const vector4_type bxdfPdfAtVertex      = math::conditionalAbsOrMax(_receiverWasBSDF, dots, hlsl::promote<vector4_type>(minimumProjSolidAngle));
+
+      const shapes::SphericalRectangle<T> shape                         = shapes::SphericalRectangle<T>::create(compressed);
+      const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
+      return create(shape, sa, bxdfPdfAtVertex, _receiverNormal);
+   }
+
+   // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+   {
+      // Local-frame path: unnormalized dot(corner_i, n) with n = basis * receiverNormal, then
+      // a single rsqrt<vec4>(lenSq) for all 4 corner normalizations at once.
+      const vector3_type n                                              = hlsl::mul(shape.basis, _receiverNormal);
+      const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
+      const vector3_type r0                                             = sa.r0;
+
+      // All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
+      const scalar_type r1x      = r0.x + shape.extents.x;
+      const scalar_type r1y      = r0.y + shape.extents.y;
+      const scalar_type base_dot = hlsl::dot(r0, n);
+      const scalar_type dx       = shape.extents.x * n.x;
+      const scalar_type dy       = shape.extents.y * n.y;
+      const vector4_type dots    = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy);
+
+      const scalar_type r0zSq  = r0.z * r0.z;
+      const vector4_type lenSq = vector4_type(
+                                    r0.x * r0.x + r0.y * r0.y,
+                                    r1x * r1x + r0.y * r0.y,
+                                    r0.x * r0.x + r1y * r1y,
+                                    r1x * r1x + r1y * r1y) +
+         hlsl::promote<vector4_type>(r0zSq);
+
+      // dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq). Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11
+      const scalar_type minimumProjSolidAngle = scalar_type(0.0);
+      const vector4_type bxdfPdfAtVertex      = math::conditionalAbsOrMax(_receiverWasBSDF,
+         dots * hlsl::rsqrt<vector4_type>(lenSq),
+         hlsl::promote<vector4_type>(minimumProjSolidAngle));
+
+      return create(shape, sa, bxdfPdfAtVertex, _receiverNormal);
+   }
+
+   // returns a normalized 3D direction in the local frame
+   codomain_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generateNormalizedLocal(warped, sphrectCache, hitDist);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      cache.L = hlsl::mul(hlsl::transpose(sphrect.basis), dir);
+      return dir;
+   }
+
+   // returns a unnormalized 3D direction in the global frame
+   codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generateUnnormalized(warped, sphrectCache);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      cache.L = dir * hlsl::rsqrt(hlsl::dot(dir, dir));
+      return dir;
+   }
+
+   // returns a normalized 3D direction in the global frame
+   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generate(warped, sphrectCache);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      cache.L = dir;
+      return dir;
+   }
+
+   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return bilinearPatch.forwardPdf(u, cache.bilinearCache) / sphrect.solidAngle;
+   }
+
+   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      NBL_IF_CONSTEXPR(UsePdfAsWeight)
+      return forwardPdf(u, cache);
+      return backwardWeight(cache.L);
+   }
+
+   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      NBL_IF_CONSTEXPR(UsePdfAsWeight)
+      {
 #if 0
 			const vector2_type warped = sphrect.generateInvese(L); // TODO: implement `generateInverse`
 			return bilinearPatch.backwardPdf(warped) / sphrect.solidAngle;
 #endif
-			return 0.f/0.f;
-		}
-        // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
-        return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
-	}
-
-	sampling::SphericalRectangle<T> sphrect;
-	Bilinear<scalar_type> bilinearPatch;
-	// TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
-	vector3_type receiverNormal;
-	vector3_type projSolidAngle;
+         return hlsl::numeric_limits<weight_type>::quiet_NaN;
+      }
+      // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
+      return hlsl::abs(hlsl::dot(L, receiverNormal)) / projSolidAngle;
+   }
+
+   sampling::SphericalRectangle<T> sphrect;
+   Bilinear<scalar_type> bilinearPatch;
+   // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
+   vector3_type receiverNormal;
+   scalar_type projSolidAngle;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
index a542357ead..6eddd03e8a 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
@@ -55,7 +55,7 @@ struct ProjectedSphericalTriangle
     static ProjectedSphericalTriangle<T,UsePdfAsWeight> create(NBL_REF_ARG(shapes::SphericalTriangle<T>) shape, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
     {
         ProjectedSphericalTriangle<T,UsePdfAsWeight> retval;
-        retval.sphtri = SphericalTriangle<T, UsePdfAsWeight>::create(shape);
+        retval.sphtri = SphericalTriangle<T>::create(shape);
 
         const scalar_type minimumProjSolidAngle = 0.0;
         matrix<T, 3, 3> m = matrix<T, 3, 3>(shape.vertices[0], shape.vertices[1], shape.vertices[2]);
@@ -83,7 +83,7 @@ struct ProjectedSphericalTriangle
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return sphtri.getRcpSolidAngle() * bilinearPatch.forwardPdf(u,cache.bilinearCache);
+        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u,cache.bilinearCache);
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -98,17 +98,17 @@ struct ProjectedSphericalTriangle
         NBL_IF_CONSTEXPR (UsePdfAsWeight)
         {
             const vector2_type u = sphtri.generateInverse(L);
-            return sphtri.getRcpSolidAngle() * bilinearPatch.backwardPdf(u);
+            return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
         }
         // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
         return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
     }
 
-    sampling::SphericalTriangle<T, UsePdfAsWeight> sphtri;
+    sampling::SphericalTriangle<T> sphtri;
     Bilinear<scalar_type> bilinearPatch;
     // TODO: erase when UsePdfAsWeight==false
     vector3_type receiverNormal;
-    vector3_type projSolidAngle;
+    scalar_type projSolidAngle;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
index 8029205384..6873362b1a 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
@@ -19,202 +19,212 @@ namespace hlsl
 namespace sampling
 {
 
-template<typename T>
+// UseRealSinCos=true  (default): math::sincos uses real sin+cos; full precision near au=n*pi, Jacobian test passes.
+// UseRealSinCos=false          : math::sincos uses cos + sqrt(1-c*c); saves one special-function op but loses mantissa as |cos_au| -> 1.
+template<typename T, bool UseRealSinCos = true>
 struct SphericalRectangle
 {
-    using scalar_type = T;
-    using vector2_type = vector<T, 2>;
-    using vector3_type = vector<T, 3>;
-    using vector4_type = vector<T, 4>;
-    using matrix3x3_type = matrix<T, 3, 3>;
-
-    // BackwardTractableSampler concept types
-    using domain_type = vector2_type;
-    using codomain_type = vector3_type;
-    using density_type = scalar_type;
-    using weight_type = density_type;
-
-    struct cache_type {};
-
-    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
-    {
-        return create(rect.basis,rect.solidAngle(observer),rect.extents);
-    }
-
-    static SphericalRectangle<T> create(const matrix3x3_type _basis, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
-    {
-        SphericalRectangle<T> retval;
-        retval.basis = _basis;
-        retval.r0 = sa.r0;
-        retval.extents = _extents;
-
-        retval.solidAngle = sa.value;
-        retval.b0 = sa.n_z[0];
-        retval.b1 = sa.n_z[2];
-
-        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[2]);
-        angle_adder.addCosine(sa.cosGamma[3]);
-        retval.k = scalar_type(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
-
-        return retval;
-    }
-
-    // Create directly from a local-frame corner position and rectangle extents.
-    // Use when you already know r0 (e.g. from a gnomonic projection) and don't
-    // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip.
-    static SphericalRectangle<T> create(const matrix3x3_type _basis, const vector3_type _r0, const vector2_type _extents)
-    {
-        // Same math as shapes::SphericalRectangle::solidAngle() but without
-        // the mul(basis, origin - observer) step since we already have r0.
-        typename shapes::SphericalRectangle<T>::solid_angle_type sa;
-        sa.r0 = _r0;
-
-        const scalar_type zSq = _r0.z * _r0.z;
-        const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x);
-        sa.n_z = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(zSq) + denorm_n_z * denorm_n_z);
-        sa.cosGamma = vector4_type(
-            -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2],
-            -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]);
-
-        math::sincos_accumulator<scalar_type> acc = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[0]);
-        acc.addCosine(sa.cosGamma[1]);
-        acc.addCosine(sa.cosGamma[2]);
-        acc.addCosine(sa.cosGamma[3]);
-        sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi<scalar_type>;
-
-        return create(_basis, sa, _extents);
-    }
-
-    // shared core of generate and generateSurfaceOffset
-    // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction
-    struct SCommonGen
-    {
-        scalar_type xu;
-        scalar_type d2;
-        scalar_type hv;
-        scalar_type cosElevation2;
-    };
-    SCommonGen __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
-    {
-        SCommonGen retval;
-
-        // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip
-        const scalar_type negAbsR0z = -hlsl::abs(r0.z);
-        const scalar_type r0zSq = r0.z * r0.z;
-        const vector2_type r1 = vector2_type(r0.x + extents.x, r0.y + extents.y);
-
-        const scalar_type au = u.x * solidAngle + k;
-        const scalar_type cos_au = hlsl::cos<scalar_type>(au);
-        const scalar_type numerator = b1 - cos_au * b0;
-        // (1-cos)*(1+cos) avoids catastrophic cancellation of 1-cos^2 when cos_au is near +/-1
-        const scalar_type sin_au_sq = (scalar_type(1.0) - cos_au) * (scalar_type(1.0) + cos_au);
-        const scalar_type absNegFu = hlsl::abs(numerator) * hlsl::rsqrt<scalar_type>(sin_au_sq);
-        const scalar_type rcpCu_2 = hlsl::max<scalar_type>(absNegFu * absNegFu + b0 * b0, scalar_type(1.0));
-        // sign(negFu) = sign(numerator) * sign(sin(au)); sin(au) < 0 iff au > PI
-        const scalar_type negFuSign = hlsl::select((au > numbers::pi<scalar_type>) != (numerator < scalar_type(0.0)), scalar_type(-1.0), scalar_type(1.0));
-        retval.xu = negAbsR0z * negFuSign * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
-        retval.xu = hlsl::clamp<scalar_type>(retval.xu, r0.x, r1.x); // avoid Infs
-        // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2` and reclamping this result in a different way
-        // with `d2 = r0zSq*(rcpCu_2*rcpCu_2-rcpCu_2)` or `d2 = r0zSq*rcpCu_2*(rcpCu_2-1)`
-        retval.d2 = retval.xu * retval.xu + r0zSq;
-
-        const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(retval.d2 + r0.y * r0.y);
-        const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(retval.d2 + r1.y * r1.y);
-        retval.hv = h0 + u.y * (h1 - h0);
-        retval.cosElevation2 = scalar_type(1.0) - hlsl::min<scalar_type>(retval.hv * retval.hv,1);
-
-        return retval;
-    }
-
-    // returns a normalized 3D direction in the local frame with correct r0.z sign
-    vector3_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
-    {
-        const SCommonGen core = __generate(u);
-        scalar_type cosElevationOverD = hlsl::rsqrt(core.d2/core.cosElevation2);
-        // TODO: or shall we do some other more sophisticated clamp or correction? Is this even the right one to use?
-        cosElevationOverD = hlsl::select(hlsl::isnan(cosElevationOverD),1.f,cosElevationOverD);
-
-        // TODO: investigate if due to precision we need to compute this as a `sqrt` then the quantity being computed is `core.cosElevation2/core.d2`
-        // which what alss `generateLocalBasisXY` needs, in which case `__generate` can already compute it and return it as `hitDist2`
-        hitDist = 1.f/cosElevationOverD;
-
-        const vector3_type retval = vector3_type(core.xu / hitDist, core.hv, r0.z / hitDist);
-        assert(!hlsl::isnan(computeHitT(retval)));
-        return retval;
-    }
-
-    codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-    {
-        scalar_type dummy;
-        const vector3_type localL = generateNormalizedLocal(u,cache,dummy);
-        // could return `hlsl::mul(hlsl::tranpose(basis),localL)` or just this
-        return basis[0]*localL[0]+basis[1]*localL[1]+basis[2]*localL[2];
-    }
-
-    // utility to determine maxT for a ray from L shot from the origin which we're sure intersects the rectangle
-    scalar_type computeHitT(const vector3_type L)
-    {
-        const scalar_type retval = hlsl::abs(r0.z/hlsl::dot(L,basis[2]));
-        {
-            const vector3_type hitPointRelative = L*retval;
-            const vector2_type uv = mul(basis,L).xy-r0.xy;
-            assert(uv[0]>=0.0 && uv[0]<=rectExtents[0]);
-            assert(uv[1]>=0.0 && uv[1]<=rectExtents[1]);
-        }
-        return retval;
-    }
-
-    // returns a 2D offset on the rectangle surface including the r0 corner -useful for generating unnormalized worldspace L
-    vector2_type generateLocalBasisXY(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-    {
-        const SCommonGen core = __generate(u);
-        const scalar_type r1y = r0.y + extents.y;
-        // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2`
-        const scalar_type yv = core.hv * hlsl::rsqrt(core.cosElevation2/core.d2);
-
-        // fun fact, when one of the operands to min or max is NaN, the SPIR-V builtin will select the other one
-        // TODO: maybe try just `min(yv,ry1)`
-        const vector2_type retval = vector2_type(core.xu,hlsl::clamp(yv,r0.y,r1y));
-        assert(retval[0] >= r0.x && retval[1] >= r0.y);
-        assert(retval[0] <= r0.x+rectExtents[0] && retval[1] <= r0.y+rectExtents[1]);
-        return retval;
-    }
-
-    // its basically the hitpoint minus the observer origin
-    codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-    {
-        const vector2_type localXY = generateLocalBasisXY(u,cache);
-        // the `localXY` already contains r0.xy
-        return basis[0]*localXY[0]+basis[1]*localXY[1]+basis[2]*r0.z;
-    }
-
-    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-    {
-        return scalar_type(1.0) / solidAngle;
-    }
-
-    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-    {
-        return forwardPdf(u, cache);
-    }
-
-    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        return scalar_type(1.0) / solidAngle;
-    }
-
-    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        return backwardPdf(L);
-    }
-
-    matrix3x3_type basis;
-    vector3_type r0;
-    vector2_type extents;
-    scalar_type solidAngle;
-    scalar_type k;
-    scalar_type b0;
-    scalar_type b1;
+   using scalar_type    = T;
+   using vector2_type   = vector<T, 2>;
+   using vector3_type   = vector<T, 3>;
+   using vector4_type   = vector<T, 4>;
+   using matrix3x3_type = matrix<T, 3, 3>;
+
+   // BackwardTractableSampler concept types
+   using domain_type   = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type  = scalar_type;
+   using weight_type   = density_type;
+
+   struct cache_type
+   {
+   };
+
+   static SphericalRectangle<T, UseRealSinCos> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
+   {
+      return create(rect.basis, rect.solidAngle(observer), rect.extents);
+   }
+
+   static SphericalRectangle<T, UseRealSinCos> create(const matrix3x3_type _basis, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
+   {
+      SphericalRectangle<T, UseRealSinCos> retval;
+      retval.basis   = _basis;
+      retval.r0      = sa.r0;
+      retval.extents = _extents;
+
+      retval.solidAngle = sa.value;
+      retval.b0         = sa.n_z[0];
+      retval.b1         = sa.n_z[2];
+
+      math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[2]);
+      angle_adder.addCosine(sa.cosGamma[3]);
+      retval.k = scalar_type(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
+
+      return retval;
+   }
+
+   // Create directly from a local-frame corner position and rectangle extents.
+   // Use when you already know r0 (e.g. from a gnomonic projection) and don't
+   // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip.
+   static SphericalRectangle<T, UseRealSinCos> create(const matrix3x3_type _basis, const vector3_type _r0, const vector2_type _extents)
+   {
+      // Same math as shapes::SphericalRectangle::solidAngle() but without
+      // the mul(basis, origin - observer) step since we already have r0.
+      typename shapes::SphericalRectangle<T>::solid_angle_type sa;
+      sa.r0 = _r0;
+
+      const scalar_type zSq         = _r0.z * _r0.z;
+      const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x);
+      sa.n_z                        = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(zSq) + denorm_n_z * denorm_n_z);
+      sa.cosGamma                   = vector4_type(
+         -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2],
+         -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]);
+
+      math::sincos_accumulator<scalar_type> acc = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[0]);
+      acc.addCosine(sa.cosGamma[1]);
+      acc.addCosine(sa.cosGamma[2]);
+      acc.addCosine(sa.cosGamma[3]);
+      sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi<scalar_type>;
+
+      return create(_basis, sa, _extents);
+   }
+
+   // shared core of generate and generateSurfaceOffset
+   // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction
+   struct SCommonGen
+   {
+      scalar_type xu;
+      scalar_type d2;
+      scalar_type hv;
+      scalar_type cosElevation2;
+   };
+   SCommonGen __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+   {
+      SCommonGen retval;
+
+      // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip
+      const scalar_type negAbsR0z = -hlsl::abs(r0.z);
+      const scalar_type r0zSq     = r0.z * r0.z;
+      const vector2_type r1       = vector2_type(r0.x + extents.x, r0.y + extents.y);
+
+      // au in [0, 4*pi] since u.x in [0,1], solidAngle <= 2*pi, k <= 2*pi.
+      // The sqrt path in math::sincos recovers sin sign via sign(theta), which only holds for theta in [-pi,pi].
+      // Real sin/cos handle any range, so only wrap on the sqrt path (compile-time branch folds away).
+      scalar_type au = u.x * solidAngle + k;
+      NBL_IF_CONSTEXPR(!UseRealSinCos)
+      {
+         // au in [0, 4*pi] -> peel at most two 2*pi periods to land in (-pi, pi].
+         au = hlsl::select(au > numbers::pi<scalar_type>, au - scalar_type(2.0) * numbers::pi<scalar_type>, au);
+         au = hlsl::select(au > numbers::pi<scalar_type>, au - scalar_type(2.0) * numbers::pi<scalar_type>, au);
+      }
+      scalar_type sin_au, cos_au;
+      math::sincos<scalar_type, UseRealSinCos>(au, sin_au, cos_au);
+      const scalar_type numerator = b1 - cos_au * b0;
+      // negFu carries the sign directly (numerator and sin_au both signed), so xu's sign drops
+      // out of a single multiply + hlsl::sign.
+      const scalar_type negFu   = numerator / sin_au;
+      const scalar_type rcpCu_2 = hlsl::max<scalar_type>(negFu * negFu + b0 * b0, scalar_type(1.0));
+      retval.xu                 = negAbsR0z * hlsl::sign(negFu) * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
+      retval.xu = hlsl::clamp<scalar_type>(retval.xu, r0.x, r1.x); // avoid Infs
+      retval.d2 = retval.xu * retval.xu + r0zSq;
+
+      const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(retval.d2 + r0.y * r0.y);
+      const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(retval.d2 + r1.y * r1.y);
+      retval.hv            = h0 + u.y * (h1 - h0);
+      retval.cosElevation2 = scalar_type(1.0) - hlsl::min<scalar_type>(retval.hv * retval.hv, 1);
+
+      return retval;
+   }
+
+   // returns a normalized 3D direction in the local frame with correct r0.z sign
+   vector3_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
+   {
+      const SCommonGen core         = __generate(u);
+      scalar_type cosElevationOverD = hlsl::rsqrt(core.d2 / core.cosElevation2);
+      // TODO: or shall we do some other more sophisticated clamp or correction? Is this even the right one to use?
+      cosElevationOverD = hlsl::select(hlsl::isnan(cosElevationOverD), 1.f, cosElevationOverD);
+
+      // TODO: investigate if due to precision we need to compute this as a `sqrt` then the quantity being computed is `core.cosElevation2/core.d2`
+      // which what alss `generateLocalBasisXY` needs, in which case `__generate` can already compute it and return it as `hitDist2`
+      hitDist = 1.f / cosElevationOverD;
+
+      const vector3_type retval = vector3_type(core.xu / hitDist, core.hv, r0.z / hitDist);
+      assert(!hlsl::isnan(computeHitT(retval)));
+      return retval;
+   }
+
+   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      scalar_type dummy;
+      const vector3_type localL = generateNormalizedLocal(u, cache, dummy);
+      // could return `hlsl::mul(hlsl::tranpose(basis),localL)` or just this
+      return basis[0] * localL[0] + basis[1] * localL[1] + basis[2] * localL[2];
+   }
+
+   // utility to determine maxT for a ray from L shot from the origin which we're sure intersects the rectangle
+   scalar_type computeHitT(const vector3_type L)
+   {
+      const scalar_type retval = hlsl::abs(r0.z / hlsl::dot(L, basis[2]));
+      {
+         const vector3_type hitPointRelative = L * retval;
+         const vector2_type uv               = mul(basis, L).xy - r0.xy;
+         assert(uv[0] >= 0.0 && uv[0] <= rectExtents[0]);
+         assert(uv[1] >= 0.0 && uv[1] <= rectExtents[1]);
+      }
+      return retval;
+   }
+
+   // returns a 2D offset on the rectangle surface including the r0 corner -useful for generating unnormalized worldspace L
+   vector2_type generateLocalBasisXY(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const SCommonGen core = __generate(u);
+      const scalar_type r1y = r0.y + extents.y;
+      // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2`
+      const scalar_type yv = core.hv * hlsl::rsqrt(core.cosElevation2 / core.d2);
+
+      // fun fact, when one of the operands to min or max is NaN, the SPIR-V builtin will select the other one
+      // TODO: maybe try just `min(yv,ry1)`
+      const vector2_type retval = vector2_type(core.xu, hlsl::clamp(yv, r0.y, r1y));
+      assert(retval[0] >= r0.x && retval[1] >= r0.y);
+      assert(retval[0] <= r0.x + rectExtents[0] && retval[1] <= r0.y + rectExtents[1]);
+      return retval;
+   }
+
+   // its basically the hitpoint minus the observer origin
+   codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type localXY = generateLocalBasisXY(u, cache);
+      // the `localXY` already contains r0.xy
+      return basis[0] * localXY[0] + basis[1] * localXY[1] + basis[2] * r0.z;
+   }
+
+   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return scalar_type(1.0) / solidAngle;
+   }
+
+   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return forwardPdf(u, cache);
+   }
+
+   density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return scalar_type(1.0) / solidAngle;
+   }
+
+   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return backwardPdf(L);
+   }
+
+   matrix3x3_type basis;
+   vector3_type r0;
+   vector2_type extents;
+   scalar_type solidAngle;
+   scalar_type k;
+   scalar_type b0;
+   scalar_type b1;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
index eb714a2505..59dea3b6ea 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
@@ -11,6 +11,7 @@
 #include <nbl/builtin/hlsl/math/quaternions.hlsl>
 #include <nbl/builtin/hlsl/math/fast_acos.hlsl>
 #include <nbl/builtin/hlsl/shapes/spherical_triangle.hlsl>
+#include <nbl/builtin/hlsl/ieee754.hlsl>
 
 namespace nbl
 {
@@ -25,9 +26,6 @@ enum SphericalTriangleAlgorithm : uint16_t
    STA_PBRT = 1
 };
 
-template<typename T, bool Bijective = false, SphericalTriangleAlgorithm Algorithm = STA_ARVO>
-struct SphericalTriangle;
-
 namespace impl
 {
 
@@ -53,9 +51,8 @@ T sumOfProducts(T a, T b, T c, T d)
 
 } // namespace impl
 
-// Non-bijective: Resamplable & Tractable (generate + pdf/weight, no inverse)
-template<typename T, SphericalTriangleAlgorithm Algorithm>
-struct SphericalTriangle<T, false, Algorithm>
+template<typename T, SphericalTriangleAlgorithm Algorithm = STA_ARVO>
+struct SphericalTriangle
 {
    using scalar_type = T;
    using vector2_type = vector<T, 2>;
@@ -76,19 +73,20 @@ struct SphericalTriangle<T, false, Algorithm>
       retval.rcpSolidAngle = scalar_type(1.0) / tri.solid_angle;
       retval.tri_vertices[0] = tri.vertices[0];
       retval.tri_vertices[1] = tri.vertices[1];
-      retval.triCosC = tri.cos_sides[2];
+      retval.triCosc = tri.cos_sides[2];
       // precompute great circle normal of arc AC: cross(A,C) has magnitude sin(b),
       // so multiplying by csc(b) normalizes it; zero when side AC is degenerate
-      const scalar_type cscB = tri.csc_sides[1];
-      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits<scalar_type>::max, cscB, scalar_type(0));
+      const scalar_type cscb = tri.csc_sides[1];
+      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscb < numeric_limits<scalar_type>::max, cscb, scalar_type(0));
       retval.e_C = hlsl::cross(arcACPlaneNormal, tri.vertices[0]);
       retval.cosA = tri.cos_vertices[0];
       retval.sinA = tri.sin_vertices[0];
       if (Algorithm == STA_ARVO)
       {
-         retval.sinA_triCosC = retval.sinA * retval.triCosC;
+         retval.sinA_triCosc = retval.sinA * retval.triCosc;
          retval.eCdotB = hlsl::dot(retval.e_C, tri.vertices[1]);
       }
+      retval.APlusC = tri.vertices[0] + tri.vertices[2];
       return retval;
    }
 
@@ -106,21 +104,22 @@ struct SphericalTriangle<T, false, Algorithm>
       if (Algorithm == STA_ARVO) // faster than PBRT
       {
          const scalar_type u_ = t - cosA;
-         const scalar_type v_ = s + sinA_triCosC;
+         const scalar_type v_ = s + sinA_triCosc;
          const scalar_type num = (v_ * t - u_ * s) * cosA - v_;
          const scalar_type denum = (v_ * s + u_ * t) * sinA;
 
+#define ACCURATE 1
 #if ACCURATE
          // sqrt(1 - cosBp^2) loses precision when cosBp ~ 1 (small u.x).
          // Use stable factorization: sinBp = sqrt((denum-num)(denum+num)) / |denum|
-         // where denum-num = sinA*(1+triCosC)*(1-cosA_hat).
+         // where denum-num = sinA*(1+triCosc)*(1-cosA_hat).
 
          // For large triangles with high u.x, cosA_hat can approach -1,
          // making (1 + cosA_hat) near zero and the division unstable.
          // Use the algebraic identity only when cosA_hat > 0 (safe denominator).
          const scalar_type rcpDenum = scalar_type(1) / denum;
          const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > scalar_type(0), sinA_hat * sinA_hat / (scalar_type(1) + cosA_hat), scalar_type(1) - cosA_hat);
-         const scalar_type DminusN = sinA * (scalar_type(1) + triCosC) * oneMinusCosAhat;
+         const scalar_type DminusN = sinA * (scalar_type(1) + triCosc) * oneMinusCosAhat;
          sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum);
          cosBp = scalar_type(1) - DminusN * rcpDenum;
 #else // 17% faster, less accurate
@@ -132,10 +131,10 @@ struct SphericalTriangle<T, false, Algorithm>
       {
          // PBRT uses cosPhi = -t, sinPhi = -s (pi offset from Arvo's A_hat)
          const scalar_type k1 = -t + cosA;
-         const scalar_type k2 = -s - sinA * triCosC;
+         const scalar_type k2 = -s - sinA * triCosc;
          cosBp = (k2 + impl::differenceOfProducts(k2, -t, k1, -s) * cosA) / (impl::sumOfProducts(k2, -s, k1, -t) * sinA);
          cosBp = nbl::hlsl::clamp(cosBp, scalar_type(-1), scalar_type(1));
-         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - cosBp * cosBp));
+         sinBp = sqrt<scalar_type>(scalar_type(1) - cosBp * cosBp);
       }
 
       // Step 3: construct C' on the great circle through A toward C
@@ -143,82 +142,24 @@ struct SphericalTriangle<T, false, Algorithm>
 
       // Step 4: uniformly sample the great circle arc from B to C'
       scalar_type cosCpB;
-      if (Algorithm == STA_ARVO)
-         cosCpB = cosBp * triCosC + sinBp * eCdotB;
+      NBL_IF_CONSTEXPR(Algorithm == STA_ARVO)
+         cosCpB = cosBp * triCosc + sinBp * eCdotB;
       else
          cosCpB = nbl::hlsl::dot(cp, tri_vertices[1]);
+      // TODO: degeneracy at u.y = 0. z = 1 - u.y*(1-cosCpB) makes sinZ = sqrt(1-z^2) behave like
+      // sqrt(u.y) near zero, so dL/du.y diverges as u.y^(-1/2) and every higher derivative diverges
+      // faster. The forward Jacobian test in 37_HLSLSamplingTests reports ~2-8% error at u.y < 0.003
+      // even with the O(h^2) one-sided stencil because the third-derivative term dominates. At
+      // u.y = 0 exactly, L collapses to vertex B for all u.x (|det J| = 0), so it's an intrinsic
+      // property of the Arvo parameterization, not a bug. Fix: rework the arc interpolation to use
+      // a u.y -> angle mapping whose derivatives stay bounded near u.y = 0 (e.g. acos(z) = angle
+      // from B, then sample arc-length linearly), so the Jacobian is smooth and the skip band in
+      // the tester can be removed.
       const scalar_type z = scalar_type(1) - u.y * (scalar_type(1) - cosCpB);
       const scalar_type sinZ = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - z * z));
       return z * tri_vertices[1] + sinZ * hlsl::normalize(cp - cosCpB * tri_vertices[1]);
    }
 
-   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-   {
-      return rcpSolidAngle;
-   }
-
-   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-   {
-      return forwardPdf(u, cache);
-   }
-
-   density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-   {
-      return rcpSolidAngle;
-   }
-
-   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
-   {
-      return backwardPdf(L);
-   }
-
-   scalar_type getRcpSolidAngle() NBL_CONST_MEMBER_FUNC {return rcpSolidAngle;}
-
-   scalar_type rcpSolidAngle;
-   scalar_type cosA;
-   scalar_type sinA;
-   scalar_type sinA_triCosC; // precomputed sinA * triCosC
-   scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only
-
-   vector3_type tri_vertices[2]; // A and B only
-   scalar_type triCosC;
-   vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane
-};
-
-// Bijective: adds generateInverse, stores extra members for the inverse mapping
-template<typename T, SphericalTriangleAlgorithm Algorithm>
-struct SphericalTriangle<T, true, Algorithm>
-{
-   using scalar_type = T;
-   using vector2_type = vector<T, 2>;
-   using vector3_type = vector<T, 3>;
-
-   using base_type = SphericalTriangle<T, false, Algorithm>;
-   using domain_type = vector2_type;
-   using codomain_type = vector3_type;
-   using density_type = scalar_type;
-   using weight_type = density_type;
-
-   using cache_type = typename base_type::cache_type;
-
-   static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
-   {
-      SphericalTriangle retval;
-      retval.base = base_type::create(tri);
-      retval.vertexC = tri.vertices[2];
-      // precompute great circle normal of arc AC (needed for generateInverse)
-      const scalar_type cscB = tri.csc_sides[1];
-      retval.arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits<scalar_type>::max, cscB, scalar_type(0));
-      retval.triCscC = tri.csc_sides[2];
-      return retval;
-   }
-
-   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-   {
-      return base.generate(u, cache);
-   }
-
-
    // generate() works in two steps:
    //   u.x -> pick C' on arc AC (choosing a sub-area fraction)
    //   u.y -> pick L on arc B->C' (linear interpolation)
@@ -230,15 +171,22 @@ struct SphericalTriangle<T, true, Algorithm>
    domain_type generateInverse(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
       // Step 1: find C' = intersection of great circles (B,L) and (A,C)
-      const vector3_type BxL = nbl::hlsl::cross(base.tri_vertices[1], L);
+      const vector3_type BxL = nbl::hlsl::cross(tri_vertices[1], L);
       const scalar_type sinBL_sq = nbl::hlsl::dot(BxL, BxL);
-      if (sinBL_sq < numeric_limits<scalar_type>::epsilon)
+
+      // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t).
+      // C' also lies on the B-L plane, so dot(BxL, C') = 0.
+      // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R
+      const scalar_type tripleA = nbl::hlsl::dot(BxL, tri_vertices[0]);
+      const scalar_type tripleE = nbl::hlsl::dot(BxL, e_C);
+      const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE;
+
+      if (sinBL_sq < numeric_limits<scalar_type>::epsilon || R_sq < numeric_limits<scalar_type>::epsilon)
       {
-         // L ~ B: u.y ~ 0, u.x is indeterminate (all u.x map to B when u.y=0).
          // Recover u.y from |L-B|^2 / |A-B|^2 (using C'=A; the (1-cosCpB) ratio
          // cancels so any C' gives the same result).
-         const vector3_type LminusB = L - base.tri_vertices[1];
-         const vector3_type AminusB = base.tri_vertices[0] - base.tri_vertices[1];
+         const vector3_type LminusB = L - tri_vertices[1];
+         const vector3_type AminusB = tri_vertices[0] - tri_vertices[1];
          const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
          const scalar_type v_denom = nbl::hlsl::dot(AminusB, AminusB);
          const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
@@ -247,20 +195,10 @@ struct SphericalTriangle<T, true, Algorithm>
          return vector2_type(scalar_type(0.0), v);
       }
 
-      // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t).
-      // C' also lies on the B-L plane, so dot(BxL, C') = 0.
-      // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R
-      const scalar_type tripleA = nbl::hlsl::dot(BxL, base.tri_vertices[0]);
-      const scalar_type tripleE = nbl::hlsl::dot(BxL, base.e_C);
-      const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE;
-      if (R_sq < numeric_limits<scalar_type>::epsilon)
-         return vector2_type(scalar_type(0.0), scalar_type(0.0));
-
       const scalar_type rcpR = scalar_type(1.0) / nbl::hlsl::sqrt(R_sq);
-      vector3_type cp = base.tri_vertices[0] * (tripleE * rcpR) + base.e_C * (-tripleA * rcpR);
-      // two intersections exist; pick the one on the minor arc A->C
-      if (nbl::hlsl::dot(cp, base.tri_vertices[0] + vertexC) < scalar_type(0.0))
-         cp = -cp;
+      vector3_type cp = tri_vertices[0] * (tripleE * rcpR) + e_C * (-tripleA * rcpR);
+      // two intersections exist; pick the one on the minor arc A->C (branchless sign flip)
+      cp = ieee754::flipSignIfRHSNegative(cp, nbl::hlsl::dot(cp, APlusC));
 
       // Step 2: u.x = solidAngle(A,B,C') / solidAngle(A,B,C)
       // Van Oosterom-Strackee: tan(Omega/2) = |A.(BxC')| / (1 + A.B + B.C' + C'.A)
@@ -271,19 +209,19 @@ struct SphericalTriangle<T, true, Algorithm>
       // Expanding C' = cosBp*A + sinBp*e_C into the triple product:
       //   A.(BxC') = cosBp * A.(BxA) + sinBp * A.(Bxe_C) = sinBp * A.(Bxe_C)
       // since A.(BxA) = 0 identically. This avoids the cancellation.
-      const scalar_type cosBp_inv = nbl::hlsl::dot(cp, base.tri_vertices[0]);
-      const scalar_type sinBp_inv = nbl::hlsl::dot(cp, base.e_C);
-      const scalar_type AxBdotE = nbl::hlsl::dot(base.tri_vertices[0], nbl::hlsl::cross(base.tri_vertices[1], base.e_C));
+      const scalar_type cosBp_inv = nbl::hlsl::dot(cp, tri_vertices[0]);
+      const scalar_type sinBp_inv = nbl::hlsl::dot(cp, e_C);
+      const scalar_type AxBdotE = nbl::hlsl::dot(tri_vertices[0], nbl::hlsl::cross(tri_vertices[1], e_C));
       const scalar_type num = sinBp_inv * AxBdotE;
-      const scalar_type cosCpB = nbl::hlsl::dot(base.tri_vertices[1], cp);
-      const scalar_type den = scalar_type(1.0) + base.triCosC + cosCpB + cosBp_inv;
+      const scalar_type cosCpB = nbl::hlsl::dot(tri_vertices[1], cp);
+      const scalar_type den = scalar_type(1.0) + triCosc + cosCpB + cosBp_inv;
       const scalar_type subSolidAngle = scalar_type(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
-      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * base.rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
+      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
 
       // Step 3: u.y = |L-B|^2 / |C'-B|^2
       // Squared Euclidean distance avoids catastrophic cancellation vs (1-dot)/(1-dot)
-      const vector3_type LminusB = L - base.tri_vertices[1];
-      const vector3_type cpMinusB = cp - base.tri_vertices[1];
+      const vector3_type LminusB = L - tri_vertices[1];
+      const vector3_type cpMinusB = cp - tri_vertices[1];
       const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
       const scalar_type v_denom = nbl::hlsl::dot(cpMinusB, cpMinusB);
       const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
@@ -296,7 +234,7 @@ struct SphericalTriangle<T, true, Algorithm>
 
    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
    {
-      return base.rcpSolidAngle;
+      return rcpSolidAngle;
    }
 
    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -306,20 +244,24 @@ struct SphericalTriangle<T, true, Algorithm>
 
    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
-      return base.rcpSolidAngle;
+      return rcpSolidAngle;
    }
 
    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
       return backwardPdf(L);
    }
-   
-   scalar_type getRcpSolidAngle() NBL_CONST_MEMBER_FUNC {return base.rcpSolidAngle;}
 
-   base_type base;
-   vector3_type vertexC;
-   vector3_type arcACPlaneNormal; // precomputed normalize(cross(A, C)), great circle normal of arc AC
-   scalar_type triCscC;
+   scalar_type rcpSolidAngle;
+   scalar_type cosA;
+   scalar_type sinA;
+   scalar_type sinA_triCosc; // precomputed sinA * triCosc
+   scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only
+
+   vector3_type tri_vertices[2]; // A and B only
+   scalar_type triCosc;
+   vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane
+   vector3_type APlusC; // precomputed A + C, used to pick the minor-arc intersection in generateInverse
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
index 21901a9628..a5a14660f4 100644
--- a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
@@ -35,10 +35,11 @@ struct UniformHemisphere
 
 	static codomain_type generate(const domain_type u)
 	{
-		typename ConcentricMapping<T>::cache_type cmCache;
+		typename ConcentricMapping<scalar_type>::cache_type cmCache;
 		const vector_t2 p = ConcentricMapping<T>::generate(u, cmCache);
+		assert(cmCache.r2 <= scalar_type(1.0));
 		const T z = T(1.0) - cmCache.r2;
-		const T xyScale = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(2.0) - cmCache.r2));
+		const T xyScale = hlsl::sqrt<scalar_type>(scalar_type(2.0) - cmCache.r2);
 		return vector_t3(p.x * xyScale, p.y * xyScale, z);
 	}
 
@@ -50,13 +51,13 @@ struct UniformHemisphere
 	static domain_type generateInverse(const codomain_type v)
 	{
 		// r_disk / r_xy = sqrt(1-z) / sqrt(1-z^2) = 1/sqrt(1+z)
-		const T scale = T(1.0) / hlsl::sqrt<T>(T(1.0) + v.z);
+		const scalar_type scale = hlsl::rsqrt<T>(T(1.0) + v.z);
 		return ConcentricMapping<T>::generateInverse(vector_t2(v.x * scale, v.y * scale));
 	}
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * numbers::inv_pi<T>;
+		return scalar_type(0.5) * numbers::inv_pi<T>;
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)

From 37cb8173e30ad23d184c2ec9724012bf1beb03ef Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Fri, 24 Apr 2026 21:27:52 +0300
Subject: [PATCH 23/26] Eytzinger CDF, alias table is packed, 2 versions, CDF
 sampler now has 3 enumed types (tracking, YOLO, Eytzinger)

---
 examples_tests                                |   2 +-
 .../builtin/hlsl/sampling/alias_table.hlsl    | 201 +++++++++++++-----
 .../hlsl/sampling/alias_table_builder.h       | 198 +++++++++++------
 .../sampling/cumulative_probability_builder.h |  55 +++++
 4 files changed, 342 insertions(+), 114 deletions(-)

diff --git a/examples_tests b/examples_tests
index fb5cfa2bca..a4559b941a 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit fb5cfa2bcaa0a92aafb429f3d390658d28d1ca02
+Subproject commit a4559b941a9d0f465ccc8687630077e045829403
diff --git a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
index 15742e10f3..8e7b3249a0 100644
--- a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
@@ -7,6 +7,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/bit.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl>
 #include <nbl/builtin/hlsl/concepts/core.hlsl>
 #include <nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl>
 
@@ -17,84 +18,187 @@ namespace hlsl
 namespace sampling
 {
 
-// Alias Method (Vose/Walker) discrete sampler.
-//
-// Samples a discrete index in [0, N) with probability proportional to
-// precomputed weights in O(1) time per sample, using a prebuilt alias table.
-//
-// Accessor template parameters must satisfy GenericReadAccessor:
-//   accessor.template get<V, I>(index, outVal)  // void, writes to outVal
-//
-// - ProbabilityAccessor: reads scalar_type threshold in [0, 1] for bin i
-// - AliasIndexAccessor:  reads uint32_t redirect index for bin i
-// - PdfAccessor:         reads scalar_type weight[i] / totalWeight
-//
-// Satisfies TractableSampler (not BackwardTractableSampler: the mapping is discrete).
-// The cache stores the PDF value looked up during generate, avoiding redundant
-// storage of the codomain (sampled index) which is already the return value.
-template<typename T, typename Domain, typename Codomain, typename ProbabilityAccessor, typename AliasIndexAccessor, typename PdfAccessor
+// Packed alias-entry bit layout shared by every packed variant. One 32-bit
+// word holds the redirect index in the low Log2N bits and the stay-
+// probability quantized as an unorm in the high (32 - Log2N) bits.
+//   u * N  = scaled;  bin = floor(scaled);  remainder = scaled - bin
+//   if (remainder < getStayProb(word))  -> result = bin
+//   else                                -> result = getTarget(word)
+// Quantizing the threshold to (32 - Log2N) bits is precision-neutral: `u`
+// already consumed Log2N bits of randomness producing `bin`, so `remainder`
+// carries exactly that many bits of discriminatory power.
+namespace impl
+{
+template<uint32_t Log2N>
+struct AliasBitDecoder
+{
+	static uint32_t getTarget(uint32_t word)
+	{
+		return word & ((1u << Log2N) - 1u);
+	}
+	template<typename T>
+	static T getStayProb(uint32_t word)
+	{
+		const uint32_t unormMax = (~0u) >> Log2N;
+		return T(word >> Log2N) / T(unormMax);
+	}
+};
+} // namespace impl
+
+// 8 B entry used by the NBig == true variant. Embeds the bin's own pdf
+// alongside the packed word so the common stay-case needs no extra tap.
+template<typename T>
+struct PackedAliasEntryB
+{
+	uint32_t packedWord;	// low Log2N: redirect target; high 32-Log2N: stayProb unorm
+	T ownPdf;				// pdf of this bin
+};
+
+
+// NBig == false: 4 B packed word per bin + separate pdf[] array. Per sample
+// = one 4 B word load + one unconditional 4 B pdf[] tap indexed by the
+// selected bin (either the current bin or its redirect). Total 8 B whether
+// the sample stays or aliases. Favours small N.
+template<typename T, typename Domain, typename Codomain, typename PackedWordAccessor, typename PdfAccessor, uint32_t Log2N
 	NBL_PRIMARY_REQUIRES(
 		concepts::UnsignedIntegralScalar<Codomain> &&
-		concepts::accessors::GenericReadAccessor<ProbabilityAccessor, T, Codomain> &&
-		concepts::accessors::GenericReadAccessor<AliasIndexAccessor, Codomain, Codomain> &&
+		concepts::accessors::GenericReadAccessor<PackedWordAccessor, uint32_t, Codomain> &&
 		concepts::accessors::GenericReadAccessor<PdfAccessor, T, Codomain>)
-struct AliasTable
+struct PackedAliasTableA
 {
 	using scalar_type = T;
-
 	using domain_type = Domain;
 	using codomain_type = Codomain;
 	using density_type = scalar_type;
 	using weight_type = density_type;
+	using decoder = impl::AliasBitDecoder<Log2N>;
+	NBL_CONSTEXPR_STATIC_INLINE bool NBig = false;
 
 	struct cache_type
 	{
 		density_type pdf;
 	};
 
-	static AliasTable create(NBL_CONST_REF_ARG(ProbabilityAccessor) _probAccessor, NBL_CONST_REF_ARG(AliasIndexAccessor) _aliasAccessor, NBL_CONST_REF_ARG(PdfAccessor) _pdfAccessor, codomain_type _size)
+	static PackedAliasTableA create(NBL_CONST_REF_ARG(PackedWordAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size)
 	{
-		AliasTable retval;
-		retval.probAccessor = _probAccessor;
-		retval.aliasAccessor = _aliasAccessor;
-		retval.pdfAccessor = _pdfAccessor;
-		// Precompute tableSize as float minus 1 ULP so that u=1.0 maps to bin N-1
+		PackedAliasTableA retval;
+		retval.entryAcc = _entryAcc;
+		retval.pdfAcc = _pdfAcc;
 		const scalar_type exact = scalar_type(_size);
 		retval.tableSizeMinusUlp = nbl::hlsl::bit_cast<scalar_type>(nbl::hlsl::bit_cast<uint32_t>(exact) - 1u);
 		return retval;
 	}
 
-	// BasicSampler interface
 	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
 	{
 		const scalar_type scaled = u * tableSizeMinusUlp;
 		const codomain_type bin = _static_cast<codomain_type>(scaled);
 		const scalar_type remainder = scaled - scalar_type(bin);
 
-		scalar_type prob;
-		probAccessor.template get<scalar_type, codomain_type>(bin, prob);
-
-		// Use if-statement to avoid select: aliasIndex is a dependent read
-		codomain_type result;
-		if (remainder < prob)
-		{
-			result = bin;
-		}
-		else
-		{
-			codomain_type alias;
-			aliasAccessor.template get<codomain_type, codomain_type>(bin, alias);
-			result = alias;
-		}
+		uint32_t packedWord;
+		entryAcc.template get<uint32_t, codomain_type>(bin, packedWord);
+		return hlsl::select(remainder < decoder::template getStayProb<scalar_type>(packedWord), bin, codomain_type(decoder::getTarget(packedWord)));
+	}
 
+	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		const codomain_type result = generate(u);
+		pdfAcc.template get<scalar_type, codomain_type>(result, cache.pdf);
 		return result;
 	}
 
-	// TractableSampler interface
+	density_type forwardPdf(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.pdf;
+	}
+
+	weight_type forwardWeight(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.pdf;
+	}
+
+	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		scalar_type pdf;
+		pdfAcc.template get<scalar_type, codomain_type>(v, pdf);
+		return pdf;
+	}
+
+	weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		return backwardPdf(v);
+	}
+
+	PackedWordAccessor entryAcc;
+	PdfAccessor pdfAcc;
+	scalar_type tableSizeMinusUlp;
+};
+
+// NBig == true: 8 B entry {packedWord, ownPdf} + separate pdf[] array. Per
+// sample = one 8 B entry load (covers the common stay case where cache
+// already has ownPdf). If the sample aliases, a conditional 4 B pdf[target]
+// tap fills the cache. Total 8 B stay, 12 B aliased. Favours large N.
+template<typename T, typename Domain, typename Codomain, typename EntryAccessor, typename PdfAccessor, uint32_t Log2N
+	NBL_PRIMARY_REQUIRES(
+		concepts::UnsignedIntegralScalar<Codomain> &&
+		concepts::accessors::GenericReadAccessor<EntryAccessor, PackedAliasEntryB<T>, Codomain> &&
+		concepts::accessors::GenericReadAccessor<PdfAccessor, T, Codomain>)
+struct PackedAliasTableB
+{
+	using scalar_type = T;
+	using domain_type = Domain;
+	using codomain_type = Codomain;
+	using density_type = scalar_type;
+	using weight_type = density_type;
+	using entry_type = PackedAliasEntryB<scalar_type>;
+	using decoder = impl::AliasBitDecoder<Log2N>;
+	NBL_CONSTEXPR_STATIC_INLINE bool NBig = true;
+
+	struct cache_type
+	{
+		density_type pdf;
+	};
+
+	static PackedAliasTableB create(NBL_CONST_REF_ARG(EntryAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size)
+	{
+		PackedAliasTableB retval;
+		retval.entryAcc = _entryAcc;
+		retval.pdfAcc = _pdfAcc;
+		const scalar_type exact = scalar_type(_size);
+		retval.tableSizeMinusUlp = nbl::hlsl::bit_cast<scalar_type>(nbl::hlsl::bit_cast<uint32_t>(exact) - 1u);
+		return retval;
+	}
+
+	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+	{
+		const scalar_type scaled = u * tableSizeMinusUlp;
+		const codomain_type bin = _static_cast<codomain_type>(scaled);
+		const scalar_type remainder = scaled - scalar_type(bin);
+
+		entry_type entry;
+		entryAcc.template get<entry_type, codomain_type>(bin, entry);
+		return hlsl::select(remainder < decoder::template getStayProb<scalar_type>(entry.packedWord), bin, codomain_type(decoder::getTarget(entry.packedWord)));
+	}
+
 	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		const codomain_type result = generate(u);
-		pdfAccessor.template get<scalar_type, codomain_type>(result, cache.pdf);
+		const scalar_type scaled = u * tableSizeMinusUlp;
+		const codomain_type bin = _static_cast<codomain_type>(scaled);
+		const scalar_type remainder = scaled - scalar_type(bin);
+
+		entry_type entry;
+		entryAcc.template get<entry_type, codomain_type>(bin, entry);
+
+		const bool stay = remainder < decoder::template getStayProb<scalar_type>(entry.packedWord);
+		
+		cache.pdf = entry.ownPdf;
+		codomain_type result = bin;
+		if (!stay)
+		{
+			const codomain_type target = codomain_type(decoder::getTarget(entry.packedWord));
+			pdfAcc.template get<scalar_type, codomain_type>(target, cache.pdf);
+			result = target;
+		}
 		return result;
 	}
 
@@ -111,7 +215,7 @@ struct AliasTable
 	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
 	{
 		scalar_type pdf;
-		pdfAccessor.template get<scalar_type, codomain_type>(v, pdf);
+		pdfAcc.template get<scalar_type, codomain_type>(v, pdf);
 		return pdf;
 	}
 
@@ -120,9 +224,8 @@ struct AliasTable
 		return backwardPdf(v);
 	}
 
-	ProbabilityAccessor probAccessor;
-	AliasIndexAccessor aliasAccessor;
-	PdfAccessor pdfAccessor;
+	EntryAccessor entryAcc;
+	PdfAccessor pdfAcc;
 	scalar_type tableSizeMinusUlp;
 };
 
diff --git a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
index d02d21488c..2c7c53fd5f 100644
--- a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
+++ b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
@@ -5,7 +5,12 @@
 #ifndef _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_
 #define _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_
 
+#include <cmath>
 #include <cstdint>
+#include <span>
+#include <vector>
+
+#include <nbl/builtin/hlsl/sampling/alias_table.hlsl>
 
 namespace nbl
 {
@@ -15,74 +20,139 @@ namespace sampling
 {
 
 // Builds the alias table from an array of non-negative weights.
-// All output arrays must be pre-allocated to N entries.
 //
-// Parameters:
-//   weights         - input weights (non-negative, at least one must be > 0)
-//   N               - number of entries
-//   outProbability  - [out] alias table probability threshold per bin, in [0, 1]
-//   outAlias        - [out] alias redirect index per bin
-//   outPdf          - [out] normalized PDF per entry: weight[i] / sum(weights)
-//   workspace       - scratch buffer of N uint32_t entries
+// When `weights.size()` is a power of two, the builder transparently appends
+// one zero-weight dummy bucket so the GPU-facing table size is N+1 (odd),
+// which breaks PoT-periodic address patterns that alias memory channels /
+// cache sets on most GPUs. The sampled distribution is unchanged, the dummy
+// has stayProb = 0 and always redirects to a real donor.
+//
+// Output vectors are `resize`d by the builder to the final table size, so
+// the caller just passes (possibly empty) vectors and reads back the
+// returned size. That returned value is what to pass to the sampler's
+// `_size` argument and to use when packing / uploading.
 template<typename T>
 struct AliasTableBuilder
 {
-	static void build(std::span<const T> weights, T* outProbability, uint32_t* outAlias, T* outPdf, uint32_t* workspace)
-	{
-		T totalWeight = T(0);
-      for (uint32_t i = 0; i < weights.size(); i++)
-			totalWeight += weights[i];
-
-		const T rcpTotalWeight = T(1) / totalWeight;
-
-		// Compute PDFs, scaled probabilities, and partition into small/large in one pass
-		uint32_t smallEnd = 0;
-      uint32_t largeBegin = weights.size();
-      for (uint32_t i = 0; i < weights.size(); i++)
-		{
-			outPdf[i] = weights[i] * rcpTotalWeight;
-         outProbability[i] = outPdf[i] * T(weights.size());
-
-			if (outProbability[i] < T(1))
-				workspace[smallEnd++] = i;
-			else
-				workspace[--largeBegin] = i;
-		}
-
-		// Pair small and large entries
-      while (smallEnd > 0 && largeBegin < weights.size())
-		{
-			const uint32_t s = workspace[--smallEnd];
-			const uint32_t l = workspace[largeBegin];
-
-			outAlias[s] = l;
-			// outProbability[s] already holds the correct probability for bin s
-
-			outProbability[l] -= (T(1) - outProbability[s]);
-
-			if (outProbability[l] < T(1))
-			{
-				// l became small: pop from large, push to small
-				largeBegin++;
-				workspace[smallEnd++] = l;
-			}
-			// else l stays in large (don't pop, reuse next iteration)
-		}
-
-		// Remaining entries (floating point rounding artifacts)
-		while (smallEnd > 0)
-		{
-			const uint32_t s = workspace[--smallEnd];
-			outProbability[s] = T(1);
-			outAlias[s] = s;
-		}
-      while (largeBegin < weights.size())
-		{
-			const uint32_t l = workspace[largeBegin++];
-			outProbability[l] = T(1);
-			outAlias[l] = l;
-		}
-	}
+   // Ugly but much faster: we better ensure the table size is not a power of
+   // two, so we pad with +1 zero-weight dummy bucket when needed. PoT-sized
+   // alias tables hit GPU memory channel / cache set aliasing that can be
+   // wildly (sometimes 2x+) slower than a nearby non-PoT size. Builder owns
+   // all the sizing (resizes the output vectors, allocates its own scratch),
+   // so the caller can't get it wrong.
+   static uint32_t build(std::span<const T> weights, std::vector<T>& outProbability, std::vector<uint32_t>& outAlias, std::vector<T>& outPdf)
+   {
+      const uint32_t userN  = static_cast<uint32_t>(weights.size());
+      const uint32_t tableN = (userN > 1u && (userN & (userN - 1u)) == 0u) ? (userN + 1u) : userN;
+
+      outProbability.resize(tableN);
+      outAlias.resize(tableN);
+      outPdf.resize(tableN);
+      std::vector<uint32_t> workspace(tableN);
+
+      T totalWeight = T(0);
+      for (uint32_t i = 0; i < userN; i++)
+         totalWeight += weights[i];
+
+      const T rcpTotalWeight = T(1) / totalWeight;
+
+      // Compute PDFs, scaled probabilities, and partition into small/large in one pass
+      uint32_t smallEnd   = 0u;
+      uint32_t largeBegin = tableN;
+      for (uint32_t i = 0; i < userN; i++)
+      {
+         outPdf[i]         = weights[i] * rcpTotalWeight;
+         outProbability[i] = outPdf[i] * T(tableN);
+
+         if (outProbability[i] < T(1))
+            workspace[smallEnd++] = i;
+         else
+            workspace[--largeBegin] = i;
+      }
+      // PoT dodge tail: one zero-weight dummy at index userN, always in the small list.
+      if (tableN != userN)
+      {
+         outPdf[userN]         = T(0);
+         outProbability[userN] = T(0);
+         workspace[smallEnd++] = userN;
+      }
+
+      // Pair small and large entries
+      while (smallEnd > 0u && largeBegin < tableN)
+      {
+         const uint32_t s = workspace[--smallEnd];
+         const uint32_t l = workspace[largeBegin];
+
+         outAlias[s] = l;
+         // outProbability[s] already holds the correct probability for bin s
+
+         outProbability[l] -= (T(1) - outProbability[s]);
+
+         if (outProbability[l] < T(1))
+         {
+            // l became small: pop from large, push to small
+            largeBegin++;
+            workspace[smallEnd++] = l;
+         }
+         // else l stays in large (don't pop, reuse next iteration)
+      }
+
+      // Remaining entries (floating point rounding artifacts)
+      while (smallEnd > 0u)
+      {
+         const uint32_t s  = workspace[--smallEnd];
+         outProbability[s] = T(1);
+         outAlias[s]       = s;
+      }
+      while (largeBegin < tableN)
+      {
+         const uint32_t l  = workspace[largeBegin++];
+         outProbability[l] = T(1);
+         outAlias[l]       = l;
+      }
+
+      return tableN;
+   }
+
+   // Pack (target, stayProb) into a single 32-bit word with Log2N bits for
+   // target and (32 - Log2N) bits for the unorm-quantized threshold. Used by
+   // every packed variant; each packX() below calls this on a per-entry basis.
+   template<uint32_t Log2N>
+   static uint32_t packWord(uint32_t target, T stayProb)
+   {
+      const uint32_t targetMask = (Log2N == 32u) ? ~0u : ((1u << Log2N) - 1u);
+      const T        clamped    = stayProb < T(0) ? T(0) : (stayProb > T(1) ? T(1) : stayProb);
+      const uint32_t unormMax   = (Log2N == 0u) ? ~0u : ((~0u) >> Log2N);
+      const uint32_t probUnorm  = static_cast<uint32_t>(std::round(static_cast<double>(clamped) * static_cast<double>(unormMax)));
+      return (target & targetMask) | (probUnorm << Log2N);
+   }
+
+   // Variant A, pack SoA outputs into an array of 4 B packed words. The
+   // pdf[] array is consumed directly by the sampler as a second accessor.
+   // outWords must be pre-allocated to N uint32_t entries.
+   template<uint32_t Log2N>
+   static void packA(std::span<const T> probability, std::span<const uint32_t> alias, uint32_t* outWords)
+   {
+      const uint32_t N = static_cast<uint32_t>(probability.size());
+      for (uint32_t i = 0; i < N; i++)
+         outWords[i] = packWord<Log2N>(alias[i], probability[i]);
+   }
+
+   // Variant B, pack SoA outputs into 8 B entries { packedWord, ownPdf }.
+   // The pdf[] array is *also* passed to the sampler (same contents as ownPdf
+   // column, but tapped independently with a 4 B fetch when the sample aliases).
+   // outEntries must be pre-allocated to N entries.
+   template<uint32_t Log2N>
+   static void packB(std::span<const T> probability, std::span<const uint32_t> alias, std::span<const T> pdf,
+      PackedAliasEntryB<T>* outEntries)
+   {
+      const uint32_t N = static_cast<uint32_t>(probability.size());
+      for (uint32_t i = 0; i < N; i++)
+      {
+         outEntries[i].packedWord = packWord<Log2N>(alias[i], probability[i]);
+         outEntries[i].ownPdf     = pdf[i];
+      }
+   }
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
index a511fc2d8c..bf98d5ec93 100644
--- a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
@@ -30,6 +30,61 @@ void computeNormalizedCumulativeHistogram(std::span<const T> weights, T* outCumP
 	std::for_each(outCumProb, outCumProb + N - 1, [normalizationFactor](T& v) { v *= normalizationFactor; });
 }
 
+// Returns the next power of two >= x (and 1 for x <= 1). Matches the leaf-count
+// the Eytzinger builder pads to.
+inline uint32_t eytzingerLeafCount(uint32_t N)
+{
+	uint32_t P = 1u;
+	while (P < N) P <<= 1u;
+	return P;
+}
+
+// Builds an Eytzinger-layout CDF for cache-friendly binary search on the GPU.
+//
+// Layout (1-indexed, size 2*P where P = eytzingerLeafCount(N)):
+//   tree[0]           unused (keeps parent/child arithmetic branch-free)
+//   tree[1 .. P-1]    interior split keys; tree[v] == rightmost leaf of v's left subtree
+//   tree[P .. P+N-1]  leaves, tree[P + i] = normalized inclusive scan of weights up to i
+//   tree[P+N .. 2P-1] padded leaves, all 1.0 (any u < 1.0 routes away from these)
+//
+// The sampler walks the tree as index = (index << 1) | goRight for ceil(log2(N))
+// iterations. Successive taps within one search land on adjacent memory, so every
+// cache line pulled is fully used and the first log2(subgroupSize) iterations are
+// served by a single memory transaction per subgroup.
+template<typename T>
+void buildEytzinger(std::span<const T> weights, T* outTree)
+{
+	const uint32_t N = static_cast<uint32_t>(weights.size());
+	if (N == 0)
+		return;
+
+	const uint32_t P = eytzingerLeafCount(N);
+
+	T total = T(0);
+	for (uint32_t i = 0; i < N; ++i)
+		total += weights[i];
+	const T rcpTotal = T(1) / total;
+
+	T acc = T(0);
+	for (uint32_t i = 0; i < N; ++i)
+	{
+		acc += weights[i];
+		outTree[P + i] = acc * rcpTotal;
+	}
+	for (uint32_t i = N; i < P; ++i)
+		outTree[P + i] = T(1);
+
+	// Bottom-up: each interior node copies the rightmost leaf of its left subtree,
+	// found by descending left-then-always-right from v.
+	for (uint32_t v = P; v-- > 1u;)
+	{
+		uint32_t r = v << 1u;
+		while (r < P)
+			r = (r << 1u) | 1u;
+		outTree[v] = outTree[r];
+	}
+}
+
 } // namespace sampling
 } // namespace hlsl
 } // namespace nbl

From c12a724ef5c0dfd2c0968404d21a7acb8a584f7e Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Fri, 24 Apr 2026 21:28:50 +0300
Subject: [PATCH 24/26] forgotten to commit file

---
 .../hlsl/sampling/cumulative_probability.hlsl | 189 +++++++++++++-----
 1 file changed, 143 insertions(+), 46 deletions(-)

diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
index 1f176f9713..f9acba367a 100644
--- a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
@@ -16,18 +16,38 @@ namespace hlsl
 namespace sampling
 {
 
-// Discrete sampler using cumulative probability lookup via upper_bound.
+// Discrete sampler using cumulative probability lookup.
 //
 // Samples a discrete index in [0, N) with probability proportional to
 // precomputed weights in O(log N) time per sample.
 //
-// The cumulative probability array stores N-1 entries (the last bucket
-// is always 1.0 and need not be stored). Entry i holds the sum of
-// probabilities for indices [0, i].
+// Three layouts / cache-population strategies, selected by the Mode parameter:
+//
+//   TRACKING (default):  N-1 CDF entries, last bucket implicit at 1.0.
+//                        A stateful comparator records the straddling CDF
+//                        values during upper_bound itself.
+//   YOLO:                Same storage. Plain upper_bound followed by two
+//                        re-reads of the adjacent CDF entries (warm cache).
+//                        Lower register footprint, two extra array reads.
+//   EYTZINGER:           Level-order implicit binary tree in 2*P entries
+//                        where P = roundUpPot(N). Leaves at [P, P+N) hold
+//                        the CDF; interior nodes at [1, P) hold split keys.
+//                        Descent reads adjacent memory at each step, so
+//                        every cache line pulled is fully utilised and the
+//                        first log2(subgroupSize) iterations are served by a
+//                        single transaction per subgroup. Build with
+//                        sampling::buildEytzinger<T>().
 //
 // Satisfies TractableSampler and ResamplableSampler (not BackwardTractableSampler:
 // the mapping is discrete).
-template<typename T, typename Domain, typename Codomain, typename CumProbAccessor
+enum CumulativeProbabilityMode : uint32_t
+{
+	TRACKING  = 0u,
+	YOLO      = 1u,
+	EYTZINGER = 2u
+};
+
+template<typename T, typename Domain, typename Codomain, typename CumProbAccessor, CumulativeProbabilityMode Mode = CumulativeProbabilityMode::TRACKING
 	NBL_PRIMARY_REQUIRES(concepts::accessors::GenericReadAccessor<CumProbAccessor, T, Codomain>)
 struct CumulativeProbabilitySampler
 {
@@ -44,58 +64,116 @@ struct CumulativeProbabilitySampler
 		density_type upperBound;
 	};
 
+	// `_size` is the user-facing bucket count N for every mode. TRACKING / YOLO
+	// expect the accessor to hold N-1 CDF entries; EYTZINGER expects 2*P entries
+	// in the level-order layout produced by buildEytzinger.
 	static CumulativeProbabilitySampler create(NBL_CONST_REF_ARG(CumProbAccessor) _cumProbAccessor, uint32_t _size)
 	{
 		CumulativeProbabilitySampler retval;
 		retval.cumProbAccessor = _cumProbAccessor;
 		retval.storedCount = _size - 1u;
+		retval.depth = 0u;
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
+		{
+			uint32_t P = 1u;
+			uint32_t d = 0u;
+			while (P < _size) { P <<= 1u; ++d; }
+			retval.depth = d;
+		}
 		return retval;
 	}
 
 	// BasicSampler interface
 	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
 	{
-		// upper_bound returns first index where cumProb > u
-		return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
+		{
+			const uint32_t leafBase = 1u << depth;
+			uint32_t index = 1u;
+			for (uint32_t iter = 0u; iter < depth; ++iter)
+			{
+				density_type key;
+				cumProbAccessor.template get<density_type, uint32_t>(index, key);
+				index = (index << 1u) | uint32_t(!(u < key));
+			}
+			const codomain_type result = codomain_type(index - leafBase);
+			return result < codomain_type(storedCount) ? result : codomain_type(storedCount);
+		}
+		else
+		{
+			return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+		}
 	}
 
 	// TractableSampler interface
 	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		// #define NBL_CUMPROB_YOLO_READS
-#ifdef NBL_CUMPROB_YOLO_READS
-		// YOLO approach: re-read the array after binary search.
-		// The accessed elements are adjacent to the found index so the cache is warm.
-		const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
-		cache.oneBefore = density_type(0.0);
-		if (result)
-			cumProbAccessor.template get<density_type, codomain_type>(result - 1u, cache.oneBefore);
-		cache.upperBound = density_type(1.0);
-		if (result < storedCount)
-			cumProbAccessor.template get<density_type, codomain_type>(result, cache.upperBound);
-#else
-		// Tracking reads approach: stateful comparator captures CDF values during binary search.
-		struct CdfComparator
+		codomain_type result;
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
 		{
-			bool operator()(const density_type value, const density_type rhs)
+			// Descent visits one interior node per level. Going left tightens
+			// the upper bound to the current key; going right tightens the
+			// lower bound. Final index, leafBase is the bucket.
+			cache.oneBefore = density_type(0.0);
+			cache.upperBound = density_type(1.0);
+			const uint32_t leafBase = 1u << depth;
+			uint32_t index = 1u;
+			for (uint32_t iter = 0u; iter < depth; ++iter)
 			{
-				const bool retval = value < rhs;
-				if (retval)
-					upperBound = rhs;
+				density_type key;
+				cumProbAccessor.template get<density_type, uint32_t>(index, key);
+				const bool goRight = !(u < key);
+				if (goRight)
+				{
+					cache.oneBefore = key;
+					index = (index << 1u) | 1u;
+				}
 				else
-					oneBefore = rhs;
-				return retval;
+				{
+					cache.upperBound = key;
+					index = (index << 1u);
+				}
 			}
-
-			density_type oneBefore;
-			density_type upperBound;
-		} comp;
-		comp.oneBefore = density_type(0.0);
-		comp.upperBound = density_type(1.0);
-		const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp);
-		cache.oneBefore = comp.oneBefore;
-		cache.upperBound = comp.upperBound;
-#endif
+			const codomain_type raw = codomain_type(index - leafBase);
+			result = raw < codomain_type(storedCount) ? raw : codomain_type(storedCount);
+		}
+		else NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::YOLO)
+		{
+			// Re-read the two adjacent CDF entries after the binary search.
+			// Both sit on the cache lines the search just touched, so they are warm.
+			result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+			cache.oneBefore = density_type(0.0);
+			if (result)
+				cumProbAccessor.template get<density_type, codomain_type>(result - 1u, cache.oneBefore);
+			cache.upperBound = density_type(1.0);
+			if (result < storedCount)
+				cumProbAccessor.template get<density_type, codomain_type>(result, cache.upperBound);
+		}
+		else
+		{
+			// TRACKING: stateful comparator captures the CDF values straddling the
+			// found index during the binary search itself, avoiding the two extra reads.
+			struct CdfComparator
+			{
+				bool operator()(const density_type value, const density_type rhs)
+				{
+					const bool retval = value < rhs;
+					if (retval)
+						upperBound = rhs;
+					else
+						oneBefore = rhs;
+					return retval;
+				}
+
+				density_type oneBefore;
+				density_type upperBound;
+			} comp;
+			comp.oneBefore = density_type(0.0);
+			comp.upperBound = density_type(1.0);
+			result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp);
+			cache.oneBefore = comp.oneBefore;
+			cache.upperBound = comp.upperBound;
+		}
 		return result;
 	}
 
@@ -111,16 +189,34 @@ struct CumulativeProbabilitySampler
 
 	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
 	{
-		density_type retval = density_type(1.0);
-		if (v < storedCount)
-			cumProbAccessor.template get<density_type, codomain_type>(v, retval);
-		if (v)
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
 		{
-			density_type prev;
-			cumProbAccessor.template get<density_type, codomain_type>(v - 1u, prev);
-			retval -= prev;
+			// Leaves store the CDF directly; the last real leaf is normalized
+			// to 1.0 and padded leaves (if any) also hold 1.0.
+			const uint32_t leafBase = 1u << depth;
+			density_type retval;
+			cumProbAccessor.template get<density_type, uint32_t>(leafBase + uint32_t(v), retval);
+			if (v)
+			{
+				density_type prev;
+				cumProbAccessor.template get<density_type, uint32_t>(leafBase + uint32_t(v) - 1u, prev);
+				retval -= prev;
+			}
+			return retval;
+		}
+		else
+		{
+			density_type retval = density_type(1.0);
+			if (v < storedCount)
+				cumProbAccessor.template get<density_type, codomain_type>(v, retval);
+			if (v)
+			{
+				density_type prev;
+				cumProbAccessor.template get<density_type, codomain_type>(v - 1u, prev);
+				retval -= prev;
+			}
+			return retval;
 		}
-		return retval;
 	}
 
 	weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC
@@ -129,7 +225,8 @@ struct CumulativeProbabilitySampler
 	}
 
 	CumProbAccessor cumProbAccessor;
-	uint32_t storedCount;
+	uint32_t storedCount;    // N - 1 (last real bucket index)
+	uint32_t depth;          // EYTZINGER only: ceil(log2(N)), iteration count; leafBase = 1 << depth
 };
 
 } // namespace sampling

From fc3cec4ba1a978b9522dea296498abd4e754a3ba Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Mon, 27 Apr 2026 16:34:17 +0300
Subject: [PATCH 25/26] replaced all sampler literal floats casting with
 `_static_cast<>()`

---
 .../nbl/builtin/hlsl/sampling/bilinear.hlsl   |  6 +--
 .../hlsl/sampling/box_muller_transform.hlsl   |  6 +--
 .../hlsl/sampling/concentric_mapping.hlsl     | 10 ++---
 .../hlsl/sampling/cos_weighted_spheres.hlsl   | 14 +++----
 .../hlsl/sampling/cumulative_probability.hlsl | 14 +++----
 include/nbl/builtin/hlsl/sampling/linear.hlsl |  6 +--
 .../builtin/hlsl/sampling/polar_mapping.hlsl  |  6 +--
 .../projected_spherical_rectangle.hlsl        |  4 +-
 .../hlsl/sampling/spherical_rectangle.hlsl    | 18 ++++----
 .../hlsl/sampling/spherical_triangle.hlsl     | 42 +++++++++----------
 .../hlsl/sampling/uniform_spheres.hlsl        | 30 ++++++-------
 11 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
index baf1b5427e..b56073074b 100644
--- a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
@@ -45,10 +45,10 @@ struct Bilinear
         vector2_type twiceAreasUnderXCurve = vector2_type(bilinearCoeffs[0] + bilinearCoeffs[1], bilinearCoeffs[2] + bilinearCoeffs[3]);
         // Linear::create adds FLT_MIN internally, replicate here so both divisions share
         // the same denominator (sum + 2*min), enabling CSE to merge them into one division
-        const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + scalar_type(2.0) * hlsl::numeric_limits<scalar_type>::min;
-        const scalar_type yNormFactor = scalar_type(2.0) / safeSum;
+        const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + _static_cast<scalar_type>(2.0) * hlsl::numeric_limits<scalar_type>::min;
+        const scalar_type yNormFactor = _static_cast<scalar_type>(2.0) / safeSum;
         retval.lineary = Linear<scalar_type>::create(twiceAreasUnderXCurve);
-        retval.normFactor = yNormFactor * scalar_type(2.0);
+        retval.normFactor = yNormFactor * _static_cast<scalar_type>(2.0);
         return retval;
     }
 
diff --git a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
index b9efd1ec35..9ae0c83c05 100644
--- a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
@@ -37,16 +37,16 @@ struct BoxMullerTransform
     {
         BoxMullerTransform<T> retval;
         retval.stddev = _stddev;
-        retval.halfRcpStddev2 = scalar_type(0.5) / (_stddev * _stddev);
+        retval.halfRcpStddev2 = _static_cast<scalar_type>(0.5) / (_stddev * _stddev);
         return retval;
     }
 
     codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
         scalar_type sinPhi, cosPhi;
-        math::sincos<scalar_type>(scalar_type(2.0) * numbers::pi<scalar_type> * u.y - numbers::pi<scalar_type>, sinPhi, cosPhi);
+        math::sincos<scalar_type>(_static_cast<scalar_type>(2.0) * numbers::pi<scalar_type> * u.y - numbers::pi<scalar_type>, sinPhi, cosPhi);
         cache.direction = vector2_type(cosPhi, sinPhi);
-        return cache.direction * nbl::hlsl::sqrt(scalar_type(-2.0) * nbl::hlsl::log(u.x)) * stddev;
+        return cache.direction * nbl::hlsl::sqrt(_static_cast<scalar_type>(-2.0) * nbl::hlsl::log(u.x)) * stddev;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
diff --git a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
index c9e5cac5d6..d31c2d994a 100644
--- a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
@@ -41,7 +41,7 @@ struct ConcentricMapping
    static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
    {
       // map [0,1]^2 to [-1,1]^2
-      const vector2_type centered = scalar_type(2) * u - hlsl::promote<vector2_type>(scalar_type(1));
+      const vector2_type centered = _static_cast<scalar_type>(2) * u - hlsl::promote<vector2_type>(_static_cast<scalar_type>(1));
 
       const scalar_type a = centered.x;
       const scalar_type b = centered.y;
@@ -51,10 +51,10 @@ struct ConcentricMapping
       const scalar_type dominant = hlsl::select(cond, a, b);
       const scalar_type minor = hlsl::select(cond, b, a);
 
-      const scalar_type safe_dominant = dominant != scalar_type(0) ? dominant : scalar_type(0);
+      const scalar_type safe_dominant = dominant != _static_cast<scalar_type>(0) ? dominant : _static_cast<scalar_type>(0);
       const scalar_type ratio = minor / safe_dominant;
 
-      const scalar_type angle = scalar_type(0.25) * numbers::pi<scalar_type> * ratio;
+      const scalar_type angle = _static_cast<scalar_type>(0.25) * numbers::pi<scalar_type> * ratio;
       const scalar_type c = hlsl::cos<scalar_type>(angle);
       const scalar_type s = hlsl::sin<scalar_type>(angle);
 
@@ -90,7 +90,7 @@ struct ConcentricMapping
       // angle in [0, pi/4]
       const scalar_type phi = hlsl::atan2(num, denom);
 
-      const scalar_type minor_val = r * phi / (scalar_type(0.25) * numbers::pi<scalar_type>);
+      const scalar_type minor_val = r * phi / (_static_cast<scalar_type>(0.25) * numbers::pi<scalar_type>);
 
       // reconstruct a,b using select instead of branching
       const scalar_type a_base = hlsl::select(swapped, minor_val, r);
@@ -99,7 +99,7 @@ struct ConcentricMapping
       const scalar_type a = ieee754::copySign(a_base, p.x);
       const scalar_type b = ieee754::copySign(b_base, p.y);
 
-      return (vector2_type(a, b) + hlsl::promote<vector2_type>(scalar_type(1))) * scalar_type(0.5);
+      return (vector2_type(a, b) + hlsl::promote<vector2_type>(_static_cast<scalar_type>(1))) * _static_cast<scalar_type>(0.5);
    }
 
    // The PDF of Shirley mapping is constant (1/PI on the unit disk)
diff --git a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
index 2305cceec1..52b063f448 100644
--- a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
@@ -35,8 +35,8 @@ struct ProjectedHemisphere
 
 	static codomain_type __generate(const domain_type u)
 	{
-		vector_t2 p = ConcentricMapping<T>::generate(u * T(0.99999) + T(0.000005));
-		T z = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(1.0) - p.x * p.x - p.y * p.y));
+		vector_t2 p = ConcentricMapping<T>::generate(u * _static_cast<T>(0.99999) + _static_cast<T>(0.000005));
+		T z = hlsl::sqrt<T>(hlsl::max<T>(_static_cast<T>(0.0), _static_cast<T>(1.0) - p.x * p.x - p.y * p.y));
 		return vector_t3(p.x, p.y, z);
 	}
 
@@ -93,11 +93,11 @@ struct ProjectedSphere
 	static codomain_type __generate(NBL_REF_ARG(domain_type) u)
 	{
 		vector_t3 retval = hemisphere_t::__generate(u.xy);
-		const bool chooseLower = u.z > scalar_type(0.5);
+		const bool chooseLower = u.z > _static_cast<scalar_type>(0.5);
 		retval.z = chooseLower ? (-retval.z) : retval.z;
 		if (chooseLower)
-			u.z -= T(0.5);
-		u.z *= T(2.0);
+			u.z -= _static_cast<T>(0.5);
+		u.z *= _static_cast<T>(2.0);
 		return retval;
 	}
 
@@ -110,7 +110,7 @@ struct ProjectedSphere
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * cache.z * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * cache.z * numbers::inv_pi<T>;
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)
@@ -120,7 +120,7 @@ struct ProjectedSphere
 
 	static density_type backwardPdf(const codomain_type L)
 	{
-		return T(0.5) * hlsl::abs(L.z) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * hlsl::abs(L.z) * numbers::inv_pi<T>;
 	}
 
 	static weight_type backwardWeight(const codomain_type L)
diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
index f9acba367a..d3657babb1 100644
--- a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
@@ -114,8 +114,8 @@ struct CumulativeProbabilitySampler
 			// Descent visits one interior node per level. Going left tightens
 			// the upper bound to the current key; going right tightens the
 			// lower bound. Final index, leafBase is the bucket.
-			cache.oneBefore = density_type(0.0);
-			cache.upperBound = density_type(1.0);
+			cache.oneBefore = _static_cast<density_type>(0.0);
+			cache.upperBound = _static_cast<density_type>(1.0);
 			const uint32_t leafBase = 1u << depth;
 			uint32_t index = 1u;
 			for (uint32_t iter = 0u; iter < depth; ++iter)
@@ -142,10 +142,10 @@ struct CumulativeProbabilitySampler
 			// Re-read the two adjacent CDF entries after the binary search.
 			// Both sit on the cache lines the search just touched, so they are warm.
 			result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
-			cache.oneBefore = density_type(0.0);
+			cache.oneBefore = _static_cast<density_type>(0.0);
 			if (result)
 				cumProbAccessor.template get<density_type, codomain_type>(result - 1u, cache.oneBefore);
-			cache.upperBound = density_type(1.0);
+			cache.upperBound = _static_cast<density_type>(1.0);
 			if (result < storedCount)
 				cumProbAccessor.template get<density_type, codomain_type>(result, cache.upperBound);
 		}
@@ -168,8 +168,8 @@ struct CumulativeProbabilitySampler
 				density_type oneBefore;
 				density_type upperBound;
 			} comp;
-			comp.oneBefore = density_type(0.0);
-			comp.upperBound = density_type(1.0);
+			comp.oneBefore = _static_cast<density_type>(0.0);
+			comp.upperBound = _static_cast<density_type>(1.0);
 			result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp);
 			cache.oneBefore = comp.oneBefore;
 			cache.upperBound = comp.upperBound;
@@ -206,7 +206,7 @@ struct CumulativeProbabilitySampler
 		}
 		else
 		{
-			density_type retval = density_type(1.0);
+			density_type retval = _static_cast<density_type>(1.0);
 			if (v < storedCount)
 				cumProbAccessor.template get<density_type, codomain_type>(v, retval);
 			if (v)
diff --git a/include/nbl/builtin/hlsl/sampling/linear.hlsl b/include/nbl/builtin/hlsl/sampling/linear.hlsl
index 8ebf2e38a0..cd260cea3a 100644
--- a/include/nbl/builtin/hlsl/sampling/linear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/linear.hlsl
@@ -40,13 +40,13 @@ struct Linear
         // instead of inf normalization (2/0) leading to NaN; negligible for normal inputs
         const vector2_type safeCoeffs = linearCoeffs + vector2_type(hlsl::numeric_limits<scalar_type>::min, hlsl::numeric_limits<scalar_type>::min);
         // normalize coefficients so that the PDF is simply normalizedCoeffStart + linearCoeffDiff * x
-        const scalar_type normFactor = scalar_type(2.0) / (safeCoeffs[0] + safeCoeffs[1]);
+        const scalar_type normFactor = _static_cast<scalar_type>(2.0) / (safeCoeffs[0] + safeCoeffs[1]);
         const vector2_type normalized = safeCoeffs * normFactor;
         retval.normalizedCoeffStart = normalized[0];
         retval.normalizedCoeffEnd = normalized[1];
         // precompute for the stable quadratic in generate()
         retval.squaredCoeffStart = normalized[0] * normalized[0];
-        retval.twoTimesDiff = scalar_type(2.0) * (normalized[1] - normalized[0]);
+        retval.twoTimesDiff = _static_cast<scalar_type>(2.0) * (normalized[1] - normalized[0]);
         return retval;
     }
 
@@ -82,7 +82,7 @@ struct Linear
     // Not used because we already store start for generate().
     density_type backwardPdf(const codomain_type x) NBL_CONST_MEMBER_FUNC
     {
-        assert(x >= scalar_type(0.0) && x <= scalar_type(1.0));
+        assert(x >= _static_cast<scalar_type>(0.0) && x <= _static_cast<scalar_type>(1.0));
         return hlsl::mix(normalizedCoeffStart, normalizedCoeffEnd, x);
     }
 
diff --git a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
index 719eaf504f..64d5e69b96 100644
--- a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
@@ -32,7 +32,7 @@ struct PolarMapping
 	static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
 	{
 		const scalar_type r = hlsl::sqrt<scalar_type>(u.x);
-		const scalar_type phi = scalar_type(2) * numbers::pi<scalar_type> * u.y;
+		const scalar_type phi = _static_cast<scalar_type>(2) * numbers::pi<scalar_type> * u.y;
 		return vector2_type(r * hlsl::cos<scalar_type>(phi), r * hlsl::sin<scalar_type>(phi));
 	}
 
@@ -46,8 +46,8 @@ struct PolarMapping
 	{
 		const scalar_type r2 = p.x * p.x + p.y * p.y;
 		scalar_type phi = hlsl::atan2(p.y, p.x);
-		phi += hlsl::mix(scalar_type(0), scalar_type(2) * numbers::pi<scalar_type>, phi < scalar_type(0));
-		return vector2_type(r2, phi * (scalar_type(0.5) * numbers::inv_pi<scalar_type>));
+		phi += hlsl::mix(_static_cast<scalar_type>(0), _static_cast<scalar_type>(2) * numbers::pi<scalar_type>, phi < _static_cast<scalar_type>(0));
+		return vector2_type(r2, phi * (_static_cast<scalar_type>(0.5) * numbers::inv_pi<scalar_type>));
 	}
 
 	static density_type forwardPdf(const domain_type u, cache_type cache) { return numbers::inv_pi<scalar_type>; }
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
index d40e58c5db..13cc13d30c 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
@@ -79,7 +79,7 @@ struct ProjectedSphericalRectangle
          hlsl::dot(hlsl::normalize(c1), _receiverNormal),
          hlsl::dot(hlsl::normalize(c2), _receiverNormal),
          hlsl::dot(hlsl::normalize(c3), _receiverNormal));
-      const scalar_type minimumProjSolidAngle = scalar_type(0.0);
+      const scalar_type minimumProjSolidAngle = _static_cast<scalar_type>(0.0);
       const vector4_type bxdfPdfAtVertex      = math::conditionalAbsOrMax(_receiverWasBSDF, dots, hlsl::promote<vector4_type>(minimumProjSolidAngle));
 
       const shapes::SphericalRectangle<T> shape                         = shapes::SphericalRectangle<T>::create(compressed);
@@ -113,7 +113,7 @@ struct ProjectedSphericalRectangle
          hlsl::promote<vector4_type>(r0zSq);
 
       // dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq). Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11
-      const scalar_type minimumProjSolidAngle = scalar_type(0.0);
+      const scalar_type minimumProjSolidAngle = _static_cast<scalar_type>(0.0);
       const vector4_type bxdfPdfAtVertex      = math::conditionalAbsOrMax(_receiverWasBSDF,
          dots * hlsl::rsqrt<vector4_type>(lenSq),
          hlsl::promote<vector4_type>(minimumProjSolidAngle));
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
index 6873362b1a..ba088a0e29 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
@@ -58,7 +58,7 @@ struct SphericalRectangle
 
       math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[2]);
       angle_adder.addCosine(sa.cosGamma[3]);
-      retval.k = scalar_type(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
+      retval.k = _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
 
       return retval;
    }
@@ -84,7 +84,7 @@ struct SphericalRectangle
       acc.addCosine(sa.cosGamma[1]);
       acc.addCosine(sa.cosGamma[2]);
       acc.addCosine(sa.cosGamma[3]);
-      sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi<scalar_type>;
+      sa.value = acc.getSumOfArccos() - _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type>;
 
       return create(_basis, sa, _extents);
    }
@@ -114,8 +114,8 @@ struct SphericalRectangle
       NBL_IF_CONSTEXPR(!UseRealSinCos)
       {
          // au in [0, 4*pi] -> peel at most two 2*pi periods to land in (-pi, pi].
-         au = hlsl::select(au > numbers::pi<scalar_type>, au - scalar_type(2.0) * numbers::pi<scalar_type>, au);
-         au = hlsl::select(au > numbers::pi<scalar_type>, au - scalar_type(2.0) * numbers::pi<scalar_type>, au);
+         au = hlsl::select(au > numbers::pi<scalar_type>, au - _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type>, au);
+         au = hlsl::select(au > numbers::pi<scalar_type>, au - _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type>, au);
       }
       scalar_type sin_au, cos_au;
       math::sincos<scalar_type, UseRealSinCos>(au, sin_au, cos_au);
@@ -123,15 +123,15 @@ struct SphericalRectangle
       // negFu carries the sign directly (numerator and sin_au both signed), so xu's sign drops
       // out of a single multiply + hlsl::sign.
       const scalar_type negFu   = numerator / sin_au;
-      const scalar_type rcpCu_2 = hlsl::max<scalar_type>(negFu * negFu + b0 * b0, scalar_type(1.0));
-      retval.xu                 = negAbsR0z * hlsl::sign(negFu) * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
+      const scalar_type rcpCu_2 = hlsl::max<scalar_type>(negFu * negFu + b0 * b0, _static_cast<scalar_type>(1.0));
+      retval.xu                 = negAbsR0z * hlsl::sign(negFu) * hlsl::rsqrt<scalar_type>(rcpCu_2 - _static_cast<scalar_type>(1.0));
       retval.xu = hlsl::clamp<scalar_type>(retval.xu, r0.x, r1.x); // avoid Infs
       retval.d2 = retval.xu * retval.xu + r0zSq;
 
       const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(retval.d2 + r0.y * r0.y);
       const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(retval.d2 + r1.y * r1.y);
       retval.hv            = h0 + u.y * (h1 - h0);
-      retval.cosElevation2 = scalar_type(1.0) - hlsl::min<scalar_type>(retval.hv * retval.hv, 1);
+      retval.cosElevation2 = _static_cast<scalar_type>(1.0) - hlsl::min<scalar_type>(retval.hv * retval.hv, 1);
 
       return retval;
    }
@@ -200,7 +200,7 @@ struct SphericalRectangle
 
    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
    {
-      return scalar_type(1.0) / solidAngle;
+      return _static_cast<scalar_type>(1.0) / solidAngle;
    }
 
    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -210,7 +210,7 @@ struct SphericalRectangle
 
    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
-      return scalar_type(1.0) / solidAngle;
+      return _static_cast<scalar_type>(1.0) / solidAngle;
    }
 
    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
index 59dea3b6ea..b915a55de7 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
@@ -70,14 +70,14 @@ struct SphericalTriangle
    static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
    {
       SphericalTriangle retval;
-      retval.rcpSolidAngle = scalar_type(1.0) / tri.solid_angle;
+      retval.rcpSolidAngle = _static_cast<scalar_type>(1.0) / tri.solid_angle;
       retval.tri_vertices[0] = tri.vertices[0];
       retval.tri_vertices[1] = tri.vertices[1];
       retval.triCosc = tri.cos_sides[2];
       // precompute great circle normal of arc AC: cross(A,C) has magnitude sin(b),
       // so multiplying by csc(b) normalizes it; zero when side AC is degenerate
       const scalar_type cscb = tri.csc_sides[1];
-      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscb < numeric_limits<scalar_type>::max, cscb, scalar_type(0));
+      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscb < numeric_limits<scalar_type>::max, cscb, _static_cast<scalar_type>(0));
       retval.e_C = hlsl::cross(arcACPlaneNormal, tri.vertices[0]);
       retval.cosA = tri.cos_vertices[0];
       retval.sinA = tri.sin_vertices[0];
@@ -117,14 +117,14 @@ struct SphericalTriangle
          // For large triangles with high u.x, cosA_hat can approach -1,
          // making (1 + cosA_hat) near zero and the division unstable.
          // Use the algebraic identity only when cosA_hat > 0 (safe denominator).
-         const scalar_type rcpDenum = scalar_type(1) / denum;
-         const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > scalar_type(0), sinA_hat * sinA_hat / (scalar_type(1) + cosA_hat), scalar_type(1) - cosA_hat);
-         const scalar_type DminusN = sinA * (scalar_type(1) + triCosc) * oneMinusCosAhat;
-         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum);
-         cosBp = scalar_type(1) - DminusN * rcpDenum;
+         const scalar_type rcpDenum = _static_cast<scalar_type>(1) / denum;
+         const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > _static_cast<scalar_type>(0), sinA_hat * sinA_hat / (_static_cast<scalar_type>(1) + cosA_hat), _static_cast<scalar_type>(1) - cosA_hat);
+         const scalar_type DminusN = sinA * (_static_cast<scalar_type>(1) + triCosc) * oneMinusCosAhat;
+         sinBp = sqrt<scalar_type>(max<scalar_type>(_static_cast<scalar_type>(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum);
+         cosBp = _static_cast<scalar_type>(1) - DminusN * rcpDenum;
 #else // 17% faster, less accurate
          cosBp = num / denum;
-         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - cosBp * cosBp));
+         sinBp = sqrt<scalar_type>(max<scalar_type>(_static_cast<scalar_type>(0), _static_cast<scalar_type>(1) - cosBp * cosBp));
 #endif
       }
       else // STA_PBRT, accurate, slowest
@@ -133,8 +133,8 @@ struct SphericalTriangle
          const scalar_type k1 = -t + cosA;
          const scalar_type k2 = -s - sinA * triCosc;
          cosBp = (k2 + impl::differenceOfProducts(k2, -t, k1, -s) * cosA) / (impl::sumOfProducts(k2, -s, k1, -t) * sinA);
-         cosBp = nbl::hlsl::clamp(cosBp, scalar_type(-1), scalar_type(1));
-         sinBp = sqrt<scalar_type>(scalar_type(1) - cosBp * cosBp);
+         cosBp = nbl::hlsl::clamp(cosBp, _static_cast<scalar_type>(-1), _static_cast<scalar_type>(1));
+         sinBp = sqrt<scalar_type>(_static_cast<scalar_type>(1) - cosBp * cosBp);
       }
 
       // Step 3: construct C' on the great circle through A toward C
@@ -155,8 +155,8 @@ struct SphericalTriangle
       // a u.y -> angle mapping whose derivatives stay bounded near u.y = 0 (e.g. acos(z) = angle
       // from B, then sample arc-length linearly), so the Jacobian is smooth and the skip band in
       // the tester can be removed.
-      const scalar_type z = scalar_type(1) - u.y * (scalar_type(1) - cosCpB);
-      const scalar_type sinZ = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - z * z));
+      const scalar_type z = _static_cast<scalar_type>(1) - u.y * (_static_cast<scalar_type>(1) - cosCpB);
+      const scalar_type sinZ = sqrt<scalar_type>(max<scalar_type>(_static_cast<scalar_type>(0), _static_cast<scalar_type>(1) - z * z));
       return z * tri_vertices[1] + sinZ * hlsl::normalize(cp - cosCpB * tri_vertices[1]);
    }
 
@@ -190,12 +190,12 @@ struct SphericalTriangle
          const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
          const scalar_type v_denom = nbl::hlsl::dot(AminusB, AminusB);
          const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
-            nbl::hlsl::clamp(v_num / v_denom, scalar_type(0.0), scalar_type(1.0)),
-            scalar_type(0.0));
-         return vector2_type(scalar_type(0.0), v);
+            nbl::hlsl::clamp(v_num / v_denom, _static_cast<scalar_type>(0.0), _static_cast<scalar_type>(1.0)),
+            _static_cast<scalar_type>(0.0));
+         return vector2_type(_static_cast<scalar_type>(0.0), v);
       }
 
-      const scalar_type rcpR = scalar_type(1.0) / nbl::hlsl::sqrt(R_sq);
+      const scalar_type rcpR = _static_cast<scalar_type>(1.0) / nbl::hlsl::sqrt(R_sq);
       vector3_type cp = tri_vertices[0] * (tripleE * rcpR) + e_C * (-tripleA * rcpR);
       // two intersections exist; pick the one on the minor arc A->C (branchless sign flip)
       cp = ieee754::flipSignIfRHSNegative(cp, nbl::hlsl::dot(cp, APlusC));
@@ -214,9 +214,9 @@ struct SphericalTriangle
       const scalar_type AxBdotE = nbl::hlsl::dot(tri_vertices[0], nbl::hlsl::cross(tri_vertices[1], e_C));
       const scalar_type num = sinBp_inv * AxBdotE;
       const scalar_type cosCpB = nbl::hlsl::dot(tri_vertices[1], cp);
-      const scalar_type den = scalar_type(1.0) + triCosc + cosCpB + cosBp_inv;
-      const scalar_type subSolidAngle = scalar_type(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
-      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
+      const scalar_type den = _static_cast<scalar_type>(1.0) + triCosc + cosCpB + cosBp_inv;
+      const scalar_type subSolidAngle = _static_cast<scalar_type>(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
+      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, _static_cast<scalar_type>(0.0), _static_cast<scalar_type>(1.0));
 
       // Step 3: u.y = |L-B|^2 / |C'-B|^2
       // Squared Euclidean distance avoids catastrophic cancellation vs (1-dot)/(1-dot)
@@ -226,8 +226,8 @@ struct SphericalTriangle
       const scalar_type v_denom = nbl::hlsl::dot(cpMinusB, cpMinusB);
       const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
          nbl::hlsl::clamp(v_num / nbl::hlsl::max(v_denom, numeric_limits<scalar_type>::min),
-            scalar_type(0.0), scalar_type(1.0)),
-         scalar_type(0.0));
+            _static_cast<scalar_type>(0.0), _static_cast<scalar_type>(1.0)),
+         _static_cast<scalar_type>(0.0));
 
       return vector2_type(u, v);
    }
diff --git a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
index a5a14660f4..0d22323921 100644
--- a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
@@ -37,9 +37,9 @@ struct UniformHemisphere
 	{
 		typename ConcentricMapping<scalar_type>::cache_type cmCache;
 		const vector_t2 p = ConcentricMapping<T>::generate(u, cmCache);
-		assert(cmCache.r2 <= scalar_type(1.0));
-		const T z = T(1.0) - cmCache.r2;
-		const T xyScale = hlsl::sqrt<scalar_type>(scalar_type(2.0) - cmCache.r2);
+		assert(cmCache.r2 <= _static_cast<scalar_type>(1.0));
+		const T z = _static_cast<T>(1.0) - cmCache.r2;
+		const T xyScale = hlsl::sqrt<scalar_type>(_static_cast<scalar_type>(2.0) - cmCache.r2);
 		return vector_t3(p.x * xyScale, p.y * xyScale, z);
 	}
 
@@ -51,30 +51,30 @@ struct UniformHemisphere
 	static domain_type generateInverse(const codomain_type v)
 	{
 		// r_disk / r_xy = sqrt(1-z) / sqrt(1-z^2) = 1/sqrt(1+z)
-		const scalar_type scale = hlsl::rsqrt<T>(T(1.0) + v.z);
+		const scalar_type scale = hlsl::rsqrt<T>(_static_cast<T>(1.0) + v.z);
 		return ConcentricMapping<T>::generateInverse(vector_t2(v.x * scale, v.y * scale));
 	}
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return scalar_type(0.5) * numbers::inv_pi<T>;
+		return _static_cast<scalar_type>(0.5) * numbers::inv_pi<T>;
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * numbers::inv_pi<T>;
 	}
 
 	static density_type backwardPdf(const codomain_type v)
 	{
 		assert(v.z > 0);
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * numbers::inv_pi<T>;
 	}
 
 	static weight_type backwardWeight(const codomain_type v)
 	{
 		assert(v.z > 0);
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * numbers::inv_pi<T>;
 	}
 
 };
@@ -97,7 +97,7 @@ struct UniformSphere
 
 	static codomain_type generate(const domain_type u)
 	{
-		const T tmp = u.x * T(2.0) - T(1.0);
+		const T tmp = u.x * _static_cast<T>(2.0) - _static_cast<T>(1.0);
 		const codomain_type L = hemisphere_t::generate(vector_t2(hlsl::abs<T>(tmp), u.y));
 		return vector_t3(L.x, L.y, L.z * hlsl::sign(tmp));
 	}
@@ -109,29 +109,29 @@ struct UniformSphere
 
 	static domain_type generateInverse(const codomain_type v)
 	{
-		const T dir = hlsl::sign(v.z) * T(0.5);
+		const T dir = hlsl::sign(v.z) * _static_cast<T>(0.5);
 		const domain_type hemiU = hemisphere_t::generateInverse(vector_t3(v.x, v.y, hlsl::abs<T>(v.z)));
-		return vector_t2(hemiU.x * dir + T(0.5), hemiU.y);
+		return vector_t2(hemiU.x * dir + _static_cast<T>(0.5), hemiU.y);
 	}
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * hemisphere_t::forwardPdf(u, cache);
+		return _static_cast<T>(0.5) * hemisphere_t::forwardPdf(u, cache);
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * hemisphere_t::forwardWeight(u, cache);
+		return _static_cast<T>(0.5) * hemisphere_t::forwardWeight(u, cache);
 	}
 
 	static density_type backwardPdf(const codomain_type v)
 	{
-		return T(0.25) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.25) * numbers::inv_pi<T>;
 	}
 
 	static weight_type backwardWeight(const codomain_type v)
 	{
-		return T(0.25) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.25) * numbers::inv_pi<T>;
 	}
 
 };

From 581e185db9d7aaecafe33cec361e1a1914f77db4 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Mon, 27 Apr 2026 22:22:32 +0300
Subject: [PATCH 26/26] OBB view, width specific rotl and rotr

---
 examples_tests                                |   2 +-
 include/nbl/builtin/hlsl/bit.hlsl             |  30 +++++-
 .../projected_spherical_rectangle.hlsl        |   6 +-
 include/nbl/builtin/hlsl/shapes/obb.hlsl      | 102 +++++++++++++++++-
 4 files changed, 134 insertions(+), 6 deletions(-)

diff --git a/examples_tests b/examples_tests
index b5d1d5bcba..07e76965a7 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit b5d1d5bcba745e0b70539f2d2de68460b39af60e
+Subproject commit 07e76965a740d8a420779860de133dd88a3081f6
diff --git a/include/nbl/builtin/hlsl/bit.hlsl b/include/nbl/builtin/hlsl/bit.hlsl
index 71075d5491..f712c5aad5 100644
--- a/include/nbl/builtin/hlsl/bit.hlsl
+++ b/include/nbl/builtin/hlsl/bit.hlsl
@@ -3,6 +3,7 @@
 
 
 #include <nbl/builtin/hlsl/macros.h>
+#include <nbl/builtin/hlsl/cpp_compat/basic.h>
 
 
 #ifndef __HLSL_VERSION
@@ -123,5 +124,32 @@ uint16_t countl_zero(T n)
 }
 }
 #endif
- 
+
+namespace nbl
+{
+namespace hlsl
+{
+
+// Variable-width sub-word bit rotation
+template<typename T>
+NBL_CONSTEXPR_FUNC T rotl(T value, uint32_t bits, uint32_t width)
+{
+    const T mask = (width >= sizeof(T) * 8) ? ~T(0) : ((T(1) << width) - T(1));
+    value &= mask;
+    bits &= -(bits < width);
+    return ((value << bits) | (value >> (width - bits))) & mask;
+}
+
+template<typename T>
+NBL_CONSTEXPR_FUNC T rotr(T value, uint32_t bits, uint32_t width)
+{
+    const T mask = (width >= sizeof(T) * 8) ? ~T(0) : ((T(1) << width) - T(1));
+    value &= mask;
+    bits &= -(bits < width);
+    return ((value >> bits) | (value << (width - bits))) & mask;
+}
+
+}
+}
+
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
index 13cc13d30c..1ad8b5462e 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
@@ -128,7 +128,7 @@ struct ProjectedSphericalRectangle
       typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
       const vector3_type dir = sphrect.generateNormalizedLocal(warped, sphrectCache, hitDist);
       NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-      cache.L = hlsl::mul(hlsl::transpose(sphrect.basis), dir);
+         cache.L = hlsl::mul(hlsl::transpose(sphrect.basis), dir);
       return dir;
    }
 
@@ -139,7 +139,7 @@ struct ProjectedSphericalRectangle
       typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
       const vector3_type dir = sphrect.generateUnnormalized(warped, sphrectCache);
       NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-      cache.L = dir * hlsl::rsqrt(hlsl::dot(dir, dir));
+         cache.L = dir * hlsl::rsqrt(hlsl::dot(dir, dir));
       return dir;
    }
 
@@ -162,7 +162,7 @@ struct ProjectedSphericalRectangle
    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
    {
       NBL_IF_CONSTEXPR(UsePdfAsWeight)
-      return forwardPdf(u, cache);
+         return forwardPdf(u, cache);
       return backwardWeight(cache.L);
    }
 
diff --git a/include/nbl/builtin/hlsl/shapes/obb.hlsl b/include/nbl/builtin/hlsl/shapes/obb.hlsl
index bdddc48ebf..3fa5eebc15 100644
--- a/include/nbl/builtin/hlsl/shapes/obb.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/obb.hlsl
@@ -49,7 +49,7 @@ struct OBB
 	NBL_CONSTEXPR_STATIC_INLINE OBB createAxisAligned(point_t mid, point_t len)
 	{
 		point_t axes[D];
-		for (auto dim_i = 0; dim_i < D; dim_i++)
+		for (int dim_i = 0; dim_i < D; dim_i++)
 		{
 			axes[dim_i] = point_t(0);
 			axes[dim_i][dim_i] = 1;
@@ -60,6 +60,106 @@ struct OBB
 	matrix<scalar_t, D, D + 1> transform;
 };
 
+// Decomposed OBB view: caches columns and minCorner for fast vertex queries.
+// Same 12-float footprint as float32_t3x4, but laid out for branchless
+// corner computation.
+// Decomposed OBB view: caches columns and minCorner for fast vertex queries.
+// columns[i] are the full OBB edge vectors; minCorner is the world position
+// of corner 0b000 = center - 0.5*(col0+col1+col2).
+template<typename Scalar=float32_t>
+struct OBBView
+{
+	using scalar_t = Scalar;
+	using vec3_t = vector<Scalar, 3>;
+	using mat3_t = matrix<Scalar, 3, 3>;
+
+	mat3_t columns;
+	vec3_t minCorner;
+
+	static OBBView create(matrix<scalar_t, 3, 4> modelMatrix)
+	{
+		matrix<scalar_t, 4, 3> m = transpose(modelMatrix);
+		OBBView v;
+		v.columns = mat3_t(m[0].xyz, m[1].xyz, m[2].xyz);
+		v.minCorner = m[3].xyz - scalar_t(0.5) * (m[0].xyz + m[1].xyz + m[2].xyz);
+		return v;
+	}
+
+	vec3_t getCenter() NBL_CONST_MEMBER_FUNC
+	{
+		return minCorner + scalar_t(0.5) * (columns[0] + columns[1] + columns[2]);
+	}
+
+	vec3_t getVertex(uint32_t i) NBL_CONST_MEMBER_FUNC
+	{
+		vec3_t p = minCorner;
+		if (i & 1u) p += columns[0];
+		if (i & 2u) p += columns[1];
+		if (i & 4u) p += columns[2];
+		return p;
+	}
+
+	// Scalar-z specialization: only computes the z component of a corner.
+	// Used for clip-mask build where we only need the sign of z.
+	scalar_t getVertexZ(uint32_t i) NBL_CONST_MEMBER_FUNC
+	{
+		scalar_t pz = minCorner.z;
+		if (i & 1u) pz += columns[0].z;
+		if (i & 2u) pz += columns[1].z;
+		if (i & 4u) pz += columns[2].z;
+		return pz;
+	}
+
+	// Ray-OBB intersection via slab test in OBB-local space.
+	// Returns (tMin, tMax, hit). tMin is clamped to 0 if ray starts inside.
+	// TODO: not optimized -- precompute inverse columns, handle f~=0 edge case
+	struct Intersection
+	{
+		scalar_t tMin;
+		scalar_t tMax;
+		bool hit;
+	};
+
+	Intersection rayIntersection(vec3_t rayOrigin, vec3_t rayDir) NBL_CONST_MEMBER_FUNC
+	{
+		// Vector from ray origin to minCorner
+		vec3_t delta = rayOrigin - minCorner;
+
+		scalar_t tMin = scalar_t(-1e30);
+		scalar_t tMax = scalar_t(1e30);
+
+		// Slab test against each OBB axis
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			vec3_t axis = columns[i];
+			scalar_t axisLenSq = dot(axis, axis);
+			scalar_t e = dot(axis, delta);
+			scalar_t f = dot(axis, rayDir);
+
+			// Project ray onto axis: entry at t where dot = 0, exit where dot = axisLenSq
+			scalar_t invF = scalar_t(1.0) / f;
+			scalar_t t1 = (-e) * invF;
+			scalar_t t2 = (axisLenSq - e) * invF;
+
+			if (t1 > t2)
+			{
+				scalar_t tmp = t1;
+				t1 = t2;
+				t2 = tmp;
+			}
+
+			tMin = max(tMin, t1);
+			tMax = min(tMax, t2);
+		}
+
+		Intersection result;
+		result.hit = (tMax >= tMin) && (tMax > scalar_t(0));
+		result.tMin = max(tMin, scalar_t(0));
+		result.tMax = tMax;
+		return result;
+	}
+};
+
 }
 }
 }