diff --git a/examples_tests b/examples_tests
index eebde787c2..07e76965a7 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit eebde787c233367ade8eb0580bc79c0d562e97aa
+Subproject commit 07e76965a740d8a420779860de133dd88a3081f6
diff --git a/include/nbl/builtin/hlsl/algorithm.hlsl b/include/nbl/builtin/hlsl/algorithm.hlsl
index 631001686e..d80ebdf443 100644
--- a/include/nbl/builtin/hlsl/algorithm.hlsl
+++ b/include/nbl/builtin/hlsl/algorithm.hlsl
@@ -142,7 +142,9 @@ struct bound_t
 
     void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint, const uint32_t rightBegin)
     {
-        if (compare(accessor[testPoint],value))
+        typename Accessor::value_type val;
+        accessor.get(testPoint, val);
+        if (compare(val,value))
             it = rightBegin;
     }
     void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint)
diff --git a/include/nbl/builtin/hlsl/bit.hlsl b/include/nbl/builtin/hlsl/bit.hlsl
index 71075d5491..f712c5aad5 100644
--- a/include/nbl/builtin/hlsl/bit.hlsl
+++ b/include/nbl/builtin/hlsl/bit.hlsl
@@ -3,6 +3,7 @@
 
 
 #include <nbl/builtin/hlsl/macros.h>
+#include <nbl/builtin/hlsl/cpp_compat/basic.h>
 
 
 #ifndef __HLSL_VERSION
@@ -123,5 +124,32 @@ uint16_t countl_zero(T n)
 }
 }
 #endif
- 
+
+namespace nbl
+{
+namespace hlsl
+{
+
+// Variable-width sub-word bit rotation
+template<typename T>
+NBL_CONSTEXPR_FUNC T rotl(T value, uint32_t bits, uint32_t width)
+{
+    const T mask = (width >= sizeof(T) * 8) ? ~T(0) : ((T(1) << width) - T(1));
+    value &= mask;
+    bits &= -(bits < width);
+    return ((value << bits) | (value >> (width - bits))) & mask;
+}
+
+template<typename T>
+NBL_CONSTEXPR_FUNC T rotr(T value, uint32_t bits, uint32_t width)
+{
+    const T mask = (width >= sizeof(T) * 8) ? ~T(0) : ((T(1) << width) - T(1));
+    value &= mask;
+    bits &= -(bits < width);
+    return ((value >> bits) | (value << (width - bits))) & mask;
+}
+
+}
+}
+
 #endif
diff --git a/include/nbl/builtin/hlsl/functional.hlsl b/include/nbl/builtin/hlsl/functional.hlsl
index 118fe07c63..6a155f2b92 100644
--- a/include/nbl/builtin/hlsl/functional.hlsl
+++ b/include/nbl/builtin/hlsl/functional.hlsl
@@ -89,12 +89,23 @@ struct reference_wrapper : enable_if_t<
         return lhs OP rhs; \
     }
 
+#define ALIAS_STD_CMP(NAME,OP) template<typename T NBL_STRUCT_CONSTRAINABLE > struct NAME { \
+    using type_t = T; \
+    \
+    bool operator()(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs) \
+    { \
+        return lhs OP rhs; \
+    }
+
 
 #else // CPP
 
 #define ALIAS_STD(NAME,OP) template<typename T> struct NAME : std::NAME<T> { \
     using type_t = T;
 
+#define ALIAS_STD_CMP(NAME,OP) template<typename T> struct NAME : std::NAME<T> { \
+    using type_t = T;
+
 #endif
 
 ALIAS_STD(bit_and,&)
@@ -136,14 +147,15 @@ ALIAS_STD(divides,/)
 };
 
 
-ALIAS_STD(equal_to, ==) };
-ALIAS_STD(not_equal_to, !=) };
-ALIAS_STD(greater, >) };
-ALIAS_STD(less, <) };
-ALIAS_STD(greater_equal, >=) };
-ALIAS_STD(less_equal, <=) };
+ALIAS_STD_CMP(equal_to, ==) };
+ALIAS_STD_CMP(not_equal_to, !=) };
+ALIAS_STD_CMP(greater, >) };
+ALIAS_STD_CMP(less, <) };
+ALIAS_STD_CMP(greater_equal, >=) };
+ALIAS_STD_CMP(less_equal, <=) };
 
 #undef ALIAS_STD
+#undef ALIAS_STD_CMP
 
 // The above comparison operators return bool on STD, but in HLSL they're supposed to yield bool vectors, so here's a specialization so that they return `vector<bool, N>` for vectorial types
 
diff --git a/include/nbl/builtin/hlsl/ies/sampler.hlsl b/include/nbl/builtin/hlsl/ies/sampler.hlsl
index ab4046477c..a6309fd128 100644
--- a/include/nbl/builtin/hlsl/ies/sampler.hlsl
+++ b/include/nbl/builtin/hlsl/ies/sampler.hlsl
@@ -85,7 +85,7 @@ struct CandelaSampler
         const angle_t vAngle = degrees(polar.theta);
         const angle_t hAngle = degrees(__wrapPhi(polar.phi, symmetry));
 
-#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } };
+#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } void get(uint32_t idx, NBL_REF_ARG(value_type) val) NBL_CONST_MEMBER_FUNC { val = EXPR; } };
 
         NBL_IES_DEF_ANGLE_ACC(VAcc, acc.vAngle(idx))
         NBL_IES_DEF_ANGLE_ACC(HAcc, acc.hAngle(idx))
diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index b5b6f8feea..a80bdda904 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -93,13 +93,22 @@ scalar_type_t<T> lpNorm(NBL_CONST_REF_ARG(T) v)
 
 
 // valid only for `theta` in [-PI,PI]
-template <typename T NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<T>)
+// UseRealSinCos=true  -> back-to-back sin + cos. Saturates the special-function pipeline, enables vendor sincos fusion, full precision near multiples of pi.
+// UseRealSinCos=false -> cos + sqrt(1-c*c) with sign recovered from theta. Saves one special-function op when cos alone is cheaper than sin+cos, but suffers catastrophic cancellation as |c| -> 1.
+template <typename T, bool UseRealSinCos = true NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<T>)
 void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c)
 {
-    s = sin<T>(theta);
-    c = cos<T>(theta);
-    // s = sqrt<T>(T(NBL_FP64_LITERAL(1.0))-c*c);
-    // s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0)));
+    if (UseRealSinCos)
+    {
+        s = sin<T>(theta);
+        c = cos<T>(theta);
+    }
+    else
+    {
+        c = cos<T>(theta);
+        s = sqrt<T>(T(NBL_FP64_LITERAL(1.0))-c*c);
+        s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0)));
+    }
 }
 
 template <typename T NBL_FUNC_REQUIRES(vector_traits<T>::Dimension == 3)
diff --git a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
index 15742e10f3..8e7b3249a0 100644
--- a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
@@ -7,6 +7,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/bit.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl>
 #include <nbl/builtin/hlsl/concepts/core.hlsl>
 #include <nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl>
 
@@ -17,84 +18,187 @@ namespace hlsl
 namespace sampling
 {
 
-// Alias Method (Vose/Walker) discrete sampler.
-//
-// Samples a discrete index in [0, N) with probability proportional to
-// precomputed weights in O(1) time per sample, using a prebuilt alias table.
-//
-// Accessor template parameters must satisfy GenericReadAccessor:
-//   accessor.template get<V, I>(index, outVal)  // void, writes to outVal
-//
-// - ProbabilityAccessor: reads scalar_type threshold in [0, 1] for bin i
-// - AliasIndexAccessor:  reads uint32_t redirect index for bin i
-// - PdfAccessor:         reads scalar_type weight[i] / totalWeight
-//
-// Satisfies TractableSampler (not BackwardTractableSampler: the mapping is discrete).
-// The cache stores the PDF value looked up during generate, avoiding redundant
-// storage of the codomain (sampled index) which is already the return value.
-template<typename T, typename Domain, typename Codomain, typename ProbabilityAccessor, typename AliasIndexAccessor, typename PdfAccessor
+// Packed alias-entry bit layout shared by every packed variant. One 32-bit
+// word holds the redirect index in the low Log2N bits and the stay-
+// probability quantized as an unorm in the high (32 - Log2N) bits.
+//   u * N  = scaled;  bin = floor(scaled);  remainder = scaled - bin
+//   if (remainder < getStayProb(word))  -> result = bin
+//   else                                -> result = getTarget(word)
+// Quantizing the threshold to (32 - Log2N) bits is precision-neutral: `u`
+// already consumed Log2N bits of randomness producing `bin`, so `remainder`
+// carries exactly that many bits of discriminatory power.
+namespace impl
+{
+template<uint32_t Log2N>
+struct AliasBitDecoder
+{
+	static uint32_t getTarget(uint32_t word)
+	{
+		return word & ((1u << Log2N) - 1u);
+	}
+	template<typename T>
+	static T getStayProb(uint32_t word)
+	{
+		const uint32_t unormMax = (~0u) >> Log2N;
+		return T(word >> Log2N) / T(unormMax);
+	}
+};
+} // namespace impl
+
+// 8 B entry used by the NBig == true variant. Embeds the bin's own pdf
+// alongside the packed word so the common stay-case needs no extra tap.
+template<typename T>
+struct PackedAliasEntryB
+{
+	uint32_t packedWord;	// low Log2N: redirect target; high 32-Log2N: stayProb unorm
+	T ownPdf;				// pdf of this bin
+};
+
+
+// NBig == false: 4 B packed word per bin + separate pdf[] array. Per sample
+// = one 4 B word load + one unconditional 4 B pdf[] tap indexed by the
+// selected bin (either the current bin or its redirect). Total 8 B whether
+// the sample stays or aliases. Favours small N.
+template<typename T, typename Domain, typename Codomain, typename PackedWordAccessor, typename PdfAccessor, uint32_t Log2N
 	NBL_PRIMARY_REQUIRES(
 		concepts::UnsignedIntegralScalar<Codomain> &&
-		concepts::accessors::GenericReadAccessor<ProbabilityAccessor, T, Codomain> &&
-		concepts::accessors::GenericReadAccessor<AliasIndexAccessor, Codomain, Codomain> &&
+		concepts::accessors::GenericReadAccessor<PackedWordAccessor, uint32_t, Codomain> &&
 		concepts::accessors::GenericReadAccessor<PdfAccessor, T, Codomain>)
-struct AliasTable
+struct PackedAliasTableA
 {
 	using scalar_type = T;
-
 	using domain_type = Domain;
 	using codomain_type = Codomain;
 	using density_type = scalar_type;
 	using weight_type = density_type;
+	using decoder = impl::AliasBitDecoder<Log2N>;
+	NBL_CONSTEXPR_STATIC_INLINE bool NBig = false;
 
 	struct cache_type
 	{
 		density_type pdf;
 	};
 
-	static AliasTable create(NBL_CONST_REF_ARG(ProbabilityAccessor) _probAccessor, NBL_CONST_REF_ARG(AliasIndexAccessor) _aliasAccessor, NBL_CONST_REF_ARG(PdfAccessor) _pdfAccessor, codomain_type _size)
+	static PackedAliasTableA create(NBL_CONST_REF_ARG(PackedWordAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size)
 	{
-		AliasTable retval;
-		retval.probAccessor = _probAccessor;
-		retval.aliasAccessor = _aliasAccessor;
-		retval.pdfAccessor = _pdfAccessor;
-		// Precompute tableSize as float minus 1 ULP so that u=1.0 maps to bin N-1
+		PackedAliasTableA retval;
+		retval.entryAcc = _entryAcc;
+		retval.pdfAcc = _pdfAcc;
 		const scalar_type exact = scalar_type(_size);
 		retval.tableSizeMinusUlp = nbl::hlsl::bit_cast<scalar_type>(nbl::hlsl::bit_cast<uint32_t>(exact) - 1u);
 		return retval;
 	}
 
-	// BasicSampler interface
 	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
 	{
 		const scalar_type scaled = u * tableSizeMinusUlp;
 		const codomain_type bin = _static_cast<codomain_type>(scaled);
 		const scalar_type remainder = scaled - scalar_type(bin);
 
-		scalar_type prob;
-		probAccessor.template get<scalar_type, codomain_type>(bin, prob);
-
-		// Use if-statement to avoid select: aliasIndex is a dependent read
-		codomain_type result;
-		if (remainder < prob)
-		{
-			result = bin;
-		}
-		else
-		{
-			codomain_type alias;
-			aliasAccessor.template get<codomain_type, codomain_type>(bin, alias);
-			result = alias;
-		}
+		uint32_t packedWord;
+		entryAcc.template get<uint32_t, codomain_type>(bin, packedWord);
+		return hlsl::select(remainder < decoder::template getStayProb<scalar_type>(packedWord), bin, codomain_type(decoder::getTarget(packedWord)));
+	}
 
+	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		const codomain_type result = generate(u);
+		pdfAcc.template get<scalar_type, codomain_type>(result, cache.pdf);
 		return result;
 	}
 
-	// TractableSampler interface
+	density_type forwardPdf(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.pdf;
+	}
+
+	weight_type forwardWeight(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.pdf;
+	}
+
+	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		scalar_type pdf;
+		pdfAcc.template get<scalar_type, codomain_type>(v, pdf);
+		return pdf;
+	}
+
+	weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		return backwardPdf(v);
+	}
+
+	PackedWordAccessor entryAcc;
+	PdfAccessor pdfAcc;
+	scalar_type tableSizeMinusUlp;
+};
+
+// NBig == true: 8 B entry {packedWord, ownPdf} + separate pdf[] array. Per
+// sample = one 8 B entry load (covers the common stay case where cache
+// already has ownPdf). If the sample aliases, a conditional 4 B pdf[target]
+// tap fills the cache. Total 8 B stay, 12 B aliased. Favours large N.
+template<typename T, typename Domain, typename Codomain, typename EntryAccessor, typename PdfAccessor, uint32_t Log2N
+	NBL_PRIMARY_REQUIRES(
+		concepts::UnsignedIntegralScalar<Codomain> &&
+		concepts::accessors::GenericReadAccessor<EntryAccessor, PackedAliasEntryB<T>, Codomain> &&
+		concepts::accessors::GenericReadAccessor<PdfAccessor, T, Codomain>)
+struct PackedAliasTableB
+{
+	using scalar_type = T;
+	using domain_type = Domain;
+	using codomain_type = Codomain;
+	using density_type = scalar_type;
+	using weight_type = density_type;
+	using entry_type = PackedAliasEntryB<scalar_type>;
+	using decoder = impl::AliasBitDecoder<Log2N>;
+	NBL_CONSTEXPR_STATIC_INLINE bool NBig = true;
+
+	struct cache_type
+	{
+		density_type pdf;
+	};
+
+	static PackedAliasTableB create(NBL_CONST_REF_ARG(EntryAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size)
+	{
+		PackedAliasTableB retval;
+		retval.entryAcc = _entryAcc;
+		retval.pdfAcc = _pdfAcc;
+		const scalar_type exact = scalar_type(_size);
+		retval.tableSizeMinusUlp = nbl::hlsl::bit_cast<scalar_type>(nbl::hlsl::bit_cast<uint32_t>(exact) - 1u);
+		return retval;
+	}
+
+	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+	{
+		const scalar_type scaled = u * tableSizeMinusUlp;
+		const codomain_type bin = _static_cast<codomain_type>(scaled);
+		const scalar_type remainder = scaled - scalar_type(bin);
+
+		entry_type entry;
+		entryAcc.template get<entry_type, codomain_type>(bin, entry);
+		return hlsl::select(remainder < decoder::template getStayProb<scalar_type>(entry.packedWord), bin, codomain_type(decoder::getTarget(entry.packedWord)));
+	}
+
 	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		const codomain_type result = generate(u);
-		pdfAccessor.template get<scalar_type, codomain_type>(result, cache.pdf);
+		const scalar_type scaled = u * tableSizeMinusUlp;
+		const codomain_type bin = _static_cast<codomain_type>(scaled);
+		const scalar_type remainder = scaled - scalar_type(bin);
+
+		entry_type entry;
+		entryAcc.template get<entry_type, codomain_type>(bin, entry);
+
+		const bool stay = remainder < decoder::template getStayProb<scalar_type>(entry.packedWord);
+		
+		cache.pdf = entry.ownPdf;
+		codomain_type result = bin;
+		if (!stay)
+		{
+			const codomain_type target = codomain_type(decoder::getTarget(entry.packedWord));
+			pdfAcc.template get<scalar_type, codomain_type>(target, cache.pdf);
+			result = target;
+		}
 		return result;
 	}
 
@@ -111,7 +215,7 @@ struct AliasTable
 	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
 	{
 		scalar_type pdf;
-		pdfAccessor.template get<scalar_type, codomain_type>(v, pdf);
+		pdfAcc.template get<scalar_type, codomain_type>(v, pdf);
 		return pdf;
 	}
 
@@ -120,9 +224,8 @@ struct AliasTable
 		return backwardPdf(v);
 	}
 
-	ProbabilityAccessor probAccessor;
-	AliasIndexAccessor aliasAccessor;
-	PdfAccessor pdfAccessor;
+	EntryAccessor entryAcc;
+	PdfAccessor pdfAcc;
 	scalar_type tableSizeMinusUlp;
 };
 
diff --git a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
index d02d21488c..2c7c53fd5f 100644
--- a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
+++ b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
@@ -5,7 +5,12 @@
 #ifndef _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_
 #define _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_
 
+#include <cmath>
 #include <cstdint>
+#include <span>
+#include <vector>
+
+#include <nbl/builtin/hlsl/sampling/alias_table.hlsl>
 
 namespace nbl
 {
@@ -15,74 +20,139 @@ namespace sampling
 {
 
 // Builds the alias table from an array of non-negative weights.
-// All output arrays must be pre-allocated to N entries.
 //
-// Parameters:
-//   weights         - input weights (non-negative, at least one must be > 0)
-//   N               - number of entries
-//   outProbability  - [out] alias table probability threshold per bin, in [0, 1]
-//   outAlias        - [out] alias redirect index per bin
-//   outPdf          - [out] normalized PDF per entry: weight[i] / sum(weights)
-//   workspace       - scratch buffer of N uint32_t entries
+// When `weights.size()` is a power of two, the builder transparently appends
+// one zero-weight dummy bucket so the GPU-facing table size is N+1 (odd),
+// which breaks PoT-periodic address patterns that alias memory channels /
+// cache sets on most GPUs. The sampled distribution is unchanged, the dummy
+// has stayProb = 0 and always redirects to a real donor.
+//
+// Output vectors are `resize`d by the builder to the final table size, so
+// the caller just passes (possibly empty) vectors and reads back the
+// returned size. That returned value is what to pass to the sampler's
+// `_size` argument and to use when packing / uploading.
 template<typename T>
 struct AliasTableBuilder
 {
-	static void build(std::span<const T> weights, T* outProbability, uint32_t* outAlias, T* outPdf, uint32_t* workspace)
-	{
-		T totalWeight = T(0);
-      for (uint32_t i = 0; i < weights.size(); i++)
-			totalWeight += weights[i];
-
-		const T rcpTotalWeight = T(1) / totalWeight;
-
-		// Compute PDFs, scaled probabilities, and partition into small/large in one pass
-		uint32_t smallEnd = 0;
-      uint32_t largeBegin = weights.size();
-      for (uint32_t i = 0; i < weights.size(); i++)
-		{
-			outPdf[i] = weights[i] * rcpTotalWeight;
-         outProbability[i] = outPdf[i] * T(weights.size());
-
-			if (outProbability[i] < T(1))
-				workspace[smallEnd++] = i;
-			else
-				workspace[--largeBegin] = i;
-		}
-
-		// Pair small and large entries
-      while (smallEnd > 0 && largeBegin < weights.size())
-		{
-			const uint32_t s = workspace[--smallEnd];
-			const uint32_t l = workspace[largeBegin];
-
-			outAlias[s] = l;
-			// outProbability[s] already holds the correct probability for bin s
-
-			outProbability[l] -= (T(1) - outProbability[s]);
-
-			if (outProbability[l] < T(1))
-			{
-				// l became small: pop from large, push to small
-				largeBegin++;
-				workspace[smallEnd++] = l;
-			}
-			// else l stays in large (don't pop, reuse next iteration)
-		}
-
-		// Remaining entries (floating point rounding artifacts)
-		while (smallEnd > 0)
-		{
-			const uint32_t s = workspace[--smallEnd];
-			outProbability[s] = T(1);
-			outAlias[s] = s;
-		}
-      while (largeBegin < weights.size())
-		{
-			const uint32_t l = workspace[largeBegin++];
-			outProbability[l] = T(1);
-			outAlias[l] = l;
-		}
-	}
+   // Ugly but much faster: we better ensure the table size is not a power of
+   // two, so we pad with +1 zero-weight dummy bucket when needed. PoT-sized
+   // alias tables hit GPU memory channel / cache set aliasing that can be
+   // wildly (sometimes 2x+) slower than a nearby non-PoT size. Builder owns
+   // all the sizing (resizes the output vectors, allocates its own scratch),
+   // so the caller can't get it wrong.
+   static uint32_t build(std::span<const T> weights, std::vector<T>& outProbability, std::vector<uint32_t>& outAlias, std::vector<T>& outPdf)
+   {
+      const uint32_t userN  = static_cast<uint32_t>(weights.size());
+      const uint32_t tableN = (userN > 1u && (userN & (userN - 1u)) == 0u) ? (userN + 1u) : userN;
+
+      outProbability.resize(tableN);
+      outAlias.resize(tableN);
+      outPdf.resize(tableN);
+      std::vector<uint32_t> workspace(tableN);
+
+      T totalWeight = T(0);
+      for (uint32_t i = 0; i < userN; i++)
+         totalWeight += weights[i];
+
+      const T rcpTotalWeight = T(1) / totalWeight;
+
+      // Compute PDFs, scaled probabilities, and partition into small/large in one pass
+      uint32_t smallEnd   = 0u;
+      uint32_t largeBegin = tableN;
+      for (uint32_t i = 0; i < userN; i++)
+      {
+         outPdf[i]         = weights[i] * rcpTotalWeight;
+         outProbability[i] = outPdf[i] * T(tableN);
+
+         if (outProbability[i] < T(1))
+            workspace[smallEnd++] = i;
+         else
+            workspace[--largeBegin] = i;
+      }
+      // PoT dodge tail: one zero-weight dummy at index userN, always in the small list.
+      if (tableN != userN)
+      {
+         outPdf[userN]         = T(0);
+         outProbability[userN] = T(0);
+         workspace[smallEnd++] = userN;
+      }
+
+      // Pair small and large entries
+      while (smallEnd > 0u && largeBegin < tableN)
+      {
+         const uint32_t s = workspace[--smallEnd];
+         const uint32_t l = workspace[largeBegin];
+
+         outAlias[s] = l;
+         // outProbability[s] already holds the correct probability for bin s
+
+         outProbability[l] -= (T(1) - outProbability[s]);
+
+         if (outProbability[l] < T(1))
+         {
+            // l became small: pop from large, push to small
+            largeBegin++;
+            workspace[smallEnd++] = l;
+         }
+         // else l stays in large (don't pop, reuse next iteration)
+      }
+
+      // Remaining entries (floating point rounding artifacts)
+      while (smallEnd > 0u)
+      {
+         const uint32_t s  = workspace[--smallEnd];
+         outProbability[s] = T(1);
+         outAlias[s]       = s;
+      }
+      while (largeBegin < tableN)
+      {
+         const uint32_t l  = workspace[largeBegin++];
+         outProbability[l] = T(1);
+         outAlias[l]       = l;
+      }
+
+      return tableN;
+   }
+
+   // Pack (target, stayProb) into a single 32-bit word with Log2N bits for
+   // target and (32 - Log2N) bits for the unorm-quantized threshold. Used by
+   // every packed variant; each packX() below calls this on a per-entry basis.
+   template<uint32_t Log2N>
+   static uint32_t packWord(uint32_t target, T stayProb)
+   {
+      const uint32_t targetMask = (Log2N == 32u) ? ~0u : ((1u << Log2N) - 1u);
+      const T        clamped    = stayProb < T(0) ? T(0) : (stayProb > T(1) ? T(1) : stayProb);
+      const uint32_t unormMax   = (Log2N == 0u) ? ~0u : ((~0u) >> Log2N);
+      const uint32_t probUnorm  = static_cast<uint32_t>(std::round(static_cast<double>(clamped) * static_cast<double>(unormMax)));
+      return (target & targetMask) | (probUnorm << Log2N);
+   }
+
+   // Variant A, pack SoA outputs into an array of 4 B packed words. The
+   // pdf[] array is consumed directly by the sampler as a second accessor.
+   // outWords must be pre-allocated to N uint32_t entries.
+   template<uint32_t Log2N>
+   static void packA(std::span<const T> probability, std::span<const uint32_t> alias, uint32_t* outWords)
+   {
+      const uint32_t N = static_cast<uint32_t>(probability.size());
+      for (uint32_t i = 0; i < N; i++)
+         outWords[i] = packWord<Log2N>(alias[i], probability[i]);
+   }
+
+   // Variant B, pack SoA outputs into 8 B entries { packedWord, ownPdf }.
+   // The pdf[] array is *also* passed to the sampler (same contents as ownPdf
+   // column, but tapped independently with a 4 B fetch when the sample aliases).
+   // outEntries must be pre-allocated to N entries.
+   template<uint32_t Log2N>
+   static void packB(std::span<const T> probability, std::span<const uint32_t> alias, std::span<const T> pdf,
+      PackedAliasEntryB<T>* outEntries)
+   {
+      const uint32_t N = static_cast<uint32_t>(probability.size());
+      for (uint32_t i = 0; i < N; i++)
+      {
+         outEntries[i].packedWord = packWord<Log2N>(alias[i], probability[i]);
+         outEntries[i].ownPdf     = pdf[i];
+      }
+   }
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
index 35f1391930..b56073074b 100644
--- a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
@@ -45,10 +45,10 @@ struct Bilinear
         vector2_type twiceAreasUnderXCurve = vector2_type(bilinearCoeffs[0] + bilinearCoeffs[1], bilinearCoeffs[2] + bilinearCoeffs[3]);
         // Linear::create adds FLT_MIN internally, replicate here so both divisions share
         // the same denominator (sum + 2*min), enabling CSE to merge them into one division
-        const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + scalar_type(2.0) * hlsl::numeric_limits<scalar_type>::min;
-        const scalar_type yNormFactor = scalar_type(2.0) / safeSum;
+        const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + _static_cast<scalar_type>(2.0) * hlsl::numeric_limits<scalar_type>::min;
+        const scalar_type yNormFactor = _static_cast<scalar_type>(2.0) / safeSum;
         retval.lineary = Linear<scalar_type>::create(twiceAreasUnderXCurve);
-        retval.normFactor = yNormFactor * scalar_type(2.0);
+        retval.normFactor = yNormFactor * _static_cast<scalar_type>(2.0);
         return retval;
     }
 
@@ -65,7 +65,7 @@ struct Bilinear
 
         // bilinear PDF = marginal_y_pdf * conditional_x_pdf; reuse both linear caches
         const scalar_type yPdf = lineary.forwardPdf(u.y, linearYCache);
-        cache.normalizedStart = yPdf * linearx.linearCoeffStart;
+        cache.normalizedStart = yPdf * linearx.normalizedCoeffStart;
         cache.linearXCache.diffTimesX *= yPdf;
         return p;
     }
diff --git a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
index b9efd1ec35..9ae0c83c05 100644
--- a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
@@ -37,16 +37,16 @@ struct BoxMullerTransform
     {
         BoxMullerTransform<T> retval;
         retval.stddev = _stddev;
-        retval.halfRcpStddev2 = scalar_type(0.5) / (_stddev * _stddev);
+        retval.halfRcpStddev2 = _static_cast<scalar_type>(0.5) / (_stddev * _stddev);
         return retval;
     }
 
     codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
         scalar_type sinPhi, cosPhi;
-        math::sincos<scalar_type>(scalar_type(2.0) * numbers::pi<scalar_type> * u.y - numbers::pi<scalar_type>, sinPhi, cosPhi);
+        math::sincos<scalar_type>(_static_cast<scalar_type>(2.0) * numbers::pi<scalar_type> * u.y - numbers::pi<scalar_type>, sinPhi, cosPhi);
         cache.direction = vector2_type(cosPhi, sinPhi);
-        return cache.direction * nbl::hlsl::sqrt(scalar_type(-2.0) * nbl::hlsl::log(u.x)) * stddev;
+        return cache.direction * nbl::hlsl::sqrt(_static_cast<scalar_type>(-2.0) * nbl::hlsl::log(u.x)) * stddev;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
diff --git a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
index c9e5cac5d6..d31c2d994a 100644
--- a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
@@ -41,7 +41,7 @@ struct ConcentricMapping
    static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
    {
       // map [0,1]^2 to [-1,1]^2
-      const vector2_type centered = scalar_type(2) * u - hlsl::promote<vector2_type>(scalar_type(1));
+      const vector2_type centered = _static_cast<scalar_type>(2) * u - hlsl::promote<vector2_type>(_static_cast<scalar_type>(1));
 
       const scalar_type a = centered.x;
       const scalar_type b = centered.y;
@@ -51,10 +51,10 @@ struct ConcentricMapping
       const scalar_type dominant = hlsl::select(cond, a, b);
       const scalar_type minor = hlsl::select(cond, b, a);
 
-      const scalar_type safe_dominant = dominant != scalar_type(0) ? dominant : scalar_type(0);
+      const scalar_type safe_dominant = dominant != _static_cast<scalar_type>(0) ? dominant : _static_cast<scalar_type>(0);
       const scalar_type ratio = minor / safe_dominant;
 
-      const scalar_type angle = scalar_type(0.25) * numbers::pi<scalar_type> * ratio;
+      const scalar_type angle = _static_cast<scalar_type>(0.25) * numbers::pi<scalar_type> * ratio;
       const scalar_type c = hlsl::cos<scalar_type>(angle);
       const scalar_type s = hlsl::sin<scalar_type>(angle);
 
@@ -90,7 +90,7 @@ struct ConcentricMapping
       // angle in [0, pi/4]
       const scalar_type phi = hlsl::atan2(num, denom);
 
-      const scalar_type minor_val = r * phi / (scalar_type(0.25) * numbers::pi<scalar_type>);
+      const scalar_type minor_val = r * phi / (_static_cast<scalar_type>(0.25) * numbers::pi<scalar_type>);
 
       // reconstruct a,b using select instead of branching
       const scalar_type a_base = hlsl::select(swapped, minor_val, r);
@@ -99,7 +99,7 @@ struct ConcentricMapping
       const scalar_type a = ieee754::copySign(a_base, p.x);
       const scalar_type b = ieee754::copySign(b_base, p.y);
 
-      return (vector2_type(a, b) + hlsl::promote<vector2_type>(scalar_type(1))) * scalar_type(0.5);
+      return (vector2_type(a, b) + hlsl::promote<vector2_type>(_static_cast<scalar_type>(1))) * _static_cast<scalar_type>(0.5);
    }
 
    // The PDF of Shirley mapping is constant (1/PI on the unit disk)
diff --git a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
index 23e35e2f7d..52b063f448 100644
--- a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
@@ -35,8 +35,8 @@ struct ProjectedHemisphere
 
 	static codomain_type __generate(const domain_type u)
 	{
-		vector_t2 p = ConcentricMapping<T>::generate(u * T(0.99999) + T(0.000005));
-		T z = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(1.0) - p.x * p.x - p.y * p.y));
+		vector_t2 p = ConcentricMapping<T>::generate(u * _static_cast<T>(0.99999) + _static_cast<T>(0.000005));
+		T z = hlsl::sqrt<T>(hlsl::max<T>(_static_cast<T>(0.0), _static_cast<T>(1.0) - p.x * p.x - p.y * p.y));
 		return vector_t3(p.x, p.y, z);
 	}
 
@@ -93,11 +93,11 @@ struct ProjectedSphere
 	static codomain_type __generate(NBL_REF_ARG(domain_type) u)
 	{
 		vector_t3 retval = hemisphere_t::__generate(u.xy);
-		const bool chooseLower = u.z > T(0.5);
+		const bool chooseLower = u.z > _static_cast<scalar_type>(0.5);
 		retval.z = chooseLower ? (-retval.z) : retval.z;
 		if (chooseLower)
-			u.z -= T(0.5);
-		u.z *= T(2.0);
+			u.z -= _static_cast<T>(0.5);
+		u.z *= _static_cast<T>(2.0);
 		return retval;
 	}
 
@@ -110,7 +110,7 @@ struct ProjectedSphere
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * cache.z * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * cache.z * numbers::inv_pi<T>;
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)
@@ -120,7 +120,7 @@ struct ProjectedSphere
 
 	static density_type backwardPdf(const codomain_type L)
 	{
-		return T(0.5) * hlsl::abs(L.z) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * hlsl::abs(L.z) * numbers::inv_pi<T>;
 	}
 
 	static weight_type backwardWeight(const codomain_type L)
diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
index 1f176f9713..d3657babb1 100644
--- a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
@@ -16,18 +16,38 @@ namespace hlsl
 namespace sampling
 {
 
-// Discrete sampler using cumulative probability lookup via upper_bound.
+// Discrete sampler using cumulative probability lookup.
 //
 // Samples a discrete index in [0, N) with probability proportional to
 // precomputed weights in O(log N) time per sample.
 //
-// The cumulative probability array stores N-1 entries (the last bucket
-// is always 1.0 and need not be stored). Entry i holds the sum of
-// probabilities for indices [0, i].
+// Three layouts / cache-population strategies, selected by the Mode parameter:
+//
+//   TRACKING (default):  N-1 CDF entries, last bucket implicit at 1.0.
+//                        A stateful comparator records the straddling CDF
+//                        values during upper_bound itself.
+//   YOLO:                Same storage. Plain upper_bound followed by two
+//                        re-reads of the adjacent CDF entries (warm cache).
+//                        Lower register footprint, two extra array reads.
+//   EYTZINGER:           Level-order implicit binary tree in 2*P entries
+//                        where P = roundUpPot(N). Leaves at [P, P+N) hold
+//                        the CDF; interior nodes at [1, P) hold split keys.
+//                        Descent reads adjacent memory at each step, so
+//                        every cache line pulled is fully utilised and the
+//                        first log2(subgroupSize) iterations are served by a
+//                        single transaction per subgroup. Build with
+//                        sampling::buildEytzinger<T>().
 //
 // Satisfies TractableSampler and ResamplableSampler (not BackwardTractableSampler:
 // the mapping is discrete).
-template<typename T, typename Domain, typename Codomain, typename CumProbAccessor
+enum CumulativeProbabilityMode : uint32_t
+{
+	TRACKING  = 0u,
+	YOLO      = 1u,
+	EYTZINGER = 2u
+};
+
+template<typename T, typename Domain, typename Codomain, typename CumProbAccessor, CumulativeProbabilityMode Mode = CumulativeProbabilityMode::TRACKING
 	NBL_PRIMARY_REQUIRES(concepts::accessors::GenericReadAccessor<CumProbAccessor, T, Codomain>)
 struct CumulativeProbabilitySampler
 {
@@ -44,58 +64,116 @@ struct CumulativeProbabilitySampler
 		density_type upperBound;
 	};
 
+	// `_size` is the user-facing bucket count N for every mode. TRACKING / YOLO
+	// expect the accessor to hold N-1 CDF entries; EYTZINGER expects 2*P entries
+	// in the level-order layout produced by buildEytzinger.
 	static CumulativeProbabilitySampler create(NBL_CONST_REF_ARG(CumProbAccessor) _cumProbAccessor, uint32_t _size)
 	{
 		CumulativeProbabilitySampler retval;
 		retval.cumProbAccessor = _cumProbAccessor;
 		retval.storedCount = _size - 1u;
+		retval.depth = 0u;
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
+		{
+			uint32_t P = 1u;
+			uint32_t d = 0u;
+			while (P < _size) { P <<= 1u; ++d; }
+			retval.depth = d;
+		}
 		return retval;
 	}
 
 	// BasicSampler interface
 	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
 	{
-		// upper_bound returns first index where cumProb > u
-		return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
+		{
+			const uint32_t leafBase = 1u << depth;
+			uint32_t index = 1u;
+			for (uint32_t iter = 0u; iter < depth; ++iter)
+			{
+				density_type key;
+				cumProbAccessor.template get<density_type, uint32_t>(index, key);
+				index = (index << 1u) | uint32_t(!(u < key));
+			}
+			const codomain_type result = codomain_type(index - leafBase);
+			return result < codomain_type(storedCount) ? result : codomain_type(storedCount);
+		}
+		else
+		{
+			return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+		}
 	}
 
 	// TractableSampler interface
 	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		// #define NBL_CUMPROB_YOLO_READS
-#ifdef NBL_CUMPROB_YOLO_READS
-		// YOLO approach: re-read the array after binary search.
-		// The accessed elements are adjacent to the found index so the cache is warm.
-		const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
-		cache.oneBefore = density_type(0.0);
-		if (result)
-			cumProbAccessor.template get<density_type, codomain_type>(result - 1u, cache.oneBefore);
-		cache.upperBound = density_type(1.0);
-		if (result < storedCount)
-			cumProbAccessor.template get<density_type, codomain_type>(result, cache.upperBound);
-#else
-		// Tracking reads approach: stateful comparator captures CDF values during binary search.
-		struct CdfComparator
+		codomain_type result;
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
 		{
-			bool operator()(const density_type value, const density_type rhs)
+			// Descent visits one interior node per level. Going left tightens
+			// the upper bound to the current key; going right tightens the
+			// lower bound. Final index, leafBase is the bucket.
+			cache.oneBefore = _static_cast<density_type>(0.0);
+			cache.upperBound = _static_cast<density_type>(1.0);
+			const uint32_t leafBase = 1u << depth;
+			uint32_t index = 1u;
+			for (uint32_t iter = 0u; iter < depth; ++iter)
 			{
-				const bool retval = value < rhs;
-				if (retval)
-					upperBound = rhs;
+				density_type key;
+				cumProbAccessor.template get<density_type, uint32_t>(index, key);
+				const bool goRight = !(u < key);
+				if (goRight)
+				{
+					cache.oneBefore = key;
+					index = (index << 1u) | 1u;
+				}
 				else
-					oneBefore = rhs;
-				return retval;
+				{
+					cache.upperBound = key;
+					index = (index << 1u);
+				}
 			}
-
-			density_type oneBefore;
-			density_type upperBound;
-		} comp;
-		comp.oneBefore = density_type(0.0);
-		comp.upperBound = density_type(1.0);
-		const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp);
-		cache.oneBefore = comp.oneBefore;
-		cache.upperBound = comp.upperBound;
-#endif
+			const codomain_type raw = codomain_type(index - leafBase);
+			result = raw < codomain_type(storedCount) ? raw : codomain_type(storedCount);
+		}
+		else NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::YOLO)
+		{
+			// Re-read the two adjacent CDF entries after the binary search.
+			// Both sit on the cache lines the search just touched, so they are warm.
+			result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+			cache.oneBefore = _static_cast<density_type>(0.0);
+			if (result)
+				cumProbAccessor.template get<density_type, codomain_type>(result - 1u, cache.oneBefore);
+			cache.upperBound = _static_cast<density_type>(1.0);
+			if (result < storedCount)
+				cumProbAccessor.template get<density_type, codomain_type>(result, cache.upperBound);
+		}
+		else
+		{
+			// TRACKING: stateful comparator captures the CDF values straddling the
+			// found index during the binary search itself, avoiding the two extra reads.
+			struct CdfComparator
+			{
+				bool operator()(const density_type value, const density_type rhs)
+				{
+					const bool retval = value < rhs;
+					if (retval)
+						upperBound = rhs;
+					else
+						oneBefore = rhs;
+					return retval;
+				}
+
+				density_type oneBefore;
+				density_type upperBound;
+			} comp;
+			comp.oneBefore = _static_cast<density_type>(0.0);
+			comp.upperBound = _static_cast<density_type>(1.0);
+			result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp);
+			cache.oneBefore = comp.oneBefore;
+			cache.upperBound = comp.upperBound;
+		}
 		return result;
 	}
 
@@ -111,16 +189,34 @@ struct CumulativeProbabilitySampler
 
 	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
 	{
-		density_type retval = density_type(1.0);
-		if (v < storedCount)
-			cumProbAccessor.template get<density_type, codomain_type>(v, retval);
-		if (v)
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
 		{
-			density_type prev;
-			cumProbAccessor.template get<density_type, codomain_type>(v - 1u, prev);
-			retval -= prev;
+			// Leaves store the CDF directly; the last real leaf is normalized
+			// to 1.0 and padded leaves (if any) also hold 1.0.
+			const uint32_t leafBase = 1u << depth;
+			density_type retval;
+			cumProbAccessor.template get<density_type, uint32_t>(leafBase + uint32_t(v), retval);
+			if (v)
+			{
+				density_type prev;
+				cumProbAccessor.template get<density_type, uint32_t>(leafBase + uint32_t(v) - 1u, prev);
+				retval -= prev;
+			}
+			return retval;
+		}
+		else
+		{
+			density_type retval = _static_cast<density_type>(1.0);
+			if (v < storedCount)
+				cumProbAccessor.template get<density_type, codomain_type>(v, retval);
+			if (v)
+			{
+				density_type prev;
+				cumProbAccessor.template get<density_type, codomain_type>(v - 1u, prev);
+				retval -= prev;
+			}
+			return retval;
 		}
-		return retval;
 	}
 
 	weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC
@@ -129,7 +225,8 @@ struct CumulativeProbabilitySampler
 	}
 
 	CumProbAccessor cumProbAccessor;
-	uint32_t storedCount;
+	uint32_t storedCount;    // N - 1 (last real bucket index)
+	uint32_t depth;          // EYTZINGER only: ceil(log2(N)), iteration count; leafBase = 1 << depth
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
index a511fc2d8c..bf98d5ec93 100644
--- a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
@@ -30,6 +30,61 @@ void computeNormalizedCumulativeHistogram(std::span<const T> weights, T* outCumP
 	std::for_each(outCumProb, outCumProb + N - 1, [normalizationFactor](T& v) { v *= normalizationFactor; });
 }
 
+// Returns the next power of two >= x (and 1 for x <= 1). Matches the leaf-count
+// the Eytzinger builder pads to.
+inline uint32_t eytzingerLeafCount(uint32_t N)
+{
+	uint32_t P = 1u;
+	while (P < N) P <<= 1u;
+	return P;
+}
+
+// Builds an Eytzinger-layout CDF for cache-friendly binary search on the GPU.
+//
+// Layout (1-indexed, size 2*P where P = eytzingerLeafCount(N)):
+//   tree[0]           unused (keeps parent/child arithmetic branch-free)
+//   tree[1 .. P-1]    interior split keys; tree[v] == rightmost leaf of v's left subtree
+//   tree[P .. P+N-1]  leaves, tree[P + i] = normalized inclusive scan of weights up to i
+//   tree[P+N .. 2P-1] padded leaves, all 1.0 (any u < 1.0 routes away from these)
+//
+// The sampler walks the tree as index = (index << 1) | goRight for ceil(log2(N))
+// iterations. Successive taps within one search land on adjacent memory, so every
+// cache line pulled is fully used and the first log2(subgroupSize) iterations are
+// served by a single memory transaction per subgroup.
+template<typename T>
+void buildEytzinger(std::span<const T> weights, T* outTree)
+{
+	const uint32_t N = static_cast<uint32_t>(weights.size());
+	if (N == 0)
+		return;
+
+	const uint32_t P = eytzingerLeafCount(N);
+
+	T total = T(0);
+	for (uint32_t i = 0; i < N; ++i)
+		total += weights[i];
+	const T rcpTotal = T(1) / total;
+
+	T acc = T(0);
+	for (uint32_t i = 0; i < N; ++i)
+	{
+		acc += weights[i];
+		outTree[P + i] = acc * rcpTotal;
+	}
+	for (uint32_t i = N; i < P; ++i)
+		outTree[P + i] = T(1);
+
+	// Bottom-up: each interior node copies the rightmost leaf of its left subtree,
+	// found by descending left-then-always-right from v.
+	for (uint32_t v = P; v-- > 1u;)
+	{
+		uint32_t r = v << 1u;
+		while (r < P)
+			r = (r << 1u) | 1u;
+		outTree[v] = outTree[r];
+	}
+}
+
 } // namespace sampling
 } // namespace hlsl
 } // namespace nbl
diff --git a/include/nbl/builtin/hlsl/sampling/linear.hlsl b/include/nbl/builtin/hlsl/sampling/linear.hlsl
index 12602b4a79..cd260cea3a 100644
--- a/include/nbl/builtin/hlsl/sampling/linear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/linear.hlsl
@@ -39,14 +39,14 @@ struct Linear
         // add min to both coefficients so (0,0) input produces a valid uniform sampler
         // instead of inf normalization (2/0) leading to NaN; negligible for normal inputs
         const vector2_type safeCoeffs = linearCoeffs + vector2_type(hlsl::numeric_limits<scalar_type>::min, hlsl::numeric_limits<scalar_type>::min);
-        // normalize coefficients so that the PDF is simply linearCoeffStart + linearCoeffDiff * x
-        const scalar_type normFactor = scalar_type(2.0) / (safeCoeffs[0] + safeCoeffs[1]);
+        // normalize coefficients so that the PDF is simply normalizedCoeffStart + linearCoeffDiff * x
+        const scalar_type normFactor = _static_cast<scalar_type>(2.0) / (safeCoeffs[0] + safeCoeffs[1]);
         const vector2_type normalized = safeCoeffs * normFactor;
-        retval.linearCoeffStart = normalized[0];
-        retval.linearCoeffEnd = normalized[1];
+        retval.normalizedCoeffStart = normalized[0];
+        retval.normalizedCoeffEnd = normalized[1];
         // precompute for the stable quadratic in generate()
         retval.squaredCoeffStart = normalized[0] * normalized[0];
-        retval.twoTimesDiff = scalar_type(2.0) * (normalized[1] - normalized[0]);
+        retval.twoTimesDiff = _static_cast<scalar_type>(2.0) * (normalized[1] - normalized[0]);
         return retval;
     }
 
@@ -57,18 +57,18 @@ struct Linear
         // Quadratic (1-start)*x^2 + start*x - u = 0; since start >= 0 the stable root is
         // x = 2u / (start + sqrt(start^2 + 2*diff*u)), which never cancels.
         const scalar_type sqrtTerm = sqrt(squaredCoeffStart + twoTimesDiff * u);
-        const scalar_type denom = linearCoeffStart + sqrtTerm;
+        const scalar_type denom = normalizedCoeffStart + sqrtTerm;
         // NOTE: floating point can make x slightly > 1 when u~1 and diff < 0; callers needing
         // non-negative PDF at the boundary should clamp with min(x, 1).
         const codomain_type x = (u + u) / denom;
         // diff*x == sqrtTerm - start algebraically (conjugate identity), saves 1 mul
-        cache.diffTimesX = sqrtTerm - linearCoeffStart;
+        cache.diffTimesX = sqrtTerm - normalizedCoeffStart;
         return x;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return linearCoeffStart + cache.diffTimesX;
+        return normalizedCoeffStart + cache.diffTimesX;
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -82,8 +82,8 @@ struct Linear
     // Not used because we already store start for generate().
     density_type backwardPdf(const codomain_type x) NBL_CONST_MEMBER_FUNC
     {
-        assert(x >= scalar_type(0.0) && x <= scalar_type(1.0));
-        return hlsl::mix(linearCoeffStart, linearCoeffEnd, x);
+        assert(x >= _static_cast<scalar_type>(0.0) && x <= _static_cast<scalar_type>(1.0));
+        return hlsl::mix(normalizedCoeffStart, normalizedCoeffEnd, x);
     }
 
     weight_type backwardWeight(const codomain_type x) NBL_CONST_MEMBER_FUNC
@@ -91,8 +91,8 @@ struct Linear
         return backwardPdf(x);
     }
 
-    scalar_type linearCoeffStart;
-    scalar_type linearCoeffEnd;
+    scalar_type normalizedCoeffStart;
+    scalar_type normalizedCoeffEnd;
     scalar_type squaredCoeffStart;
     scalar_type twoTimesDiff;
 };
diff --git a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
index 719eaf504f..64d5e69b96 100644
--- a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
@@ -32,7 +32,7 @@ struct PolarMapping
 	static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
 	{
 		const scalar_type r = hlsl::sqrt<scalar_type>(u.x);
-		const scalar_type phi = scalar_type(2) * numbers::pi<scalar_type> * u.y;
+		const scalar_type phi = _static_cast<scalar_type>(2) * numbers::pi<scalar_type> * u.y;
 		return vector2_type(r * hlsl::cos<scalar_type>(phi), r * hlsl::sin<scalar_type>(phi));
 	}
 
@@ -46,8 +46,8 @@ struct PolarMapping
 	{
 		const scalar_type r2 = p.x * p.x + p.y * p.y;
 		scalar_type phi = hlsl::atan2(p.y, p.x);
-		phi += hlsl::mix(scalar_type(0), scalar_type(2) * numbers::pi<scalar_type>, phi < scalar_type(0));
-		return vector2_type(r2, phi * (scalar_type(0.5) * numbers::inv_pi<scalar_type>));
+		phi += hlsl::mix(_static_cast<scalar_type>(0), _static_cast<scalar_type>(2) * numbers::pi<scalar_type>, phi < _static_cast<scalar_type>(0));
+		return vector2_type(r2, phi * (_static_cast<scalar_type>(0.5) * numbers::inv_pi<scalar_type>));
 	}
 
 	static density_type forwardPdf(const domain_type u, cache_type cache) { return numbers::inv_pi<scalar_type>; }
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
index 5bf652cb4c..1ad8b5462e 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
@@ -25,147 +25,166 @@ namespace sampling
 //   2. Warp uniform [0,1]^2 through the bilinear to importance-sample NdotL
 //   3. Feed the warped UV into the solid angle sampler to get a rect offset
 //   4. PDF = (1/SolidAngle) * bilinearPdf
-//
-// Template parameter `UsePdfAsWeight`: when true (default), forwardWeight/backwardWeight
-// return the PDF instead of the projected-solid-angle MIS weight.
-// TODO: the projected-solid-angle MIS weight (UsePdfAsWeight=false) has been shown to be
-// poor in practice. Once confirmed by testing, remove the false path and stop storing
-// receiverNormal, receiverWasBSDF, and rcpProjSolidAngle as members.
 template<typename T, bool UsePdfAsWeight = true>
 struct ProjectedSphericalRectangle
 {
-	using scalar_type = T;
-	using vector2_type = vector<T, 2>;
-	using vector3_type = vector<T, 3>;
-	using vector4_type = vector<T, 4>;
-
-	// BackwardTractableSampler concept types
-	using domain_type = vector2_type;
-	using codomain_type = vector3_type;
-	using density_type = scalar_type;
-	using weight_type = density_type;
-
-	struct cache_type
-	{
-		scalar_type abs_cos_theta;
-		vector2_type warped;
-		typename Bilinear<scalar_type>::cache_type bilinearCache;
-	};
-
-	// NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away
-	// from all four rectangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure
-	// at least one vertex has positive projection onto the receiver normal.
-	static ProjectedSphericalRectangle<T,UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
-	{
-		ProjectedSphericalRectangle<T,UsePdfAsWeight> retval;
-		const vector3_type n = hlsl::mul(shape.basis, _receiverNormal);
-		retval.localReceiverNormal = n;
-		retval.receiverWasBSDF = _receiverWasBSDF;
-
-		// Compute solid angle and get r0 in local frame (before z-flip)
-		const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
-		const vector3_type r0 = sa.r0;
-
-		// All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
-		const scalar_type r1x = r0.x + shape.extents.x;
-		const scalar_type r1y = r0.y + shape.extents.y;
-
-		// Unnormalized dots: dot(corner_i, n)
-		const scalar_type base_dot = hlsl::dot(r0, n);
-		const scalar_type dx = shape.extents.x * n.x;
-		const scalar_type dy = shape.extents.y * n.y;
-		const vector4_type dots = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy);
-
-		// Squared lengths of each corner
-		const scalar_type r0zSq = r0.z * r0.z;
-		const vector4_type lenSq = vector4_type(
-			r0.x * r0.x + r0.y * r0.y,
-			r1x * r1x + r0.y * r0.y,
-			r0.x * r0.x + r1y * r1y,
-			r1x * r1x + r1y * r1y
-		) + hlsl::promote<vector4_type>(r0zSq);
-
-		// dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq)
-		// Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11
-		const scalar_type minimumProjSolidAngle = 0.0;
-		const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF,
-			dots * hlsl::rsqrt<vector4_type>(lenSq),
-			hlsl::promote<vector4_type>(minimumProjSolidAngle));
-		retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
-
-		// Reuse the already-computed solid_angle_type to avoid recomputing mul(basis, origin - observer)
-		retval.sphrect = SphericalRectangle<T>::create(sa, shape.extents);
-		retval.rcpSolidAngle = retval.sphrect.solidAngle > scalar_type(0.0) ? scalar_type(1.0) / retval.sphrect.solidAngle : scalar_type(0.0);
-
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-		{
-			const scalar_type projSA = shape.projectedSolidAngleFromLocal(r0, n);
-			retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0);
-		}
-		return retval;
-	}
-
-	// returns a normalized 3D direction in the local frame
-	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-	{
-		Bilinear<scalar_type> bilinear = bilinearPatch;
-		cache.warped = bilinear.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache;
-		const vector3_type dir = sphrect.generate(cache.warped, sphrectCache);
-		cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
-		return dir;
-	}
-
-	// returns a 2D offset on the rectangle surface from the r0 corner
-	vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-	{
-		Bilinear<scalar_type> bilinear = bilinearPatch;
-		cache.warped = bilinear.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache;
-		const vector2_type sampleOffset = sphrect.generateSurfaceOffset(cache.warped, sphrectCache);
-		cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
-		return sampleOffset;
-	}
-
-	density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-	{
-		return rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache);
-	}
-
-	weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-	{
-		if (UsePdfAsWeight)
-			return forwardPdf(u, cache);
-		return cache.abs_cos_theta * rcpProjSolidAngle;
-	}
-
-	// `p` is the normalized [0,1]^2 position on the rectangle
-	density_type backwardPdf(const vector2_type p) NBL_CONST_MEMBER_FUNC
-	{
-		return rcpSolidAngle * bilinearPatch.backwardPdf(p);
-	}
-
-	weight_type backwardWeight(const vector2_type p) NBL_CONST_MEMBER_FUNC
-	{
-		NBL_IF_CONSTEXPR(UsePdfAsWeight)
-			return backwardPdf(p);
-		const scalar_type minimumProjSolidAngle = 0.0;
-		// Reconstruct local direction from normalized rect position
-		const vector3_type localDir = hlsl::normalize(sphrect.r0 + vector3_type(
-			p.x * sphrect.extents.x,
-			p.y * sphrect.extents.y,
-			scalar_type(0)
-		));
-		const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(localReceiverNormal, localDir), minimumProjSolidAngle);
-		return abs_cos_theta * rcpProjSolidAngle;
-	}
-
-	sampling::SphericalRectangle<T> sphrect;
-	Bilinear<scalar_type> bilinearPatch;
-	scalar_type rcpSolidAngle;
-	scalar_type rcpProjSolidAngle;
-	vector3_type localReceiverNormal;
-	bool receiverWasBSDF;
+   using scalar_type  = T;
+   using vector2_type = vector<T, 2>;
+   using vector3_type = vector<T, 3>;
+   using vector4_type = vector<T, 4>;
+
+   // BackwardTractableSampler concept types
+   using domain_type   = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type  = scalar_type;
+   using weight_type   = density_type;
+
+   struct cache_type
+   {
+      typename Bilinear<scalar_type>::cache_type bilinearCache;
+      vector3_type L; // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
+   };
+
+   // Shared finalization for both create() overloads: builds the bilinear patch, the inner sphrect
+   // sampler, and the UsePdfAsWeight=false extras. The two overloads differ only in how they
+   // compute bxdfPdfAtVertex (worldspace corner normalizations vs local-frame rsqrt(lenSq)).
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa,
+      const vector4_type bxdfPdfAtVertex, const vector3_type _receiverNormal)
+   {
+      ProjectedSphericalRectangle<T, UsePdfAsWeight> retval;
+      retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
+      // Reuse solid_angle_type to avoid recomputing mul(basis, origin - observer)
+      retval.sphrect = SphericalRectangle<T>::create(shape.basis, sa, shape.extents);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      {
+         retval.receiverNormal     = _receiverNormal;
+         const vector3_type nLocal = hlsl::mul(shape.basis, _receiverNormal);
+         retval.projSolidAngle     = shape.projectedSolidAngleFromLocal(sa.r0, nLocal);
+      }
+      return retval;
+   }
+
+   // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::CompressedSphericalRectangle<T>) compressed, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+   {
+      // 4 normalized worldspace corners dotted with the worldspace receiver normal. Avoids the
+      // mul(basis, receiverNormal) data dependency of the uncompressed overload so these 4
+      // normalize+dot chains can pipeline alongside the basis/solid-angle work below.
+      const vector3_type c0   = compressed.origin - observer;
+      const vector3_type c1   = c0 + compressed.right;
+      const vector3_type c2   = c0 + compressed.up;
+      const vector3_type c3   = c1 + compressed.up;
+      const vector4_type dots = vector4_type(
+         hlsl::dot(hlsl::normalize(c0), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c1), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c2), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c3), _receiverNormal));
+      const scalar_type minimumProjSolidAngle = _static_cast<scalar_type>(0.0);
+      const vector4_type bxdfPdfAtVertex      = math::conditionalAbsOrMax(_receiverWasBSDF, dots, hlsl::promote<vector4_type>(minimumProjSolidAngle));
+
+      const shapes::SphericalRectangle<T> shape                         = shapes::SphericalRectangle<T>::create(compressed);
+      const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
+      return create(shape, sa, bxdfPdfAtVertex, _receiverNormal);
+   }
+
+   // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+   {
+      // Local-frame path: unnormalized dot(corner_i, n) with n = basis * receiverNormal, then
+      // a single rsqrt<vec4>(lenSq) for all 4 corner normalizations at once.
+      const vector3_type n                                              = hlsl::mul(shape.basis, _receiverNormal);
+      const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
+      const vector3_type r0                                             = sa.r0;
+
+      // All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
+      const scalar_type r1x      = r0.x + shape.extents.x;
+      const scalar_type r1y      = r0.y + shape.extents.y;
+      const scalar_type base_dot = hlsl::dot(r0, n);
+      const scalar_type dx       = shape.extents.x * n.x;
+      const scalar_type dy       = shape.extents.y * n.y;
+      const vector4_type dots    = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy);
+
+      const scalar_type r0zSq  = r0.z * r0.z;
+      const vector4_type lenSq = vector4_type(
+                                    r0.x * r0.x + r0.y * r0.y,
+                                    r1x * r1x + r0.y * r0.y,
+                                    r0.x * r0.x + r1y * r1y,
+                                    r1x * r1x + r1y * r1y) +
+         hlsl::promote<vector4_type>(r0zSq);
+
+      // dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq). Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11
+      const scalar_type minimumProjSolidAngle = _static_cast<scalar_type>(0.0);
+      const vector4_type bxdfPdfAtVertex      = math::conditionalAbsOrMax(_receiverWasBSDF,
+         dots * hlsl::rsqrt<vector4_type>(lenSq),
+         hlsl::promote<vector4_type>(minimumProjSolidAngle));
+
+      return create(shape, sa, bxdfPdfAtVertex, _receiverNormal);
+   }
+
+   // returns a normalized 3D direction in the local frame
+   codomain_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generateNormalizedLocal(warped, sphrectCache, hitDist);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+         cache.L = hlsl::mul(hlsl::transpose(sphrect.basis), dir);
+      return dir;
+   }
+
+   // returns a unnormalized 3D direction in the global frame
+   codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generateUnnormalized(warped, sphrectCache);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+         cache.L = dir * hlsl::rsqrt(hlsl::dot(dir, dir));
+      return dir;
+   }
+
+   // returns a normalized 3D direction in the global frame
+   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generate(warped, sphrectCache);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      cache.L = dir;
+      return dir;
+   }
+
+   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return bilinearPatch.forwardPdf(u, cache.bilinearCache) / sphrect.solidAngle;
+   }
+
+   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      NBL_IF_CONSTEXPR(UsePdfAsWeight)
+         return forwardPdf(u, cache);
+      return backwardWeight(cache.L);
+   }
+
+   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      NBL_IF_CONSTEXPR(UsePdfAsWeight)
+      {
+#if 0
+			const vector2_type warped = sphrect.generateInvese(L); // TODO: implement `generateInverse`
+			return bilinearPatch.backwardPdf(warped) / sphrect.solidAngle;
+#endif
+         return hlsl::numeric_limits<weight_type>::quiet_NaN;
+      }
+      // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
+      return hlsl::abs(hlsl::dot(L, receiverNormal)) / projSolidAngle;
+   }
+
+   sampling::SphericalRectangle<T> sphrect;
+   Bilinear<scalar_type> bilinearPatch;
+   // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
+   vector3_type receiverNormal;
+   scalar_type projSolidAngle;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
index c4bc5fcea8..6eddd03e8a 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
@@ -47,73 +47,68 @@ struct ProjectedSphericalTriangle
 
     struct cache_type
     {
-        scalar_type abs_cos_theta;
-        vector2_type warped;
         typename Bilinear<scalar_type>::cache_type bilinearCache;
+        vector3_type L; // TODO: erase when UsePdfAsWeight==false
     };
 
-    // NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away
-    // from all three triangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure
-    // at least one vertex has positive projection onto the receiver normal.
+    // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
     static ProjectedSphericalTriangle<T,UsePdfAsWeight> create(NBL_REF_ARG(shapes::SphericalTriangle<T>) shape, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
     {
         ProjectedSphericalTriangle<T,UsePdfAsWeight> retval;
-       retval.sphtri = SphericalTriangle<T, UsePdfAsWeight>::create(shape);
-        retval.receiverNormal = _receiverNormal;
-        retval.receiverWasBSDF = _receiverWasBSDF;
+        retval.sphtri = SphericalTriangle<T>::create(shape);
 
         const scalar_type minimumProjSolidAngle = 0.0;
         matrix<T, 3, 3> m = matrix<T, 3, 3>(shape.vertices[0], shape.vertices[1], shape.vertices[2]);
         const vector3_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, hlsl::mul(m, _receiverNormal), hlsl::promote<vector3_type>(minimumProjSolidAngle));
         retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex.yyxz);
 
-        const scalar_type projSA = shape.projectedSolidAngle(_receiverNormal);
-        retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0);
+        NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+        {
+            retval.receiverNormal = _receiverNormal;
+            // prevent division of 0 cosine by 0
+            retval.projSolidAngle = max<scalar_type>(shape.projectedSolidAngle(_receiverNormal),numeric_limits<scalar_type>::min);
+        }
         return retval;
     }
 
     codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
-        Bilinear<scalar_type> bilinear = bilinearPatch;
-        cache.warped = bilinear.generate(u, cache.bilinearCache);
-        typename SphericalTriangle<scalar_type>::cache_type sphtriCache;
-        const vector3_type L = sphtri.generate(cache.warped, sphtriCache);
-        cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
+        const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+        typename SphericalTriangle<scalar_type>::cache_type sphtriCache; // PDF is constant caches nothing, its empty
+        const codomain_type L = sphtri.generate(warped, sphtriCache);
+        NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+            cache.L = L;
         return L;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache);
+        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u,cache.bilinearCache);
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        if (UsePdfAsWeight)
-            return forwardPdf(u, cache);
-        return cache.abs_cos_theta * rcpProjSolidAngle;
-    }
-
-    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        const vector2_type u = sphtri.generateInverse(L);
-        return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+        NBL_IF_CONSTEXPR (UsePdfAsWeight)
+            return forwardPdf(u,cache);
+        return backwardWeight(cache.L);
     }
 
     weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
     {
-        NBL_IF_CONSTEXPR(UsePdfAsWeight)
-            return backwardPdf(L);
-        const scalar_type minimumProjSolidAngle = 0.0;
-        const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(receiverNormal, L), minimumProjSolidAngle);
-        return abs_cos_theta * rcpProjSolidAngle;
+        NBL_IF_CONSTEXPR (UsePdfAsWeight)
+        {
+            const vector2_type u = sphtri.generateInverse(L);
+            return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+        }
+        // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
+        return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
     }
 
-    sampling::SphericalTriangle<T, UsePdfAsWeight> sphtri;
+    sampling::SphericalTriangle<T> sphtri;
     Bilinear<scalar_type> bilinearPatch;
-    scalar_type rcpProjSolidAngle;
+    // TODO: erase when UsePdfAsWeight==false
     vector3_type receiverNormal;
-    bool receiverWasBSDF;
+    scalar_type projSolidAngle;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
index 131cc92d70..ba088a0e29 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
@@ -19,156 +19,212 @@ namespace hlsl
 namespace sampling
 {
 
-template<typename T>
+// UseRealSinCos=true  (default): math::sincos uses real sin+cos; full precision near au=n*pi, Jacobian test passes.
+// UseRealSinCos=false          : math::sincos uses cos + sqrt(1-c*c); saves one special-function op but loses mantissa as |cos_au| -> 1.
+template<typename T, bool UseRealSinCos = true>
 struct SphericalRectangle
 {
-    using scalar_type = T;
-    using vector2_type = vector<T, 2>;
-    using vector3_type = vector<T, 3>;
-    using vector4_type = vector<T, 4>;
-
-    // BackwardTractableSampler concept types
-    using domain_type = vector2_type;
-    using codomain_type = vector3_type;
-    using density_type = scalar_type;
-    using weight_type = density_type;
-
-    struct cache_type {};
-
-    NBL_CONSTEXPR_STATIC_INLINE scalar_type ClampEps = 1e-5;
-
-    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
-    {
-        return create(rect.solidAngle(observer), rect.extents);
-    }
-
-    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
-    {
-        SphericalRectangle<T> retval;
-        retval.r0 = sa.r0;
-        retval.extents = _extents;
-
-        retval.solidAngle = sa.value;
-        retval.b0 = sa.n_z[0];
-        retval.b1 = sa.n_z[2];
-
-        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[2]);
-        angle_adder.addCosine(sa.cosGamma[3]);
-        retval.k = scalar_type(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
-
-        return retval;
-    }
-
-    // Create directly from a local-frame corner position and rectangle extents.
-    // Use when you already know r0 (e.g. from a gnomonic projection) and don't
-    // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip.
-    static SphericalRectangle<T> create(const vector3_type _r0, const vector2_type _extents)
-    {
-        // Same math as shapes::SphericalRectangle::solidAngle() but without
-        // the mul(basis, origin - observer) step since we already have r0.
-        typename shapes::SphericalRectangle<T>::solid_angle_type sa;
-        sa.r0 = _r0;
-
-        const scalar_type zSq = _r0.z * _r0.z;
-        const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x);
-        sa.n_z = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(zSq) + denorm_n_z * denorm_n_z);
-        sa.cosGamma = vector4_type(
-            -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2],
-            -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]);
-
-        math::sincos_accumulator<scalar_type> acc = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[0]);
-        acc.addCosine(sa.cosGamma[1]);
-        acc.addCosine(sa.cosGamma[2]);
-        acc.addCosine(sa.cosGamma[3]);
-        sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi<scalar_type>;
-
-        return create(sa, _extents);
-    }
-
-    // shared core of generate and generateSurfaceOffset
-    // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction
-    vector3_type __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
-    {
-        // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip
-        const scalar_type negAbsR0z = -hlsl::abs(r0.z);
-        const scalar_type r0zSq = r0.z * r0.z;
-        const vector2_type r1 = vector2_type(r0.x + extents.x, r0.y + extents.y);
-
-        const scalar_type au = u.x * solidAngle + k;
-        const scalar_type cos_au = hlsl::cos<scalar_type>(au);
-        const scalar_type numerator = b1 - cos_au * b0;
-        // (1-cos)*(1+cos) avoids catastrophic cancellation of 1-cos^2 when cos_au is near +/-1
-        const scalar_type sin_au_sq = (scalar_type(1.0) - cos_au) * (scalar_type(1.0) + cos_au);
-        const scalar_type absNegFu = hlsl::abs(numerator) * hlsl::rsqrt<scalar_type>(sin_au_sq);
-        const scalar_type rcpCu_2 = hlsl::max<scalar_type>(absNegFu * absNegFu + b0 * b0, scalar_type(1.0));
-        // sign(negFu) = sign(numerator) * sign(sin(au)); sin(au) < 0 iff au > PI
-        const scalar_type negFuSign = hlsl::select((au > numbers::pi<scalar_type>) != (numerator < scalar_type(0.0)), scalar_type(-1.0), scalar_type(1.0));
-        scalar_type xu = negAbsR0z * negFuSign * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
-        xu = hlsl::clamp<scalar_type>(xu, r0.x, r1.x); // avoid Infs
-        const scalar_type d_2 = xu * xu + r0zSq;
-        const scalar_type d = hlsl::sqrt<scalar_type>(d_2);
-
-        const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(d_2 + r0.y * r0.y);
-        const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(d_2 + r1.y * r1.y);
-        const scalar_type hv = h0 + u.y * (h1 - h0);
-
-        return vector3_type(xu, hv, d);
-    }
-
-    // returns a normalized 3D direction in the local frame with correct r0.z sign
-    codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-    {
-        const vector3_type core = __generate(u);
-        const scalar_type xu = core.x;
-        const scalar_type hv = core.y;
-        const scalar_type d = core.z;
-        const scalar_type hv2 = hv * hv;
-        const scalar_type cosElevation = hlsl::sqrt<scalar_type>(hlsl::max<scalar_type>(scalar_type(1.0) - hv2, scalar_type(0.0)));
-        const scalar_type rcpD = scalar_type(1.0) / d;
-
-        return vector3_type(xu * cosElevation * rcpD, hv, r0.z * cosElevation * rcpD);
-    }
-
-    // returns a 2D offset on the rectangle surface from the r0 corner
-    vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-    {
-        const vector3_type core = __generate(u);
-        const scalar_type xu = core.x;
-        const scalar_type hv = core.y;
-        const scalar_type d = core.z;
-        const scalar_type r1y = r0.y + extents.y;
-        const scalar_type hv2 = hv * hv;
-        const scalar_type yv = hlsl::mix(r1y, (hv * d) / hlsl::sqrt<scalar_type>(scalar_type(1.0) - hv2), hv2 < scalar_type(1.0) - ClampEps);
-
-        return vector2_type((xu - r0.x), (yv - r0.y));
-    }
-
-    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-    {
-        return scalar_type(1.0) / solidAngle;
-    }
-
-    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-    {
-        return forwardPdf(u, cache);
-    }
-
-    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        return scalar_type(1.0) / solidAngle;
-    }
-
-    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        return backwardPdf(L);
-    }
-
-    scalar_type solidAngle;
-    scalar_type k;
-    scalar_type b0;
-    scalar_type b1;
-    vector3_type r0;
-    vector2_type extents;
+   using scalar_type    = T;
+   using vector2_type   = vector<T, 2>;
+   using vector3_type   = vector<T, 3>;
+   using vector4_type   = vector<T, 4>;
+   using matrix3x3_type = matrix<T, 3, 3>;
+
+   // BackwardTractableSampler concept types
+   using domain_type   = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type  = scalar_type;
+   using weight_type   = density_type;
+
+   struct cache_type
+   {
+   };
+
+   static SphericalRectangle<T, UseRealSinCos> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
+   {
+      return create(rect.basis, rect.solidAngle(observer), rect.extents);
+   }
+
+   static SphericalRectangle<T, UseRealSinCos> create(const matrix3x3_type _basis, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
+   {
+      SphericalRectangle<T, UseRealSinCos> retval;
+      retval.basis   = _basis;
+      retval.r0      = sa.r0;
+      retval.extents = _extents;
+
+      retval.solidAngle = sa.value;
+      retval.b0         = sa.n_z[0];
+      retval.b1         = sa.n_z[2];
+
+      math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[2]);
+      angle_adder.addCosine(sa.cosGamma[3]);
+      retval.k = _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
+
+      return retval;
+   }
+
+   // Create directly from a local-frame corner position and rectangle extents.
+   // Use when you already know r0 (e.g. from a gnomonic projection) and don't
+   // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip.
+   static SphericalRectangle<T, UseRealSinCos> create(const matrix3x3_type _basis, const vector3_type _r0, const vector2_type _extents)
+   {
+      // Same math as shapes::SphericalRectangle::solidAngle() but without
+      // the mul(basis, origin - observer) step since we already have r0.
+      typename shapes::SphericalRectangle<T>::solid_angle_type sa;
+      sa.r0 = _r0;
+
+      const scalar_type zSq         = _r0.z * _r0.z;
+      const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x);
+      sa.n_z                        = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(zSq) + denorm_n_z * denorm_n_z);
+      sa.cosGamma                   = vector4_type(
+         -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2],
+         -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]);
+
+      math::sincos_accumulator<scalar_type> acc = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[0]);
+      acc.addCosine(sa.cosGamma[1]);
+      acc.addCosine(sa.cosGamma[2]);
+      acc.addCosine(sa.cosGamma[3]);
+      sa.value = acc.getSumOfArccos() - _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type>;
+
+      return create(_basis, sa, _extents);
+   }
+
+   // shared core of generate and generateSurfaceOffset
+   // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction
+   struct SCommonGen
+   {
+      scalar_type xu;
+      scalar_type d2;
+      scalar_type hv;
+      scalar_type cosElevation2;
+   };
+   SCommonGen __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+   {
+      SCommonGen retval;
+
+      // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip
+      const scalar_type negAbsR0z = -hlsl::abs(r0.z);
+      const scalar_type r0zSq     = r0.z * r0.z;
+      const vector2_type r1       = vector2_type(r0.x + extents.x, r0.y + extents.y);
+
+      // au in [0, 4*pi] since u.x in [0,1], solidAngle <= 2*pi, k <= 2*pi.
+      // The sqrt path in math::sincos recovers sin sign via sign(theta), which only holds for theta in [-pi,pi].
+      // Real sin/cos handle any range, so only wrap on the sqrt path (compile-time branch folds away).
+      scalar_type au = u.x * solidAngle + k;
+      NBL_IF_CONSTEXPR(!UseRealSinCos)
+      {
+         // au in [0, 4*pi] -> peel at most two 2*pi periods to land in (-pi, pi].
+         au = hlsl::select(au > numbers::pi<scalar_type>, au - _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type>, au);
+         au = hlsl::select(au > numbers::pi<scalar_type>, au - _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type>, au);
+      }
+      scalar_type sin_au, cos_au;
+      math::sincos<scalar_type, UseRealSinCos>(au, sin_au, cos_au);
+      const scalar_type numerator = b1 - cos_au * b0;
+      // negFu carries the sign directly (numerator and sin_au both signed), so xu's sign drops
+      // out of a single multiply + hlsl::sign.
+      const scalar_type negFu   = numerator / sin_au;
+      const scalar_type rcpCu_2 = hlsl::max<scalar_type>(negFu * negFu + b0 * b0, _static_cast<scalar_type>(1.0));
+      retval.xu                 = negAbsR0z * hlsl::sign(negFu) * hlsl::rsqrt<scalar_type>(rcpCu_2 - _static_cast<scalar_type>(1.0));
+      retval.xu = hlsl::clamp<scalar_type>(retval.xu, r0.x, r1.x); // avoid Infs
+      retval.d2 = retval.xu * retval.xu + r0zSq;
+
+      const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(retval.d2 + r0.y * r0.y);
+      const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(retval.d2 + r1.y * r1.y);
+      retval.hv            = h0 + u.y * (h1 - h0);
+      retval.cosElevation2 = _static_cast<scalar_type>(1.0) - hlsl::min<scalar_type>(retval.hv * retval.hv, 1);
+
+      return retval;
+   }
+
+   // returns a normalized 3D direction in the local frame with correct r0.z sign
+   vector3_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
+   {
+      const SCommonGen core         = __generate(u);
+      scalar_type cosElevationOverD = hlsl::rsqrt(core.d2 / core.cosElevation2);
+      // TODO: or shall we do some other more sophisticated clamp or correction? Is this even the right one to use?
+      cosElevationOverD = hlsl::select(hlsl::isnan(cosElevationOverD), 1.f, cosElevationOverD);
+
+      // TODO: investigate if due to precision we need to compute this as a `sqrt` then the quantity being computed is `core.cosElevation2/core.d2`
+      // which what alss `generateLocalBasisXY` needs, in which case `__generate` can already compute it and return it as `hitDist2`
+      hitDist = 1.f / cosElevationOverD;
+
+      const vector3_type retval = vector3_type(core.xu / hitDist, core.hv, r0.z / hitDist);
+      assert(!hlsl::isnan(computeHitT(retval)));
+      return retval;
+   }
+
+   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      scalar_type dummy;
+      const vector3_type localL = generateNormalizedLocal(u, cache, dummy);
+      // could return `hlsl::mul(hlsl::tranpose(basis),localL)` or just this
+      return basis[0] * localL[0] + basis[1] * localL[1] + basis[2] * localL[2];
+   }
+
+   // utility to determine maxT for a ray from L shot from the origin which we're sure intersects the rectangle
+   scalar_type computeHitT(const vector3_type L)
+   {
+      const scalar_type retval = hlsl::abs(r0.z / hlsl::dot(L, basis[2]));
+      {
+         const vector3_type hitPointRelative = L * retval;
+         const vector2_type uv               = mul(basis, L).xy - r0.xy;
+         assert(uv[0] >= 0.0 && uv[0] <= rectExtents[0]);
+         assert(uv[1] >= 0.0 && uv[1] <= rectExtents[1]);
+      }
+      return retval;
+   }
+
+   // returns a 2D offset on the rectangle surface including the r0 corner -useful for generating unnormalized worldspace L
+   vector2_type generateLocalBasisXY(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const SCommonGen core = __generate(u);
+      const scalar_type r1y = r0.y + extents.y;
+      // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2`
+      const scalar_type yv = core.hv * hlsl::rsqrt(core.cosElevation2 / core.d2);
+
+      // fun fact, when one of the operands to min or max is NaN, the SPIR-V builtin will select the other one
+      // TODO: maybe try just `min(yv,ry1)`
+      const vector2_type retval = vector2_type(core.xu, hlsl::clamp(yv, r0.y, r1y));
+      assert(retval[0] >= r0.x && retval[1] >= r0.y);
+      assert(retval[0] <= r0.x + rectExtents[0] && retval[1] <= r0.y + rectExtents[1]);
+      return retval;
+   }
+
+   // its basically the hitpoint minus the observer origin
+   codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type localXY = generateLocalBasisXY(u, cache);
+      // the `localXY` already contains r0.xy
+      return basis[0] * localXY[0] + basis[1] * localXY[1] + basis[2] * r0.z;
+   }
+
+   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return _static_cast<scalar_type>(1.0) / solidAngle;
+   }
+
+   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return forwardPdf(u, cache);
+   }
+
+   density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return _static_cast<scalar_type>(1.0) / solidAngle;
+   }
+
+   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return backwardPdf(L);
+   }
+
+   matrix3x3_type basis;
+   vector3_type r0;
+   vector2_type extents;
+   scalar_type solidAngle;
+   scalar_type k;
+   scalar_type b0;
+   scalar_type b1;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
index 6f29582e04..b915a55de7 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
@@ -11,6 +11,7 @@
 #include <nbl/builtin/hlsl/math/quaternions.hlsl>
 #include <nbl/builtin/hlsl/math/fast_acos.hlsl>
 #include <nbl/builtin/hlsl/shapes/spherical_triangle.hlsl>
+#include <nbl/builtin/hlsl/ieee754.hlsl>
 
 namespace nbl
 {
@@ -25,9 +26,6 @@ enum SphericalTriangleAlgorithm : uint16_t
    STA_PBRT = 1
 };
 
-template<typename T, bool Bijective = false, SphericalTriangleAlgorithm Algorithm = STA_ARVO>
-struct SphericalTriangle;
-
 namespace impl
 {
 
@@ -53,9 +51,8 @@ T sumOfProducts(T a, T b, T c, T d)
 
 } // namespace impl
 
-// Non-bijective: Resamplable & Tractable (generate + pdf/weight, no inverse)
-template<typename T, SphericalTriangleAlgorithm Algorithm>
-struct SphericalTriangle<T, false, Algorithm>
+template<typename T, SphericalTriangleAlgorithm Algorithm = STA_ARVO>
+struct SphericalTriangle
 {
    using scalar_type = T;
    using vector2_type = vector<T, 2>;
@@ -73,22 +70,23 @@ struct SphericalTriangle<T, false, Algorithm>
    static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
    {
       SphericalTriangle retval;
-      retval.rcpSolidAngle = scalar_type(1.0) / tri.solid_angle;
+      retval.rcpSolidAngle = _static_cast<scalar_type>(1.0) / tri.solid_angle;
       retval.tri_vertices[0] = tri.vertices[0];
       retval.tri_vertices[1] = tri.vertices[1];
-      retval.triCosC = tri.cos_sides[2];
+      retval.triCosc = tri.cos_sides[2];
       // precompute great circle normal of arc AC: cross(A,C) has magnitude sin(b),
       // so multiplying by csc(b) normalizes it; zero when side AC is degenerate
-      const scalar_type cscB = tri.csc_sides[1];
-      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits<scalar_type>::max, cscB, scalar_type(0));
+      const scalar_type cscb = tri.csc_sides[1];
+      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscb < numeric_limits<scalar_type>::max, cscb, _static_cast<scalar_type>(0));
       retval.e_C = hlsl::cross(arcACPlaneNormal, tri.vertices[0]);
       retval.cosA = tri.cos_vertices[0];
       retval.sinA = tri.sin_vertices[0];
       if (Algorithm == STA_ARVO)
       {
-         retval.sinA_triCosC = retval.sinA * retval.triCosC;
+         retval.sinA_triCosc = retval.sinA * retval.triCosc;
          retval.eCdotB = hlsl::dot(retval.e_C, tri.vertices[1]);
       }
+      retval.APlusC = tri.vertices[0] + tri.vertices[2];
       return retval;
    }
 
@@ -106,36 +104,37 @@ struct SphericalTriangle<T, false, Algorithm>
       if (Algorithm == STA_ARVO) // faster than PBRT
       {
          const scalar_type u_ = t - cosA;
-         const scalar_type v_ = s + sinA_triCosC;
+         const scalar_type v_ = s + sinA_triCosc;
          const scalar_type num = (v_ * t - u_ * s) * cosA - v_;
          const scalar_type denum = (v_ * s + u_ * t) * sinA;
 
+#define ACCURATE 1
 #if ACCURATE
          // sqrt(1 - cosBp^2) loses precision when cosBp ~ 1 (small u.x).
          // Use stable factorization: sinBp = sqrt((denum-num)(denum+num)) / |denum|
-         // where denum-num = sinA*(1+triCosC)*(1-cosA_hat).
+         // where denum-num = sinA*(1+triCosc)*(1-cosA_hat).
 
          // For large triangles with high u.x, cosA_hat can approach -1,
          // making (1 + cosA_hat) near zero and the division unstable.
          // Use the algebraic identity only when cosA_hat > 0 (safe denominator).
-         const scalar_type rcpDenum = scalar_type(1) / denum;
-         const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > scalar_type(0), sinA_hat * sinA_hat / (scalar_type(1) + cosA_hat), scalar_type(1) - cosA_hat);
-         const scalar_type DminusN = sinA * (scalar_type(1) + triCosC) * oneMinusCosAhat;
-         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum);
-         cosBp = scalar_type(1) - DminusN * rcpDenum;
+         const scalar_type rcpDenum = _static_cast<scalar_type>(1) / denum;
+         const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > _static_cast<scalar_type>(0), sinA_hat * sinA_hat / (_static_cast<scalar_type>(1) + cosA_hat), _static_cast<scalar_type>(1) - cosA_hat);
+         const scalar_type DminusN = sinA * (_static_cast<scalar_type>(1) + triCosc) * oneMinusCosAhat;
+         sinBp = sqrt<scalar_type>(max<scalar_type>(_static_cast<scalar_type>(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum);
+         cosBp = _static_cast<scalar_type>(1) - DminusN * rcpDenum;
 #else // 17% faster, less accurate
          cosBp = num / denum;
-         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - cosBp * cosBp));
+         sinBp = sqrt<scalar_type>(max<scalar_type>(_static_cast<scalar_type>(0), _static_cast<scalar_type>(1) - cosBp * cosBp));
 #endif
       }
       else // STA_PBRT, accurate, slowest
       {
          // PBRT uses cosPhi = -t, sinPhi = -s (pi offset from Arvo's A_hat)
          const scalar_type k1 = -t + cosA;
-         const scalar_type k2 = -s - sinA * triCosC;
+         const scalar_type k2 = -s - sinA * triCosc;
          cosBp = (k2 + impl::differenceOfProducts(k2, -t, k1, -s) * cosA) / (impl::sumOfProducts(k2, -s, k1, -t) * sinA);
-         cosBp = nbl::hlsl::clamp(cosBp, scalar_type(-1), scalar_type(1));
-         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - cosBp * cosBp));
+         cosBp = nbl::hlsl::clamp(cosBp, _static_cast<scalar_type>(-1), _static_cast<scalar_type>(1));
+         sinBp = sqrt<scalar_type>(_static_cast<scalar_type>(1) - cosBp * cosBp);
       }
 
       // Step 3: construct C' on the great circle through A toward C
@@ -143,81 +142,24 @@ struct SphericalTriangle<T, false, Algorithm>
 
       // Step 4: uniformly sample the great circle arc from B to C'
       scalar_type cosCpB;
-      if (Algorithm == STA_ARVO)
-         cosCpB = cosBp * triCosC + sinBp * eCdotB;
+      NBL_IF_CONSTEXPR(Algorithm == STA_ARVO)
+         cosCpB = cosBp * triCosc + sinBp * eCdotB;
       else
          cosCpB = nbl::hlsl::dot(cp, tri_vertices[1]);
-      const scalar_type z = scalar_type(1) - u.y * (scalar_type(1) - cosCpB);
-      const scalar_type sinZ = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - z * z));
+      // TODO: degeneracy at u.y = 0. z = 1 - u.y*(1-cosCpB) makes sinZ = sqrt(1-z^2) behave like
+      // sqrt(u.y) near zero, so dL/du.y diverges as u.y^(-1/2) and every higher derivative diverges
+      // faster. The forward Jacobian test in 37_HLSLSamplingTests reports ~2-8% error at u.y < 0.003
+      // even with the O(h^2) one-sided stencil because the third-derivative term dominates. At
+      // u.y = 0 exactly, L collapses to vertex B for all u.x (|det J| = 0), so it's an intrinsic
+      // property of the Arvo parameterization, not a bug. Fix: rework the arc interpolation to use
+      // a u.y -> angle mapping whose derivatives stay bounded near u.y = 0 (e.g. acos(z) = angle
+      // from B, then sample arc-length linearly), so the Jacobian is smooth and the skip band in
+      // the tester can be removed.
+      const scalar_type z = _static_cast<scalar_type>(1) - u.y * (_static_cast<scalar_type>(1) - cosCpB);
+      const scalar_type sinZ = sqrt<scalar_type>(max<scalar_type>(_static_cast<scalar_type>(0), _static_cast<scalar_type>(1) - z * z));
       return z * tri_vertices[1] + sinZ * hlsl::normalize(cp - cosCpB * tri_vertices[1]);
    }
 
-   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-   {
-      return rcpSolidAngle;
-   }
-
-   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-   {
-      return forwardPdf(u, cache);
-   }
-
-   density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-   {
-      return rcpSolidAngle;
-   }
-
-   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
-   {
-      return backwardPdf(L);
-   }
-
-   scalar_type rcpSolidAngle;
-   scalar_type cosA;
-   scalar_type sinA;
-   scalar_type sinA_triCosC; // precomputed sinA * triCosC
-   scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only
-
-   vector3_type tri_vertices[2]; // A and B only
-   scalar_type triCosC;
-   vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane
-};
-
-// Bijective: adds generateInverse, stores extra members for the inverse mapping
-template<typename T, SphericalTriangleAlgorithm Algorithm>
-struct SphericalTriangle<T, true, Algorithm>
-{
-   using scalar_type = T;
-   using vector2_type = vector<T, 2>;
-   using vector3_type = vector<T, 3>;
-
-   using base_type = SphericalTriangle<T, false, Algorithm>;
-   using domain_type = vector2_type;
-   using codomain_type = vector3_type;
-   using density_type = scalar_type;
-   using weight_type = density_type;
-
-   using cache_type = typename base_type::cache_type;
-
-   static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
-   {
-      SphericalTriangle retval;
-      retval.base = base_type::create(tri);
-      retval.rcpSolidAngle = retval.base.rcpSolidAngle;
-      retval.vertexC = tri.vertices[2];
-      // precompute great circle normal of arc AC (needed for generateInverse)
-      const scalar_type cscB = tri.csc_sides[1];
-      retval.arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits<scalar_type>::max, cscB, scalar_type(0));
-      retval.triCscC = tri.csc_sides[2];
-      return retval;
-   }
-
-   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-   {
-      return base.generate(u, cache);
-   }
-
-
    // generate() works in two steps:
    //   u.x -> pick C' on arc AC (choosing a sub-area fraction)
    //   u.y -> pick L on arc B->C' (linear interpolation)
@@ -229,37 +171,34 @@ struct SphericalTriangle<T, true, Algorithm>
    domain_type generateInverse(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
       // Step 1: find C' = intersection of great circles (B,L) and (A,C)
-      const vector3_type BxL = nbl::hlsl::cross(base.tri_vertices[1], L);
+      const vector3_type BxL = nbl::hlsl::cross(tri_vertices[1], L);
       const scalar_type sinBL_sq = nbl::hlsl::dot(BxL, BxL);
-      if (sinBL_sq < numeric_limits<scalar_type>::epsilon)
+
+      // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t).
+      // C' also lies on the B-L plane, so dot(BxL, C') = 0.
+      // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R
+      const scalar_type tripleA = nbl::hlsl::dot(BxL, tri_vertices[0]);
+      const scalar_type tripleE = nbl::hlsl::dot(BxL, e_C);
+      const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE;
+
+      if (sinBL_sq < numeric_limits<scalar_type>::epsilon || R_sq < numeric_limits<scalar_type>::epsilon)
       {
-         // L ~ B: u.y ~ 0, u.x is indeterminate (all u.x map to B when u.y=0).
          // Recover u.y from |L-B|^2 / |A-B|^2 (using C'=A; the (1-cosCpB) ratio
          // cancels so any C' gives the same result).
-         const vector3_type LminusB = L - base.tri_vertices[1];
-         const vector3_type AminusB = base.tri_vertices[0] - base.tri_vertices[1];
+         const vector3_type LminusB = L - tri_vertices[1];
+         const vector3_type AminusB = tri_vertices[0] - tri_vertices[1];
          const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
          const scalar_type v_denom = nbl::hlsl::dot(AminusB, AminusB);
          const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
-            nbl::hlsl::clamp(v_num / v_denom, scalar_type(0.0), scalar_type(1.0)),
-            scalar_type(0.0));
-         return vector2_type(scalar_type(0.0), v);
+            nbl::hlsl::clamp(v_num / v_denom, _static_cast<scalar_type>(0.0), _static_cast<scalar_type>(1.0)),
+            _static_cast<scalar_type>(0.0));
+         return vector2_type(_static_cast<scalar_type>(0.0), v);
       }
 
-      // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t).
-      // C' also lies on the B-L plane, so dot(BxL, C') = 0.
-      // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R
-      const scalar_type tripleA = nbl::hlsl::dot(BxL, base.tri_vertices[0]);
-      const scalar_type tripleE = nbl::hlsl::dot(BxL, base.e_C);
-      const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE;
-      if (R_sq < numeric_limits<scalar_type>::epsilon)
-         return vector2_type(scalar_type(0.0), scalar_type(0.0));
-
-      const scalar_type rcpR = scalar_type(1.0) / nbl::hlsl::sqrt(R_sq);
-      vector3_type cp = base.tri_vertices[0] * (tripleE * rcpR) + base.e_C * (-tripleA * rcpR);
-      // two intersections exist; pick the one on the minor arc A->C
-      if (nbl::hlsl::dot(cp, base.tri_vertices[0] + vertexC) < scalar_type(0.0))
-         cp = -cp;
+      const scalar_type rcpR = _static_cast<scalar_type>(1.0) / nbl::hlsl::sqrt(R_sq);
+      vector3_type cp = tri_vertices[0] * (tripleE * rcpR) + e_C * (-tripleA * rcpR);
+      // two intersections exist; pick the one on the minor arc A->C (branchless sign flip)
+      cp = ieee754::flipSignIfRHSNegative(cp, nbl::hlsl::dot(cp, APlusC));
 
       // Step 2: u.x = solidAngle(A,B,C') / solidAngle(A,B,C)
       // Van Oosterom-Strackee: tan(Omega/2) = |A.(BxC')| / (1 + A.B + B.C' + C'.A)
@@ -270,25 +209,25 @@ struct SphericalTriangle<T, true, Algorithm>
       // Expanding C' = cosBp*A + sinBp*e_C into the triple product:
       //   A.(BxC') = cosBp * A.(BxA) + sinBp * A.(Bxe_C) = sinBp * A.(Bxe_C)
       // since A.(BxA) = 0 identically. This avoids the cancellation.
-      const scalar_type cosBp_inv = nbl::hlsl::dot(cp, base.tri_vertices[0]);
-      const scalar_type sinBp_inv = nbl::hlsl::dot(cp, base.e_C);
-      const scalar_type AxBdotE = nbl::hlsl::dot(base.tri_vertices[0], nbl::hlsl::cross(base.tri_vertices[1], base.e_C));
+      const scalar_type cosBp_inv = nbl::hlsl::dot(cp, tri_vertices[0]);
+      const scalar_type sinBp_inv = nbl::hlsl::dot(cp, e_C);
+      const scalar_type AxBdotE = nbl::hlsl::dot(tri_vertices[0], nbl::hlsl::cross(tri_vertices[1], e_C));
       const scalar_type num = sinBp_inv * AxBdotE;
-      const scalar_type cosCpB = nbl::hlsl::dot(base.tri_vertices[1], cp);
-      const scalar_type den = scalar_type(1.0) + base.triCosC + cosCpB + cosBp_inv;
-      const scalar_type subSolidAngle = scalar_type(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
-      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
+      const scalar_type cosCpB = nbl::hlsl::dot(tri_vertices[1], cp);
+      const scalar_type den = _static_cast<scalar_type>(1.0) + triCosc + cosCpB + cosBp_inv;
+      const scalar_type subSolidAngle = _static_cast<scalar_type>(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
+      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, _static_cast<scalar_type>(0.0), _static_cast<scalar_type>(1.0));
 
       // Step 3: u.y = |L-B|^2 / |C'-B|^2
       // Squared Euclidean distance avoids catastrophic cancellation vs (1-dot)/(1-dot)
-      const vector3_type LminusB = L - base.tri_vertices[1];
-      const vector3_type cpMinusB = cp - base.tri_vertices[1];
+      const vector3_type LminusB = L - tri_vertices[1];
+      const vector3_type cpMinusB = cp - tri_vertices[1];
       const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
       const scalar_type v_denom = nbl::hlsl::dot(cpMinusB, cpMinusB);
       const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
          nbl::hlsl::clamp(v_num / nbl::hlsl::max(v_denom, numeric_limits<scalar_type>::min),
-            scalar_type(0.0), scalar_type(1.0)),
-         scalar_type(0.0));
+            _static_cast<scalar_type>(0.0), _static_cast<scalar_type>(1.0)),
+         _static_cast<scalar_type>(0.0));
 
       return vector2_type(u, v);
    }
@@ -313,13 +252,16 @@ struct SphericalTriangle<T, true, Algorithm>
       return backwardPdf(L);
    }
 
-   // mirrored from base for uniform access across both specializations
    scalar_type rcpSolidAngle;
+   scalar_type cosA;
+   scalar_type sinA;
+   scalar_type sinA_triCosc; // precomputed sinA * triCosc
+   scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only
 
-   base_type base;
-   vector3_type vertexC;
-   vector3_type arcACPlaneNormal; // precomputed normalize(cross(A, C)), great circle normal of arc AC
-   scalar_type triCscC;
+   vector3_type tri_vertices[2]; // A and B only
+   scalar_type triCosc;
+   vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane
+   vector3_type APlusC; // precomputed A + C, used to pick the minor-arc intersection in generateInverse
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
index 21901a9628..0d22323921 100644
--- a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
@@ -35,10 +35,11 @@ struct UniformHemisphere
 
 	static codomain_type generate(const domain_type u)
 	{
-		typename ConcentricMapping<T>::cache_type cmCache;
+		typename ConcentricMapping<scalar_type>::cache_type cmCache;
 		const vector_t2 p = ConcentricMapping<T>::generate(u, cmCache);
-		const T z = T(1.0) - cmCache.r2;
-		const T xyScale = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(2.0) - cmCache.r2));
+		assert(cmCache.r2 <= _static_cast<scalar_type>(1.0));
+		const T z = _static_cast<T>(1.0) - cmCache.r2;
+		const T xyScale = hlsl::sqrt<scalar_type>(_static_cast<scalar_type>(2.0) - cmCache.r2);
 		return vector_t3(p.x * xyScale, p.y * xyScale, z);
 	}
 
@@ -50,30 +51,30 @@ struct UniformHemisphere
 	static domain_type generateInverse(const codomain_type v)
 	{
 		// r_disk / r_xy = sqrt(1-z) / sqrt(1-z^2) = 1/sqrt(1+z)
-		const T scale = T(1.0) / hlsl::sqrt<T>(T(1.0) + v.z);
+		const scalar_type scale = hlsl::rsqrt<T>(_static_cast<T>(1.0) + v.z);
 		return ConcentricMapping<T>::generateInverse(vector_t2(v.x * scale, v.y * scale));
 	}
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<scalar_type>(0.5) * numbers::inv_pi<T>;
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * numbers::inv_pi<T>;
 	}
 
 	static density_type backwardPdf(const codomain_type v)
 	{
 		assert(v.z > 0);
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * numbers::inv_pi<T>;
 	}
 
 	static weight_type backwardWeight(const codomain_type v)
 	{
 		assert(v.z > 0);
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * numbers::inv_pi<T>;
 	}
 
 };
@@ -96,7 +97,7 @@ struct UniformSphere
 
 	static codomain_type generate(const domain_type u)
 	{
-		const T tmp = u.x * T(2.0) - T(1.0);
+		const T tmp = u.x * _static_cast<T>(2.0) - _static_cast<T>(1.0);
 		const codomain_type L = hemisphere_t::generate(vector_t2(hlsl::abs<T>(tmp), u.y));
 		return vector_t3(L.x, L.y, L.z * hlsl::sign(tmp));
 	}
@@ -108,29 +109,29 @@ struct UniformSphere
 
 	static domain_type generateInverse(const codomain_type v)
 	{
-		const T dir = hlsl::sign(v.z) * T(0.5);
+		const T dir = hlsl::sign(v.z) * _static_cast<T>(0.5);
 		const domain_type hemiU = hemisphere_t::generateInverse(vector_t3(v.x, v.y, hlsl::abs<T>(v.z)));
-		return vector_t2(hemiU.x * dir + T(0.5), hemiU.y);
+		return vector_t2(hemiU.x * dir + _static_cast<T>(0.5), hemiU.y);
 	}
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * hemisphere_t::forwardPdf(u, cache);
+		return _static_cast<T>(0.5) * hemisphere_t::forwardPdf(u, cache);
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * hemisphere_t::forwardWeight(u, cache);
+		return _static_cast<T>(0.5) * hemisphere_t::forwardWeight(u, cache);
 	}
 
 	static density_type backwardPdf(const codomain_type v)
 	{
-		return T(0.25) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.25) * numbers::inv_pi<T>;
 	}
 
 	static weight_type backwardWeight(const codomain_type v)
 	{
-		return T(0.25) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.25) * numbers::inv_pi<T>;
 	}
 
 };
diff --git a/include/nbl/builtin/hlsl/shapes/obb.hlsl b/include/nbl/builtin/hlsl/shapes/obb.hlsl
index bdddc48ebf..3fa5eebc15 100644
--- a/include/nbl/builtin/hlsl/shapes/obb.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/obb.hlsl
@@ -49,7 +49,7 @@ struct OBB
 	NBL_CONSTEXPR_STATIC_INLINE OBB createAxisAligned(point_t mid, point_t len)
 	{
 		point_t axes[D];
-		for (auto dim_i = 0; dim_i < D; dim_i++)
+		for (int dim_i = 0; dim_i < D; dim_i++)
 		{
 			axes[dim_i] = point_t(0);
 			axes[dim_i][dim_i] = 1;
@@ -60,6 +60,106 @@ struct OBB
 	matrix<scalar_t, D, D + 1> transform;
 };
 
+// Decomposed OBB view: caches columns and minCorner for fast vertex queries.
+// Same 12-float footprint as float32_t3x4, but laid out for branchless
+// corner computation.
+// Decomposed OBB view: caches columns and minCorner for fast vertex queries.
+// columns[i] are the full OBB edge vectors; minCorner is the world position
+// of corner 0b000 = center - 0.5*(col0+col1+col2).
+template<typename Scalar=float32_t>
+struct OBBView
+{
+	using scalar_t = Scalar;
+	using vec3_t = vector<Scalar, 3>;
+	using mat3_t = matrix<Scalar, 3, 3>;
+
+	mat3_t columns;
+	vec3_t minCorner;
+
+	static OBBView create(matrix<scalar_t, 3, 4> modelMatrix)
+	{
+		matrix<scalar_t, 4, 3> m = transpose(modelMatrix);
+		OBBView v;
+		v.columns = mat3_t(m[0].xyz, m[1].xyz, m[2].xyz);
+		v.minCorner = m[3].xyz - scalar_t(0.5) * (m[0].xyz + m[1].xyz + m[2].xyz);
+		return v;
+	}
+
+	vec3_t getCenter() NBL_CONST_MEMBER_FUNC
+	{
+		return minCorner + scalar_t(0.5) * (columns[0] + columns[1] + columns[2]);
+	}
+
+	vec3_t getVertex(uint32_t i) NBL_CONST_MEMBER_FUNC
+	{
+		vec3_t p = minCorner;
+		if (i & 1u) p += columns[0];
+		if (i & 2u) p += columns[1];
+		if (i & 4u) p += columns[2];
+		return p;
+	}
+
+	// Scalar-z specialization: only computes the z component of a corner.
+	// Used for clip-mask build where we only need the sign of z.
+	scalar_t getVertexZ(uint32_t i) NBL_CONST_MEMBER_FUNC
+	{
+		scalar_t pz = minCorner.z;
+		if (i & 1u) pz += columns[0].z;
+		if (i & 2u) pz += columns[1].z;
+		if (i & 4u) pz += columns[2].z;
+		return pz;
+	}
+
+	// Ray-OBB intersection via slab test in OBB-local space.
+	// Returns (tMin, tMax, hit). tMin is clamped to 0 if ray starts inside.
+	// TODO: not optimized -- precompute inverse columns, handle f~=0 edge case
+	struct Intersection
+	{
+		scalar_t tMin;
+		scalar_t tMax;
+		bool hit;
+	};
+
+	Intersection rayIntersection(vec3_t rayOrigin, vec3_t rayDir) NBL_CONST_MEMBER_FUNC
+	{
+		// Vector from ray origin to minCorner
+		vec3_t delta = rayOrigin - minCorner;
+
+		scalar_t tMin = scalar_t(-1e30);
+		scalar_t tMax = scalar_t(1e30);
+
+		// Slab test against each OBB axis
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			vec3_t axis = columns[i];
+			scalar_t axisLenSq = dot(axis, axis);
+			scalar_t e = dot(axis, delta);
+			scalar_t f = dot(axis, rayDir);
+
+			// Project ray onto axis: entry at t where dot = 0, exit where dot = axisLenSq
+			scalar_t invF = scalar_t(1.0) / f;
+			scalar_t t1 = (-e) * invF;
+			scalar_t t2 = (axisLenSq - e) * invF;
+
+			if (t1 > t2)
+			{
+				scalar_t tmp = t1;
+				t1 = t2;
+				t2 = tmp;
+			}
+
+			tMin = max(tMin, t1);
+			tMax = min(tMax, t2);
+		}
+
+		Intersection result;
+		result.hit = (tMax >= tMin) && (tMax > scalar_t(0));
+		result.tMin = max(tMin, scalar_t(0));
+		result.tMax = tMax;
+		return result;
+	}
+};
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
index 5895e4bc80..7ffe2ef407 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
@@ -94,6 +94,8 @@ struct SphericalRectangle
         vector4_type cosGamma;
     };
 
+    // TODO: take an observer already, this way we can precompute and store the `r0`, `denorm_n_z` and `rcpLen_denorm_n_z`
+    // we need all of the above for solid angle and projected solid angle computation
     static SphericalRectangle<Scalar> create(NBL_CONST_REF_ARG(CompressedSphericalRectangle<Scalar>) compressed)
     {
         SphericalRectangle<scalar_type> retval;
@@ -102,7 +104,8 @@ struct SphericalRectangle
         retval.basis[0] = compressed.right / retval.extents[0];
         retval.basis[1] = compressed.up / retval.extents[1];
         assert(hlsl::abs(hlsl::dot(retval.basis[0], retval.basis[1])) < scalar_type(1e-5));
-        retval.basis[2] = hlsl::normalize(hlsl::cross(retval.basis[0], retval.basis[1]));
+        // don't normalize, the `right` and `up` vectors are orthogonal and the two bases are normalized already!
+        retval.basis[2] = hlsl::cross(retval.basis[0], retval.basis[1]);
         return retval;
     }
 
@@ -112,7 +115,8 @@ struct SphericalRectangle
         result.r0 = hlsl::mul(basis, origin - observer);
 
         const vector4_type denorm_n_z = vector4_type(-result.r0.y, result.r0.x + extents.x, result.r0.y + extents.y, -result.r0.x);
-        result.n_z = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z);
+        const vector4_type rcpLen_denorm_n_z = hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z);
+        result.n_z = denorm_n_z * rcpLen_denorm_n_z;
         result.cosGamma = vector4_type(
             -result.n_z[0] * result.n_z[1],
             -result.n_z[1] * result.n_z[2],
@@ -128,17 +132,17 @@ struct SphericalRectangle
     }
 
     // Kelvin-Stokes theorem: signed projected solid angle = integral_{rect} (n . omega) d_omega
+    // TODO: don't take the observer, observer should be taken at creation
     scalar_type projectedSolidAngle(const vector3_type observer, const vector3_type receiverNormal) NBL_CONST_MEMBER_FUNC
     {
         return projectedSolidAngleFromLocal(hlsl::mul(basis, origin - observer), hlsl::mul(basis, receiverNormal));
     }
 
-    // Overload for when r0 and localNormal are already computed (avoids redundant mul(basis, ...)).
-    // Exploits rectangle structure: all 4 corners share the same z, so cross products
-    // have only 2 nonzero components each, and externalProducts can be computed without
-    // normalizing the corner directions.
+    // TODO: only take a `localN`
     scalar_type projectedSolidAngleFromLocal(const vector3_type r0, const vector3_type n) NBL_CONST_MEMBER_FUNC
     {
+        // FUN FACT: `n_z` already holds Z coordinate the NORMALIZED `awayFromEdgePlane`, the non-zero coordinate absolute value is equal to `r0.z * rcpLen_denorm_n_z`
+// TODO: skip all this code until  just call `acos` on the `unnormDots`
         const scalar_type x0 = r0.x, y0 = r0.y, z = r0.z;
         const scalar_type x1 = x0 + extents.x;
         const scalar_type y1 = y0 + extents.y;
@@ -154,8 +158,9 @@ struct SphericalRectangle
             zSq + x1 * x1,
             zSq + y1 * y1,
             zSq + x0 * x0
-        );
+        ); // TODO: this is already computed as `rcpLen_denorm_n_z`
 
+// TODO: this can be computed from `denorm_n_z`, `z` and `rcpLen_denorm_n_z` instead
         // dot(cross(ri,rj), n) / |cross(ri,rj)| the ex/ey scale factors cancel
         const vector4_type crossDotN = vector4_type(
             z * n.y - y0 * n.z,
@@ -164,8 +169,12 @@ struct SphericalRectangle
             z * n.x - x0 * n.z
         );
         // The ABS makes the computation correct for abs(cos(theta)) (BSDF projected solid angle).
-        const vector4_type externalProducts = hlsl::abs(crossDotN) * hlsl::rsqrt<vector4_type>(crossLenSq);
+        const vector4_type externalProducts = crossDotN * hlsl::rsqrt<vector4_type>(crossLenSq);
 
+// TODO: isn't `rcpLen_denorm_n_z` related to the sin^-1() of arclengths ? Wouldn't `ACOS_CSC` apply instead  of `acos*rsqrt(1-cos^2)`
+// wouldn't then `hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z` be the sin^2 ?
+// it would probably have to be a different, here's the `acos(sqrt(1-x*x))/x` curve fit again revealed to me in a dream
+//  https://www.desmos.com/calculator/sbdrulot5a = exp2(-1.6*sqrt(1-x*x))*A+B
         // cos(arc length) between adjacent corners: dot(ri,rj) / (|ri|*|rj|)
         const vector4_type lenSq = vector4_type(
             x0 * x0 + y0 * y0,
@@ -184,9 +193,11 @@ struct SphericalRectangle
         // rcpLen[i]*rcpLen[j] for adjacent pairs: (0,1), (1,2), (2,3), (3,0)
         const vector4_type cos_sides = unnormDots * rcpLen * rcpLen.yzwx;
 
+        // TODO: there's the same opportunity for optimization of this as the Spherical Triangle
+        // https://www.linkedin.com/posts/matt-kielan-9b054a165_untitled-graph-activity-7442910005671923712-jHz6?utm_source=share&utm_medium=member_desktop&rcm=ACoAACdp2RQBqq2bJfC2zxpsme-vRv2zh9oP-8E
         const vector4_type pyramidAngles = hlsl::acos<vector4_type>(cos_sides);
 
-        return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);
+        return hlsl::abs(hlsl::dot(pyramidAngles, externalProducts)) * scalar_type(0.5);
     }
 
     vector3_type origin;
diff --git a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
index 1a5681b39e..b8a2c7229e 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
@@ -20,6 +20,26 @@ namespace hlsl
 namespace shapes
 {
 
+// TODO: move to where fast_acos lives
+template<typename T, int order=2>
+T acos_csc_approx(const T arg)
+{
+    const T u = hlsl::log2(_static_cast<T>(1)+arg);
+    // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial
+    // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties:
+    // `F(0) = 0` and
+    // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy
+    // See https://www.desmos.com/calculator/sdptomhbju
+    // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments
+    T poly;
+    // TODO: actually optimize these constants in real world scenarios (renders)
+    if (order==1)
+        poly = (_static_cast<T>(1)-u)*_static_cast<T>(0.6);
+    else if (order==2)
+        poly = (_static_cast<T>(1)-u)*_static_cast<T>(0.637)+(_static_cast<T>(1) - u * u) * _static_cast<T>(0.0115);
+    return hlsl::exp2<T>(poly);
+}
+
 template<typename T>
 struct SphericalTriangle
 {
@@ -54,6 +74,7 @@ struct SphericalTriangle
         // degenerate triangle: any side has near-zero sin, so csc blows up
         if (hlsl::any<vector<bool, 3> >(retval.csc_sides >= hlsl::promote<vector3_type>(numeric_limits<scalar_type>::max)))
         {
+            // TODO: can't do this, still need to be able to sample thin triangle like a line light, so need to know all the angles which are still valid
             retval.cos_vertices = hlsl::promote<vector3_type>(0.0);
             retval.sin_vertices = hlsl::promote<vector3_type>(0.0);
             retval.solid_angle = 0;
@@ -86,35 +107,29 @@ struct SphericalTriangle
         if (solid_angle <= numeric_limits<scalar_type>::epsilon)
             return 0;
 
-        matrix<scalar_type, 3, 3> awayFromEdgePlane;
-        awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * csc_sides[0];
-        awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * csc_sides[1];
-        awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * csc_sides[2];
+        // `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross(A,B)*acos_csc_approx(dot(A,B))`
+#define ACOS_CSC(I) acos_csc_approx(cos_sides[I])
+//#define ACOS_CSC(I) hlsl::acos(cos_sides[I])*csc_sides[I]
+        scalar_type externalProductsWeightedByPyramidAngles = hlsl::dot(hlsl::cross(vertices[1], vertices[2]),receiverNormal) * ACOS_CSC(0);
+        externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[2], vertices[0]),receiverNormal) * ACOS_CSC(1);
+        externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[0], vertices[1]),receiverNormal) * ACOS_CSC(2);
+#undef ACOS_CSC
+
         // The ABS makes it so that the computation is correct for an `abs(cos(theta))` factor which is the projected solid angle used for a BSDF.
+        // It also makes the computation insensitive to the CW or CCW winding of the vertices in the triangle.
         // Proof: Kelvin-Stokes theorem, if you split the set into two along the horizon with constant CCW winding, the `cross` along the shared edge
         // goes in different directions and cancels out, while `acos` of the clipped great arcs corresponding to polygon edges add up to the original sides again.
-        const vector3_type externalProducts = hlsl::abs(hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal));
-
-        // Far TODO: `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross*acos_csc_approx(dot(A,B))`
-        // We could skip the `csc_sides` factor, and computing `pyramidAngles` and replace them with this approximation weighting before the dot product with the receiver notmal
-        // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial, so far I've calculated `F = (1-u)0.635+(1-u^2)0.0118` which gives <5% error until 165 degrees
-        // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties:
-        // `F(0) = 0` and
-        // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy
-        // See https://www.desmos.com/calculator/sdptomhbju
-        // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments
-        const vector3_type pyramidAngles = hlsl::acos<vector3_type>(cos_sides);
-        // So that triangle covering almost whole hemisphere sums to PI
-        return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);
+        // The 0.5 is so that triangle covering almost whole hemisphere sums to PI
+        return externalProductsWeightedByPyramidAngles * scalar_type(0.5);
     }
 
     vector3_type vertices[3];
     // angles of vertices with origin, so the sides are INSIDE the sphere
     vector3_type cos_sides;
-    vector3_type csc_sides;
+    vector3_type csc_sides; // TODO: spherical triangle sampling only needs `csc_sides[1]` and possibly `csc_sides[2]`
     // angles between arcs on the sphere, so angles in the TANGENT plane at each vertex
     vector3_type cos_vertices;
-    vector3_type sin_vertices;
+    vector3_type sin_vertices; // TODO: spherical triangle sampling only needs `sin_vertices[0]` a.k.a `sinA`
     scalar_type solid_angle;
 };
 
diff --git a/include/nbl/core/sampling/RandomSampler.h b/include/nbl/core/sampling/RandomSampler.h
index 39832dc8f1..b692ef5e08 100644
--- a/include/nbl/core/sampling/RandomSampler.h
+++ b/include/nbl/core/sampling/RandomSampler.h
@@ -11,8 +11,8 @@
 namespace nbl::core
 {
 
-class RandomSampler
-{
+	class RandomSampler
+	{
 	public:
 		RandomSampler(uint32_t _seed)
 		{
@@ -25,9 +25,24 @@ class RandomSampler
 			return mersenneTwister();
 		}
 
+		// Returns a float in [0, 1)
+		inline float nextFloat()
+		{
+			// 1 / 2^32
+			constexpr float norm = 1.0f / 4294967296.0f;
+			return mersenneTwister() * norm;
+		}
+
+		// Returns a float in [min, max)
+		inline float nextFloat(float min, float max)
+		{
+			constexpr float norm = 1.0f / 4294967296.0f;
+			return min + (mersenneTwister() * norm) * (max - min);
+		}
+
 	protected:
 		std::mt19937 mersenneTwister;
-};
+	};
 
 
 }
diff --git a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
index 39013417dc..53cef98d71 100644
--- a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
+++ b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
@@ -24,11 +24,18 @@ struct ProtoPipeline final
 			const video::IGPURenderpass* renderpass,
 			const uint32_t subpassIx=0,
 			asset::SBlendParams blendParams = {},
+			asset::SRasterizationParams rasterizationParams = DefaultRasterParams,
 			const hlsl::SurfaceTransform::FLAG_BITS swapchainTransform=hlsl::SurfaceTransform::FLAG_BITS::IDENTITY_BIT,
 			video::IGPUPipelineCache* pipelineCache = nullptr
 		);
 
 		core::smart_refctd_ptr<asset::IShader> m_vxShader;
+
+		constexpr static inline asset::SRasterizationParams DefaultRasterParams = {
+			.faceCullingMode = asset::EFCM_NONE,
+			.depthWriteEnable = false,
+			.depthCompareOp = asset::ECO_ALWAYS
+		};
 };
 
 bool recordDrawCall(video::IGPUCommandBuffer* commandBuffer);
diff --git a/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp b/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp
index 58b1f2ea84..55dea2eb00 100644
--- a/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp
+++ b/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp
@@ -84,6 +84,7 @@ smart_refctd_ptr<IGPUGraphicsPipeline> ProtoPipeline::createPipeline(
 	const IGPURenderpass* renderpass,
 	const uint32_t subpassIx,
 	SBlendParams blendParams,
+	asset::SRasterizationParams rasterizationParams,
 	const hlsl::SurfaceTransform::FLAG_BITS swapchainTransform,
 	IGPUPipelineCache* pipelineCache)
 {
@@ -94,11 +95,6 @@ smart_refctd_ptr<IGPUGraphicsPipeline> ProtoPipeline::createPipeline(
 
 	smart_refctd_ptr<IGPUGraphicsPipeline> m_retval;
 	{
-		constexpr SRasterizationParams defaultRasterParams = {
-			.faceCullingMode = EFCM_NONE,
-			.depthWriteEnable = false,
-			.depthCompareOp = ECO_ALWAYS
-		};
 		const auto orientationAsUint32 = static_cast<uint32_t>(swapchainTransform);
 
 		IGPUPipelineBase::SShaderEntryMap specConstants;
@@ -111,7 +107,7 @@ smart_refctd_ptr<IGPUGraphicsPipeline> ProtoPipeline::createPipeline(
 		params[0].cached = {
 			.vertexInput = {}, // The Full Screen Triangle doesn't use any HW vertex input state
 			.primitiveAssembly = {},
-			.rasterization = defaultRasterParams,
+			.rasterization = rasterizationParams,
 			.blend = blendParams,
 			.subpassIx = subpassIx
 		};