diff --git a/examples_tests b/examples_tests index eebde787c2..07e76965a7 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit eebde787c233367ade8eb0580bc79c0d562e97aa +Subproject commit 07e76965a740d8a420779860de133dd88a3081f6 diff --git a/include/nbl/builtin/hlsl/algorithm.hlsl b/include/nbl/builtin/hlsl/algorithm.hlsl index 631001686e..d80ebdf443 100644 --- a/include/nbl/builtin/hlsl/algorithm.hlsl +++ b/include/nbl/builtin/hlsl/algorithm.hlsl @@ -142,7 +142,9 @@ struct bound_t void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint, const uint32_t rightBegin) { - if (compare(accessor[testPoint],value)) + typename Accessor::value_type val; + accessor.get(testPoint, val); + if (compare(val,value)) it = rightBegin; } void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint) diff --git a/include/nbl/builtin/hlsl/bit.hlsl b/include/nbl/builtin/hlsl/bit.hlsl index 71075d5491..f712c5aad5 100644 --- a/include/nbl/builtin/hlsl/bit.hlsl +++ b/include/nbl/builtin/hlsl/bit.hlsl @@ -3,6 +3,7 @@ #include +#include #ifndef __HLSL_VERSION @@ -123,5 +124,32 @@ uint16_t countl_zero(T n) } } #endif - + +namespace nbl +{ +namespace hlsl +{ + +// Variable-width sub-word bit rotation +template +NBL_CONSTEXPR_FUNC T rotl(T value, uint32_t bits, uint32_t width) +{ + const T mask = (width >= sizeof(T) * 8) ? ~T(0) : ((T(1) << width) - T(1)); + value &= mask; + bits &= -(bits < width); + return ((value << bits) | (value >> (width - bits))) & mask; +} + +template +NBL_CONSTEXPR_FUNC T rotr(T value, uint32_t bits, uint32_t width) +{ + const T mask = (width >= sizeof(T) * 8) ? ~T(0) : ((T(1) << width) - T(1)); + value &= mask; + bits &= -(bits < width); + return ((value >> bits) | (value << (width - bits))) & mask; +} + +} +} + #endif diff --git a/include/nbl/builtin/hlsl/functional.hlsl b/include/nbl/builtin/hlsl/functional.hlsl index 118fe07c63..6a155f2b92 100644 --- a/include/nbl/builtin/hlsl/functional.hlsl +++ b/include/nbl/builtin/hlsl/functional.hlsl @@ -89,12 +89,23 @@ struct reference_wrapper : enable_if_t< return lhs OP rhs; \ } +#define ALIAS_STD_CMP(NAME,OP) template struct NAME { \ + using type_t = T; \ + \ + bool operator()(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs) \ + { \ + return lhs OP rhs; \ + } + #else // CPP #define ALIAS_STD(NAME,OP) template struct NAME : std::NAME { \ using type_t = T; +#define ALIAS_STD_CMP(NAME,OP) template struct NAME : std::NAME { \ + using type_t = T; + #endif ALIAS_STD(bit_and,&) @@ -136,14 +147,15 @@ ALIAS_STD(divides,/) }; -ALIAS_STD(equal_to, ==) }; -ALIAS_STD(not_equal_to, !=) }; -ALIAS_STD(greater, >) }; -ALIAS_STD(less, <) }; -ALIAS_STD(greater_equal, >=) }; -ALIAS_STD(less_equal, <=) }; +ALIAS_STD_CMP(equal_to, ==) }; +ALIAS_STD_CMP(not_equal_to, !=) }; +ALIAS_STD_CMP(greater, >) }; +ALIAS_STD_CMP(less, <) }; +ALIAS_STD_CMP(greater_equal, >=) }; +ALIAS_STD_CMP(less_equal, <=) }; #undef ALIAS_STD +#undef ALIAS_STD_CMP // The above comparison operators return bool on STD, but in HLSL they're supposed to yield bool vectors, so here's a specialization so that they return `vector` for vectorial types diff --git a/include/nbl/builtin/hlsl/ies/sampler.hlsl b/include/nbl/builtin/hlsl/ies/sampler.hlsl index ab4046477c..a6309fd128 100644 --- a/include/nbl/builtin/hlsl/ies/sampler.hlsl +++ b/include/nbl/builtin/hlsl/ies/sampler.hlsl @@ -85,7 +85,7 @@ struct CandelaSampler const angle_t vAngle = degrees(polar.theta); const angle_t hAngle = degrees(__wrapPhi(polar.phi, symmetry)); -#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } }; +#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } void get(uint32_t idx, NBL_REF_ARG(value_type) val) NBL_CONST_MEMBER_FUNC { val = EXPR; } }; NBL_IES_DEF_ANGLE_ACC(VAcc, acc.vAngle(idx)) NBL_IES_DEF_ANGLE_ACC(HAcc, acc.hAngle(idx)) diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl index b5b6f8feea..a80bdda904 100644 --- a/include/nbl/builtin/hlsl/math/functions.hlsl +++ b/include/nbl/builtin/hlsl/math/functions.hlsl @@ -93,13 +93,22 @@ scalar_type_t lpNorm(NBL_CONST_REF_ARG(T) v) // valid only for `theta` in [-PI,PI] -template ) +// UseRealSinCos=true -> back-to-back sin + cos. Saturates the special-function pipeline, enables vendor sincos fusion, full precision near multiples of pi. +// UseRealSinCos=false -> cos + sqrt(1-c*c) with sign recovered from theta. Saves one special-function op when cos alone is cheaper than sin+cos, but suffers catastrophic cancellation as |c| -> 1. +template ) void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c) { - s = sin(theta); - c = cos(theta); - // s = sqrt(T(NBL_FP64_LITERAL(1.0))-c*c); - // s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0))); + if (UseRealSinCos) + { + s = sin(theta); + c = cos(theta); + } + else + { + c = cos(theta); + s = sqrt(T(NBL_FP64_LITERAL(1.0))-c*c); + s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0))); + } } template ::Dimension == 3) diff --git a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl index 15742e10f3..8e7b3249a0 100644 --- a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl +++ b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -17,84 +18,187 @@ namespace hlsl namespace sampling { -// Alias Method (Vose/Walker) discrete sampler. -// -// Samples a discrete index in [0, N) with probability proportional to -// precomputed weights in O(1) time per sample, using a prebuilt alias table. -// -// Accessor template parameters must satisfy GenericReadAccessor: -// accessor.template get(index, outVal) // void, writes to outVal -// -// - ProbabilityAccessor: reads scalar_type threshold in [0, 1] for bin i -// - AliasIndexAccessor: reads uint32_t redirect index for bin i -// - PdfAccessor: reads scalar_type weight[i] / totalWeight -// -// Satisfies TractableSampler (not BackwardTractableSampler: the mapping is discrete). -// The cache stores the PDF value looked up during generate, avoiding redundant -// storage of the codomain (sampled index) which is already the return value. -template result = bin +// else -> result = getTarget(word) +// Quantizing the threshold to (32 - Log2N) bits is precision-neutral: `u` +// already consumed Log2N bits of randomness producing `bin`, so `remainder` +// carries exactly that many bits of discriminatory power. +namespace impl +{ +template +struct AliasBitDecoder +{ + static uint32_t getTarget(uint32_t word) + { + return word & ((1u << Log2N) - 1u); + } + template + static T getStayProb(uint32_t word) + { + const uint32_t unormMax = (~0u) >> Log2N; + return T(word >> Log2N) / T(unormMax); + } +}; +} // namespace impl + +// 8 B entry used by the NBig == true variant. Embeds the bin's own pdf +// alongside the packed word so the common stay-case needs no extra tap. +template +struct PackedAliasEntryB +{ + uint32_t packedWord; // low Log2N: redirect target; high 32-Log2N: stayProb unorm + T ownPdf; // pdf of this bin +}; + + +// NBig == false: 4 B packed word per bin + separate pdf[] array. Per sample +// = one 4 B word load + one unconditional 4 B pdf[] tap indexed by the +// selected bin (either the current bin or its redirect). Total 8 B whether +// the sample stays or aliases. Favours small N. +template && - concepts::accessors::GenericReadAccessor && - concepts::accessors::GenericReadAccessor && + concepts::accessors::GenericReadAccessor && concepts::accessors::GenericReadAccessor) -struct AliasTable +struct PackedAliasTableA { using scalar_type = T; - using domain_type = Domain; using codomain_type = Codomain; using density_type = scalar_type; using weight_type = density_type; + using decoder = impl::AliasBitDecoder; + NBL_CONSTEXPR_STATIC_INLINE bool NBig = false; struct cache_type { density_type pdf; }; - static AliasTable create(NBL_CONST_REF_ARG(ProbabilityAccessor) _probAccessor, NBL_CONST_REF_ARG(AliasIndexAccessor) _aliasAccessor, NBL_CONST_REF_ARG(PdfAccessor) _pdfAccessor, codomain_type _size) + static PackedAliasTableA create(NBL_CONST_REF_ARG(PackedWordAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size) { - AliasTable retval; - retval.probAccessor = _probAccessor; - retval.aliasAccessor = _aliasAccessor; - retval.pdfAccessor = _pdfAccessor; - // Precompute tableSize as float minus 1 ULP so that u=1.0 maps to bin N-1 + PackedAliasTableA retval; + retval.entryAcc = _entryAcc; + retval.pdfAcc = _pdfAcc; const scalar_type exact = scalar_type(_size); retval.tableSizeMinusUlp = nbl::hlsl::bit_cast(nbl::hlsl::bit_cast(exact) - 1u); return retval; } - // BasicSampler interface codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC { const scalar_type scaled = u * tableSizeMinusUlp; const codomain_type bin = _static_cast(scaled); const scalar_type remainder = scaled - scalar_type(bin); - scalar_type prob; - probAccessor.template get(bin, prob); - - // Use if-statement to avoid select: aliasIndex is a dependent read - codomain_type result; - if (remainder < prob) - { - result = bin; - } - else - { - codomain_type alias; - aliasAccessor.template get(bin, alias); - result = alias; - } + uint32_t packedWord; + entryAcc.template get(bin, packedWord); + return hlsl::select(remainder < decoder::template getStayProb(packedWord), bin, codomain_type(decoder::getTarget(packedWord))); + } + codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const codomain_type result = generate(u); + pdfAcc.template get(result, cache.pdf); return result; } - // TractableSampler interface + density_type forwardPdf(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + return cache.pdf; + } + + weight_type forwardWeight(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + return cache.pdf; + } + + density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC + { + scalar_type pdf; + pdfAcc.template get(v, pdf); + return pdf; + } + + weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC + { + return backwardPdf(v); + } + + PackedWordAccessor entryAcc; + PdfAccessor pdfAcc; + scalar_type tableSizeMinusUlp; +}; + +// NBig == true: 8 B entry {packedWord, ownPdf} + separate pdf[] array. Per +// sample = one 8 B entry load (covers the common stay case where cache +// already has ownPdf). If the sample aliases, a conditional 4 B pdf[target] +// tap fills the cache. Total 8 B stay, 12 B aliased. Favours large N. +template && + concepts::accessors::GenericReadAccessor, Codomain> && + concepts::accessors::GenericReadAccessor) +struct PackedAliasTableB +{ + using scalar_type = T; + using domain_type = Domain; + using codomain_type = Codomain; + using density_type = scalar_type; + using weight_type = density_type; + using entry_type = PackedAliasEntryB; + using decoder = impl::AliasBitDecoder; + NBL_CONSTEXPR_STATIC_INLINE bool NBig = true; + + struct cache_type + { + density_type pdf; + }; + + static PackedAliasTableB create(NBL_CONST_REF_ARG(EntryAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size) + { + PackedAliasTableB retval; + retval.entryAcc = _entryAcc; + retval.pdfAcc = _pdfAcc; + const scalar_type exact = scalar_type(_size); + retval.tableSizeMinusUlp = nbl::hlsl::bit_cast(nbl::hlsl::bit_cast(exact) - 1u); + return retval; + } + + codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC + { + const scalar_type scaled = u * tableSizeMinusUlp; + const codomain_type bin = _static_cast(scaled); + const scalar_type remainder = scaled - scalar_type(bin); + + entry_type entry; + entryAcc.template get(bin, entry); + return hlsl::select(remainder < decoder::template getStayProb(entry.packedWord), bin, codomain_type(decoder::getTarget(entry.packedWord))); + } + codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC { - const codomain_type result = generate(u); - pdfAccessor.template get(result, cache.pdf); + const scalar_type scaled = u * tableSizeMinusUlp; + const codomain_type bin = _static_cast(scaled); + const scalar_type remainder = scaled - scalar_type(bin); + + entry_type entry; + entryAcc.template get(bin, entry); + + const bool stay = remainder < decoder::template getStayProb(entry.packedWord); + + cache.pdf = entry.ownPdf; + codomain_type result = bin; + if (!stay) + { + const codomain_type target = codomain_type(decoder::getTarget(entry.packedWord)); + pdfAcc.template get(target, cache.pdf); + result = target; + } return result; } @@ -111,7 +215,7 @@ struct AliasTable density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC { scalar_type pdf; - pdfAccessor.template get(v, pdf); + pdfAcc.template get(v, pdf); return pdf; } @@ -120,9 +224,8 @@ struct AliasTable return backwardPdf(v); } - ProbabilityAccessor probAccessor; - AliasIndexAccessor aliasAccessor; - PdfAccessor pdfAccessor; + EntryAccessor entryAcc; + PdfAccessor pdfAcc; scalar_type tableSizeMinusUlp; }; diff --git a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h index d02d21488c..2c7c53fd5f 100644 --- a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h +++ b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h @@ -5,7 +5,12 @@ #ifndef _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_ #define _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_ +#include #include +#include +#include + +#include namespace nbl { @@ -15,74 +20,139 @@ namespace sampling { // Builds the alias table from an array of non-negative weights. -// All output arrays must be pre-allocated to N entries. // -// Parameters: -// weights - input weights (non-negative, at least one must be > 0) -// N - number of entries -// outProbability - [out] alias table probability threshold per bin, in [0, 1] -// outAlias - [out] alias redirect index per bin -// outPdf - [out] normalized PDF per entry: weight[i] / sum(weights) -// workspace - scratch buffer of N uint32_t entries +// When `weights.size()` is a power of two, the builder transparently appends +// one zero-weight dummy bucket so the GPU-facing table size is N+1 (odd), +// which breaks PoT-periodic address patterns that alias memory channels / +// cache sets on most GPUs. The sampled distribution is unchanged, the dummy +// has stayProb = 0 and always redirects to a real donor. +// +// Output vectors are `resize`d by the builder to the final table size, so +// the caller just passes (possibly empty) vectors and reads back the +// returned size. That returned value is what to pass to the sampler's +// `_size` argument and to use when packing / uploading. template struct AliasTableBuilder { - static void build(std::span weights, T* outProbability, uint32_t* outAlias, T* outPdf, uint32_t* workspace) - { - T totalWeight = T(0); - for (uint32_t i = 0; i < weights.size(); i++) - totalWeight += weights[i]; - - const T rcpTotalWeight = T(1) / totalWeight; - - // Compute PDFs, scaled probabilities, and partition into small/large in one pass - uint32_t smallEnd = 0; - uint32_t largeBegin = weights.size(); - for (uint32_t i = 0; i < weights.size(); i++) - { - outPdf[i] = weights[i] * rcpTotalWeight; - outProbability[i] = outPdf[i] * T(weights.size()); - - if (outProbability[i] < T(1)) - workspace[smallEnd++] = i; - else - workspace[--largeBegin] = i; - } - - // Pair small and large entries - while (smallEnd > 0 && largeBegin < weights.size()) - { - const uint32_t s = workspace[--smallEnd]; - const uint32_t l = workspace[largeBegin]; - - outAlias[s] = l; - // outProbability[s] already holds the correct probability for bin s - - outProbability[l] -= (T(1) - outProbability[s]); - - if (outProbability[l] < T(1)) - { - // l became small: pop from large, push to small - largeBegin++; - workspace[smallEnd++] = l; - } - // else l stays in large (don't pop, reuse next iteration) - } - - // Remaining entries (floating point rounding artifacts) - while (smallEnd > 0) - { - const uint32_t s = workspace[--smallEnd]; - outProbability[s] = T(1); - outAlias[s] = s; - } - while (largeBegin < weights.size()) - { - const uint32_t l = workspace[largeBegin++]; - outProbability[l] = T(1); - outAlias[l] = l; - } - } + // Ugly but much faster: we better ensure the table size is not a power of + // two, so we pad with +1 zero-weight dummy bucket when needed. PoT-sized + // alias tables hit GPU memory channel / cache set aliasing that can be + // wildly (sometimes 2x+) slower than a nearby non-PoT size. Builder owns + // all the sizing (resizes the output vectors, allocates its own scratch), + // so the caller can't get it wrong. + static uint32_t build(std::span weights, std::vector& outProbability, std::vector& outAlias, std::vector& outPdf) + { + const uint32_t userN = static_cast(weights.size()); + const uint32_t tableN = (userN > 1u && (userN & (userN - 1u)) == 0u) ? (userN + 1u) : userN; + + outProbability.resize(tableN); + outAlias.resize(tableN); + outPdf.resize(tableN); + std::vector workspace(tableN); + + T totalWeight = T(0); + for (uint32_t i = 0; i < userN; i++) + totalWeight += weights[i]; + + const T rcpTotalWeight = T(1) / totalWeight; + + // Compute PDFs, scaled probabilities, and partition into small/large in one pass + uint32_t smallEnd = 0u; + uint32_t largeBegin = tableN; + for (uint32_t i = 0; i < userN; i++) + { + outPdf[i] = weights[i] * rcpTotalWeight; + outProbability[i] = outPdf[i] * T(tableN); + + if (outProbability[i] < T(1)) + workspace[smallEnd++] = i; + else + workspace[--largeBegin] = i; + } + // PoT dodge tail: one zero-weight dummy at index userN, always in the small list. + if (tableN != userN) + { + outPdf[userN] = T(0); + outProbability[userN] = T(0); + workspace[smallEnd++] = userN; + } + + // Pair small and large entries + while (smallEnd > 0u && largeBegin < tableN) + { + const uint32_t s = workspace[--smallEnd]; + const uint32_t l = workspace[largeBegin]; + + outAlias[s] = l; + // outProbability[s] already holds the correct probability for bin s + + outProbability[l] -= (T(1) - outProbability[s]); + + if (outProbability[l] < T(1)) + { + // l became small: pop from large, push to small + largeBegin++; + workspace[smallEnd++] = l; + } + // else l stays in large (don't pop, reuse next iteration) + } + + // Remaining entries (floating point rounding artifacts) + while (smallEnd > 0u) + { + const uint32_t s = workspace[--smallEnd]; + outProbability[s] = T(1); + outAlias[s] = s; + } + while (largeBegin < tableN) + { + const uint32_t l = workspace[largeBegin++]; + outProbability[l] = T(1); + outAlias[l] = l; + } + + return tableN; + } + + // Pack (target, stayProb) into a single 32-bit word with Log2N bits for + // target and (32 - Log2N) bits for the unorm-quantized threshold. Used by + // every packed variant; each packX() below calls this on a per-entry basis. + template + static uint32_t packWord(uint32_t target, T stayProb) + { + const uint32_t targetMask = (Log2N == 32u) ? ~0u : ((1u << Log2N) - 1u); + const T clamped = stayProb < T(0) ? T(0) : (stayProb > T(1) ? T(1) : stayProb); + const uint32_t unormMax = (Log2N == 0u) ? ~0u : ((~0u) >> Log2N); + const uint32_t probUnorm = static_cast(std::round(static_cast(clamped) * static_cast(unormMax))); + return (target & targetMask) | (probUnorm << Log2N); + } + + // Variant A, pack SoA outputs into an array of 4 B packed words. The + // pdf[] array is consumed directly by the sampler as a second accessor. + // outWords must be pre-allocated to N uint32_t entries. + template + static void packA(std::span probability, std::span alias, uint32_t* outWords) + { + const uint32_t N = static_cast(probability.size()); + for (uint32_t i = 0; i < N; i++) + outWords[i] = packWord(alias[i], probability[i]); + } + + // Variant B, pack SoA outputs into 8 B entries { packedWord, ownPdf }. + // The pdf[] array is *also* passed to the sampler (same contents as ownPdf + // column, but tapped independently with a 4 B fetch when the sample aliases). + // outEntries must be pre-allocated to N entries. + template + static void packB(std::span probability, std::span alias, std::span pdf, + PackedAliasEntryB* outEntries) + { + const uint32_t N = static_cast(probability.size()); + for (uint32_t i = 0; i < N; i++) + { + outEntries[i].packedWord = packWord(alias[i], probability[i]); + outEntries[i].ownPdf = pdf[i]; + } + } }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl index 35f1391930..b56073074b 100644 --- a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl +++ b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl @@ -45,10 +45,10 @@ struct Bilinear vector2_type twiceAreasUnderXCurve = vector2_type(bilinearCoeffs[0] + bilinearCoeffs[1], bilinearCoeffs[2] + bilinearCoeffs[3]); // Linear::create adds FLT_MIN internally, replicate here so both divisions share // the same denominator (sum + 2*min), enabling CSE to merge them into one division - const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + scalar_type(2.0) * hlsl::numeric_limits::min; - const scalar_type yNormFactor = scalar_type(2.0) / safeSum; + const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + _static_cast(2.0) * hlsl::numeric_limits::min; + const scalar_type yNormFactor = _static_cast(2.0) / safeSum; retval.lineary = Linear::create(twiceAreasUnderXCurve); - retval.normFactor = yNormFactor * scalar_type(2.0); + retval.normFactor = yNormFactor * _static_cast(2.0); return retval; } @@ -65,7 +65,7 @@ struct Bilinear // bilinear PDF = marginal_y_pdf * conditional_x_pdf; reuse both linear caches const scalar_type yPdf = lineary.forwardPdf(u.y, linearYCache); - cache.normalizedStart = yPdf * linearx.linearCoeffStart; + cache.normalizedStart = yPdf * linearx.normalizedCoeffStart; cache.linearXCache.diffTimesX *= yPdf; return p; } diff --git a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl index b9efd1ec35..9ae0c83c05 100644 --- a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl +++ b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl @@ -37,16 +37,16 @@ struct BoxMullerTransform { BoxMullerTransform retval; retval.stddev = _stddev; - retval.halfRcpStddev2 = scalar_type(0.5) / (_stddev * _stddev); + retval.halfRcpStddev2 = _static_cast(0.5) / (_stddev * _stddev); return retval; } codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC { scalar_type sinPhi, cosPhi; - math::sincos(scalar_type(2.0) * numbers::pi * u.y - numbers::pi, sinPhi, cosPhi); + math::sincos(_static_cast(2.0) * numbers::pi * u.y - numbers::pi, sinPhi, cosPhi); cache.direction = vector2_type(cosPhi, sinPhi); - return cache.direction * nbl::hlsl::sqrt(scalar_type(-2.0) * nbl::hlsl::log(u.x)) * stddev; + return cache.direction * nbl::hlsl::sqrt(_static_cast(-2.0) * nbl::hlsl::log(u.x)) * stddev; } density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC diff --git a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl index c9e5cac5d6..d31c2d994a 100644 --- a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl +++ b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl @@ -41,7 +41,7 @@ struct ConcentricMapping static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) { // map [0,1]^2 to [-1,1]^2 - const vector2_type centered = scalar_type(2) * u - hlsl::promote(scalar_type(1)); + const vector2_type centered = _static_cast(2) * u - hlsl::promote(_static_cast(1)); const scalar_type a = centered.x; const scalar_type b = centered.y; @@ -51,10 +51,10 @@ struct ConcentricMapping const scalar_type dominant = hlsl::select(cond, a, b); const scalar_type minor = hlsl::select(cond, b, a); - const scalar_type safe_dominant = dominant != scalar_type(0) ? dominant : scalar_type(0); + const scalar_type safe_dominant = dominant != _static_cast(0) ? dominant : _static_cast(0); const scalar_type ratio = minor / safe_dominant; - const scalar_type angle = scalar_type(0.25) * numbers::pi * ratio; + const scalar_type angle = _static_cast(0.25) * numbers::pi * ratio; const scalar_type c = hlsl::cos(angle); const scalar_type s = hlsl::sin(angle); @@ -90,7 +90,7 @@ struct ConcentricMapping // angle in [0, pi/4] const scalar_type phi = hlsl::atan2(num, denom); - const scalar_type minor_val = r * phi / (scalar_type(0.25) * numbers::pi); + const scalar_type minor_val = r * phi / (_static_cast(0.25) * numbers::pi); // reconstruct a,b using select instead of branching const scalar_type a_base = hlsl::select(swapped, minor_val, r); @@ -99,7 +99,7 @@ struct ConcentricMapping const scalar_type a = ieee754::copySign(a_base, p.x); const scalar_type b = ieee754::copySign(b_base, p.y); - return (vector2_type(a, b) + hlsl::promote(scalar_type(1))) * scalar_type(0.5); + return (vector2_type(a, b) + hlsl::promote(_static_cast(1))) * _static_cast(0.5); } // The PDF of Shirley mapping is constant (1/PI on the unit disk) diff --git a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl index 23e35e2f7d..52b063f448 100644 --- a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl +++ b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl @@ -35,8 +35,8 @@ struct ProjectedHemisphere static codomain_type __generate(const domain_type u) { - vector_t2 p = ConcentricMapping::generate(u * T(0.99999) + T(0.000005)); - T z = hlsl::sqrt(hlsl::max(T(0.0), T(1.0) - p.x * p.x - p.y * p.y)); + vector_t2 p = ConcentricMapping::generate(u * _static_cast(0.99999) + _static_cast(0.000005)); + T z = hlsl::sqrt(hlsl::max(_static_cast(0.0), _static_cast(1.0) - p.x * p.x - p.y * p.y)); return vector_t3(p.x, p.y, z); } @@ -93,11 +93,11 @@ struct ProjectedSphere static codomain_type __generate(NBL_REF_ARG(domain_type) u) { vector_t3 retval = hemisphere_t::__generate(u.xy); - const bool chooseLower = u.z > T(0.5); + const bool chooseLower = u.z > _static_cast(0.5); retval.z = chooseLower ? (-retval.z) : retval.z; if (chooseLower) - u.z -= T(0.5); - u.z *= T(2.0); + u.z -= _static_cast(0.5); + u.z *= _static_cast(2.0); return retval; } @@ -110,7 +110,7 @@ struct ProjectedSphere static density_type forwardPdf(const domain_type u, const cache_type cache) { - return T(0.5) * cache.z * numbers::inv_pi; + return _static_cast(0.5) * cache.z * numbers::inv_pi; } static weight_type forwardWeight(const domain_type u, const cache_type cache) @@ -120,7 +120,7 @@ struct ProjectedSphere static density_type backwardPdf(const codomain_type L) { - return T(0.5) * hlsl::abs(L.z) * numbers::inv_pi; + return _static_cast(0.5) * hlsl::abs(L.z) * numbers::inv_pi; } static weight_type backwardWeight(const codomain_type L) diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl index 1f176f9713..d3657babb1 100644 --- a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl +++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl @@ -16,18 +16,38 @@ namespace hlsl namespace sampling { -// Discrete sampler using cumulative probability lookup via upper_bound. +// Discrete sampler using cumulative probability lookup. // // Samples a discrete index in [0, N) with probability proportional to // precomputed weights in O(log N) time per sample. // -// The cumulative probability array stores N-1 entries (the last bucket -// is always 1.0 and need not be stored). Entry i holds the sum of -// probabilities for indices [0, i]. +// Three layouts / cache-population strategies, selected by the Mode parameter: +// +// TRACKING (default): N-1 CDF entries, last bucket implicit at 1.0. +// A stateful comparator records the straddling CDF +// values during upper_bound itself. +// YOLO: Same storage. Plain upper_bound followed by two +// re-reads of the adjacent CDF entries (warm cache). +// Lower register footprint, two extra array reads. +// EYTZINGER: Level-order implicit binary tree in 2*P entries +// where P = roundUpPot(N). Leaves at [P, P+N) hold +// the CDF; interior nodes at [1, P) hold split keys. +// Descent reads adjacent memory at each step, so +// every cache line pulled is fully utilised and the +// first log2(subgroupSize) iterations are served by a +// single transaction per subgroup. Build with +// sampling::buildEytzinger(). // // Satisfies TractableSampler and ResamplableSampler (not BackwardTractableSampler: // the mapping is discrete). -template) struct CumulativeProbabilitySampler { @@ -44,58 +64,116 @@ struct CumulativeProbabilitySampler density_type upperBound; }; + // `_size` is the user-facing bucket count N for every mode. TRACKING / YOLO + // expect the accessor to hold N-1 CDF entries; EYTZINGER expects 2*P entries + // in the level-order layout produced by buildEytzinger. static CumulativeProbabilitySampler create(NBL_CONST_REF_ARG(CumProbAccessor) _cumProbAccessor, uint32_t _size) { CumulativeProbabilitySampler retval; retval.cumProbAccessor = _cumProbAccessor; retval.storedCount = _size - 1u; + retval.depth = 0u; + NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER) + { + uint32_t P = 1u; + uint32_t d = 0u; + while (P < _size) { P <<= 1u; ++d; } + retval.depth = d; + } return retval; } // BasicSampler interface codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC { - // upper_bound returns first index where cumProb > u - return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u); + NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER) + { + const uint32_t leafBase = 1u << depth; + uint32_t index = 1u; + for (uint32_t iter = 0u; iter < depth; ++iter) + { + density_type key; + cumProbAccessor.template get(index, key); + index = (index << 1u) | uint32_t(!(u < key)); + } + const codomain_type result = codomain_type(index - leafBase); + return result < codomain_type(storedCount) ? result : codomain_type(storedCount); + } + else + { + return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u); + } } // TractableSampler interface codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC { - // #define NBL_CUMPROB_YOLO_READS -#ifdef NBL_CUMPROB_YOLO_READS - // YOLO approach: re-read the array after binary search. - // The accessed elements are adjacent to the found index so the cache is warm. - const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u); - cache.oneBefore = density_type(0.0); - if (result) - cumProbAccessor.template get(result - 1u, cache.oneBefore); - cache.upperBound = density_type(1.0); - if (result < storedCount) - cumProbAccessor.template get(result, cache.upperBound); -#else - // Tracking reads approach: stateful comparator captures CDF values during binary search. - struct CdfComparator + codomain_type result; + NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER) { - bool operator()(const density_type value, const density_type rhs) + // Descent visits one interior node per level. Going left tightens + // the upper bound to the current key; going right tightens the + // lower bound. Final index, leafBase is the bucket. + cache.oneBefore = _static_cast(0.0); + cache.upperBound = _static_cast(1.0); + const uint32_t leafBase = 1u << depth; + uint32_t index = 1u; + for (uint32_t iter = 0u; iter < depth; ++iter) { - const bool retval = value < rhs; - if (retval) - upperBound = rhs; + density_type key; + cumProbAccessor.template get(index, key); + const bool goRight = !(u < key); + if (goRight) + { + cache.oneBefore = key; + index = (index << 1u) | 1u; + } else - oneBefore = rhs; - return retval; + { + cache.upperBound = key; + index = (index << 1u); + } } - - density_type oneBefore; - density_type upperBound; - } comp; - comp.oneBefore = density_type(0.0); - comp.upperBound = density_type(1.0); - const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp); - cache.oneBefore = comp.oneBefore; - cache.upperBound = comp.upperBound; -#endif + const codomain_type raw = codomain_type(index - leafBase); + result = raw < codomain_type(storedCount) ? raw : codomain_type(storedCount); + } + else NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::YOLO) + { + // Re-read the two adjacent CDF entries after the binary search. + // Both sit on the cache lines the search just touched, so they are warm. + result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u); + cache.oneBefore = _static_cast(0.0); + if (result) + cumProbAccessor.template get(result - 1u, cache.oneBefore); + cache.upperBound = _static_cast(1.0); + if (result < storedCount) + cumProbAccessor.template get(result, cache.upperBound); + } + else + { + // TRACKING: stateful comparator captures the CDF values straddling the + // found index during the binary search itself, avoiding the two extra reads. + struct CdfComparator + { + bool operator()(const density_type value, const density_type rhs) + { + const bool retval = value < rhs; + if (retval) + upperBound = rhs; + else + oneBefore = rhs; + return retval; + } + + density_type oneBefore; + density_type upperBound; + } comp; + comp.oneBefore = _static_cast(0.0); + comp.upperBound = _static_cast(1.0); + result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp); + cache.oneBefore = comp.oneBefore; + cache.upperBound = comp.upperBound; + } return result; } @@ -111,16 +189,34 @@ struct CumulativeProbabilitySampler density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC { - density_type retval = density_type(1.0); - if (v < storedCount) - cumProbAccessor.template get(v, retval); - if (v) + NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER) { - density_type prev; - cumProbAccessor.template get(v - 1u, prev); - retval -= prev; + // Leaves store the CDF directly; the last real leaf is normalized + // to 1.0 and padded leaves (if any) also hold 1.0. + const uint32_t leafBase = 1u << depth; + density_type retval; + cumProbAccessor.template get(leafBase + uint32_t(v), retval); + if (v) + { + density_type prev; + cumProbAccessor.template get(leafBase + uint32_t(v) - 1u, prev); + retval -= prev; + } + return retval; + } + else + { + density_type retval = _static_cast(1.0); + if (v < storedCount) + cumProbAccessor.template get(v, retval); + if (v) + { + density_type prev; + cumProbAccessor.template get(v - 1u, prev); + retval -= prev; + } + return retval; } - return retval; } weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC @@ -129,7 +225,8 @@ struct CumulativeProbabilitySampler } CumProbAccessor cumProbAccessor; - uint32_t storedCount; + uint32_t storedCount; // N - 1 (last real bucket index) + uint32_t depth; // EYTZINGER only: ceil(log2(N)), iteration count; leafBase = 1 << depth }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h index a511fc2d8c..bf98d5ec93 100644 --- a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h +++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h @@ -30,6 +30,61 @@ void computeNormalizedCumulativeHistogram(std::span weights, T* outCumP std::for_each(outCumProb, outCumProb + N - 1, [normalizationFactor](T& v) { v *= normalizationFactor; }); } +// Returns the next power of two >= x (and 1 for x <= 1). Matches the leaf-count +// the Eytzinger builder pads to. +inline uint32_t eytzingerLeafCount(uint32_t N) +{ + uint32_t P = 1u; + while (P < N) P <<= 1u; + return P; +} + +// Builds an Eytzinger-layout CDF for cache-friendly binary search on the GPU. +// +// Layout (1-indexed, size 2*P where P = eytzingerLeafCount(N)): +// tree[0] unused (keeps parent/child arithmetic branch-free) +// tree[1 .. P-1] interior split keys; tree[v] == rightmost leaf of v's left subtree +// tree[P .. P+N-1] leaves, tree[P + i] = normalized inclusive scan of weights up to i +// tree[P+N .. 2P-1] padded leaves, all 1.0 (any u < 1.0 routes away from these) +// +// The sampler walks the tree as index = (index << 1) | goRight for ceil(log2(N)) +// iterations. Successive taps within one search land on adjacent memory, so every +// cache line pulled is fully used and the first log2(subgroupSize) iterations are +// served by a single memory transaction per subgroup. +template +void buildEytzinger(std::span weights, T* outTree) +{ + const uint32_t N = static_cast(weights.size()); + if (N == 0) + return; + + const uint32_t P = eytzingerLeafCount(N); + + T total = T(0); + for (uint32_t i = 0; i < N; ++i) + total += weights[i]; + const T rcpTotal = T(1) / total; + + T acc = T(0); + for (uint32_t i = 0; i < N; ++i) + { + acc += weights[i]; + outTree[P + i] = acc * rcpTotal; + } + for (uint32_t i = N; i < P; ++i) + outTree[P + i] = T(1); + + // Bottom-up: each interior node copies the rightmost leaf of its left subtree, + // found by descending left-then-always-right from v. + for (uint32_t v = P; v-- > 1u;) + { + uint32_t r = v << 1u; + while (r < P) + r = (r << 1u) | 1u; + outTree[v] = outTree[r]; + } +} + } // namespace sampling } // namespace hlsl } // namespace nbl diff --git a/include/nbl/builtin/hlsl/sampling/linear.hlsl b/include/nbl/builtin/hlsl/sampling/linear.hlsl index 12602b4a79..cd260cea3a 100644 --- a/include/nbl/builtin/hlsl/sampling/linear.hlsl +++ b/include/nbl/builtin/hlsl/sampling/linear.hlsl @@ -39,14 +39,14 @@ struct Linear // add min to both coefficients so (0,0) input produces a valid uniform sampler // instead of inf normalization (2/0) leading to NaN; negligible for normal inputs const vector2_type safeCoeffs = linearCoeffs + vector2_type(hlsl::numeric_limits::min, hlsl::numeric_limits::min); - // normalize coefficients so that the PDF is simply linearCoeffStart + linearCoeffDiff * x - const scalar_type normFactor = scalar_type(2.0) / (safeCoeffs[0] + safeCoeffs[1]); + // normalize coefficients so that the PDF is simply normalizedCoeffStart + linearCoeffDiff * x + const scalar_type normFactor = _static_cast(2.0) / (safeCoeffs[0] + safeCoeffs[1]); const vector2_type normalized = safeCoeffs * normFactor; - retval.linearCoeffStart = normalized[0]; - retval.linearCoeffEnd = normalized[1]; + retval.normalizedCoeffStart = normalized[0]; + retval.normalizedCoeffEnd = normalized[1]; // precompute for the stable quadratic in generate() retval.squaredCoeffStart = normalized[0] * normalized[0]; - retval.twoTimesDiff = scalar_type(2.0) * (normalized[1] - normalized[0]); + retval.twoTimesDiff = _static_cast(2.0) * (normalized[1] - normalized[0]); return retval; } @@ -57,18 +57,18 @@ struct Linear // Quadratic (1-start)*x^2 + start*x - u = 0; since start >= 0 the stable root is // x = 2u / (start + sqrt(start^2 + 2*diff*u)), which never cancels. const scalar_type sqrtTerm = sqrt(squaredCoeffStart + twoTimesDiff * u); - const scalar_type denom = linearCoeffStart + sqrtTerm; + const scalar_type denom = normalizedCoeffStart + sqrtTerm; // NOTE: floating point can make x slightly > 1 when u~1 and diff < 0; callers needing // non-negative PDF at the boundary should clamp with min(x, 1). const codomain_type x = (u + u) / denom; // diff*x == sqrtTerm - start algebraically (conjugate identity), saves 1 mul - cache.diffTimesX = sqrtTerm - linearCoeffStart; + cache.diffTimesX = sqrtTerm - normalizedCoeffStart; return x; } density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC { - return linearCoeffStart + cache.diffTimesX; + return normalizedCoeffStart + cache.diffTimesX; } weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC @@ -82,8 +82,8 @@ struct Linear // Not used because we already store start for generate(). density_type backwardPdf(const codomain_type x) NBL_CONST_MEMBER_FUNC { - assert(x >= scalar_type(0.0) && x <= scalar_type(1.0)); - return hlsl::mix(linearCoeffStart, linearCoeffEnd, x); + assert(x >= _static_cast(0.0) && x <= _static_cast(1.0)); + return hlsl::mix(normalizedCoeffStart, normalizedCoeffEnd, x); } weight_type backwardWeight(const codomain_type x) NBL_CONST_MEMBER_FUNC @@ -91,8 +91,8 @@ struct Linear return backwardPdf(x); } - scalar_type linearCoeffStart; - scalar_type linearCoeffEnd; + scalar_type normalizedCoeffStart; + scalar_type normalizedCoeffEnd; scalar_type squaredCoeffStart; scalar_type twoTimesDiff; }; diff --git a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl index 719eaf504f..64d5e69b96 100644 --- a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl +++ b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl @@ -32,7 +32,7 @@ struct PolarMapping static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) { const scalar_type r = hlsl::sqrt(u.x); - const scalar_type phi = scalar_type(2) * numbers::pi * u.y; + const scalar_type phi = _static_cast(2) * numbers::pi * u.y; return vector2_type(r * hlsl::cos(phi), r * hlsl::sin(phi)); } @@ -46,8 +46,8 @@ struct PolarMapping { const scalar_type r2 = p.x * p.x + p.y * p.y; scalar_type phi = hlsl::atan2(p.y, p.x); - phi += hlsl::mix(scalar_type(0), scalar_type(2) * numbers::pi, phi < scalar_type(0)); - return vector2_type(r2, phi * (scalar_type(0.5) * numbers::inv_pi)); + phi += hlsl::mix(_static_cast(0), _static_cast(2) * numbers::pi, phi < _static_cast(0)); + return vector2_type(r2, phi * (_static_cast(0.5) * numbers::inv_pi)); } static density_type forwardPdf(const domain_type u, cache_type cache) { return numbers::inv_pi; } diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl index 5bf652cb4c..1ad8b5462e 100644 --- a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl +++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl @@ -25,147 +25,166 @@ namespace sampling // 2. Warp uniform [0,1]^2 through the bilinear to importance-sample NdotL // 3. Feed the warped UV into the solid angle sampler to get a rect offset // 4. PDF = (1/SolidAngle) * bilinearPdf -// -// Template parameter `UsePdfAsWeight`: when true (default), forwardWeight/backwardWeight -// return the PDF instead of the projected-solid-angle MIS weight. -// TODO: the projected-solid-angle MIS weight (UsePdfAsWeight=false) has been shown to be -// poor in practice. Once confirmed by testing, remove the false path and stop storing -// receiverNormal, receiverWasBSDF, and rcpProjSolidAngle as members. template struct ProjectedSphericalRectangle { - using scalar_type = T; - using vector2_type = vector; - using vector3_type = vector; - using vector4_type = vector; - - // BackwardTractableSampler concept types - using domain_type = vector2_type; - using codomain_type = vector3_type; - using density_type = scalar_type; - using weight_type = density_type; - - struct cache_type - { - scalar_type abs_cos_theta; - vector2_type warped; - typename Bilinear::cache_type bilinearCache; - }; - - // NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away - // from all four rectangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure - // at least one vertex has positive projection onto the receiver normal. - static ProjectedSphericalRectangle create(NBL_CONST_REF_ARG(shapes::SphericalRectangle) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF) - { - ProjectedSphericalRectangle retval; - const vector3_type n = hlsl::mul(shape.basis, _receiverNormal); - retval.localReceiverNormal = n; - retval.receiverWasBSDF = _receiverWasBSDF; - - // Compute solid angle and get r0 in local frame (before z-flip) - const typename shapes::SphericalRectangle::solid_angle_type sa = shape.solidAngle(observer); - const vector3_type r0 = sa.r0; - - // All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey - const scalar_type r1x = r0.x + shape.extents.x; - const scalar_type r1y = r0.y + shape.extents.y; - - // Unnormalized dots: dot(corner_i, n) - const scalar_type base_dot = hlsl::dot(r0, n); - const scalar_type dx = shape.extents.x * n.x; - const scalar_type dy = shape.extents.y * n.y; - const vector4_type dots = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy); - - // Squared lengths of each corner - const scalar_type r0zSq = r0.z * r0.z; - const vector4_type lenSq = vector4_type( - r0.x * r0.x + r0.y * r0.y, - r1x * r1x + r0.y * r0.y, - r0.x * r0.x + r1y * r1y, - r1x * r1x + r1y * r1y - ) + hlsl::promote(r0zSq); - - // dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq) - // Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11 - const scalar_type minimumProjSolidAngle = 0.0; - const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, - dots * hlsl::rsqrt(lenSq), - hlsl::promote(minimumProjSolidAngle)); - retval.bilinearPatch = Bilinear::create(bxdfPdfAtVertex); - - // Reuse the already-computed solid_angle_type to avoid recomputing mul(basis, origin - observer) - retval.sphrect = SphericalRectangle::create(sa, shape.extents); - retval.rcpSolidAngle = retval.sphrect.solidAngle > scalar_type(0.0) ? scalar_type(1.0) / retval.sphrect.solidAngle : scalar_type(0.0); - - NBL_IF_CONSTEXPR(!UsePdfAsWeight) - { - const scalar_type projSA = shape.projectedSolidAngleFromLocal(r0, n); - retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0); - } - return retval; - } - - // returns a normalized 3D direction in the local frame - codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC - { - Bilinear bilinear = bilinearPatch; - cache.warped = bilinear.generate(u, cache.bilinearCache); - typename SphericalRectangle::cache_type sphrectCache; - const vector3_type dir = sphrect.generate(cache.warped, sphrectCache); - cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache); - return dir; - } - - // returns a 2D offset on the rectangle surface from the r0 corner - vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC - { - Bilinear bilinear = bilinearPatch; - cache.warped = bilinear.generate(u, cache.bilinearCache); - typename SphericalRectangle::cache_type sphrectCache; - const vector2_type sampleOffset = sphrect.generateSurfaceOffset(cache.warped, sphrectCache); - cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache); - return sampleOffset; - } - - density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - return rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache); - } - - weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - if (UsePdfAsWeight) - return forwardPdf(u, cache); - return cache.abs_cos_theta * rcpProjSolidAngle; - } - - // `p` is the normalized [0,1]^2 position on the rectangle - density_type backwardPdf(const vector2_type p) NBL_CONST_MEMBER_FUNC - { - return rcpSolidAngle * bilinearPatch.backwardPdf(p); - } - - weight_type backwardWeight(const vector2_type p) NBL_CONST_MEMBER_FUNC - { - NBL_IF_CONSTEXPR(UsePdfAsWeight) - return backwardPdf(p); - const scalar_type minimumProjSolidAngle = 0.0; - // Reconstruct local direction from normalized rect position - const vector3_type localDir = hlsl::normalize(sphrect.r0 + vector3_type( - p.x * sphrect.extents.x, - p.y * sphrect.extents.y, - scalar_type(0) - )); - const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(localReceiverNormal, localDir), minimumProjSolidAngle); - return abs_cos_theta * rcpProjSolidAngle; - } - - sampling::SphericalRectangle sphrect; - Bilinear bilinearPatch; - scalar_type rcpSolidAngle; - scalar_type rcpProjSolidAngle; - vector3_type localReceiverNormal; - bool receiverWasBSDF; + using scalar_type = T; + using vector2_type = vector; + using vector3_type = vector; + using vector4_type = vector; + + // BackwardTractableSampler concept types + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + struct cache_type + { + typename Bilinear::cache_type bilinearCache; + vector3_type L; // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false + }; + + // Shared finalization for both create() overloads: builds the bilinear patch, the inner sphrect + // sampler, and the UsePdfAsWeight=false extras. The two overloads differ only in how they + // compute bxdfPdfAtVertex (worldspace corner normalizations vs local-frame rsqrt(lenSq)). + static ProjectedSphericalRectangle create(NBL_CONST_REF_ARG(shapes::SphericalRectangle) shape, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle::solid_angle_type) sa, + const vector4_type bxdfPdfAtVertex, const vector3_type _receiverNormal) + { + ProjectedSphericalRectangle retval; + retval.bilinearPatch = Bilinear::create(bxdfPdfAtVertex); + // Reuse solid_angle_type to avoid recomputing mul(basis, origin - observer) + retval.sphrect = SphericalRectangle::create(shape.basis, sa, shape.extents); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + { + retval.receiverNormal = _receiverNormal; + const vector3_type nLocal = hlsl::mul(shape.basis, _receiverNormal); + retval.projSolidAngle = shape.projectedSolidAngleFromLocal(sa.r0, nLocal); + } + return retval; + } + + // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler + static ProjectedSphericalRectangle create(NBL_CONST_REF_ARG(shapes::CompressedSphericalRectangle) compressed, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF) + { + // 4 normalized worldspace corners dotted with the worldspace receiver normal. Avoids the + // mul(basis, receiverNormal) data dependency of the uncompressed overload so these 4 + // normalize+dot chains can pipeline alongside the basis/solid-angle work below. + const vector3_type c0 = compressed.origin - observer; + const vector3_type c1 = c0 + compressed.right; + const vector3_type c2 = c0 + compressed.up; + const vector3_type c3 = c1 + compressed.up; + const vector4_type dots = vector4_type( + hlsl::dot(hlsl::normalize(c0), _receiverNormal), + hlsl::dot(hlsl::normalize(c1), _receiverNormal), + hlsl::dot(hlsl::normalize(c2), _receiverNormal), + hlsl::dot(hlsl::normalize(c3), _receiverNormal)); + const scalar_type minimumProjSolidAngle = _static_cast(0.0); + const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, dots, hlsl::promote(minimumProjSolidAngle)); + + const shapes::SphericalRectangle shape = shapes::SphericalRectangle::create(compressed); + const typename shapes::SphericalRectangle::solid_angle_type sa = shape.solidAngle(observer); + return create(shape, sa, bxdfPdfAtVertex, _receiverNormal); + } + + // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler + static ProjectedSphericalRectangle create(NBL_CONST_REF_ARG(shapes::SphericalRectangle) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF) + { + // Local-frame path: unnormalized dot(corner_i, n) with n = basis * receiverNormal, then + // a single rsqrt(lenSq) for all 4 corner normalizations at once. + const vector3_type n = hlsl::mul(shape.basis, _receiverNormal); + const typename shapes::SphericalRectangle::solid_angle_type sa = shape.solidAngle(observer); + const vector3_type r0 = sa.r0; + + // All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey + const scalar_type r1x = r0.x + shape.extents.x; + const scalar_type r1y = r0.y + shape.extents.y; + const scalar_type base_dot = hlsl::dot(r0, n); + const scalar_type dx = shape.extents.x * n.x; + const scalar_type dy = shape.extents.y * n.y; + const vector4_type dots = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy); + + const scalar_type r0zSq = r0.z * r0.z; + const vector4_type lenSq = vector4_type( + r0.x * r0.x + r0.y * r0.y, + r1x * r1x + r0.y * r0.y, + r0.x * r0.x + r1y * r1y, + r1x * r1x + r1y * r1y) + + hlsl::promote(r0zSq); + + // dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq). Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11 + const scalar_type minimumProjSolidAngle = _static_cast(0.0); + const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, + dots * hlsl::rsqrt(lenSq), + hlsl::promote(minimumProjSolidAngle)); + + return create(shape, sa, bxdfPdfAtVertex, _receiverNormal); + } + + // returns a normalized 3D direction in the local frame + codomain_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC + { + const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache); + typename SphericalRectangle::cache_type sphrectCache; // there's nothing in the cache + const vector3_type dir = sphrect.generateNormalizedLocal(warped, sphrectCache, hitDist); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + cache.L = hlsl::mul(hlsl::transpose(sphrect.basis), dir); + return dir; + } + + // returns a unnormalized 3D direction in the global frame + codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache); + typename SphericalRectangle::cache_type sphrectCache; // there's nothing in the cache + const vector3_type dir = sphrect.generateUnnormalized(warped, sphrectCache); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + cache.L = dir * hlsl::rsqrt(hlsl::dot(dir, dir)); + return dir; + } + + // returns a normalized 3D direction in the global frame + codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache); + typename SphericalRectangle::cache_type sphrectCache; // there's nothing in the cache + const vector3_type dir = sphrect.generate(warped, sphrectCache); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + cache.L = dir; + return dir; + } + + density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + return bilinearPatch.forwardPdf(u, cache.bilinearCache) / sphrect.solidAngle; + } + + weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + NBL_IF_CONSTEXPR(UsePdfAsWeight) + return forwardPdf(u, cache); + return backwardWeight(cache.L); + } + + weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC + { + NBL_IF_CONSTEXPR(UsePdfAsWeight) + { +#if 0 + const vector2_type warped = sphrect.generateInvese(L); // TODO: implement `generateInverse` + return bilinearPatch.backwardPdf(warped) / sphrect.solidAngle; +#endif + return hlsl::numeric_limits::quiet_NaN; + } + // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used + return hlsl::abs(hlsl::dot(L, receiverNormal)) / projSolidAngle; + } + + sampling::SphericalRectangle sphrect; + Bilinear bilinearPatch; + // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false + vector3_type receiverNormal; + scalar_type projSolidAngle; }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl index c4bc5fcea8..6eddd03e8a 100644 --- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl +++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl @@ -47,73 +47,68 @@ struct ProjectedSphericalTriangle struct cache_type { - scalar_type abs_cos_theta; - vector2_type warped; typename Bilinear::cache_type bilinearCache; + vector3_type L; // TODO: erase when UsePdfAsWeight==false }; - // NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away - // from all three triangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure - // at least one vertex has positive projection onto the receiver normal. + // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler static ProjectedSphericalTriangle create(NBL_REF_ARG(shapes::SphericalTriangle) shape, const vector3_type _receiverNormal, const bool _receiverWasBSDF) { ProjectedSphericalTriangle retval; - retval.sphtri = SphericalTriangle::create(shape); - retval.receiverNormal = _receiverNormal; - retval.receiverWasBSDF = _receiverWasBSDF; + retval.sphtri = SphericalTriangle::create(shape); const scalar_type minimumProjSolidAngle = 0.0; matrix m = matrix(shape.vertices[0], shape.vertices[1], shape.vertices[2]); const vector3_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, hlsl::mul(m, _receiverNormal), hlsl::promote(minimumProjSolidAngle)); retval.bilinearPatch = Bilinear::create(bxdfPdfAtVertex.yyxz); - const scalar_type projSA = shape.projectedSolidAngle(_receiverNormal); - retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + { + retval.receiverNormal = _receiverNormal; + // prevent division of 0 cosine by 0 + retval.projSolidAngle = max(shape.projectedSolidAngle(_receiverNormal),numeric_limits::min); + } return retval; } codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC { - Bilinear bilinear = bilinearPatch; - cache.warped = bilinear.generate(u, cache.bilinearCache); - typename SphericalTriangle::cache_type sphtriCache; - const vector3_type L = sphtri.generate(cache.warped, sphtriCache); - cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache); + const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache); + typename SphericalTriangle::cache_type sphtriCache; // PDF is constant caches nothing, its empty + const codomain_type L = sphtri.generate(warped, sphtriCache); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + cache.L = L; return L; } density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC { - return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache); + return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u,cache.bilinearCache); } weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC { - if (UsePdfAsWeight) - return forwardPdf(u, cache); - return cache.abs_cos_theta * rcpProjSolidAngle; - } - - density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC - { - const vector2_type u = sphtri.generateInverse(L); - return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u); + NBL_IF_CONSTEXPR (UsePdfAsWeight) + return forwardPdf(u,cache); + return backwardWeight(cache.L); } weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC { - NBL_IF_CONSTEXPR(UsePdfAsWeight) - return backwardPdf(L); - const scalar_type minimumProjSolidAngle = 0.0; - const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(receiverNormal, L), minimumProjSolidAngle); - return abs_cos_theta * rcpProjSolidAngle; + NBL_IF_CONSTEXPR (UsePdfAsWeight) + { + const vector2_type u = sphtri.generateInverse(L); + return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u); + } + // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used + return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle; } - sampling::SphericalTriangle sphtri; + sampling::SphericalTriangle sphtri; Bilinear bilinearPatch; - scalar_type rcpProjSolidAngle; + // TODO: erase when UsePdfAsWeight==false vector3_type receiverNormal; - bool receiverWasBSDF; + scalar_type projSolidAngle; }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl index 131cc92d70..ba088a0e29 100644 --- a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl +++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl @@ -19,156 +19,212 @@ namespace hlsl namespace sampling { -template +// UseRealSinCos=true (default): math::sincos uses real sin+cos; full precision near au=n*pi, Jacobian test passes. +// UseRealSinCos=false : math::sincos uses cos + sqrt(1-c*c); saves one special-function op but loses mantissa as |cos_au| -> 1. +template struct SphericalRectangle { - using scalar_type = T; - using vector2_type = vector; - using vector3_type = vector; - using vector4_type = vector; - - // BackwardTractableSampler concept types - using domain_type = vector2_type; - using codomain_type = vector3_type; - using density_type = scalar_type; - using weight_type = density_type; - - struct cache_type {}; - - NBL_CONSTEXPR_STATIC_INLINE scalar_type ClampEps = 1e-5; - - static SphericalRectangle create(NBL_CONST_REF_ARG(shapes::SphericalRectangle) rect, const vector3_type observer) - { - return create(rect.solidAngle(observer), rect.extents); - } - - static SphericalRectangle create(NBL_CONST_REF_ARG(typename shapes::SphericalRectangle::solid_angle_type) sa, const vector2_type _extents) - { - SphericalRectangle retval; - retval.r0 = sa.r0; - retval.extents = _extents; - - retval.solidAngle = sa.value; - retval.b0 = sa.n_z[0]; - retval.b1 = sa.n_z[2]; - - math::sincos_accumulator angle_adder = math::sincos_accumulator::create(sa.cosGamma[2]); - angle_adder.addCosine(sa.cosGamma[3]); - retval.k = scalar_type(2.0) * numbers::pi - angle_adder.getSumOfArccos(); - - return retval; - } - - // Create directly from a local-frame corner position and rectangle extents. - // Use when you already know r0 (e.g. from a gnomonic projection) and don't - // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip. - static SphericalRectangle create(const vector3_type _r0, const vector2_type _extents) - { - // Same math as shapes::SphericalRectangle::solidAngle() but without - // the mul(basis, origin - observer) step since we already have r0. - typename shapes::SphericalRectangle::solid_angle_type sa; - sa.r0 = _r0; - - const scalar_type zSq = _r0.z * _r0.z; - const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x); - sa.n_z = denorm_n_z * hlsl::rsqrt(hlsl::promote(zSq) + denorm_n_z * denorm_n_z); - sa.cosGamma = vector4_type( - -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2], - -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]); - - math::sincos_accumulator acc = math::sincos_accumulator::create(sa.cosGamma[0]); - acc.addCosine(sa.cosGamma[1]); - acc.addCosine(sa.cosGamma[2]); - acc.addCosine(sa.cosGamma[3]); - sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi; - - return create(sa, _extents); - } - - // shared core of generate and generateSurfaceOffset - // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction - vector3_type __generate(const domain_type u) NBL_CONST_MEMBER_FUNC - { - // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip - const scalar_type negAbsR0z = -hlsl::abs(r0.z); - const scalar_type r0zSq = r0.z * r0.z; - const vector2_type r1 = vector2_type(r0.x + extents.x, r0.y + extents.y); - - const scalar_type au = u.x * solidAngle + k; - const scalar_type cos_au = hlsl::cos(au); - const scalar_type numerator = b1 - cos_au * b0; - // (1-cos)*(1+cos) avoids catastrophic cancellation of 1-cos^2 when cos_au is near +/-1 - const scalar_type sin_au_sq = (scalar_type(1.0) - cos_au) * (scalar_type(1.0) + cos_au); - const scalar_type absNegFu = hlsl::abs(numerator) * hlsl::rsqrt(sin_au_sq); - const scalar_type rcpCu_2 = hlsl::max(absNegFu * absNegFu + b0 * b0, scalar_type(1.0)); - // sign(negFu) = sign(numerator) * sign(sin(au)); sin(au) < 0 iff au > PI - const scalar_type negFuSign = hlsl::select((au > numbers::pi) != (numerator < scalar_type(0.0)), scalar_type(-1.0), scalar_type(1.0)); - scalar_type xu = negAbsR0z * negFuSign * hlsl::rsqrt(rcpCu_2 - scalar_type(1.0)); - xu = hlsl::clamp(xu, r0.x, r1.x); // avoid Infs - const scalar_type d_2 = xu * xu + r0zSq; - const scalar_type d = hlsl::sqrt(d_2); - - const scalar_type h0 = r0.y * hlsl::rsqrt(d_2 + r0.y * r0.y); - const scalar_type h1 = r1.y * hlsl::rsqrt(d_2 + r1.y * r1.y); - const scalar_type hv = h0 + u.y * (h1 - h0); - - return vector3_type(xu, hv, d); - } - - // returns a normalized 3D direction in the local frame with correct r0.z sign - codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC - { - const vector3_type core = __generate(u); - const scalar_type xu = core.x; - const scalar_type hv = core.y; - const scalar_type d = core.z; - const scalar_type hv2 = hv * hv; - const scalar_type cosElevation = hlsl::sqrt(hlsl::max(scalar_type(1.0) - hv2, scalar_type(0.0))); - const scalar_type rcpD = scalar_type(1.0) / d; - - return vector3_type(xu * cosElevation * rcpD, hv, r0.z * cosElevation * rcpD); - } - - // returns a 2D offset on the rectangle surface from the r0 corner - vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC - { - const vector3_type core = __generate(u); - const scalar_type xu = core.x; - const scalar_type hv = core.y; - const scalar_type d = core.z; - const scalar_type r1y = r0.y + extents.y; - const scalar_type hv2 = hv * hv; - const scalar_type yv = hlsl::mix(r1y, (hv * d) / hlsl::sqrt(scalar_type(1.0) - hv2), hv2 < scalar_type(1.0) - ClampEps); - - return vector2_type((xu - r0.x), (yv - r0.y)); - } - - density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - return scalar_type(1.0) / solidAngle; - } - - weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - return forwardPdf(u, cache); - } - - density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC - { - return scalar_type(1.0) / solidAngle; - } - - weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC - { - return backwardPdf(L); - } - - scalar_type solidAngle; - scalar_type k; - scalar_type b0; - scalar_type b1; - vector3_type r0; - vector2_type extents; + using scalar_type = T; + using vector2_type = vector; + using vector3_type = vector; + using vector4_type = vector; + using matrix3x3_type = matrix; + + // BackwardTractableSampler concept types + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + struct cache_type + { + }; + + static SphericalRectangle create(NBL_CONST_REF_ARG(shapes::SphericalRectangle) rect, const vector3_type observer) + { + return create(rect.basis, rect.solidAngle(observer), rect.extents); + } + + static SphericalRectangle create(const matrix3x3_type _basis, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle::solid_angle_type) sa, const vector2_type _extents) + { + SphericalRectangle retval; + retval.basis = _basis; + retval.r0 = sa.r0; + retval.extents = _extents; + + retval.solidAngle = sa.value; + retval.b0 = sa.n_z[0]; + retval.b1 = sa.n_z[2]; + + math::sincos_accumulator angle_adder = math::sincos_accumulator::create(sa.cosGamma[2]); + angle_adder.addCosine(sa.cosGamma[3]); + retval.k = _static_cast(2.0) * numbers::pi - angle_adder.getSumOfArccos(); + + return retval; + } + + // Create directly from a local-frame corner position and rectangle extents. + // Use when you already know r0 (e.g. from a gnomonic projection) and don't + // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip. + static SphericalRectangle create(const matrix3x3_type _basis, const vector3_type _r0, const vector2_type _extents) + { + // Same math as shapes::SphericalRectangle::solidAngle() but without + // the mul(basis, origin - observer) step since we already have r0. + typename shapes::SphericalRectangle::solid_angle_type sa; + sa.r0 = _r0; + + const scalar_type zSq = _r0.z * _r0.z; + const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x); + sa.n_z = denorm_n_z * hlsl::rsqrt(hlsl::promote(zSq) + denorm_n_z * denorm_n_z); + sa.cosGamma = vector4_type( + -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2], + -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]); + + math::sincos_accumulator acc = math::sincos_accumulator::create(sa.cosGamma[0]); + acc.addCosine(sa.cosGamma[1]); + acc.addCosine(sa.cosGamma[2]); + acc.addCosine(sa.cosGamma[3]); + sa.value = acc.getSumOfArccos() - _static_cast(2.0) * numbers::pi; + + return create(_basis, sa, _extents); + } + + // shared core of generate and generateSurfaceOffset + // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction + struct SCommonGen + { + scalar_type xu; + scalar_type d2; + scalar_type hv; + scalar_type cosElevation2; + }; + SCommonGen __generate(const domain_type u) NBL_CONST_MEMBER_FUNC + { + SCommonGen retval; + + // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip + const scalar_type negAbsR0z = -hlsl::abs(r0.z); + const scalar_type r0zSq = r0.z * r0.z; + const vector2_type r1 = vector2_type(r0.x + extents.x, r0.y + extents.y); + + // au in [0, 4*pi] since u.x in [0,1], solidAngle <= 2*pi, k <= 2*pi. + // The sqrt path in math::sincos recovers sin sign via sign(theta), which only holds for theta in [-pi,pi]. + // Real sin/cos handle any range, so only wrap on the sqrt path (compile-time branch folds away). + scalar_type au = u.x * solidAngle + k; + NBL_IF_CONSTEXPR(!UseRealSinCos) + { + // au in [0, 4*pi] -> peel at most two 2*pi periods to land in (-pi, pi]. + au = hlsl::select(au > numbers::pi, au - _static_cast(2.0) * numbers::pi, au); + au = hlsl::select(au > numbers::pi, au - _static_cast(2.0) * numbers::pi, au); + } + scalar_type sin_au, cos_au; + math::sincos(au, sin_au, cos_au); + const scalar_type numerator = b1 - cos_au * b0; + // negFu carries the sign directly (numerator and sin_au both signed), so xu's sign drops + // out of a single multiply + hlsl::sign. + const scalar_type negFu = numerator / sin_au; + const scalar_type rcpCu_2 = hlsl::max(negFu * negFu + b0 * b0, _static_cast(1.0)); + retval.xu = negAbsR0z * hlsl::sign(negFu) * hlsl::rsqrt(rcpCu_2 - _static_cast(1.0)); + retval.xu = hlsl::clamp(retval.xu, r0.x, r1.x); // avoid Infs + retval.d2 = retval.xu * retval.xu + r0zSq; + + const scalar_type h0 = r0.y * hlsl::rsqrt(retval.d2 + r0.y * r0.y); + const scalar_type h1 = r1.y * hlsl::rsqrt(retval.d2 + r1.y * r1.y); + retval.hv = h0 + u.y * (h1 - h0); + retval.cosElevation2 = _static_cast(1.0) - hlsl::min(retval.hv * retval.hv, 1); + + return retval; + } + + // returns a normalized 3D direction in the local frame with correct r0.z sign + vector3_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC + { + const SCommonGen core = __generate(u); + scalar_type cosElevationOverD = hlsl::rsqrt(core.d2 / core.cosElevation2); + // TODO: or shall we do some other more sophisticated clamp or correction? Is this even the right one to use? + cosElevationOverD = hlsl::select(hlsl::isnan(cosElevationOverD), 1.f, cosElevationOverD); + + // TODO: investigate if due to precision we need to compute this as a `sqrt` then the quantity being computed is `core.cosElevation2/core.d2` + // which what alss `generateLocalBasisXY` needs, in which case `__generate` can already compute it and return it as `hitDist2` + hitDist = 1.f / cosElevationOverD; + + const vector3_type retval = vector3_type(core.xu / hitDist, core.hv, r0.z / hitDist); + assert(!hlsl::isnan(computeHitT(retval))); + return retval; + } + + codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + scalar_type dummy; + const vector3_type localL = generateNormalizedLocal(u, cache, dummy); + // could return `hlsl::mul(hlsl::tranpose(basis),localL)` or just this + return basis[0] * localL[0] + basis[1] * localL[1] + basis[2] * localL[2]; + } + + // utility to determine maxT for a ray from L shot from the origin which we're sure intersects the rectangle + scalar_type computeHitT(const vector3_type L) + { + const scalar_type retval = hlsl::abs(r0.z / hlsl::dot(L, basis[2])); + { + const vector3_type hitPointRelative = L * retval; + const vector2_type uv = mul(basis, L).xy - r0.xy; + assert(uv[0] >= 0.0 && uv[0] <= rectExtents[0]); + assert(uv[1] >= 0.0 && uv[1] <= rectExtents[1]); + } + return retval; + } + + // returns a 2D offset on the rectangle surface including the r0 corner -useful for generating unnormalized worldspace L + vector2_type generateLocalBasisXY(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const SCommonGen core = __generate(u); + const scalar_type r1y = r0.y + extents.y; + // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu` and `cosElevation2` + const scalar_type yv = core.hv * hlsl::rsqrt(core.cosElevation2 / core.d2); + + // fun fact, when one of the operands to min or max is NaN, the SPIR-V builtin will select the other one + // TODO: maybe try just `min(yv,ry1)` + const vector2_type retval = vector2_type(core.xu, hlsl::clamp(yv, r0.y, r1y)); + assert(retval[0] >= r0.x && retval[1] >= r0.y); + assert(retval[0] <= r0.x + rectExtents[0] && retval[1] <= r0.y + rectExtents[1]); + return retval; + } + + // its basically the hitpoint minus the observer origin + codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const vector2_type localXY = generateLocalBasisXY(u, cache); + // the `localXY` already contains r0.xy + return basis[0] * localXY[0] + basis[1] * localXY[1] + basis[2] * r0.z; + } + + density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + return _static_cast(1.0) / solidAngle; + } + + weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + return forwardPdf(u, cache); + } + + density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC + { + return _static_cast(1.0) / solidAngle; + } + + weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC + { + return backwardPdf(L); + } + + matrix3x3_type basis; + vector3_type r0; + vector2_type extents; + scalar_type solidAngle; + scalar_type k; + scalar_type b0; + scalar_type b1; }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl index 6f29582e04..b915a55de7 100644 --- a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl +++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl @@ -11,6 +11,7 @@ #include #include #include +#include namespace nbl { @@ -25,9 +26,6 @@ enum SphericalTriangleAlgorithm : uint16_t STA_PBRT = 1 }; -template -struct SphericalTriangle; - namespace impl { @@ -53,9 +51,8 @@ T sumOfProducts(T a, T b, T c, T d) } // namespace impl -// Non-bijective: Resamplable & Tractable (generate + pdf/weight, no inverse) -template -struct SphericalTriangle +template +struct SphericalTriangle { using scalar_type = T; using vector2_type = vector; @@ -73,22 +70,23 @@ struct SphericalTriangle static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle) tri) { SphericalTriangle retval; - retval.rcpSolidAngle = scalar_type(1.0) / tri.solid_angle; + retval.rcpSolidAngle = _static_cast(1.0) / tri.solid_angle; retval.tri_vertices[0] = tri.vertices[0]; retval.tri_vertices[1] = tri.vertices[1]; - retval.triCosC = tri.cos_sides[2]; + retval.triCosc = tri.cos_sides[2]; // precompute great circle normal of arc AC: cross(A,C) has magnitude sin(b), // so multiplying by csc(b) normalizes it; zero when side AC is degenerate - const scalar_type cscB = tri.csc_sides[1]; - const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits::max, cscB, scalar_type(0)); + const scalar_type cscb = tri.csc_sides[1]; + const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscb < numeric_limits::max, cscb, _static_cast(0)); retval.e_C = hlsl::cross(arcACPlaneNormal, tri.vertices[0]); retval.cosA = tri.cos_vertices[0]; retval.sinA = tri.sin_vertices[0]; if (Algorithm == STA_ARVO) { - retval.sinA_triCosC = retval.sinA * retval.triCosC; + retval.sinA_triCosc = retval.sinA * retval.triCosc; retval.eCdotB = hlsl::dot(retval.e_C, tri.vertices[1]); } + retval.APlusC = tri.vertices[0] + tri.vertices[2]; return retval; } @@ -106,36 +104,37 @@ struct SphericalTriangle if (Algorithm == STA_ARVO) // faster than PBRT { const scalar_type u_ = t - cosA; - const scalar_type v_ = s + sinA_triCosC; + const scalar_type v_ = s + sinA_triCosc; const scalar_type num = (v_ * t - u_ * s) * cosA - v_; const scalar_type denum = (v_ * s + u_ * t) * sinA; +#define ACCURATE 1 #if ACCURATE // sqrt(1 - cosBp^2) loses precision when cosBp ~ 1 (small u.x). // Use stable factorization: sinBp = sqrt((denum-num)(denum+num)) / |denum| - // where denum-num = sinA*(1+triCosC)*(1-cosA_hat). + // where denum-num = sinA*(1+triCosc)*(1-cosA_hat). // For large triangles with high u.x, cosA_hat can approach -1, // making (1 + cosA_hat) near zero and the division unstable. // Use the algebraic identity only when cosA_hat > 0 (safe denominator). - const scalar_type rcpDenum = scalar_type(1) / denum; - const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > scalar_type(0), sinA_hat * sinA_hat / (scalar_type(1) + cosA_hat), scalar_type(1) - cosA_hat); - const scalar_type DminusN = sinA * (scalar_type(1) + triCosC) * oneMinusCosAhat; - sinBp = sqrt(max(scalar_type(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum); - cosBp = scalar_type(1) - DminusN * rcpDenum; + const scalar_type rcpDenum = _static_cast(1) / denum; + const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > _static_cast(0), sinA_hat * sinA_hat / (_static_cast(1) + cosA_hat), _static_cast(1) - cosA_hat); + const scalar_type DminusN = sinA * (_static_cast(1) + triCosc) * oneMinusCosAhat; + sinBp = sqrt(max(_static_cast(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum); + cosBp = _static_cast(1) - DminusN * rcpDenum; #else // 17% faster, less accurate cosBp = num / denum; - sinBp = sqrt(max(scalar_type(0), scalar_type(1) - cosBp * cosBp)); + sinBp = sqrt(max(_static_cast(0), _static_cast(1) - cosBp * cosBp)); #endif } else // STA_PBRT, accurate, slowest { // PBRT uses cosPhi = -t, sinPhi = -s (pi offset from Arvo's A_hat) const scalar_type k1 = -t + cosA; - const scalar_type k2 = -s - sinA * triCosC; + const scalar_type k2 = -s - sinA * triCosc; cosBp = (k2 + impl::differenceOfProducts(k2, -t, k1, -s) * cosA) / (impl::sumOfProducts(k2, -s, k1, -t) * sinA); - cosBp = nbl::hlsl::clamp(cosBp, scalar_type(-1), scalar_type(1)); - sinBp = sqrt(max(scalar_type(0), scalar_type(1) - cosBp * cosBp)); + cosBp = nbl::hlsl::clamp(cosBp, _static_cast(-1), _static_cast(1)); + sinBp = sqrt(_static_cast(1) - cosBp * cosBp); } // Step 3: construct C' on the great circle through A toward C @@ -143,81 +142,24 @@ struct SphericalTriangle // Step 4: uniformly sample the great circle arc from B to C' scalar_type cosCpB; - if (Algorithm == STA_ARVO) - cosCpB = cosBp * triCosC + sinBp * eCdotB; + NBL_IF_CONSTEXPR(Algorithm == STA_ARVO) + cosCpB = cosBp * triCosc + sinBp * eCdotB; else cosCpB = nbl::hlsl::dot(cp, tri_vertices[1]); - const scalar_type z = scalar_type(1) - u.y * (scalar_type(1) - cosCpB); - const scalar_type sinZ = sqrt(max(scalar_type(0), scalar_type(1) - z * z)); + // TODO: degeneracy at u.y = 0. z = 1 - u.y*(1-cosCpB) makes sinZ = sqrt(1-z^2) behave like + // sqrt(u.y) near zero, so dL/du.y diverges as u.y^(-1/2) and every higher derivative diverges + // faster. The forward Jacobian test in 37_HLSLSamplingTests reports ~2-8% error at u.y < 0.003 + // even with the O(h^2) one-sided stencil because the third-derivative term dominates. At + // u.y = 0 exactly, L collapses to vertex B for all u.x (|det J| = 0), so it's an intrinsic + // property of the Arvo parameterization, not a bug. Fix: rework the arc interpolation to use + // a u.y -> angle mapping whose derivatives stay bounded near u.y = 0 (e.g. acos(z) = angle + // from B, then sample arc-length linearly), so the Jacobian is smooth and the skip band in + // the tester can be removed. + const scalar_type z = _static_cast(1) - u.y * (_static_cast(1) - cosCpB); + const scalar_type sinZ = sqrt(max(_static_cast(0), _static_cast(1) - z * z)); return z * tri_vertices[1] + sinZ * hlsl::normalize(cp - cosCpB * tri_vertices[1]); } - density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - return rcpSolidAngle; - } - - weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - return forwardPdf(u, cache); - } - - density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC - { - return rcpSolidAngle; - } - - weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC - { - return backwardPdf(L); - } - - scalar_type rcpSolidAngle; - scalar_type cosA; - scalar_type sinA; - scalar_type sinA_triCosC; // precomputed sinA * triCosC - scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only - - vector3_type tri_vertices[2]; // A and B only - scalar_type triCosC; - vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane -}; - -// Bijective: adds generateInverse, stores extra members for the inverse mapping -template -struct SphericalTriangle -{ - using scalar_type = T; - using vector2_type = vector; - using vector3_type = vector; - - using base_type = SphericalTriangle; - using domain_type = vector2_type; - using codomain_type = vector3_type; - using density_type = scalar_type; - using weight_type = density_type; - - using cache_type = typename base_type::cache_type; - - static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle) tri) - { - SphericalTriangle retval; - retval.base = base_type::create(tri); - retval.rcpSolidAngle = retval.base.rcpSolidAngle; - retval.vertexC = tri.vertices[2]; - // precompute great circle normal of arc AC (needed for generateInverse) - const scalar_type cscB = tri.csc_sides[1]; - retval.arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits::max, cscB, scalar_type(0)); - retval.triCscC = tri.csc_sides[2]; - return retval; - } - - codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC - { - return base.generate(u, cache); - } - - // generate() works in two steps: // u.x -> pick C' on arc AC (choosing a sub-area fraction) // u.y -> pick L on arc B->C' (linear interpolation) @@ -229,37 +171,34 @@ struct SphericalTriangle domain_type generateInverse(const codomain_type L) NBL_CONST_MEMBER_FUNC { // Step 1: find C' = intersection of great circles (B,L) and (A,C) - const vector3_type BxL = nbl::hlsl::cross(base.tri_vertices[1], L); + const vector3_type BxL = nbl::hlsl::cross(tri_vertices[1], L); const scalar_type sinBL_sq = nbl::hlsl::dot(BxL, BxL); - if (sinBL_sq < numeric_limits::epsilon) + + // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t). + // C' also lies on the B-L plane, so dot(BxL, C') = 0. + // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R + const scalar_type tripleA = nbl::hlsl::dot(BxL, tri_vertices[0]); + const scalar_type tripleE = nbl::hlsl::dot(BxL, e_C); + const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE; + + if (sinBL_sq < numeric_limits::epsilon || R_sq < numeric_limits::epsilon) { - // L ~ B: u.y ~ 0, u.x is indeterminate (all u.x map to B when u.y=0). // Recover u.y from |L-B|^2 / |A-B|^2 (using C'=A; the (1-cosCpB) ratio // cancels so any C' gives the same result). - const vector3_type LminusB = L - base.tri_vertices[1]; - const vector3_type AminusB = base.tri_vertices[0] - base.tri_vertices[1]; + const vector3_type LminusB = L - tri_vertices[1]; + const vector3_type AminusB = tri_vertices[0] - tri_vertices[1]; const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB); const scalar_type v_denom = nbl::hlsl::dot(AminusB, AminusB); const scalar_type v = hlsl::select(v_denom > numeric_limits::epsilon, - nbl::hlsl::clamp(v_num / v_denom, scalar_type(0.0), scalar_type(1.0)), - scalar_type(0.0)); - return vector2_type(scalar_type(0.0), v); + nbl::hlsl::clamp(v_num / v_denom, _static_cast(0.0), _static_cast(1.0)), + _static_cast(0.0)); + return vector2_type(_static_cast(0.0), v); } - // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t). - // C' also lies on the B-L plane, so dot(BxL, C') = 0. - // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R - const scalar_type tripleA = nbl::hlsl::dot(BxL, base.tri_vertices[0]); - const scalar_type tripleE = nbl::hlsl::dot(BxL, base.e_C); - const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE; - if (R_sq < numeric_limits::epsilon) - return vector2_type(scalar_type(0.0), scalar_type(0.0)); - - const scalar_type rcpR = scalar_type(1.0) / nbl::hlsl::sqrt(R_sq); - vector3_type cp = base.tri_vertices[0] * (tripleE * rcpR) + base.e_C * (-tripleA * rcpR); - // two intersections exist; pick the one on the minor arc A->C - if (nbl::hlsl::dot(cp, base.tri_vertices[0] + vertexC) < scalar_type(0.0)) - cp = -cp; + const scalar_type rcpR = _static_cast(1.0) / nbl::hlsl::sqrt(R_sq); + vector3_type cp = tri_vertices[0] * (tripleE * rcpR) + e_C * (-tripleA * rcpR); + // two intersections exist; pick the one on the minor arc A->C (branchless sign flip) + cp = ieee754::flipSignIfRHSNegative(cp, nbl::hlsl::dot(cp, APlusC)); // Step 2: u.x = solidAngle(A,B,C') / solidAngle(A,B,C) // Van Oosterom-Strackee: tan(Omega/2) = |A.(BxC')| / (1 + A.B + B.C' + C'.A) @@ -270,25 +209,25 @@ struct SphericalTriangle // Expanding C' = cosBp*A + sinBp*e_C into the triple product: // A.(BxC') = cosBp * A.(BxA) + sinBp * A.(Bxe_C) = sinBp * A.(Bxe_C) // since A.(BxA) = 0 identically. This avoids the cancellation. - const scalar_type cosBp_inv = nbl::hlsl::dot(cp, base.tri_vertices[0]); - const scalar_type sinBp_inv = nbl::hlsl::dot(cp, base.e_C); - const scalar_type AxBdotE = nbl::hlsl::dot(base.tri_vertices[0], nbl::hlsl::cross(base.tri_vertices[1], base.e_C)); + const scalar_type cosBp_inv = nbl::hlsl::dot(cp, tri_vertices[0]); + const scalar_type sinBp_inv = nbl::hlsl::dot(cp, e_C); + const scalar_type AxBdotE = nbl::hlsl::dot(tri_vertices[0], nbl::hlsl::cross(tri_vertices[1], e_C)); const scalar_type num = sinBp_inv * AxBdotE; - const scalar_type cosCpB = nbl::hlsl::dot(base.tri_vertices[1], cp); - const scalar_type den = scalar_type(1.0) + base.triCosC + cosCpB + cosBp_inv; - const scalar_type subSolidAngle = scalar_type(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den); - const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, scalar_type(0.0), scalar_type(1.0)); + const scalar_type cosCpB = nbl::hlsl::dot(tri_vertices[1], cp); + const scalar_type den = _static_cast(1.0) + triCosc + cosCpB + cosBp_inv; + const scalar_type subSolidAngle = _static_cast(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den); + const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, _static_cast(0.0), _static_cast(1.0)); // Step 3: u.y = |L-B|^2 / |C'-B|^2 // Squared Euclidean distance avoids catastrophic cancellation vs (1-dot)/(1-dot) - const vector3_type LminusB = L - base.tri_vertices[1]; - const vector3_type cpMinusB = cp - base.tri_vertices[1]; + const vector3_type LminusB = L - tri_vertices[1]; + const vector3_type cpMinusB = cp - tri_vertices[1]; const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB); const scalar_type v_denom = nbl::hlsl::dot(cpMinusB, cpMinusB); const scalar_type v = hlsl::select(v_denom > numeric_limits::epsilon, nbl::hlsl::clamp(v_num / nbl::hlsl::max(v_denom, numeric_limits::min), - scalar_type(0.0), scalar_type(1.0)), - scalar_type(0.0)); + _static_cast(0.0), _static_cast(1.0)), + _static_cast(0.0)); return vector2_type(u, v); } @@ -313,13 +252,16 @@ struct SphericalTriangle return backwardPdf(L); } - // mirrored from base for uniform access across both specializations scalar_type rcpSolidAngle; + scalar_type cosA; + scalar_type sinA; + scalar_type sinA_triCosc; // precomputed sinA * triCosc + scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only - base_type base; - vector3_type vertexC; - vector3_type arcACPlaneNormal; // precomputed normalize(cross(A, C)), great circle normal of arc AC - scalar_type triCscC; + vector3_type tri_vertices[2]; // A and B only + scalar_type triCosc; + vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane + vector3_type APlusC; // precomputed A + C, used to pick the minor-arc intersection in generateInverse }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl index 21901a9628..0d22323921 100644 --- a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl +++ b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl @@ -35,10 +35,11 @@ struct UniformHemisphere static codomain_type generate(const domain_type u) { - typename ConcentricMapping::cache_type cmCache; + typename ConcentricMapping::cache_type cmCache; const vector_t2 p = ConcentricMapping::generate(u, cmCache); - const T z = T(1.0) - cmCache.r2; - const T xyScale = hlsl::sqrt(hlsl::max(T(0.0), T(2.0) - cmCache.r2)); + assert(cmCache.r2 <= _static_cast(1.0)); + const T z = _static_cast(1.0) - cmCache.r2; + const T xyScale = hlsl::sqrt(_static_cast(2.0) - cmCache.r2); return vector_t3(p.x * xyScale, p.y * xyScale, z); } @@ -50,30 +51,30 @@ struct UniformHemisphere static domain_type generateInverse(const codomain_type v) { // r_disk / r_xy = sqrt(1-z) / sqrt(1-z^2) = 1/sqrt(1+z) - const T scale = T(1.0) / hlsl::sqrt(T(1.0) + v.z); + const scalar_type scale = hlsl::rsqrt(_static_cast(1.0) + v.z); return ConcentricMapping::generateInverse(vector_t2(v.x * scale, v.y * scale)); } static density_type forwardPdf(const domain_type u, const cache_type cache) { - return T(0.5) * numbers::inv_pi; + return _static_cast(0.5) * numbers::inv_pi; } static weight_type forwardWeight(const domain_type u, const cache_type cache) { - return T(0.5) * numbers::inv_pi; + return _static_cast(0.5) * numbers::inv_pi; } static density_type backwardPdf(const codomain_type v) { assert(v.z > 0); - return T(0.5) * numbers::inv_pi; + return _static_cast(0.5) * numbers::inv_pi; } static weight_type backwardWeight(const codomain_type v) { assert(v.z > 0); - return T(0.5) * numbers::inv_pi; + return _static_cast(0.5) * numbers::inv_pi; } }; @@ -96,7 +97,7 @@ struct UniformSphere static codomain_type generate(const domain_type u) { - const T tmp = u.x * T(2.0) - T(1.0); + const T tmp = u.x * _static_cast(2.0) - _static_cast(1.0); const codomain_type L = hemisphere_t::generate(vector_t2(hlsl::abs(tmp), u.y)); return vector_t3(L.x, L.y, L.z * hlsl::sign(tmp)); } @@ -108,29 +109,29 @@ struct UniformSphere static domain_type generateInverse(const codomain_type v) { - const T dir = hlsl::sign(v.z) * T(0.5); + const T dir = hlsl::sign(v.z) * _static_cast(0.5); const domain_type hemiU = hemisphere_t::generateInverse(vector_t3(v.x, v.y, hlsl::abs(v.z))); - return vector_t2(hemiU.x * dir + T(0.5), hemiU.y); + return vector_t2(hemiU.x * dir + _static_cast(0.5), hemiU.y); } static density_type forwardPdf(const domain_type u, const cache_type cache) { - return T(0.5) * hemisphere_t::forwardPdf(u, cache); + return _static_cast(0.5) * hemisphere_t::forwardPdf(u, cache); } static weight_type forwardWeight(const domain_type u, const cache_type cache) { - return T(0.5) * hemisphere_t::forwardWeight(u, cache); + return _static_cast(0.5) * hemisphere_t::forwardWeight(u, cache); } static density_type backwardPdf(const codomain_type v) { - return T(0.25) * numbers::inv_pi; + return _static_cast(0.25) * numbers::inv_pi; } static weight_type backwardWeight(const codomain_type v) { - return T(0.25) * numbers::inv_pi; + return _static_cast(0.25) * numbers::inv_pi; } }; diff --git a/include/nbl/builtin/hlsl/shapes/obb.hlsl b/include/nbl/builtin/hlsl/shapes/obb.hlsl index bdddc48ebf..3fa5eebc15 100644 --- a/include/nbl/builtin/hlsl/shapes/obb.hlsl +++ b/include/nbl/builtin/hlsl/shapes/obb.hlsl @@ -49,7 +49,7 @@ struct OBB NBL_CONSTEXPR_STATIC_INLINE OBB createAxisAligned(point_t mid, point_t len) { point_t axes[D]; - for (auto dim_i = 0; dim_i < D; dim_i++) + for (int dim_i = 0; dim_i < D; dim_i++) { axes[dim_i] = point_t(0); axes[dim_i][dim_i] = 1; @@ -60,6 +60,106 @@ struct OBB matrix transform; }; +// Decomposed OBB view: caches columns and minCorner for fast vertex queries. +// Same 12-float footprint as float32_t3x4, but laid out for branchless +// corner computation. +// Decomposed OBB view: caches columns and minCorner for fast vertex queries. +// columns[i] are the full OBB edge vectors; minCorner is the world position +// of corner 0b000 = center - 0.5*(col0+col1+col2). +template +struct OBBView +{ + using scalar_t = Scalar; + using vec3_t = vector; + using mat3_t = matrix; + + mat3_t columns; + vec3_t minCorner; + + static OBBView create(matrix modelMatrix) + { + matrix m = transpose(modelMatrix); + OBBView v; + v.columns = mat3_t(m[0].xyz, m[1].xyz, m[2].xyz); + v.minCorner = m[3].xyz - scalar_t(0.5) * (m[0].xyz + m[1].xyz + m[2].xyz); + return v; + } + + vec3_t getCenter() NBL_CONST_MEMBER_FUNC + { + return minCorner + scalar_t(0.5) * (columns[0] + columns[1] + columns[2]); + } + + vec3_t getVertex(uint32_t i) NBL_CONST_MEMBER_FUNC + { + vec3_t p = minCorner; + if (i & 1u) p += columns[0]; + if (i & 2u) p += columns[1]; + if (i & 4u) p += columns[2]; + return p; + } + + // Scalar-z specialization: only computes the z component of a corner. + // Used for clip-mask build where we only need the sign of z. + scalar_t getVertexZ(uint32_t i) NBL_CONST_MEMBER_FUNC + { + scalar_t pz = minCorner.z; + if (i & 1u) pz += columns[0].z; + if (i & 2u) pz += columns[1].z; + if (i & 4u) pz += columns[2].z; + return pz; + } + + // Ray-OBB intersection via slab test in OBB-local space. + // Returns (tMin, tMax, hit). tMin is clamped to 0 if ray starts inside. + // TODO: not optimized -- precompute inverse columns, handle f~=0 edge case + struct Intersection + { + scalar_t tMin; + scalar_t tMax; + bool hit; + }; + + Intersection rayIntersection(vec3_t rayOrigin, vec3_t rayDir) NBL_CONST_MEMBER_FUNC + { + // Vector from ray origin to minCorner + vec3_t delta = rayOrigin - minCorner; + + scalar_t tMin = scalar_t(-1e30); + scalar_t tMax = scalar_t(1e30); + + // Slab test against each OBB axis + for (uint32_t i = 0; i < 3; i++) + { + vec3_t axis = columns[i]; + scalar_t axisLenSq = dot(axis, axis); + scalar_t e = dot(axis, delta); + scalar_t f = dot(axis, rayDir); + + // Project ray onto axis: entry at t where dot = 0, exit where dot = axisLenSq + scalar_t invF = scalar_t(1.0) / f; + scalar_t t1 = (-e) * invF; + scalar_t t2 = (axisLenSq - e) * invF; + + if (t1 > t2) + { + scalar_t tmp = t1; + t1 = t2; + t2 = tmp; + } + + tMin = max(tMin, t1); + tMax = min(tMax, t2); + } + + Intersection result; + result.hit = (tMax >= tMin) && (tMax > scalar_t(0)); + result.tMin = max(tMin, scalar_t(0)); + result.tMax = tMax; + return result; + } +}; + } } } diff --git a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl index 5895e4bc80..7ffe2ef407 100644 --- a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl +++ b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl @@ -94,6 +94,8 @@ struct SphericalRectangle vector4_type cosGamma; }; + // TODO: take an observer already, this way we can precompute and store the `r0`, `denorm_n_z` and `rcpLen_denorm_n_z` + // we need all of the above for solid angle and projected solid angle computation static SphericalRectangle create(NBL_CONST_REF_ARG(CompressedSphericalRectangle) compressed) { SphericalRectangle retval; @@ -102,7 +104,8 @@ struct SphericalRectangle retval.basis[0] = compressed.right / retval.extents[0]; retval.basis[1] = compressed.up / retval.extents[1]; assert(hlsl::abs(hlsl::dot(retval.basis[0], retval.basis[1])) < scalar_type(1e-5)); - retval.basis[2] = hlsl::normalize(hlsl::cross(retval.basis[0], retval.basis[1])); + // don't normalize, the `right` and `up` vectors are orthogonal and the two bases are normalized already! + retval.basis[2] = hlsl::cross(retval.basis[0], retval.basis[1]); return retval; } @@ -112,7 +115,8 @@ struct SphericalRectangle result.r0 = hlsl::mul(basis, origin - observer); const vector4_type denorm_n_z = vector4_type(-result.r0.y, result.r0.x + extents.x, result.r0.y + extents.y, -result.r0.x); - result.n_z = denorm_n_z * hlsl::rsqrt(hlsl::promote(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z); + const vector4_type rcpLen_denorm_n_z = hlsl::rsqrt(hlsl::promote(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z); + result.n_z = denorm_n_z * rcpLen_denorm_n_z; result.cosGamma = vector4_type( -result.n_z[0] * result.n_z[1], -result.n_z[1] * result.n_z[2], @@ -128,17 +132,17 @@ struct SphericalRectangle } // Kelvin-Stokes theorem: signed projected solid angle = integral_{rect} (n . omega) d_omega + // TODO: don't take the observer, observer should be taken at creation scalar_type projectedSolidAngle(const vector3_type observer, const vector3_type receiverNormal) NBL_CONST_MEMBER_FUNC { return projectedSolidAngleFromLocal(hlsl::mul(basis, origin - observer), hlsl::mul(basis, receiverNormal)); } - // Overload for when r0 and localNormal are already computed (avoids redundant mul(basis, ...)). - // Exploits rectangle structure: all 4 corners share the same z, so cross products - // have only 2 nonzero components each, and externalProducts can be computed without - // normalizing the corner directions. + // TODO: only take a `localN` scalar_type projectedSolidAngleFromLocal(const vector3_type r0, const vector3_type n) NBL_CONST_MEMBER_FUNC { + // FUN FACT: `n_z` already holds Z coordinate the NORMALIZED `awayFromEdgePlane`, the non-zero coordinate absolute value is equal to `r0.z * rcpLen_denorm_n_z` +// TODO: skip all this code until just call `acos` on the `unnormDots` const scalar_type x0 = r0.x, y0 = r0.y, z = r0.z; const scalar_type x1 = x0 + extents.x; const scalar_type y1 = y0 + extents.y; @@ -154,8 +158,9 @@ struct SphericalRectangle zSq + x1 * x1, zSq + y1 * y1, zSq + x0 * x0 - ); + ); // TODO: this is already computed as `rcpLen_denorm_n_z` +// TODO: this can be computed from `denorm_n_z`, `z` and `rcpLen_denorm_n_z` instead // dot(cross(ri,rj), n) / |cross(ri,rj)| the ex/ey scale factors cancel const vector4_type crossDotN = vector4_type( z * n.y - y0 * n.z, @@ -164,8 +169,12 @@ struct SphericalRectangle z * n.x - x0 * n.z ); // The ABS makes the computation correct for abs(cos(theta)) (BSDF projected solid angle). - const vector4_type externalProducts = hlsl::abs(crossDotN) * hlsl::rsqrt(crossLenSq); + const vector4_type externalProducts = crossDotN * hlsl::rsqrt(crossLenSq); +// TODO: isn't `rcpLen_denorm_n_z` related to the sin^-1() of arclengths ? Wouldn't `ACOS_CSC` apply instead of `acos*rsqrt(1-cos^2)` +// wouldn't then `hlsl::promote(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z` be the sin^2 ? +// it would probably have to be a different, here's the `acos(sqrt(1-x*x))/x` curve fit again revealed to me in a dream +// https://www.desmos.com/calculator/sbdrulot5a = exp2(-1.6*sqrt(1-x*x))*A+B // cos(arc length) between adjacent corners: dot(ri,rj) / (|ri|*|rj|) const vector4_type lenSq = vector4_type( x0 * x0 + y0 * y0, @@ -184,9 +193,11 @@ struct SphericalRectangle // rcpLen[i]*rcpLen[j] for adjacent pairs: (0,1), (1,2), (2,3), (3,0) const vector4_type cos_sides = unnormDots * rcpLen * rcpLen.yzwx; + // TODO: there's the same opportunity for optimization of this as the Spherical Triangle + // https://www.linkedin.com/posts/matt-kielan-9b054a165_untitled-graph-activity-7442910005671923712-jHz6?utm_source=share&utm_medium=member_desktop&rcm=ACoAACdp2RQBqq2bJfC2zxpsme-vRv2zh9oP-8E const vector4_type pyramidAngles = hlsl::acos(cos_sides); - return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5); + return hlsl::abs(hlsl::dot(pyramidAngles, externalProducts)) * scalar_type(0.5); } vector3_type origin; diff --git a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl index 1a5681b39e..b8a2c7229e 100644 --- a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl +++ b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl @@ -20,6 +20,26 @@ namespace hlsl namespace shapes { +// TODO: move to where fast_acos lives +template +T acos_csc_approx(const T arg) +{ + const T u = hlsl::log2(_static_cast(1)+arg); + // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial + // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties: + // `F(0) = 0` and + // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy + // See https://www.desmos.com/calculator/sdptomhbju + // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments + T poly; + // TODO: actually optimize these constants in real world scenarios (renders) + if (order==1) + poly = (_static_cast(1)-u)*_static_cast(0.6); + else if (order==2) + poly = (_static_cast(1)-u)*_static_cast(0.637)+(_static_cast(1) - u * u) * _static_cast(0.0115); + return hlsl::exp2(poly); +} + template struct SphericalTriangle { @@ -54,6 +74,7 @@ struct SphericalTriangle // degenerate triangle: any side has near-zero sin, so csc blows up if (hlsl::any >(retval.csc_sides >= hlsl::promote(numeric_limits::max))) { + // TODO: can't do this, still need to be able to sample thin triangle like a line light, so need to know all the angles which are still valid retval.cos_vertices = hlsl::promote(0.0); retval.sin_vertices = hlsl::promote(0.0); retval.solid_angle = 0; @@ -86,35 +107,29 @@ struct SphericalTriangle if (solid_angle <= numeric_limits::epsilon) return 0; - matrix awayFromEdgePlane; - awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * csc_sides[0]; - awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * csc_sides[1]; - awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * csc_sides[2]; + // `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross(A,B)*acos_csc_approx(dot(A,B))` +#define ACOS_CSC(I) acos_csc_approx(cos_sides[I]) +//#define ACOS_CSC(I) hlsl::acos(cos_sides[I])*csc_sides[I] + scalar_type externalProductsWeightedByPyramidAngles = hlsl::dot(hlsl::cross(vertices[1], vertices[2]),receiverNormal) * ACOS_CSC(0); + externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[2], vertices[0]),receiverNormal) * ACOS_CSC(1); + externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[0], vertices[1]),receiverNormal) * ACOS_CSC(2); +#undef ACOS_CSC + // The ABS makes it so that the computation is correct for an `abs(cos(theta))` factor which is the projected solid angle used for a BSDF. + // It also makes the computation insensitive to the CW or CCW winding of the vertices in the triangle. // Proof: Kelvin-Stokes theorem, if you split the set into two along the horizon with constant CCW winding, the `cross` along the shared edge // goes in different directions and cancels out, while `acos` of the clipped great arcs corresponding to polygon edges add up to the original sides again. - const vector3_type externalProducts = hlsl::abs(hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal)); - - // Far TODO: `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross*acos_csc_approx(dot(A,B))` - // We could skip the `csc_sides` factor, and computing `pyramidAngles` and replace them with this approximation weighting before the dot product with the receiver notmal - // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial, so far I've calculated `F = (1-u)0.635+(1-u^2)0.0118` which gives <5% error until 165 degrees - // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties: - // `F(0) = 0` and - // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy - // See https://www.desmos.com/calculator/sdptomhbju - // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments - const vector3_type pyramidAngles = hlsl::acos(cos_sides); - // So that triangle covering almost whole hemisphere sums to PI - return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5); + // The 0.5 is so that triangle covering almost whole hemisphere sums to PI + return externalProductsWeightedByPyramidAngles * scalar_type(0.5); } vector3_type vertices[3]; // angles of vertices with origin, so the sides are INSIDE the sphere vector3_type cos_sides; - vector3_type csc_sides; + vector3_type csc_sides; // TODO: spherical triangle sampling only needs `csc_sides[1]` and possibly `csc_sides[2]` // angles between arcs on the sphere, so angles in the TANGENT plane at each vertex vector3_type cos_vertices; - vector3_type sin_vertices; + vector3_type sin_vertices; // TODO: spherical triangle sampling only needs `sin_vertices[0]` a.k.a `sinA` scalar_type solid_angle; }; diff --git a/include/nbl/core/sampling/RandomSampler.h b/include/nbl/core/sampling/RandomSampler.h index 39832dc8f1..b692ef5e08 100644 --- a/include/nbl/core/sampling/RandomSampler.h +++ b/include/nbl/core/sampling/RandomSampler.h @@ -11,8 +11,8 @@ namespace nbl::core { -class RandomSampler -{ + class RandomSampler + { public: RandomSampler(uint32_t _seed) { @@ -25,9 +25,24 @@ class RandomSampler return mersenneTwister(); } + // Returns a float in [0, 1) + inline float nextFloat() + { + // 1 / 2^32 + constexpr float norm = 1.0f / 4294967296.0f; + return mersenneTwister() * norm; + } + + // Returns a float in [min, max) + inline float nextFloat(float min, float max) + { + constexpr float norm = 1.0f / 4294967296.0f; + return min + (mersenneTwister() * norm) * (max - min); + } + protected: std::mt19937 mersenneTwister; -}; + }; } diff --git a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h index 39013417dc..53cef98d71 100644 --- a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h +++ b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h @@ -24,11 +24,18 @@ struct ProtoPipeline final const video::IGPURenderpass* renderpass, const uint32_t subpassIx=0, asset::SBlendParams blendParams = {}, + asset::SRasterizationParams rasterizationParams = DefaultRasterParams, const hlsl::SurfaceTransform::FLAG_BITS swapchainTransform=hlsl::SurfaceTransform::FLAG_BITS::IDENTITY_BIT, video::IGPUPipelineCache* pipelineCache = nullptr ); core::smart_refctd_ptr m_vxShader; + + constexpr static inline asset::SRasterizationParams DefaultRasterParams = { + .faceCullingMode = asset::EFCM_NONE, + .depthWriteEnable = false, + .depthCompareOp = asset::ECO_ALWAYS + }; }; bool recordDrawCall(video::IGPUCommandBuffer* commandBuffer); diff --git a/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp b/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp index 58b1f2ea84..55dea2eb00 100644 --- a/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp +++ b/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp @@ -84,6 +84,7 @@ smart_refctd_ptr ProtoPipeline::createPipeline( const IGPURenderpass* renderpass, const uint32_t subpassIx, SBlendParams blendParams, + asset::SRasterizationParams rasterizationParams, const hlsl::SurfaceTransform::FLAG_BITS swapchainTransform, IGPUPipelineCache* pipelineCache) { @@ -94,11 +95,6 @@ smart_refctd_ptr ProtoPipeline::createPipeline( smart_refctd_ptr m_retval; { - constexpr SRasterizationParams defaultRasterParams = { - .faceCullingMode = EFCM_NONE, - .depthWriteEnable = false, - .depthCompareOp = ECO_ALWAYS - }; const auto orientationAsUint32 = static_cast(swapchainTransform); IGPUPipelineBase::SShaderEntryMap specConstants; @@ -111,7 +107,7 @@ smart_refctd_ptr ProtoPipeline::createPipeline( params[0].cached = { .vertexInput = {}, // The Full Screen Triangle doesn't use any HW vertex input state .primitiveAssembly = {}, - .rasterization = defaultRasterParams, + .rasterization = rasterizationParams, .blend = blendParams, .subpassIx = subpassIx };