Skip to content

Commit 5fbe019

Browse files
committed
VIDEO/D3D11: GPU tonemap path for HDR screenshots
Move the HDR -> SDR conversion in d3d11_gfx_read_viewport from the CPU decoder in dxgi_hdr_readback_to_bgr24() to a single-pass GPU render using a new PSMainToSDR entry point in hdr_sm5.hlsl.h. The GPU path: - Lazily compiles PSMainToSDR into d3d11->hdr.ps_readback on the first HDR screenshot, amortising the D3DCompile cost. - Copies the swapchain backbuffer to a shader-readable intermediate (the swapchain itself is created with RENDER_TARGET_OUTPUT only and cannot be bound as an SRV directly). - Draws a fullscreen quad through the new pixel shader, sampling the intermediate and writing sRGB-encoded bytes into a B8G8R8A8_UNORM render target. - Copies the RT to a staging texture, maps it, and unswizzles into the caller's BGR24 bottom-up buffer. Mirrors the same math as the CPU decoder and the Vulkan hdr_to_sdr pipeline, so output is visually consistent across all three paths. On any failure (D3DCompile failure, OOM, unexpected driver state, Map failure) the caller logs a RARCH_WARN and falls through to the existing CPU decoder — HDR screenshots remain available even if the GPU path breaks on a particular driver/hardware combination. D3D12 port will follow in a separate commit; the descriptor-heap and resource-state work there is substantially more invasive and deserves its own focused test pass.
1 parent 7d3b452 commit 5fbe019

2 files changed

Lines changed: 313 additions & 3 deletions

File tree

gfx/drivers/d3d11.c

Lines changed: 240 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ typedef struct
352352
{
353353
dxgi_hdr_uniform_t ubo_values;
354354
D3D11Buffer ubo;
355+
D3D11PixelShader ps_readback; /* lazy: HDR -> SDR tonemap PS used by read_viewport */
355356
float menu_nits;
356357
float max_output_nits;
357358
float min_output_nits;
@@ -2783,6 +2784,7 @@ static void d3d11_gfx_free(void* data)
27832784

27842785
#ifdef HAVE_DXGI_HDR
27852786
Release(d3d11->hdr.ubo);
2787+
Release(d3d11->hdr.ps_readback);
27862788
#endif
27872789

27882790
d3d11_release_shader(&d3d11->sprites.shader);
@@ -5016,6 +5018,231 @@ static void d3d11_gfx_viewport_info(void* data, struct video_viewport* vp)
50165018
*vp = d3d11->vp;
50175019
}
50185020

5021+
#ifdef HAVE_DXGI_HDR
5022+
/* GPU path for HDR screenshot readback.
5023+
*
5024+
* Runs a single full-screen pass that samples the captured HDR backbuffer
5025+
* and writes sRGB-encoded SDR into a B8G8R8A8_UNORM render target, then
5026+
* copies that to a CPU-mappable staging texture and unswizzles into the
5027+
* caller's BGR24 output. Mirrors the Vulkan driver's hdr_to_sdr path
5028+
* and the CPU implementation in dxgi_hdr_readback_to_bgr24() — either
5029+
* one should produce visually identical screenshots.
5030+
*
5031+
* Returns false on any failure; the caller then falls back to the CPU
5032+
* decoder so HDR screenshots still work even if the GPU path breaks
5033+
* (driver PSO compile bug, OOM, unexpected state, etc.). */
5034+
static bool d3d11_gpu_hdr_readback_to_bgr24(
5035+
d3d11_video_t* d3d11,
5036+
ID3D11Resource* src_backbuffer_res,
5037+
DXGI_FORMAT src_format,
5038+
unsigned full_width,
5039+
unsigned full_height,
5040+
unsigned vp_x, unsigned vp_y,
5041+
unsigned vp_w, unsigned vp_h,
5042+
uint8_t* buffer)
5043+
{
5044+
ID3D11Device* device = d3d11->device;
5045+
ID3D11DeviceContext* context = d3d11->context;
5046+
d3d11_shader_t* hdr_shader = &d3d11->shaders[VIDEO_SHADER_STOCK_HDR];
5047+
d3d11_texture_t src_tex = { 0 };
5048+
d3d11_texture_t sdr_tex = { 0 };
5049+
ID3D11Texture2D* staging_tex = NULL;
5050+
ID3D11Resource* staging_res = NULL;
5051+
ID3D11Resource* sdr_res = NULL;
5052+
D3D11_TEXTURE2D_DESC staging_desc;
5053+
D3D11_MAPPED_SUBRESOURCE map;
5054+
D3D11_VIEWPORT vp;
5055+
D3D11_RECT sc;
5056+
unsigned hdr_mode;
5057+
unsigned y, x;
5058+
UINT stride = sizeof(d3d11_vertex_t);
5059+
UINT offset = 0;
5060+
bool mapped = false;
5061+
bool ret = false;
5062+
5063+
if (src_format == DXGI_FORMAT_R10G10B10A2_UNORM)
5064+
hdr_mode = 1;
5065+
else if (src_format == DXGI_FORMAT_R16G16B16A16_FLOAT)
5066+
hdr_mode = 2;
5067+
else
5068+
return false;
5069+
5070+
/* Lazy compile of the readback pixel shader. Only pays the cost
5071+
* the first time a screenshot is taken with HDR enabled. */
5072+
if (!d3d11->hdr.ps_readback)
5073+
{
5074+
static const char shader_src[] =
5075+
#include "d3d_shaders/hdr_sm5.hlsl.h"
5076+
;
5077+
D3DBlob ps_code = NULL;
5078+
if (!d3d_compile(shader_src, sizeof(shader_src), NULL,
5079+
"PSMainToSDR", "ps_5_0", &ps_code) || !ps_code)
5080+
{
5081+
RARCH_ERR("[D3D11] Failed to compile PSMainToSDR for HDR readback.\n");
5082+
return false;
5083+
}
5084+
if (FAILED(device->lpVtbl->CreatePixelShader(device,
5085+
ps_code->lpVtbl->GetBufferPointer(ps_code),
5086+
ps_code->lpVtbl->GetBufferSize(ps_code),
5087+
NULL, &d3d11->hdr.ps_readback)))
5088+
{
5089+
ps_code->lpVtbl->Release(ps_code);
5090+
RARCH_ERR("[D3D11] Failed to create readback PS.\n");
5091+
return false;
5092+
}
5093+
ps_code->lpVtbl->Release(ps_code);
5094+
}
5095+
5096+
/* HDR-format intermediate source: a shader-readable copy of the
5097+
* swapchain backbuffer. (The swapchain itself is created with
5098+
* RENDER_TARGET_OUTPUT only and cannot be bound as an SRV.) */
5099+
src_tex.desc.Width = full_width;
5100+
src_tex.desc.Height = full_height;
5101+
src_tex.desc.Format = src_format;
5102+
src_tex.desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
5103+
if (!d3d11_init_texture(device, &src_tex))
5104+
goto cleanup;
5105+
5106+
context->lpVtbl->CopyResource(context,
5107+
(ID3D11Resource*)src_tex.handle, src_backbuffer_res);
5108+
5109+
/* SDR render target: receives the tonemap output. */
5110+
sdr_tex.desc.Width = full_width;
5111+
sdr_tex.desc.Height = full_height;
5112+
sdr_tex.desc.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
5113+
sdr_tex.desc.BindFlags = D3D11_BIND_RENDER_TARGET;
5114+
if (!d3d11_init_texture(device, &sdr_tex))
5115+
goto cleanup;
5116+
5117+
/* Populate the UBO with readback-specific values and push. */
5118+
{
5119+
const float prev_it = d3d11->hdr.ubo_values.inverse_tonemap;
5120+
const float prev_h = d3d11->hdr.ubo_values.hdr10;
5121+
const unsigned prev_m = d3d11->hdr.ubo_values.hdr_mode;
5122+
const float prev_sc = d3d11->hdr.ubo_values.scanlines;
5123+
D3D11_MAPPED_SUBRESOURCE mapped_ubo;
5124+
5125+
d3d11->hdr.ubo_values.inverse_tonemap = 0.0f;
5126+
d3d11->hdr.ubo_values.hdr10 = 0.0f;
5127+
d3d11->hdr.ubo_values.hdr_mode = hdr_mode;
5128+
d3d11->hdr.ubo_values.scanlines = 0.0f;
5129+
5130+
if (SUCCEEDED(context->lpVtbl->Map(context,
5131+
(ID3D11Resource*)d3d11->hdr.ubo, 0,
5132+
D3D11_MAP_WRITE_DISCARD, 0, &mapped_ubo)))
5133+
{
5134+
*(dxgi_hdr_uniform_t*)mapped_ubo.pData = d3d11->hdr.ubo_values;
5135+
context->lpVtbl->Unmap(context,
5136+
(ID3D11Resource*)d3d11->hdr.ubo, 0);
5137+
}
5138+
5139+
d3d11->hdr.ubo_values.inverse_tonemap = prev_it;
5140+
d3d11->hdr.ubo_values.hdr10 = prev_h;
5141+
d3d11->hdr.ubo_values.hdr_mode = prev_m;
5142+
d3d11->hdr.ubo_values.scanlines = prev_sc;
5143+
}
5144+
5145+
/* Bind state: VS / IL / GS from the HDR stock shader, PS from our
5146+
* readback shader. */
5147+
context->lpVtbl->IASetInputLayout(context, hdr_shader->layout);
5148+
context->lpVtbl->VSSetShader(context, hdr_shader->vs, NULL, 0);
5149+
context->lpVtbl->PSSetShader(context, d3d11->hdr.ps_readback, NULL, 0);
5150+
context->lpVtbl->GSSetShader(context, hdr_shader->gs, NULL, 0);
5151+
context->lpVtbl->IASetPrimitiveTopology(context,
5152+
D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
5153+
5154+
context->lpVtbl->VSSetConstantBuffers(context, 0, 1, &d3d11->hdr.ubo);
5155+
context->lpVtbl->PSSetConstantBuffers(context, 0, 1, &d3d11->hdr.ubo);
5156+
context->lpVtbl->PSSetShaderResources(context, 0, 1, &src_tex.view);
5157+
context->lpVtbl->PSSetSamplers(context, 0, 1,
5158+
&d3d11->samplers[RARCH_FILTER_UNSPEC][RARCH_WRAP_DEFAULT]);
5159+
5160+
context->lpVtbl->IASetVertexBuffers(context, 0, 1,
5161+
&d3d11->frame.vbo, &stride, &offset);
5162+
5163+
context->lpVtbl->OMSetRenderTargets(context, 1, &sdr_tex.rt_view, NULL);
5164+
context->lpVtbl->OMSetBlendState(context, d3d11->blend_disable, NULL, 0xFFFFFFFF);
5165+
5166+
vp.TopLeftX = 0.0f;
5167+
vp.TopLeftY = 0.0f;
5168+
vp.Width = (float)full_width;
5169+
vp.Height = (float)full_height;
5170+
vp.MinDepth = 0.0f;
5171+
vp.MaxDepth = 1.0f;
5172+
sc.left = 0;
5173+
sc.top = 0;
5174+
sc.right = (LONG)full_width;
5175+
sc.bottom = (LONG)full_height;
5176+
context->lpVtbl->RSSetViewports(context, 1, &vp);
5177+
context->lpVtbl->RSSetScissorRects(context, 1, &sc);
5178+
context->lpVtbl->RSSetState(context, d3d11->scissor_disabled);
5179+
5180+
context->lpVtbl->Draw(context, 4, 0);
5181+
5182+
/* Unbind SRV before we may read from the same texture as a source
5183+
* for anything else (and to stop D3D11 complaining about RTV/SRV
5184+
* aliasing if anything upstream uses the same slot). */
5185+
{
5186+
ID3D11ShaderResourceView* null_srv = NULL;
5187+
context->lpVtbl->PSSetShaderResources(context, 0, 1, &null_srv);
5188+
}
5189+
5190+
/* Staging copy of the SDR RT. */
5191+
staging_desc = sdr_tex.desc;
5192+
staging_desc.MipLevels = 1;
5193+
staging_desc.BindFlags = 0;
5194+
staging_desc.MiscFlags = 0;
5195+
staging_desc.Usage = D3D11_USAGE_STAGING;
5196+
staging_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
5197+
if (FAILED(device->lpVtbl->CreateTexture2D(device, &staging_desc,
5198+
NULL, &staging_tex)))
5199+
goto cleanup;
5200+
5201+
#ifdef __cplusplus
5202+
staging_tex->lpVtbl->QueryInterface(staging_tex, IID_ID3D11Resource, (void**)&staging_res);
5203+
sdr_tex.handle->lpVtbl->QueryInterface(sdr_tex.handle, IID_ID3D11Resource, (void**)&sdr_res);
5204+
#else
5205+
staging_tex->lpVtbl->QueryInterface(staging_tex, &IID_ID3D11Resource, (void**)&staging_res);
5206+
sdr_tex.handle->lpVtbl->QueryInterface(sdr_tex.handle, &IID_ID3D11Resource, (void**)&sdr_res);
5207+
#endif
5208+
context->lpVtbl->CopyResource(context, staging_res, sdr_res);
5209+
5210+
if (FAILED(context->lpVtbl->Map(context, staging_res, 0,
5211+
D3D11_MAP_READ, 0, &map)))
5212+
goto cleanup;
5213+
mapped = true;
5214+
5215+
/* BGRA8 -> BGR24, bottom-up, clamped to viewport. */
5216+
{
5217+
const uint8_t* src_row = (const uint8_t*)map.pData + (size_t)map.RowPitch * vp_y;
5218+
for (y = 0; y < vp_h; y++, src_row += map.RowPitch)
5219+
{
5220+
uint8_t* dst = buffer + 3 * (size_t)(vp_h - y - 1) * vp_w;
5221+
for (x = 0; x < vp_w; x++)
5222+
{
5223+
dst[3 * x + 0] = src_row[4 * (x + vp_x) + 0];
5224+
dst[3 * x + 1] = src_row[4 * (x + vp_x) + 1];
5225+
dst[3 * x + 2] = src_row[4 * (x + vp_x) + 2];
5226+
}
5227+
}
5228+
}
5229+
ret = true;
5230+
5231+
cleanup:
5232+
if (mapped)
5233+
context->lpVtbl->Unmap(context, staging_res, 0);
5234+
if (staging_res)
5235+
staging_res->lpVtbl->Release(staging_res);
5236+
if (sdr_res)
5237+
sdr_res->lpVtbl->Release(sdr_res);
5238+
if (staging_tex)
5239+
staging_tex->lpVtbl->Release(staging_tex);
5240+
d3d11_release_texture(&sdr_tex);
5241+
d3d11_release_texture(&src_tex);
5242+
return ret;
5243+
}
5244+
#endif /* HAVE_DXGI_HDR */
5245+
50195246
static bool d3d11_gfx_read_viewport(void* data, uint8_t* buffer, bool is_idle)
50205247
{
50215248
d3d11_video_t* d3d11 = (d3d11_video_t*)data;
@@ -5118,9 +5345,19 @@ static bool d3d11_gfx_read_viewport(void* data, uint8_t* buffer, bool is_idle)
51185345
#ifdef HAVE_DXGI_HDR
51195346
case DXGI_FORMAT_R10G10B10A2_UNORM:
51205347
case DXGI_FORMAT_R16G16B16A16_FLOAT:
5121-
/* HDR10 PQ or scRGB: hand off to the CPU HDR decoder.
5122-
* It undoes the forward HDR encoding using paper_white_nits
5123-
* and writes sRGB-encoded BGR24 bottom-up. */
5348+
/* HDR10 PQ or scRGB. Try the GPU tonemap pass first — it's
5349+
* faster and avoids the per-pixel CPU cost at 4K — and fall
5350+
* back to the CPU decoder on any failure so HDR screenshots
5351+
* still work even if the GPU path breaks (driver PSO compile
5352+
* bug, OOM, etc.). */
5353+
if (d3d11_gpu_hdr_readback_to_bgr24(
5354+
d3d11, BackBufferResource, StagingDesc.Format,
5355+
StagingDesc.Width, StagingDesc.Height,
5356+
vp_x, vp_y, vp_width, vp_height,
5357+
buffer))
5358+
break;
5359+
5360+
RARCH_WARN("[D3D11] GPU HDR readback failed, falling back to CPU.\n");
51245361
if (!dxgi_hdr_readback_to_bgr24(
51255362
StagingDesc.Format,
51265363
Map.pData, Map.RowPitch,

gfx/drivers/d3d_shaders/hdr_sm5.hlsl.h

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,4 +558,77 @@ float4 PSMain(PSInput input) : SV_TARGET
558558
return t0.Sample(s0, input.texcoord);
559559
}
560560
};
561+
562+
/* --- Readback path: HDR -> SDR (inverse of the forward composition) ---
563+
* Used by the read_viewport screenshot path in the D3D11/D3D12 drivers
564+
* to sample the HDR backbuffer (HDR10 PQ or scRGB FP16) and produce
565+
* sRGB-encoded bytes that a plain PNG viewer can display correctly.
566+
* Mirrors dxgi_hdr_readback_to_bgr24() in gfx/common/dxgi_common.c but
567+
* runs on the GPU. */
568+
569+
/* sRGB OETF: linear [0,1] -> sRGB encoded [0,1]. The readback render
570+
* target is B8G8R8A8_UNORM (linear), so we apply the encoding ourselves. */
571+
float3 LinearToSRGB(float3 c)
572+
{
573+
float3 clamped = saturate(c);
574+
float3 lo = clamped * 12.92f;
575+
float3 hi = 1.055f * pow(clamped, 1.0f / 2.4f) - 0.055f;
576+
return (clamped <= 0.0031308f) ? lo : hi;
577+
}
578+
579+
/* Reverse of InverseTonemap(): compresses values above SDR white back
580+
* into [0,1] using a hue-preserving operator on the peak channel. */
581+
float3 Tonemap(const float3 hdr_linear, const float max_nits, const float brightness_nits)
582+
{
583+
const float input_val = max(hdr_linear.r, max(hdr_linear.g, hdr_linear.b));
584+
if (input_val < 0.0001f) return hdr_linear;
585+
const float peak_ratio = max_nits / brightness_nits;
586+
const float k = 1.0f - (1.0f / peak_ratio);
587+
return hdr_linear / max(1.0f + input_val * k, 0.0001f);
588+
}
589+
590+
/* PQ-encoded BT.2020 HDR10 -> linear BT.709 at paper-white units.
591+
* Undoes the forward pass's PQ encode and BT.709->BT.2020 rotation,
592+
* and rescales so that SDR paper-white maps back to 1.0. */
593+
float3 HDR10ToLinear(float3 hdr10, float brightness_nits)
594+
{
595+
float3 linear_2020_10k = ST2084ToLinear(hdr10);
596+
float3 linear_709 = mul(k2020to709, linear_2020_10k);
597+
/* Forward path multiplied by (brightness_nits / 10000) before PQ
598+
* encoding; undo that to bring SDR white back to 1.0. */
599+
return linear_709 * (kMaxNitsFor2084 / brightness_nits);
600+
}
601+
602+
/* Entry point selected by hdr_mode:
603+
* 1 = HDR10 PQ backbuffer -> HDR10ToLinear + Tonemap + sRGB encode
604+
* 2 = scRGB FP16 backbuffer -> rescale by (80 / paper_white) + sRGB
605+
* The readback RT is B8G8R8A8_UNORM, so we always apply the sRGB OETF. */
606+
float4 PSMainToSDR(PSInput input) : SV_TARGET
607+
{
608+
float4 src = t0.Sample(s0, input.texcoord);
609+
float3 sdr_linear;
610+
611+
if (global.hdr_mode == 1)
612+
{
613+
float3 hdr_linear = HDR10ToLinear(src.rgb, global.brightness_nits);
614+
sdr_linear = Tonemap(hdr_linear,
615+
global.brightness_nits,
616+
global.brightness_nits);
617+
}
618+
else if (global.hdr_mode == 2)
619+
{
620+
/* scRGB: undo forward scale of (brightness_nits / 80).
621+
* Negative and >1 values are legal scRGB (wide-gamut / super-white)
622+
* and will clamp in LinearToSRGB. */
623+
sdr_linear = src.rgb * (kscRGBWhiteNits / global.brightness_nits);
624+
}
625+
else
626+
{
627+
/* Passthrough — shouldn't happen on the HDR readback path but
628+
* keeps the shader well-defined. */
629+
sdr_linear = src.rgb;
630+
}
631+
632+
return float4(LinearToSRGB(sdr_linear), 1.0f);
633+
}
561634
)

0 commit comments

Comments
 (0)