Skip to content

Commit 2f5662a

Browse files
MarijnS95claude
andcommitted
Keep submitted command buffers alive until the GPU is done
Queue::submit() takes ownership of command buffers, but their destructors free backend resources (VkCommandPool, ID3D12CommandAllocator) that the GPU may still be reading from. Track in-flight batches per queue, tagged with their fence signal value, and non-blockingly query fence progress at the start of each submit to release completed batches. Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>
1 parent b1b3f4e commit 2f5662a

3 files changed

Lines changed: 73 additions & 6 deletions

File tree

lib/API/DX/Device.cpp

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,16 @@ class DXQueue : public offloadtest::Queue {
399399
ComPtr<ID3D12CommandQueue> Queue;
400400
std::unique_ptr<DXFence> SubmitFence;
401401
uint64_t FenceCounter = 0;
402+
// Batches of command buffers submitted to the GPU that may still be
403+
// in-flight. The ID3D12CommandAllocator owns the backing memory for
404+
// recorded commands, so it must outlive GPU execution. Each batch
405+
// records the fence value it signals so we can non-blockingly query
406+
// progress and release completed batches.
407+
struct InFlightBatch {
408+
uint64_t FenceValue;
409+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs;
410+
};
411+
llvm::SmallVector<InFlightBatch> InFlightBatches;
402412

403413
DXQueue(ComPtr<ID3D12CommandQueue> Queue,
404414
std::unique_ptr<DXFence> SubmitFence)
@@ -460,12 +470,21 @@ class DXCommandBuffer : public offloadtest::CommandBuffer {
460470

461471
llvm::Expected<offloadtest::SubmitResult> DXQueue::submit(
462472
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
473+
// Non-blocking: query how far the GPU has progressed and release
474+
// command buffers from completed submissions.
475+
{
476+
const uint64_t Completed = SubmitFence->getFenceValue();
477+
llvm::erase_if(InFlightBatches, [Completed](const InFlightBatch &B) {
478+
return B.FenceValue <= Completed;
479+
});
480+
}
481+
463482
llvm::SmallVector<ID3D12CommandList *> CmdLists;
464483
CmdLists.reserve(CBs.size());
465484

466-
// Wait on the previous submit's fence value before executing this batch,
467-
// so that back-to-back submits don't overlap on the GPU. Skip on first
468-
// submit since Wait(fence, 0) triggers a D3D12 validation warning.
485+
// GPU-side wait so that back-to-back submits don't overlap on the GPU.
486+
// Skip on first submit since Wait(fence, 0) triggers a D3D12 validation
487+
// warning.
469488
if (FenceCounter > 0)
470489
if (auto Err =
471490
HR::toError(Queue->Wait(SubmitFence->Fence.Get(), FenceCounter),
@@ -488,6 +507,9 @@ llvm::Expected<offloadtest::SubmitResult> DXQueue::submit(
488507
"Failed to add signal."))
489508
return Err;
490509

510+
// Keep submitted command buffers alive until the GPU is done with them.
511+
InFlightBatches.push_back({CurrentCounter, std::move(CBs)});
512+
491513
return offloadtest::SubmitResult{SubmitFence.get(), CurrentCounter};
492514
}
493515
class DXDevice : public offloadtest::Device {

lib/API/MTL/MTLDevice.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,15 @@ class MTLQueue : public offloadtest::Queue {
114114
std::unique_ptr<MTLFence> SubmitFence;
115115
uint64_t FenceCounter = 0;
116116

117+
// Batches of command buffers submitted to the GPU that may still be
118+
// in-flight. Each batch records the fence value it signals so we can
119+
// non-blockingly query progress and release completed batches.
120+
struct InFlightBatch {
121+
uint64_t FenceValue;
122+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs;
123+
};
124+
llvm::SmallVector<InFlightBatch> InFlightBatches;
125+
117126
MTLQueue(MTL::CommandQueue *Queue, std::unique_ptr<MTLFence> SubmitFence)
118127
: Queue(Queue), SubmitFence(std::move(SubmitFence)) {}
119128
~MTLQueue() override {
@@ -184,6 +193,15 @@ class MTLCommandBuffer : public offloadtest::CommandBuffer {
184193

185194
llvm::Expected<offloadtest::SubmitResult> MTLQueue::submit(
186195
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
196+
// Non-blocking: query how far the GPU has progressed and release
197+
// command buffers from completed submissions.
198+
{
199+
const uint64_t Completed = SubmitFence->getFenceValue();
200+
llvm::erase_if(InFlightBatches, [Completed](const InFlightBatch &B) {
201+
return B.FenceValue <= Completed;
202+
});
203+
}
204+
187205
// Metal serial queues guarantee that command buffers execute in commit order,
188206
// so no explicit wait on prior work is needed here.
189207
const uint64_t SignalValue = ++FenceCounter;
@@ -196,6 +214,9 @@ llvm::Expected<offloadtest::SubmitResult> MTLQueue::submit(
196214
MCB.CmdBuffer->commit();
197215
}
198216

217+
// Keep submitted command buffers alive until the GPU is done with them.
218+
InFlightBatches.push_back({SignalValue, std::move(CBs)});
219+
199220
return offloadtest::SubmitResult{SubmitFence.get(), SignalValue};
200221
}
201222
class MTLDevice : public offloadtest::Device {

lib/API/VK/Device.cpp

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,16 @@ class VulkanQueue : public offloadtest::Queue {
480480
VkDevice Device = VK_NULL_HANDLE;
481481
std::unique_ptr<VulkanFence> SubmitFence;
482482
uint64_t FenceCounter = 0;
483+
// Batches of command buffers submitted to the GPU that may still be
484+
// in-flight. VulkanCommandBuffer's destructor destroys the VkCommandPool,
485+
// which would invalidate any still-pending command buffers. Each batch
486+
// records the fence value it signals so we can non-blockingly query
487+
// progress and release completed batches.
488+
struct InFlightBatch {
489+
uint64_t FenceValue;
490+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs;
491+
};
492+
llvm::SmallVector<InFlightBatch> InFlightBatches;
483493

484494
VulkanQueue(VkQueue Q, uint32_t QueueFamilyIdx, VkDevice Device,
485495
std::unique_ptr<VulkanFence> SubmitFence)
@@ -545,12 +555,20 @@ class VulkanCommandBuffer : public offloadtest::CommandBuffer {
545555

546556
llvm::Expected<offloadtest::SubmitResult> VulkanQueue::submit(
547557
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
558+
// Non-blocking: query how far the GPU has progressed and release
559+
// command buffers from completed submissions.
560+
{
561+
const uint64_t Completed = SubmitFence->getFenceValue();
562+
llvm::erase_if(InFlightBatches, [Completed](const InFlightBatch &B) {
563+
return B.FenceValue <= Completed;
564+
});
565+
}
566+
548567
llvm::SmallVector<VkCommandBuffer> CmdBuffers;
549568
CmdBuffers.reserve(CBs.size());
550569

551-
// Wait on the previous submit's fence value before executing this batch,
552-
// so that back-to-back submits don't overlap on the GPU. Waiting for a
553-
// value that is already signaled (including 0 on first submit) is a no-op.
570+
// GPU-side wait so that back-to-back submits don't overlap on the GPU.
571+
// Waiting for a value that is already signaled (including 0) is a no-op.
554572
const uint64_t WaitValue = FenceCounter;
555573
const uint64_t SignalValue = ++FenceCounter;
556574
const VkPipelineStageFlags WaitStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
@@ -585,6 +603,9 @@ llvm::Expected<offloadtest::SubmitResult> VulkanQueue::submit(
585603
return llvm::createStringError(std::errc::device_or_resource_busy,
586604
"Failed to submit to queue.");
587605

606+
// Keep submitted command buffers alive until the GPU is done with them.
607+
InFlightBatches.push_back({SignalValue, std::move(CBs)});
608+
588609
return offloadtest::SubmitResult{SubmitFence.get(), SignalValue};
589610
}
590611
class VulkanDevice : public offloadtest::Device {
@@ -831,6 +852,9 @@ class VulkanDevice : public offloadtest::Device {
831852
~VulkanDevice() override {
832853
if (Device != VK_NULL_HANDLE) {
833854
vkDeviceWaitIdle(Device);
855+
// Release in-flight command buffers before destroying the device,
856+
// since their destructors call vkDestroyCommandPool on the VkDevice.
857+
GraphicsQueue.InFlightBatches.clear();
834858
// Destroy the queue's fence before the device, since the fence
835859
// references the VkDevice for vkDestroySemaphore.
836860
GraphicsQueue.SubmitFence.reset();

0 commit comments

Comments
 (0)