Keep submitted command buffers alive until the GPU is done

MarijnS95 · claude · MarijnS95 · commit 2f5662ab9abb · 2026-04-21T09:11:10.000+02:00
Queue::submit() takes ownership of command buffers, but their destructors
free backend resources (VkCommandPool, ID3D12CommandAllocator) that the
GPU may still be reading from.  Track in-flight batches per queue, tagged
with their fence signal value, and non-blockingly query fence progress at
the start of each submit to release completed batches.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/lib/API/DX/Device.cpp b/lib/API/DX/Device.cpp
@@ -399,6 +399,16 @@ class DXQueue : public offloadtest::Queue {
   ComPtr<ID3D12CommandQueue> Queue;
   std::unique_ptr<DXFence> SubmitFence;
   uint64_t FenceCounter = 0;
+  // Batches of command buffers submitted to the GPU that may still be
+  // in-flight.  The ID3D12CommandAllocator owns the backing memory for
+  // recorded commands, so it must outlive GPU execution.  Each batch
+  // records the fence value it signals so we can non-blockingly query
+  // progress and release completed batches.
+  struct InFlightBatch {
+    uint64_t FenceValue;
+    llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs;
+  };
+  llvm::SmallVector<InFlightBatch> InFlightBatches;
 
   DXQueue(ComPtr<ID3D12CommandQueue> Queue,
           std::unique_ptr<DXFence> SubmitFence)
@@ -460,12 +470,21 @@ class DXCommandBuffer : public offloadtest::CommandBuffer {
 
 llvm::Expected<offloadtest::SubmitResult> DXQueue::submit(
     llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
+  // Non-blocking: query how far the GPU has progressed and release
+  // command buffers from completed submissions.
+  {
+    const uint64_t Completed = SubmitFence->getFenceValue();
+    llvm::erase_if(InFlightBatches, [Completed](const InFlightBatch &B) {
+      return B.FenceValue <= Completed;
+    });
+  }
+
   llvm::SmallVector<ID3D12CommandList *> CmdLists;
   CmdLists.reserve(CBs.size());
 
-  // Wait on the previous submit's fence value before executing this batch,
-  // so that back-to-back submits don't overlap on the GPU. Skip on first
-  // submit since Wait(fence, 0) triggers a D3D12 validation warning.
+  // GPU-side wait so that back-to-back submits don't overlap on the GPU.
+  // Skip on first submit since Wait(fence, 0) triggers a D3D12 validation
+  // warning.
   if (FenceCounter > 0)
     if (auto Err =
             HR::toError(Queue->Wait(SubmitFence->Fence.Get(), FenceCounter),
@@ -488,6 +507,9 @@ llvm::Expected<offloadtest::SubmitResult> DXQueue::submit(
                       "Failed to add signal."))
     return Err;
 
+  // Keep submitted command buffers alive until the GPU is done with them.
+  InFlightBatches.push_back({CurrentCounter, std::move(CBs)});
+
   return offloadtest::SubmitResult{SubmitFence.get(), CurrentCounter};
 }
 class DXDevice : public offloadtest::Device {
diff --git a/lib/API/MTL/MTLDevice.cpp b/lib/API/MTL/MTLDevice.cpp
@@ -114,6 +114,15 @@ class MTLQueue : public offloadtest::Queue {
   std::unique_ptr<MTLFence> SubmitFence;
   uint64_t FenceCounter = 0;
 
+  // Batches of command buffers submitted to the GPU that may still be
+  // in-flight.  Each batch records the fence value it signals so we can
+  // non-blockingly query progress and release completed batches.
+  struct InFlightBatch {
+    uint64_t FenceValue;
+    llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs;
+  };
+  llvm::SmallVector<InFlightBatch> InFlightBatches;
+
   MTLQueue(MTL::CommandQueue *Queue, std::unique_ptr<MTLFence> SubmitFence)
       : Queue(Queue), SubmitFence(std::move(SubmitFence)) {}
   ~MTLQueue() override {
@@ -184,6 +193,15 @@ class MTLCommandBuffer : public offloadtest::CommandBuffer {
 
 llvm::Expected<offloadtest::SubmitResult> MTLQueue::submit(
     llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
+  // Non-blocking: query how far the GPU has progressed and release
+  // command buffers from completed submissions.
+  {
+    const uint64_t Completed = SubmitFence->getFenceValue();
+    llvm::erase_if(InFlightBatches, [Completed](const InFlightBatch &B) {
+      return B.FenceValue <= Completed;
+    });
+  }
+
   // Metal serial queues guarantee that command buffers execute in commit order,
   // so no explicit wait on prior work is needed here.
   const uint64_t SignalValue = ++FenceCounter;
@@ -196,6 +214,9 @@ llvm::Expected<offloadtest::SubmitResult> MTLQueue::submit(
     MCB.CmdBuffer->commit();
   }
 
+  // Keep submitted command buffers alive until the GPU is done with them.
+  InFlightBatches.push_back({SignalValue, std::move(CBs)});
+
   return offloadtest::SubmitResult{SubmitFence.get(), SignalValue};
 }
 class MTLDevice : public offloadtest::Device {
diff --git a/lib/API/VK/Device.cpp b/lib/API/VK/Device.cpp
@@ -480,6 +480,16 @@ class VulkanQueue : public offloadtest::Queue {
   VkDevice Device = VK_NULL_HANDLE;
   std::unique_ptr<VulkanFence> SubmitFence;
   uint64_t FenceCounter = 0;
+  // Batches of command buffers submitted to the GPU that may still be
+  // in-flight.  VulkanCommandBuffer's destructor destroys the VkCommandPool,
+  // which would invalidate any still-pending command buffers.  Each batch
+  // records the fence value it signals so we can non-blockingly query
+  // progress and release completed batches.
+  struct InFlightBatch {
+    uint64_t FenceValue;
+    llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs;
+  };
+  llvm::SmallVector<InFlightBatch> InFlightBatches;
 
   VulkanQueue(VkQueue Q, uint32_t QueueFamilyIdx, VkDevice Device,
               std::unique_ptr<VulkanFence> SubmitFence)
@@ -545,12 +555,20 @@ class VulkanCommandBuffer : public offloadtest::CommandBuffer {
 
 llvm::Expected<offloadtest::SubmitResult> VulkanQueue::submit(
     llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
+  // Non-blocking: query how far the GPU has progressed and release
+  // command buffers from completed submissions.
+  {
+    const uint64_t Completed = SubmitFence->getFenceValue();
+    llvm::erase_if(InFlightBatches, [Completed](const InFlightBatch &B) {
+      return B.FenceValue <= Completed;
+    });
+  }
+
   llvm::SmallVector<VkCommandBuffer> CmdBuffers;
   CmdBuffers.reserve(CBs.size());
 
-  // Wait on the previous submit's fence value before executing this batch,
-  // so that back-to-back submits don't overlap on the GPU. Waiting for a
-  // value that is already signaled (including 0 on first submit) is a no-op.
+  // GPU-side wait so that back-to-back submits don't overlap on the GPU.
+  // Waiting for a value that is already signaled (including 0) is a no-op.
   const uint64_t WaitValue = FenceCounter;
   const uint64_t SignalValue = ++FenceCounter;
   const VkPipelineStageFlags WaitStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
@@ -585,6 +603,9 @@ llvm::Expected<offloadtest::SubmitResult> VulkanQueue::submit(
     return llvm::createStringError(std::errc::device_or_resource_busy,
                                    "Failed to submit to queue.");
 
+  // Keep submitted command buffers alive until the GPU is done with them.
+  InFlightBatches.push_back({SignalValue, std::move(CBs)});
+
   return offloadtest::SubmitResult{SubmitFence.get(), SignalValue};
 }
 class VulkanDevice : public offloadtest::Device {
@@ -831,6 +852,9 @@ class VulkanDevice : public offloadtest::Device {
   ~VulkanDevice() override {
     if (Device != VK_NULL_HANDLE) {
       vkDeviceWaitIdle(Device);
+      // Release in-flight command buffers before destroying the device,
+      // since their destructors call vkDestroyCommandPool on the VkDevice.
+      GraphicsQueue.InFlightBatches.clear();
       // Destroy the queue's fence before the device, since the fence
       // references the VkDevice for vkDestroySemaphore.
       GraphicsQueue.SubmitFence.reset();