Skip to content

Commit cfb2665

Browse files
MarijnS95claude
andcommitted
Add Queue::submit() for command buffer submission
Move command buffer submission logic from each backend's Device into Queue::submit(), which takes ownership of the command buffers. Each backend uses its Fence abstraction for GPU synchronization: - Metal: commit() + waitUntilCompleted() - Vulkan: vkQueueSubmit() signaling a timeline semaphore (VulkanFence), then VulkanFence::waitForCompletion() - DX12: ExecuteCommandLists() + Queue::Signal() on the queue-owned DXFence, then DXFence::waitForCompletion() Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>
1 parent 8c6a168 commit cfb2665

4 files changed

Lines changed: 135 additions & 78 deletions

File tree

include/API/Device.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,17 @@ class Queue {
5858
public:
5959
virtual ~Queue() = 0;
6060

61+
/// Submit command buffers for execution and block until completion.
62+
virtual llvm::Error
63+
submit(llvm::SmallVector<std::unique_ptr<CommandBuffer>> CBs) = 0;
64+
65+
/// Convenience overload for submitting a single command buffer.
66+
llvm::Error submit(std::unique_ptr<CommandBuffer> CB) {
67+
llvm::SmallVector<std::unique_ptr<CommandBuffer>> CBs;
68+
CBs.push_back(std::move(CB));
69+
return submit(std::move(CBs));
70+
}
71+
6172
protected:
6273
Queue() = default;
6374
};

lib/API/DX/Device.cpp

Lines changed: 50 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -394,22 +394,35 @@ class DXFence : public offloadtest::Fence {
394394

395395
class DXQueue : public offloadtest::Queue {
396396
public:
397+
using Queue::submit;
398+
397399
ComPtr<ID3D12CommandQueue> Queue;
400+
std::unique_ptr<DXFence> SubmitFence;
401+
uint64_t FenceCounter = 0;
398402

399-
DXQueue(ComPtr<ID3D12CommandQueue> Queue) : Queue(Queue) {}
400-
virtual ~DXQueue() {}
403+
DXQueue(ComPtr<ID3D12CommandQueue> Queue,
404+
std::unique_ptr<DXFence> SubmitFence)
405+
: Queue(Queue), SubmitFence(std::move(SubmitFence)) {}
406+
~DXQueue() override {}
401407

402408
static llvm::Expected<DXQueue>
403409
createGraphicsQueue(ComPtr<ID3D12Device> Device) {
404410
const D3D12_COMMAND_QUEUE_DESC Desc = {D3D12_COMMAND_LIST_TYPE_DIRECT, 0,
405411
D3D12_COMMAND_QUEUE_FLAG_NONE, 0};
406-
ComPtr<ID3D12CommandQueue> Queue;
407-
if (auto Err =
408-
HR::toError(Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&Queue)),
409-
"Failed to create command queue."))
412+
ComPtr<ID3D12CommandQueue> CmdQueue;
413+
if (auto Err = HR::toError(
414+
Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&CmdQueue)),
415+
"Failed to create command queue."))
410416
return Err;
411-
return DXQueue(Queue);
417+
auto FenceOrErr = DXFence::create(Device.Get(), "QueueSubmitFence");
418+
if (!FenceOrErr)
419+
return FenceOrErr.takeError();
420+
return DXQueue(CmdQueue, std::move(*FenceOrErr));
412421
}
422+
423+
llvm::Error
424+
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
425+
override;
413426
};
414427

415428
class DXCommandBuffer : public offloadtest::CommandBuffer {
@@ -442,6 +455,28 @@ class DXCommandBuffer : public offloadtest::CommandBuffer {
442455
DXCommandBuffer() : CommandBuffer(GPUAPI::DirectX) {}
443456
};
444457

458+
llvm::Error DXQueue::submit(
459+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
460+
for (auto &CB : CBs) {
461+
auto &DCB = CB->as<DXCommandBuffer>();
462+
if (auto Err =
463+
HR::toError(DCB.CmdList->Close(), "Failed to close command list."))
464+
return Err;
465+
466+
ID3D12CommandList *const CmdLists[] = {DCB.CmdList.Get()};
467+
Queue->ExecuteCommandLists(1, CmdLists);
468+
469+
const uint64_t CurrentCounter = ++FenceCounter;
470+
if (auto Err =
471+
HR::toError(Queue->Signal(SubmitFence->Fence.Get(), CurrentCounter),
472+
"Failed to add signal."))
473+
return Err;
474+
475+
if (auto Err = SubmitFence->waitForCompletion(CurrentCounter))
476+
return Err;
477+
}
478+
return llvm::Error::success();
479+
}
445480
class DXDevice : public offloadtest::Device {
446481
private:
447482
ComPtr<IDXCoreAdapter> Adapter;
@@ -489,10 +524,10 @@ class DXDevice : public offloadtest::Device {
489524
public:
490525
DXDevice(ComPtr<IDXCoreAdapter> A, ComPtr<ID3D12Device> D, DXQueue Q,
491526
std::string Desc)
492-
: Adapter(A), Device(D), GraphicsQueue(Q) {
527+
: Adapter(A), Device(D), GraphicsQueue(std::move(Q)) {
493528
Description = Desc;
494529
}
495-
DXDevice(const DXDevice &) = default;
530+
DXDevice(const DXDevice &) = delete;
496531

497532
~DXDevice() override = default;
498533

@@ -644,9 +679,8 @@ class DXDevice : public offloadtest::Device {
644679
auto GraphicsQueueOrErr = DXQueue::createGraphicsQueue(Device);
645680
if (!GraphicsQueueOrErr)
646681
return GraphicsQueueOrErr.takeError();
647-
const DXQueue GraphicsQueue = *GraphicsQueueOrErr;
648-
649-
return std::make_unique<DXDevice>(Adapter, Device, std::move(GraphicsQueue),
682+
return std::make_unique<DXDevice>(Adapter, Device,
683+
std::move(*GraphicsQueueOrErr),
650684
std::string(DescVec.data()));
651685
}
652686

@@ -1334,8 +1368,10 @@ class DXDevice : public offloadtest::Device {
13341368
IS.CB->CmdList->ResourceBarrier(1, &Barrier);
13351369
}
13361370

1371+
// waitForSignal is used for tile mapping synchronization, not command buffer
1372+
// submission. TODO: Replace with a proper fence abstraction.
13371373
llvm::Error waitForSignal(InvocationState &IS) {
1338-
// This is a hack but it works since this is all single threaded code.
1374+
// Reuse the command buffer's fence for a quick queue-level signal/wait.
13391375
static uint64_t FenceCounter = 0;
13401376
const uint64_t CurrentCounter = FenceCounter + 1;
13411377
auto *F = static_cast<DXFence *>(IS.Fence.get());
@@ -1353,14 +1389,7 @@ class DXDevice : public offloadtest::Device {
13531389
}
13541390

13551391
llvm::Error executeCommandList(InvocationState &IS) {
1356-
if (auto Err = HR::toError(IS.CB->CmdList->Close(),
1357-
"Failed to close command list."))
1358-
return Err;
1359-
1360-
ID3D12CommandList *const CmdLists[] = {IS.CB->CmdList.Get()};
1361-
GraphicsQueue.Queue->ExecuteCommandLists(1, CmdLists);
1362-
1363-
return waitForSignal(IS);
1392+
return GraphicsQueue.submit(std::move(IS.CB));
13641393
}
13651394

13661395
llvm::Error createComputeCommands(Pipeline &P, InvocationState &IS) {

lib/API/MTL/MTLDevice.cpp

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,18 @@ static MTL::VertexFormat getMTLVertexFormat(DataFormat Format, int Channels) {
7777
namespace {
7878
class MTLQueue : public offloadtest::Queue {
7979
public:
80+
using Queue::submit;
81+
8082
MTL::CommandQueue *Queue;
8183
MTLQueue(MTL::CommandQueue *Queue) : Queue(Queue) {}
82-
~MTLQueue() {
84+
~MTLQueue() override {
8385
if (Queue)
8486
Queue->release();
8587
}
88+
89+
llvm::Error
90+
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
91+
override;
8692
};
8793

8894
class MTLFence : public offloadtest::Fence {
@@ -170,6 +176,19 @@ class MTLCommandBuffer : public offloadtest::CommandBuffer {
170176
MTLCommandBuffer() : CommandBuffer(GPUAPI::Metal) {}
171177
};
172178

179+
llvm::Error MTLQueue::submit(
180+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
181+
for (auto &CB : CBs) {
182+
auto &MCB = CB->as<MTLCommandBuffer>();
183+
MCB.CmdBuffer->commit();
184+
MCB.CmdBuffer->waitUntilCompleted();
185+
186+
NS::Error *Err = MCB.CmdBuffer->error();
187+
if (Err)
188+
return toError(Err);
189+
}
190+
return llvm::Error::success();
191+
}
173192
class MTLDevice : public offloadtest::Device {
174193
Capabilities Caps;
175194
MTL::Device *Device;
@@ -641,24 +660,7 @@ class MTLDevice : public offloadtest::Device {
641660
}
642661

643662
llvm::Error executeCommands(InvocationState &IS) {
644-
// This is a hack but it works since this is all single threaded code.
645-
static uint64_t FenceCounter = 0;
646-
const uint64_t CurrentCounter = FenceCounter + 1;
647-
auto *F = static_cast<MTLFence *>(IS.Fence.get());
648-
649-
IS.CB->CmdBuffer->encodeSignalEvent(F->Event, CurrentCounter);
650-
IS.CB->CmdBuffer->commit();
651-
652-
if (auto Err = IS.Fence->waitForCompletion(CurrentCounter))
653-
return Err;
654-
655-
// Check and surface any errors that occurred during execution.
656-
NS::Error *CBErr = IS.CB->CmdBuffer->error();
657-
if (CBErr)
658-
return toError(CBErr);
659-
660-
FenceCounter = CurrentCounter;
661-
return llvm::Error::success();
663+
return GraphicsQueue.submit(std::move(IS.CB));
662664
}
663665

664666
llvm::Error copyBack(Pipeline &P, InvocationState &IS) {

lib/API/VK/Device.cpp

Lines changed: 53 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -472,10 +472,23 @@ class VulkanFence : public offloadtest::Fence {
472472

473473
class VulkanQueue : public offloadtest::Queue {
474474
public:
475+
using Queue::submit;
476+
475477
VkQueue Queue = VK_NULL_HANDLE;
476478
uint32_t QueueFamilyIdx = 0;
477-
VulkanQueue(VkQueue Q, uint32_t QueueFamilyIdx)
478-
: Queue(Q), QueueFamilyIdx(QueueFamilyIdx) {}
479+
// TODO: Ensure device lifetime is managed (e.g. via shared_ptr).
480+
VkDevice Device = VK_NULL_HANDLE;
481+
std::unique_ptr<VulkanFence> SubmitFence;
482+
uint64_t FenceCounter = 0;
483+
484+
VulkanQueue(VkQueue Q, uint32_t QueueFamilyIdx, VkDevice Device,
485+
std::unique_ptr<VulkanFence> SubmitFence)
486+
: Queue(Q), QueueFamilyIdx(QueueFamilyIdx), Device(Device),
487+
SubmitFence(std::move(SubmitFence)) {}
488+
489+
llvm::Error
490+
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
491+
override;
479492
};
480493

481494
class VulkanCommandBuffer : public offloadtest::CommandBuffer {
@@ -528,6 +541,38 @@ class VulkanCommandBuffer : public offloadtest::CommandBuffer {
528541
VulkanCommandBuffer() : CommandBuffer(GPUAPI::Vulkan) {}
529542
};
530543

544+
llvm::Error VulkanQueue::submit(
545+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
546+
for (auto &CB : CBs) {
547+
auto &VCB = CB->as<VulkanCommandBuffer>();
548+
if (vkEndCommandBuffer(VCB.CmdBuffer))
549+
return llvm::createStringError(std::errc::device_or_resource_busy,
550+
"Could not end command buffer.");
551+
552+
const uint64_t SignalValue = ++FenceCounter;
553+
554+
VkTimelineSemaphoreSubmitInfo TimelineInfo = {};
555+
TimelineInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO;
556+
TimelineInfo.signalSemaphoreValueCount = 1;
557+
TimelineInfo.pSignalSemaphoreValues = &SignalValue;
558+
559+
VkSubmitInfo SubmitInfo = {};
560+
SubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
561+
SubmitInfo.pNext = &TimelineInfo;
562+
SubmitInfo.commandBufferCount = 1;
563+
SubmitInfo.pCommandBuffers = &VCB.CmdBuffer;
564+
SubmitInfo.signalSemaphoreCount = 1;
565+
SubmitInfo.pSignalSemaphores = &SubmitFence->Semaphore;
566+
567+
if (vkQueueSubmit(Queue, 1, &SubmitInfo, VK_NULL_HANDLE))
568+
return llvm::createStringError(std::errc::device_or_resource_busy,
569+
"Failed to submit to queue.");
570+
571+
if (auto Err = SubmitFence->waitForCompletion(SignalValue))
572+
return Err;
573+
}
574+
return llvm::Error::success();
575+
}
531576
class VulkanDevice : public offloadtest::Device {
532577
private:
533578
std::shared_ptr<VulkanInstance> Instance;
@@ -726,7 +771,11 @@ class VulkanDevice : public offloadtest::Device {
726771
VkQueue DeviceQueue = VK_NULL_HANDLE;
727772
vkGetDeviceQueue(Device, QueueFamilyIdx, 0, &DeviceQueue);
728773

729-
const VulkanQueue GraphicsQueue = VulkanQueue(DeviceQueue, QueueFamilyIdx);
774+
auto SubmitFenceOrErr = VulkanFence::create(Device, "QueueSubmitFence");
775+
if (!SubmitFenceOrErr)
776+
return SubmitFenceOrErr.takeError();
777+
VulkanQueue GraphicsQueue(DeviceQueue, QueueFamilyIdx, Device,
778+
std::move(*SubmitFenceOrErr));
730779

731780
return std::make_unique<VulkanDevice>(Instance, PhysicalDevice, Props,
732781
Device, std::move(GraphicsQueue),
@@ -1298,41 +1347,7 @@ class VulkanDevice : public offloadtest::Device {
12981347
}
12991348

13001349
llvm::Error executeCommandBuffer(InvocationState &IS) {
1301-
// This is a hack but it works since this is all single threaded code.
1302-
static uint64_t FenceCounter = 0;
1303-
const uint64_t CurrentCounter = FenceCounter + 1;
1304-
1305-
if (vkEndCommandBuffer(IS.CB->CmdBuffer))
1306-
return llvm::createStringError(std::errc::device_or_resource_busy,
1307-
"Could not end command buffer.");
1308-
1309-
auto *F = static_cast<VulkanFence *>(IS.Fence.get());
1310-
1311-
VkTimelineSemaphoreSubmitInfo TimelineSubmitInfo = {};
1312-
TimelineSubmitInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO;
1313-
TimelineSubmitInfo.signalSemaphoreValueCount = 1;
1314-
TimelineSubmitInfo.pSignalSemaphoreValues = &CurrentCounter;
1315-
1316-
VkSubmitInfo SubmitInfo = {};
1317-
SubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
1318-
SubmitInfo.pNext = &TimelineSubmitInfo;
1319-
SubmitInfo.commandBufferCount = 1;
1320-
SubmitInfo.pCommandBuffers = &IS.CB->CmdBuffer;
1321-
SubmitInfo.signalSemaphoreCount = 1;
1322-
SubmitInfo.pSignalSemaphores = &F->Semaphore;
1323-
1324-
// Submit to the queue
1325-
if (vkQueueSubmit(GraphicsQueue.Queue, 1, &SubmitInfo, VK_NULL_HANDLE))
1326-
return llvm::createStringError(std::errc::device_or_resource_busy,
1327-
"Failed to submit to queue.");
1328-
1329-
if (auto Err = IS.Fence->waitForCompletion(CurrentCounter))
1330-
return Err;
1331-
1332-
vkFreeCommandBuffers(Device, IS.CB->CmdPool, 1, &IS.CB->CmdBuffer);
1333-
1334-
FenceCounter = CurrentCounter;
1335-
return llvm::Error::success();
1350+
return GraphicsQueue.submit(std::move(IS.CB));
13361351
}
13371352

13381353
llvm::Error createDescriptorPool(Pipeline &P, InvocationState &IS) {

0 commit comments

Comments
 (0)