Skip to content

Commit 83e0ad9

Browse files
MarijnS95claude
andcommitted
Add Queue::submit() for command buffer submission
Move command buffer submission logic from each backend's Device into Queue::submit(), which takes ownership of the command buffers. Each backend uses its Fence abstraction for GPU synchronization: - Metal: commit() + waitUntilCompleted() - Vulkan: vkQueueSubmit() signaling a timeline semaphore (VulkanFence), then VulkanFence::waitForCompletion() - DX12: ExecuteCommandLists() + Queue::Signal() on the queue-owned DXFence, then DXFence::waitForCompletion() Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>
1 parent 99a155f commit 83e0ad9

4 files changed

Lines changed: 187 additions & 96 deletions

File tree

include/API/Device.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,19 @@ class Queue {
5858
public:
5959
virtual ~Queue() = 0;
6060

61+
/// Submit command buffers for execution and block until completion.
62+
/// Command buffers execute in array order, but dependencies between them
63+
/// require appropriate barriers within the command buffers themselves.
64+
virtual llvm::Error
65+
submit(llvm::SmallVector<std::unique_ptr<CommandBuffer>> CBs) = 0;
66+
67+
/// Convenience overload for submitting a single command buffer.
68+
llvm::Error submit(std::unique_ptr<CommandBuffer> CB) {
69+
llvm::SmallVector<std::unique_ptr<CommandBuffer>> CBs;
70+
CBs.push_back(std::move(CB));
71+
return submit(std::move(CBs));
72+
}
73+
6174
protected:
6275
Queue() = default;
6376
};

lib/API/DX/Device.cpp

Lines changed: 76 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -394,22 +394,36 @@ class DXFence : public offloadtest::Fence {
394394

395395
class DXQueue : public offloadtest::Queue {
396396
public:
397+
using Queue::submit;
398+
397399
ComPtr<ID3D12CommandQueue> Queue;
400+
std::unique_ptr<DXFence> SubmitFence;
401+
uint64_t FenceCounter = 0;
398402

399-
DXQueue(ComPtr<ID3D12CommandQueue> Queue) : Queue(Queue) {}
400-
virtual ~DXQueue() {}
403+
DXQueue(ComPtr<ID3D12CommandQueue> Queue,
404+
std::unique_ptr<DXFence> SubmitFence)
405+
: Queue(Queue), SubmitFence(std::move(SubmitFence)) {}
406+
DXQueue(DXQueue &&) = default;
407+
~DXQueue() override {}
401408

402409
static llvm::Expected<DXQueue>
403410
createGraphicsQueue(ComPtr<ID3D12Device> Device) {
404411
const D3D12_COMMAND_QUEUE_DESC Desc = {D3D12_COMMAND_LIST_TYPE_DIRECT, 0,
405412
D3D12_COMMAND_QUEUE_FLAG_NONE, 0};
406-
ComPtr<ID3D12CommandQueue> Queue;
407-
if (auto Err =
408-
HR::toError(Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&Queue)),
409-
"Failed to create command queue."))
413+
ComPtr<ID3D12CommandQueue> CmdQueue;
414+
if (auto Err = HR::toError(
415+
Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&CmdQueue)),
416+
"Failed to create command queue."))
410417
return Err;
411-
return DXQueue(Queue);
418+
auto FenceOrErr = DXFence::create(Device.Get(), "QueueSubmitFence");
419+
if (!FenceOrErr)
420+
return FenceOrErr.takeError();
421+
return DXQueue(CmdQueue, std::move(*FenceOrErr));
412422
}
423+
424+
llvm::Error
425+
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
426+
override;
413427
};
414428

415429
class DXCommandBuffer : public offloadtest::CommandBuffer {
@@ -444,6 +458,42 @@ class DXCommandBuffer : public offloadtest::CommandBuffer {
444458
DXCommandBuffer() : CommandBuffer(GPUAPI::DirectX) {}
445459
};
446460

461+
llvm::Error DXQueue::submit(
462+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
463+
llvm::SmallVector<ID3D12CommandList *> CmdLists;
464+
CmdLists.reserve(CBs.size());
465+
466+
for (auto &CB : CBs) {
467+
auto &DCB = *llvm::cast<DXCommandBuffer>(CB.get());
468+
if (auto Err =
469+
HR::toError(DCB.CmdList->Close(), "Failed to close command list."))
470+
return Err;
471+
CmdLists.push_back(DCB.CmdList.Get());
472+
}
473+
474+
// Wait on the previous submit's fence value before executing this batch,
475+
// so that back-to-back submits don't overlap on the GPU. Skip on first
476+
// submit since Wait(fence, 0) triggers a D3D12 validation warning.
477+
if (FenceCounter > 0)
478+
if (auto Err =
479+
HR::toError(Queue->Wait(SubmitFence->Fence.Get(), FenceCounter),
480+
"Failed to wait on previous submit."))
481+
return Err;
482+
483+
Queue->ExecuteCommandLists(CmdLists.size(), CmdLists.data());
484+
485+
const uint64_t CurrentCounter = ++FenceCounter;
486+
if (auto Err =
487+
HR::toError(Queue->Signal(SubmitFence->Fence.Get(), CurrentCounter),
488+
"Failed to add signal."))
489+
return Err;
490+
491+
// TODO: Return a Fence+value with keepalive lists instead of blocking here.
492+
if (auto Err = SubmitFence->waitForCompletion(CurrentCounter))
493+
return Err;
494+
495+
return llvm::Error::success();
496+
}
447497
class DXDevice : public offloadtest::Device {
448498
private:
449499
ComPtr<IDXCoreAdapter> Adapter;
@@ -491,10 +541,10 @@ class DXDevice : public offloadtest::Device {
491541
public:
492542
DXDevice(ComPtr<IDXCoreAdapter> A, ComPtr<ID3D12Device> D, DXQueue Q,
493543
std::string Desc)
494-
: Adapter(A), Device(D), GraphicsQueue(Q) {
544+
: Adapter(A), Device(D), GraphicsQueue(std::move(Q)) {
495545
Description = Desc;
496546
}
497-
DXDevice(const DXDevice &) = default;
547+
DXDevice(const DXDevice &) = delete;
498548

499549
~DXDevice() override = default;
500550

@@ -646,9 +696,8 @@ class DXDevice : public offloadtest::Device {
646696
auto GraphicsQueueOrErr = DXQueue::createGraphicsQueue(Device);
647697
if (!GraphicsQueueOrErr)
648698
return GraphicsQueueOrErr.takeError();
649-
const DXQueue GraphicsQueue = *GraphicsQueueOrErr;
650-
651-
return std::make_unique<DXDevice>(Adapter, Device, std::move(GraphicsQueue),
699+
return std::make_unique<DXDevice>(Adapter, Device,
700+
std::move(*GraphicsQueueOrErr),
652701
std::string(DescVec.data()));
653702
}
654703

@@ -897,7 +946,20 @@ class DXDevice : public offloadtest::Device {
897946
Buffer.Get(), 1, &StartCoord, &RegionSize, Heap.Get(), 1, &RangeFlag,
898947
&HeapRangeStartOffset, &RangeTileCount, D3D12_TILE_MAPPING_FLAG_NONE);
899948

900-
return waitForSignal(IS);
949+
// Synchronize after UpdateTileMappings, which is a queue operation (not
950+
// recorded into a command list). This is a hack but it works since this
951+
// is all single threaded code.
952+
// TODO: Replace with a proper fence abstraction.
953+
static uint64_t TileMappingFenceCounter = 0;
954+
const uint64_t CurrentCounter = ++TileMappingFenceCounter;
955+
auto *F = static_cast<DXFence *>(IS.Fence.get());
956+
957+
if (auto Err =
958+
HR::toError(CommandQueue->Signal(F->Fence.Get(), CurrentCounter),
959+
"Failed to add signal."))
960+
return Err;
961+
962+
return IS.Fence->waitForCompletion(CurrentCounter);
901963
}
902964

903965
llvm::Expected<ResourceBundle> createSRV(Resource &R, InvocationState &IS) {
@@ -1336,33 +1398,8 @@ class DXDevice : public offloadtest::Device {
13361398
IS.CB->CmdList->ResourceBarrier(1, &Barrier);
13371399
}
13381400

1339-
llvm::Error waitForSignal(InvocationState &IS) {
1340-
// This is a hack but it works since this is all single threaded code.
1341-
static uint64_t FenceCounter = 0;
1342-
const uint64_t CurrentCounter = FenceCounter + 1;
1343-
auto *F = static_cast<DXFence *>(IS.Fence.get());
1344-
1345-
if (auto Err = HR::toError(
1346-
GraphicsQueue.Queue->Signal(F->Fence.Get(), CurrentCounter),
1347-
"Failed to add signal."))
1348-
return Err;
1349-
1350-
if (auto Err = IS.Fence->waitForCompletion(CurrentCounter))
1351-
return Err;
1352-
1353-
FenceCounter = CurrentCounter;
1354-
return llvm::Error::success();
1355-
}
1356-
13571401
llvm::Error executeCommandList(InvocationState &IS) {
1358-
if (auto Err = HR::toError(IS.CB->CmdList->Close(),
1359-
"Failed to close command list."))
1360-
return Err;
1361-
1362-
ID3D12CommandList *const CmdLists[] = {IS.CB->CmdList.Get()};
1363-
GraphicsQueue.Queue->ExecuteCommandLists(1, CmdLists);
1364-
1365-
return waitForSignal(IS);
1402+
return GraphicsQueue.submit(std::move(IS.CB));
13661403
}
13671404

13681405
llvm::Error createComputeCommands(Pipeline &P, InvocationState &IS) {

lib/API/MTL/MTLDevice.cpp

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,18 @@ static MTL::VertexFormat getMTLVertexFormat(DataFormat Format, int Channels) {
7777
namespace {
7878
class MTLQueue : public offloadtest::Queue {
7979
public:
80+
using Queue::submit;
81+
8082
MTL::CommandQueue *Queue;
8183
MTLQueue(MTL::CommandQueue *Queue) : Queue(Queue) {}
82-
~MTLQueue() {
84+
~MTLQueue() override {
8385
if (Queue)
8486
Queue->release();
8587
}
88+
89+
llvm::Error
90+
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
91+
override;
8692
};
8793

8894
class MTLFence : public offloadtest::Fence {
@@ -172,6 +178,24 @@ class MTLCommandBuffer : public offloadtest::CommandBuffer {
172178
MTLCommandBuffer() : CommandBuffer(GPUAPI::Metal) {}
173179
};
174180

181+
llvm::Error MTLQueue::submit(
182+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
183+
// Metal serial queues guarantee that command buffers execute in commit order,
184+
// so no explicit wait on prior work is needed here.
185+
for (auto &CB : CBs)
186+
llvm::cast<MTLCommandBuffer>(CB.get())->CmdBuffer->commit();
187+
188+
// TODO: Return a Fence+value with keepalive lists instead of blocking here.
189+
for (auto &CB : CBs) {
190+
auto &MCB = *llvm::cast<MTLCommandBuffer>(CB.get());
191+
MCB.CmdBuffer->waitUntilCompleted();
192+
193+
NS::Error *Err = MCB.CmdBuffer->error();
194+
if (Err)
195+
return toError(Err);
196+
}
197+
return llvm::Error::success();
198+
}
175199
class MTLDevice : public offloadtest::Device {
176200
Capabilities Caps;
177201
MTL::Device *Device;
@@ -643,24 +667,7 @@ class MTLDevice : public offloadtest::Device {
643667
}
644668

645669
llvm::Error executeCommands(InvocationState &IS) {
646-
// This is a hack but it works since this is all single threaded code.
647-
static uint64_t FenceCounter = 0;
648-
const uint64_t CurrentCounter = FenceCounter + 1;
649-
auto *F = static_cast<MTLFence *>(IS.Fence.get());
650-
651-
IS.CB->CmdBuffer->encodeSignalEvent(F->Event, CurrentCounter);
652-
IS.CB->CmdBuffer->commit();
653-
654-
if (auto Err = IS.Fence->waitForCompletion(CurrentCounter))
655-
return Err;
656-
657-
// Check and surface any errors that occurred during execution.
658-
NS::Error *CBErr = IS.CB->CmdBuffer->error();
659-
if (CBErr)
660-
return toError(CBErr);
661-
662-
FenceCounter = CurrentCounter;
663-
return llvm::Error::success();
670+
return GraphicsQueue.submit(std::move(IS.CB));
664671
}
665672

666673
llvm::Error copyBack(Pipeline &P, InvocationState &IS) {

0 commit comments

Comments
 (0)