Skip to content

Commit 3b4f680

Browse files
MarijnS95claude
andcommitted
Add Queue::submit() for command buffer submission
Move command buffer submission logic from each backend's Device into Queue::submit(), which takes ownership of the command buffers. For now it blocks internally until completion; a TODO marks that it will return a Fence once the Fence abstraction from PR #1007 is available. - Metal: commit() + waitUntilCompleted() - Vulkan: vkEndCommandBuffer() + vkQueueSubmit() with temporary fence + vkWaitForFences() - DX12: CmdList::Close() + ExecuteCommandLists() + Queue::Signal()/Fence::SetEventOnCompletion() wait VulkanQueue now stores a VkDevice handle (with a TODO for lifetime management) so it can create/destroy fences independently. Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>
1 parent f1852e8 commit 3b4f680

4 files changed

Lines changed: 140 additions & 72 deletions

File tree

include/API/Device.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,19 @@ class Queue {
7676
public:
7777
virtual ~Queue() = 0;
7878

79+
/// Submit command buffers for execution and block until completion.
80+
// TODO: Return a Fence instead of blocking, once the Fence abstraction
81+
// from PR #1007 is available.
82+
virtual llvm::Error
83+
submit(llvm::SmallVector<std::unique_ptr<CommandBuffer>> CBs) = 0;
84+
85+
/// Convenience overload for submitting a single command buffer.
86+
llvm::Error submit(std::unique_ptr<CommandBuffer> CB) {
87+
llvm::SmallVector<std::unique_ptr<CommandBuffer>> CBs;
88+
CBs.push_back(std::move(CB));
89+
return submit(std::move(CBs));
90+
}
91+
7992
protected:
8093
Queue() = default;
8194
};

lib/API/DX/Device.cpp

Lines changed: 62 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -371,22 +371,28 @@ class DXFence : public offloadtest::Fence {
371371

372372
class DXQueue : public offloadtest::Queue {
373373
public:
374+
using Queue::submit;
375+
374376
ComPtr<ID3D12CommandQueue> Queue;
375377

376378
DXQueue(ComPtr<ID3D12CommandQueue> Queue) : Queue(Queue) {}
377-
virtual ~DXQueue() {}
379+
~DXQueue() override {}
378380

379381
static llvm::Expected<DXQueue>
380382
createGraphicsQueue(ComPtr<ID3D12Device> Device) {
381383
const D3D12_COMMAND_QUEUE_DESC Desc = {D3D12_COMMAND_LIST_TYPE_DIRECT, 0,
382384
D3D12_COMMAND_QUEUE_FLAG_NONE, 0};
383-
ComPtr<ID3D12CommandQueue> Queue;
384-
if (auto Err =
385-
HR::toError(Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&Queue)),
386-
"Failed to create command queue."))
385+
ComPtr<ID3D12CommandQueue> CmdQueue;
386+
if (auto Err = HR::toError(
387+
Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&CmdQueue)),
388+
"Failed to create command queue."))
387389
return Err;
388-
return DXQueue(Queue);
390+
return DXQueue(CmdQueue);
389391
}
392+
393+
llvm::Error
394+
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
395+
override;
390396
};
391397

392398
class DXCommandBuffer : public offloadtest::CommandBuffer {
@@ -419,6 +425,52 @@ class DXCommandBuffer : public offloadtest::CommandBuffer {
419425
DXCommandBuffer() : CommandBuffer(GPUAPI::DirectX) {}
420426
};
421427

428+
llvm::Error DXQueue::submit(
429+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
430+
// This is a hack but it works since this is all single threaded code.
431+
static uint64_t FenceCounter = 0;
432+
433+
for (auto &CB : CBs) {
434+
auto &DCB = CB->as<DXCommandBuffer>();
435+
if (auto Err =
436+
HR::toError(DCB.CmdList->Close(), "Failed to close command list."))
437+
return Err;
438+
439+
ID3D12CommandList *const CmdLists[] = {DCB.CmdList.Get()};
440+
Queue->ExecuteCommandLists(1, CmdLists);
441+
442+
const uint64_t CurrentCounter = FenceCounter + 1;
443+
if (auto Err = HR::toError(Queue->Signal(DCB.Fence.Get(), CurrentCounter),
444+
"Failed to add signal."))
445+
return Err;
446+
447+
if (DCB.Fence->GetCompletedValue() < CurrentCounter) {
448+
#ifdef _WIN32
449+
HANDLE Event = DCB.Event;
450+
#else // WSL
451+
HANDLE Event = reinterpret_cast<HANDLE>(DCB.Event);
452+
#endif
453+
if (auto Err = HR::toError(
454+
DCB.Fence->SetEventOnCompletion(CurrentCounter, Event),
455+
"Failed to register end event."))
456+
return Err;
457+
458+
#ifdef _WIN32
459+
WaitForSingleObject(DCB.Event, INFINITE);
460+
#else // WSL
461+
pollfd PollEvent;
462+
PollEvent.fd = DCB.Event;
463+
PollEvent.events = POLLIN;
464+
PollEvent.revents = 0;
465+
if (poll(&PollEvent, 1, -1) == -1)
466+
return llvm::createStringError(
467+
std::error_code(errno, std::system_category()), strerror(errno));
468+
#endif
469+
}
470+
FenceCounter = CurrentCounter;
471+
}
472+
return llvm::Error::success();
473+
}
422474
class DXDevice : public offloadtest::Device {
423475
private:
424476
ComPtr<IDXCoreAdapter> Adapter;
@@ -1231,8 +1283,10 @@ class DXDevice : public offloadtest::Device {
12311283
IS.CB->CmdList->ResourceBarrier(1, &Barrier);
12321284
}
12331285

1286+
// waitForSignal is used for tile mapping synchronization, not command buffer
1287+
// submission. TODO: Replace with a proper fence abstraction.
12341288
llvm::Error waitForSignal(InvocationState &IS) {
1235-
// This is a hack but it works since this is all single threaded code.
1289+
// Reuse the command buffer's fence for a quick queue-level signal/wait.
12361290
static uint64_t FenceCounter = 0;
12371291
const uint64_t CurrentCounter = FenceCounter + 1;
12381292
auto *F = static_cast<DXFence *>(IS.Fence.get());
@@ -1250,14 +1304,7 @@ class DXDevice : public offloadtest::Device {
12501304
}
12511305

12521306
llvm::Error executeCommandList(InvocationState &IS) {
1253-
if (auto Err = HR::toError(IS.CB->CmdList->Close(),
1254-
"Failed to close command list."))
1255-
return Err;
1256-
1257-
ID3D12CommandList *const CmdLists[] = {IS.CB->CmdList.Get()};
1258-
GraphicsQueue.Queue->ExecuteCommandLists(1, CmdLists);
1259-
1260-
return waitForSignal(IS);
1307+
return GraphicsQueue.submit(std::move(IS.CB));
12611308
}
12621309

12631310
llvm::Error createComputeCommands(Pipeline &P, InvocationState &IS) {

lib/API/MTL/MTLDevice.cpp

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,18 @@ static MTL::VertexFormat getMTLVertexFormat(DataFormat Format, int Channels) {
7676
namespace {
7777
class MTLQueue : public offloadtest::Queue {
7878
public:
79+
using Queue::submit;
80+
7981
MTL::CommandQueue *Queue;
8082
MTLQueue(MTL::CommandQueue *Queue) : Queue(Queue) {}
81-
~MTLQueue() {
83+
~MTLQueue() override {
8284
if (Queue)
8385
Queue->release();
8486
}
87+
88+
llvm::Error
89+
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
90+
override;
8591
};
8692

8793
class MTLFence : public offloadtest::Fence {
@@ -154,6 +160,19 @@ class MTLCommandBuffer : public offloadtest::CommandBuffer {
154160
MTLCommandBuffer() : CommandBuffer(GPUAPI::Metal) {}
155161
};
156162

163+
llvm::Error MTLQueue::submit(
164+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
165+
for (auto &CB : CBs) {
166+
auto &MCB = CB->as<MTLCommandBuffer>();
167+
MCB.CmdBuffer->commit();
168+
MCB.CmdBuffer->waitUntilCompleted();
169+
170+
NS::Error *Err = MCB.CmdBuffer->error();
171+
if (Err)
172+
return toError(Err);
173+
}
174+
return llvm::Error::success();
175+
}
157176
class MTLDevice : public offloadtest::Device {
158177
Capabilities Caps;
159178
MTL::Device *Device;
@@ -541,24 +560,7 @@ class MTLDevice : public offloadtest::Device {
541560
}
542561

543562
llvm::Error executeCommands(InvocationState &IS) {
544-
// This is a hack but it works since this is all single threaded code.
545-
static uint64_t FenceCounter = 0;
546-
const uint64_t CurrentCounter = FenceCounter + 1;
547-
auto *F = static_cast<MTLFence *>(IS.Fence.get());
548-
549-
IS.CB->CmdBuffer->encodeSignalEvent(F->Event, CurrentCounter);
550-
IS.CB->CmdBuffer->commit();
551-
552-
if (auto Err = IS.Fence->waitForCompletion(CurrentCounter))
553-
return Err;
554-
555-
// Check and surface any errors that occurred during execution.
556-
NS::Error *CBErr = IS.CB->CmdBuffer->error();
557-
if (CBErr)
558-
return toError(CBErr);
559-
560-
FenceCounter = CurrentCounter;
561-
return llvm::Error::success();
563+
return GraphicsQueue.submit(std::move(IS.CB));
562564
}
563565

564566
llvm::Error copyBack(Pipeline &P, InvocationState &IS) {

lib/API/VK/Device.cpp

Lines changed: 44 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -445,10 +445,18 @@ class VulkanFence : public offloadtest::Fence {
445445

446446
class VulkanQueue : public offloadtest::Queue {
447447
public:
448+
using Queue::submit;
449+
448450
VkQueue Queue = VK_NULL_HANDLE;
449451
uint32_t QueueFamilyIdx = 0;
450-
VulkanQueue(VkQueue Q, uint32_t QueueFamilyIdx)
451-
: Queue(Q), QueueFamilyIdx(QueueFamilyIdx) {}
452+
// TODO: Ensure device lifetime is managed (e.g. via shared_ptr).
453+
VkDevice Device = VK_NULL_HANDLE;
454+
VulkanQueue(VkQueue Q, uint32_t QueueFamilyIdx, VkDevice Device)
455+
: Queue(Q), QueueFamilyIdx(QueueFamilyIdx), Device(Device) {}
456+
457+
llvm::Error
458+
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
459+
override;
452460
};
453461

454462
class VulkanCommandBuffer : public offloadtest::CommandBuffer {
@@ -501,6 +509,37 @@ class VulkanCommandBuffer : public offloadtest::CommandBuffer {
501509
VulkanCommandBuffer() : CommandBuffer(GPUAPI::Vulkan) {}
502510
};
503511

512+
llvm::Error VulkanQueue::submit(
513+
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
514+
for (auto &CB : CBs) {
515+
auto &VCB = CB->as<VulkanCommandBuffer>();
516+
if (vkEndCommandBuffer(VCB.CmdBuffer))
517+
return llvm::createStringError(std::errc::device_or_resource_busy,
518+
"Could not end command buffer.");
519+
520+
VkSubmitInfo SubmitInfo = {};
521+
SubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
522+
SubmitInfo.commandBufferCount = 1;
523+
SubmitInfo.pCommandBuffers = &VCB.CmdBuffer;
524+
525+
VkFenceCreateInfo FenceInfo = {};
526+
FenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
527+
VkFence Fence;
528+
if (vkCreateFence(Device, &FenceInfo, nullptr, &Fence))
529+
return llvm::createStringError(std::errc::device_or_resource_busy,
530+
"Could not create fence.");
531+
532+
if (vkQueueSubmit(Queue, 1, &SubmitInfo, Fence))
533+
return llvm::createStringError(std::errc::device_or_resource_busy,
534+
"Failed to submit to queue.");
535+
if (vkWaitForFences(Device, 1, &Fence, VK_TRUE, UINT64_MAX))
536+
return llvm::createStringError(std::errc::device_or_resource_busy,
537+
"Failed waiting for fence.");
538+
539+
vkDestroyFence(Device, Fence, nullptr);
540+
}
541+
return llvm::Error::success();
542+
}
504543
class VulkanDevice : public offloadtest::Device {
505544
private:
506545
std::shared_ptr<VulkanInstance> Instance;
@@ -699,7 +738,8 @@ class VulkanDevice : public offloadtest::Device {
699738
VkQueue DeviceQueue = VK_NULL_HANDLE;
700739
vkGetDeviceQueue(Device, QueueFamilyIdx, 0, &DeviceQueue);
701740

702-
const VulkanQueue GraphicsQueue = VulkanQueue(DeviceQueue, QueueFamilyIdx);
741+
const VulkanQueue GraphicsQueue =
742+
VulkanQueue(DeviceQueue, QueueFamilyIdx, Device);
703743

704744
return std::make_unique<VulkanDevice>(Instance, PhysicalDevice, Props,
705745
Device, std::move(GraphicsQueue),
@@ -1239,41 +1279,7 @@ class VulkanDevice : public offloadtest::Device {
12391279
}
12401280

12411281
llvm::Error executeCommandBuffer(InvocationState &IS) {
1242-
// This is a hack but it works since this is all single threaded code.
1243-
static uint64_t FenceCounter = 0;
1244-
const uint64_t CurrentCounter = FenceCounter + 1;
1245-
1246-
if (vkEndCommandBuffer(IS.CB->CmdBuffer))
1247-
return llvm::createStringError(std::errc::device_or_resource_busy,
1248-
"Could not end command buffer.");
1249-
1250-
auto *F = static_cast<VulkanFence *>(IS.Fence.get());
1251-
1252-
VkTimelineSemaphoreSubmitInfo TimelineSubmitInfo = {};
1253-
TimelineSubmitInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO;
1254-
TimelineSubmitInfo.signalSemaphoreValueCount = 1;
1255-
TimelineSubmitInfo.pSignalSemaphoreValues = &CurrentCounter;
1256-
1257-
VkSubmitInfo SubmitInfo = {};
1258-
SubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
1259-
SubmitInfo.pNext = &TimelineSubmitInfo;
1260-
SubmitInfo.commandBufferCount = 1;
1261-
SubmitInfo.pCommandBuffers = &IS.CB->CmdBuffer;
1262-
SubmitInfo.signalSemaphoreCount = 1;
1263-
SubmitInfo.pSignalSemaphores = &F->Semaphore;
1264-
1265-
// Submit to the queue
1266-
if (vkQueueSubmit(GraphicsQueue.Queue, 1, &SubmitInfo, VK_NULL_HANDLE))
1267-
return llvm::createStringError(std::errc::device_or_resource_busy,
1268-
"Failed to submit to queue.");
1269-
1270-
if (auto Err = IS.Fence->waitForCompletion(CurrentCounter))
1271-
return Err;
1272-
1273-
vkFreeCommandBuffers(Device, IS.CB->CmdPool, 1, &IS.CB->CmdBuffer);
1274-
1275-
FenceCounter = CurrentCounter;
1276-
return llvm::Error::success();
1282+
return GraphicsQueue.submit(std::move(IS.CB));
12771283
}
12781284

12791285
llvm::Error createDescriptorPool(Pipeline &P, InvocationState &IS) {

0 commit comments

Comments
 (0)