Skip to content

Commit a57700f

Browse files
MarijnS95claude
andcommitted
Return SubmitResult from Queue::submit() instead of blocking
Change `Queue::submit()` to return `Expected<SubmitResult>` (a fence + signal value pair) instead of blocking until completion. Each backend (Vulkan, DirectX, Metal) now returns its queue fence and counter value, and callers explicitly wait via `SubmitResult::waitForCompletion()` before reading back results. This allows command buffers submitted between the first and last submit to remain in-flight on the GPU without host-side synchronization. - Vulkan: `cleanup()` waits on the queue fence for error-path safety - DirectX: tile mapping sync reuses the queue's `SubmitFence` - Metal: `MTLQueue` gains a `SharedEvent` fence; the last command buffer in each batch encodes a signal on it Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>
1 parent 6823d34 commit a57700f

4 files changed

Lines changed: 105 additions & 94 deletions

File tree

include/API/Device.h

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,29 @@ class Fence {
5454
Fence() = default;
5555
};
5656

57+
/// Returned by Queue::submit() so that callers can decide when to block.
58+
struct SubmitResult {
59+
Fence *F;
60+
uint64_t Value;
61+
62+
llvm::Error waitForCompletion() const { return F->waitForCompletion(Value); }
63+
};
64+
5765
class Queue {
5866
public:
5967
virtual ~Queue() = 0;
60-
61-
/// Submit command buffers for execution and block until completion.
62-
/// Command buffers execute in array order, but dependencies between them
63-
/// require appropriate barriers within the command buffers themselves.
64-
virtual llvm::Error
68+
Queue(const Queue &) = delete;
69+
Queue &operator=(const Queue &) = delete;
70+
Queue(Queue &&) = default;
71+
Queue &operator=(Queue &&) = default;
72+
73+
/// Submit command buffers for GPU execution. Returns a fence + value that
74+
/// the caller can wait on; the call itself does not block.
75+
virtual llvm::Expected<SubmitResult>
6576
submit(llvm::SmallVector<std::unique_ptr<CommandBuffer>> CBs) = 0;
6677

6778
/// Convenience overload for submitting a single command buffer.
68-
llvm::Error submit(std::unique_ptr<CommandBuffer> CB) {
79+
llvm::Expected<SubmitResult> submit(std::unique_ptr<CommandBuffer> CB) {
6980
llvm::SmallVector<std::unique_ptr<CommandBuffer>> CBs;
7081
CBs.push_back(std::move(CB));
7182
return submit(std::move(CBs));

lib/API/DX/Device.cpp

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ class DXQueue : public offloadtest::Queue {
421421
return DXQueue(CmdQueue, std::move(*FenceOrErr));
422422
}
423423

424-
llvm::Error
424+
llvm::Expected<offloadtest::SubmitResult>
425425
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
426426
override;
427427
};
@@ -458,7 +458,7 @@ class DXCommandBuffer : public offloadtest::CommandBuffer {
458458
DXCommandBuffer() : CommandBuffer(GPUAPI::DirectX) {}
459459
};
460460

461-
llvm::Error DXQueue::submit(
461+
llvm::Expected<offloadtest::SubmitResult> DXQueue::submit(
462462
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
463463
llvm::SmallVector<ID3D12CommandList *> CmdLists;
464464
CmdLists.reserve(CBs.size());
@@ -488,11 +488,7 @@ llvm::Error DXQueue::submit(
488488
"Failed to add signal."))
489489
return Err;
490490

491-
// TODO: Return a Fence+value with keepalive lists instead of blocking here.
492-
if (auto Err = SubmitFence->waitForCompletion(CurrentCounter))
493-
return Err;
494-
495-
return llvm::Error::success();
491+
return offloadtest::SubmitResult{SubmitFence.get(), CurrentCounter};
496492
}
497493
class DXDevice : public offloadtest::Device {
498494
private:
@@ -526,7 +522,6 @@ class DXDevice : public offloadtest::Device {
526522
ComPtr<ID3D12DescriptorHeap> DescHeap;
527523
ComPtr<ID3D12PipelineState> PSO;
528524
std::unique_ptr<DXCommandBuffer> CB;
529-
std::unique_ptr<offloadtest::Fence> Fence;
530525

531526
// Resources for graphics pipelines.
532527
std::shared_ptr<DXTexture> RT;
@@ -947,19 +942,15 @@ class DXDevice : public offloadtest::Device {
947942
&HeapRangeStartOffset, &RangeTileCount, D3D12_TILE_MAPPING_FLAG_NONE);
948943

949944
// Synchronize after UpdateTileMappings, which is a queue operation (not
950-
// recorded into a command list). This is a hack but it works since this
951-
// is all single threaded code.
952-
// TODO: Replace with a proper fence abstraction.
953-
static uint64_t TileMappingFenceCounter = 0;
954-
const uint64_t CurrentCounter = ++TileMappingFenceCounter;
955-
auto *F = static_cast<DXFence *>(IS.Fence.get());
956-
957-
if (auto Err =
958-
HR::toError(CommandQueue->Signal(F->Fence.Get(), CurrentCounter),
959-
"Failed to add signal."))
945+
// recorded into a command list).
946+
const uint64_t CurrentCounter = ++GraphicsQueue.FenceCounter;
947+
if (auto Err = HR::toError(
948+
CommandQueue->Signal(GraphicsQueue.SubmitFence->Fence.Get(),
949+
CurrentCounter),
950+
"Failed to add signal."))
960951
return Err;
961952

962-
return IS.Fence->waitForCompletion(CurrentCounter);
953+
return GraphicsQueue.SubmitFence->waitForCompletion(CurrentCounter);
963954
}
964955

965956
llvm::Expected<ResourceBundle> createSRV(Resource &R, InvocationState &IS) {
@@ -1398,7 +1389,8 @@ class DXDevice : public offloadtest::Device {
13981389
IS.CB->CmdList->ResourceBarrier(1, &Barrier);
13991390
}
14001391

1401-
llvm::Error executeCommandList(InvocationState &IS) {
1392+
llvm::Expected<offloadtest::SubmitResult>
1393+
executeCommandList(InvocationState &IS) {
14021394
return GraphicsQueue.submit(std::move(IS.CB));
14031395
}
14041396

@@ -1874,11 +1866,6 @@ class DXDevice : public offloadtest::Device {
18741866
State.CB = std::move(*CBOrErr);
18751867
llvm::outs() << "Command buffer created.\n";
18761868

1877-
auto FenceOrErr = createFence("Fence");
1878-
if (!FenceOrErr)
1879-
return FenceOrErr.takeError();
1880-
State.Fence = std::move(*FenceOrErr);
1881-
18821869
if (auto Err = createBuffers(P, State))
18831870
return Err;
18841871
llvm::outs() << "Buffers created.\n";
@@ -1918,9 +1905,12 @@ class DXDevice : public offloadtest::Device {
19181905
llvm::outs() << "Graphics command list created complete.\n";
19191906
}
19201907

1921-
if (auto Err = executeCommandList(State))
1922-
return Err;
1908+
auto SubmitResult = executeCommandList(State);
1909+
if (!SubmitResult)
1910+
return SubmitResult.takeError();
19231911
llvm::outs() << "Compute commands executed.\n";
1912+
if (auto Err = SubmitResult->waitForCompletion())
1913+
return Err;
19241914
if (auto Err = readBack(P, State))
19251915
return Err;
19261916
llvm::outs() << "Read data back.\n";

lib/API/MTL/MTLDevice.cpp

Lines changed: 47 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -75,22 +75,6 @@ static MTL::VertexFormat getMTLVertexFormat(DataFormat Format, int Channels) {
7575
}
7676

7777
namespace {
78-
class MTLQueue : public offloadtest::Queue {
79-
public:
80-
using Queue::submit;
81-
82-
MTL::CommandQueue *Queue;
83-
MTLQueue(MTL::CommandQueue *Queue) : Queue(Queue) {}
84-
~MTLQueue() override {
85-
if (Queue)
86-
Queue->release();
87-
}
88-
89-
llvm::Error
90-
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
91-
override;
92-
};
93-
9478
class MTLFence : public offloadtest::Fence {
9579
public:
9680
MTLFence(MTL::SharedEvent *Event, llvm::StringRef Name)
@@ -122,6 +106,26 @@ class MTLFence : public offloadtest::Fence {
122106
}
123107
};
124108

109+
class MTLQueue : public offloadtest::Queue {
110+
public:
111+
using Queue::submit;
112+
113+
MTL::CommandQueue *Queue;
114+
std::unique_ptr<MTLFence> SubmitFence;
115+
uint64_t FenceCounter = 0;
116+
117+
MTLQueue(MTL::CommandQueue *Queue, std::unique_ptr<MTLFence> SubmitFence)
118+
: Queue(Queue), SubmitFence(std::move(SubmitFence)) {}
119+
~MTLQueue() override {
120+
if (Queue)
121+
Queue->release();
122+
}
123+
124+
llvm::Expected<offloadtest::SubmitResult>
125+
submit(llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs)
126+
override;
127+
};
128+
125129
class MTLBuffer : public offloadtest::Buffer {
126130
public:
127131
MTL::Buffer *Buf;
@@ -178,23 +182,21 @@ class MTLCommandBuffer : public offloadtest::CommandBuffer {
178182
MTLCommandBuffer() : CommandBuffer(GPUAPI::Metal) {}
179183
};
180184

181-
llvm::Error MTLQueue::submit(
185+
llvm::Expected<offloadtest::SubmitResult> MTLQueue::submit(
182186
llvm::SmallVector<std::unique_ptr<offloadtest::CommandBuffer>> CBs) {
183187
// Metal serial queues guarantee that command buffers execute in commit order,
184188
// so no explicit wait on prior work is needed here.
185-
for (auto &CB : CBs)
186-
llvm::cast<MTLCommandBuffer>(CB.get())->CmdBuffer->commit();
187-
188-
// TODO: Return a Fence+value with keepalive lists instead of blocking here.
189-
for (auto &CB : CBs) {
190-
auto &MCB = *llvm::cast<MTLCommandBuffer>(CB.get());
191-
MCB.CmdBuffer->waitUntilCompleted();
192-
193-
NS::Error *Err = MCB.CmdBuffer->error();
194-
if (Err)
195-
return toError(Err);
189+
const uint64_t SignalValue = ++FenceCounter;
190+
191+
for (size_t I = 0; I < CBs.size(); ++I) {
192+
auto &MCB = *llvm::cast<MTLCommandBuffer>(CBs[I].get());
193+
// Signal the submit fence when the last command buffer completes.
194+
if (I == CBs.size() - 1)
195+
MCB.CmdBuffer->encodeSignalEvent(SubmitFence->Event, SignalValue);
196+
MCB.CmdBuffer->commit();
196197
}
197-
return llvm::Error::success();
198+
199+
return offloadtest::SubmitResult{SubmitFence.get(), SignalValue};
198200
}
199201
class MTLDevice : public offloadtest::Device {
200202
Capabilities Caps;
@@ -228,7 +230,6 @@ class MTLDevice : public offloadtest::Device {
228230
std::shared_ptr<MTLBuffer> FrameBufferReadback;
229231
std::shared_ptr<MTLTexture> DepthStencil;
230232
std::unique_ptr<MTLCommandBuffer> CB;
231-
std::unique_ptr<offloadtest::Fence> Fence;
232233
};
233234

234235
llvm::Error setupVertexShader(InvocationState &IS, const Pipeline &P,
@@ -666,7 +667,8 @@ class MTLDevice : public offloadtest::Device {
666667
return llvm::Error::success();
667668
}
668669

669-
llvm::Error executeCommands(InvocationState &IS) {
670+
llvm::Expected<offloadtest::SubmitResult>
671+
executeCommands(InvocationState &IS) {
670672
return GraphicsQueue.submit(std::move(IS.CB));
671673
}
672674

@@ -720,8 +722,9 @@ class MTLDevice : public offloadtest::Device {
720722
}
721723

722724
public:
723-
MTLDevice(MTL::Device *D, MTL::CommandQueue *Q)
724-
: Device(D), GraphicsQueue(MTLQueue(Q)) {
725+
MTLDevice(MTL::Device *D, MTL::CommandQueue *Q,
726+
std::unique_ptr<MTLFence> SubmitFence)
727+
: Device(D), GraphicsQueue(Q, std::move(SubmitFence)) {
725728
Description = Device->name()->utf8String();
726729
}
727730
const Capabilities &getCapabilities() override {
@@ -783,11 +786,6 @@ class MTLDevice : public offloadtest::Device {
783786
return CBOrErr.takeError();
784787
IS.CB = std::move(*CBOrErr);
785788

786-
auto FenceOrErr = createFence("Fence");
787-
if (!FenceOrErr)
788-
return FenceOrErr.takeError();
789-
IS.Fence = std::move(*FenceOrErr);
790-
791789
if (auto Err = createBuffers(P, IS))
792790
return Err;
793791

@@ -804,7 +802,11 @@ class MTLDevice : public offloadtest::Device {
804802
llvm::outs() << "Created graphics commands.\n";
805803
}
806804

807-
if (auto Err = executeCommands(IS))
805+
auto SubmitResult = executeCommands(IS);
806+
if (!SubmitResult)
807+
return SubmitResult.takeError();
808+
809+
if (auto Err = SubmitResult->waitForCompletion())
808810
return Err;
809811

810812
if (auto Err = copyBack(P, IS))
@@ -825,7 +827,12 @@ llvm::Error offloadtest::initializeMetalDevices(
825827
MTL::Device *MetalDevice = MTL::CreateSystemDefaultDevice();
826828
MTL::CommandQueue *MetalQueue = MetalDevice->newCommandQueue();
827829

828-
auto DefaultDev = std::make_unique<MTLDevice>(MetalDevice, MetalQueue);
830+
auto SubmitFenceOrErr = MTLFence::create(MetalDevice, "QueueSubmitFence");
831+
if (!SubmitFenceOrErr)
832+
return SubmitFenceOrErr.takeError();
833+
834+
auto DefaultDev = std::make_unique<MTLDevice>(MetalDevice, MetalQueue,
835+
std::move(*SubmitFenceOrErr));
829836
Devices.push_back(std::move(DefaultDev));
830837

831838
return llvm::Error::success();

0 commit comments

Comments
 (0)