Skip to content

Commit 78cc483

Browse files
authored
[HLSL] Add GroupWaveIndex/Count execution test (#8277)
Add an HLK execution test for the SM 6.10 GetGroupWaveIndex() and GetGroupWaveCount() intrinsics per the spec at https://microsoft.github.io/hlsl-specs/proposals/0048-group-wave-index/. The test dispatches a compute shader that writes per-thread wave index, wave count, lane index, lane count, and first-lane group index to a UAV buffer, then verifies: - GetGroupWaveCount() is uniform across all threads in the group - GetGroupWaveCount() >= ceil(threadGroupSize / WaveGetLaneCount()) - GetGroupWaveIndex() is in range [0, waveCount) - GetGroupWaveIndex() is uniform within each wave - Each wave has a distinct index covering [0, waveCount) Test configurations cover: - Multiple thread group sizes: 8, 64, 256, 1024 threads - 1D, 2D, and 3D thread group dimensions - Non-power-of-2 thread group size - WaveSize attribute interaction for each supported wave size - Single-wave edge case (numthreads <= waveSize)
1 parent 44efa29 commit 78cc483

2 files changed

Lines changed: 217 additions & 0 deletions

File tree

tools/clang/unittests/HLSLExec/ExecutionTest.cpp

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <array>
2525
#include <string>
2626
#include <map>
27+
#include <set>
2728
#include <unordered_set>
2829
#include <sstream>
2930
#include <iomanip>
@@ -212,6 +213,7 @@ class ExecutionTest {
212213
TEST_METHOD(GroupSharedLimitTest);
213214
TEST_METHOD(GroupSharedLimitASTest);
214215
TEST_METHOD(GroupSharedLimitMSTest);
216+
TEST_METHOD(GroupWaveIndexTest);
215217
TEST_METHOD(PartialDerivTest);
216218
TEST_METHOD(DerivativesTest);
217219
TEST_METHOD(ComputeSampleTest);
@@ -10931,6 +10933,210 @@ void ExecutionTest::GroupSharedLimitMSTest() {
1093110933
}
1093210934
}
1093310935

10936+
void ExecutionTest::GroupWaveIndexTest() {
10937+
WEX::TestExecution::SetVerifyOutput VerifySettings(
10938+
WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
10939+
10940+
BEGIN_TEST_METHOD_PROPERTIES()
10941+
TEST_METHOD_PROPERTY(L"Kits.TestId", L"c3f60f00-8e91-4acb-b4be-9f483fbe836b")
10942+
TEST_METHOD_PROPERTY(
10943+
L"Kits.Specification",
10944+
L"Device.Graphics.D3D12.DXILCore.ShaderModel610.CoreRequirement")
10945+
END_TEST_METHOD_PROPERTIES()
10946+
10947+
bool FailIfRequirementsNotMet = false;
10948+
#ifdef _HLK_CONF
10949+
FailIfRequirementsNotMet = true;
10950+
#endif
10951+
WEX::TestExecution::RuntimeParameters::TryGetValue(
10952+
L"FailIfRequirementsNotMet", FailIfRequirementsNotMet);
10953+
10954+
CComPtr<ID3D12Device> Device;
10955+
const bool SkipUnsupported = !FailIfRequirementsNotMet;
10956+
if (!createDevice(&Device, D3D_SHADER_MODEL_6_10, SkipUnsupported)) {
10957+
if (FailIfRequirementsNotMet)
10958+
LogErrorFmt(L"Device creation failed, resulting in test failure, since "
10959+
L"FailIfRequirementsNotMet is set.");
10960+
return;
10961+
}
10962+
10963+
// Get supported wave sizes for WaveSize attribute tests.
10964+
D3D12_FEATURE_DATA_D3D12_OPTIONS1 WaveOpts = {};
10965+
VERIFY_SUCCEEDED(
10966+
Device->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1,
10967+
&WaveOpts, sizeof(WaveOpts)));
10968+
const UINT MinWaveSize = WaveOpts.WaveLaneCountMin;
10969+
const UINT MaxWaveSize = WaveOpts.WaveLaneCountMax;
10970+
10971+
struct GroupWaveData {
10972+
uint32_t GroupIndex;
10973+
uint32_t WaveIndex;
10974+
uint32_t WaveCount;
10975+
uint32_t LaneIndex;
10976+
uint32_t LaneCount;
10977+
uint32_t FirstLaneGroupIndex;
10978+
};
10979+
10980+
// Shader source uses defines for thread group dimensions and optional
10981+
// WaveSize attribute, injected via compiler -D options.
10982+
const char Shader[] =
10983+
R"(struct GroupWaveData {
10984+
uint GroupIndex;
10985+
uint WaveIndex;
10986+
uint WaveCount;
10987+
uint LaneIndex;
10988+
uint LaneCount;
10989+
uint FirstLaneGroupIndex;
10990+
};
10991+
RWStructuredBuffer<GroupWaveData> Data : register(u0);
10992+
10993+
WAVE_SIZE_ATTR
10994+
[numthreads(NUMTHREADS_X, NUMTHREADS_Y, NUMTHREADS_Z)]
10995+
void main(uint GI : SV_GroupIndex) {
10996+
GroupWaveData D;
10997+
D.GroupIndex = GI;
10998+
D.WaveIndex = GetGroupWaveIndex();
10999+
D.WaveCount = GetGroupWaveCount();
11000+
D.LaneIndex = WaveGetLaneIndex();
11001+
D.LaneCount = WaveGetLaneCount();
11002+
D.FirstLaneGroupIndex = WaveReadLaneFirst(GI);
11003+
Data[GI] = D;
11004+
})";
11005+
11006+
CComPtr<IStream> Stream;
11007+
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
11008+
std::make_shared<st::ShaderOpSet>();
11009+
readHlslDataIntoNewStream(L"ShaderOpArith.xml", &Stream, m_support);
11010+
st::ParseShaderOpSetFromStream(Stream, ShaderOpSet.get());
11011+
11012+
// Test configurations: {numthreadsX, numthreadsY, numthreadsZ, WaveSize}
11013+
// WaveSize 0 means no [WaveSize] attribute.
11014+
struct TestConfig {
11015+
UINT X, Y, Z;
11016+
UINT WaveSize;
11017+
};
11018+
11019+
std::vector<TestConfig> Configs = {
11020+
{8, 1, 1, 0}, // 1D small (8 threads)
11021+
{8, 8, 1, 0}, // 2D medium (64 threads)
11022+
{16, 16, 1, 0}, // 2D large (256 threads)
11023+
{32, 32, 1, 0}, // 2D max (1024 threads)
11024+
{4, 4, 4, 0}, // 3D (64 threads)
11025+
{10, 1, 1, 0}, // 1D non-power-of-2
11026+
};
11027+
11028+
// Add WaveSize-attributed variants for each supported wave size.
11029+
for (UINT WS = MinWaveSize; WS <= MaxWaveSize; WS *= 2) {
11030+
Configs.push_back({8, 8, 1, WS});
11031+
// Single wave case: numthreads <= WaveSize.
11032+
if (WS >= 8)
11033+
Configs.push_back({8, 1, 1, WS});
11034+
}
11035+
11036+
for (const auto &Cfg : Configs) {
11037+
const UINT NumThreads = Cfg.X * Cfg.Y * Cfg.Z;
11038+
if (Cfg.WaveSize > 0) {
11039+
LogCommentFmt(L"Testing [numthreads(%u,%u,%u)] [WaveSize(%u)] "
11040+
L"(%u threads)",
11041+
Cfg.X, Cfg.Y, Cfg.Z, Cfg.WaveSize, NumThreads);
11042+
} else {
11043+
LogCommentFmt(L"Testing [numthreads(%u,%u,%u)] (%u threads)", Cfg.X,
11044+
Cfg.Y, Cfg.Z, NumThreads);
11045+
}
11046+
11047+
// Build compiler options with thread group defines.
11048+
char CompilerOptions[256];
11049+
if (Cfg.WaveSize > 0) {
11050+
VERIFY_IS_TRUE(
11051+
sprintf_s(CompilerOptions, sizeof(CompilerOptions),
11052+
"-D NUMTHREADS_X=%u -D NUMTHREADS_Y=%u "
11053+
"-D NUMTHREADS_Z=%u -D WAVE_SIZE_ATTR=[wavesize(%u)]",
11054+
Cfg.X, Cfg.Y, Cfg.Z, Cfg.WaveSize) != -1);
11055+
} else {
11056+
VERIFY_IS_TRUE(sprintf_s(CompilerOptions, sizeof(CompilerOptions),
11057+
"-D NUMTHREADS_X=%u -D NUMTHREADS_Y=%u "
11058+
"-D NUMTHREADS_Z=%u -D WAVE_SIZE_ATTR=",
11059+
Cfg.X, Cfg.Y, Cfg.Z) != -1);
11060+
}
11061+
11062+
std::shared_ptr<st::ShaderOpTestResult> Test =
11063+
st::RunShaderOpTestAfterParse(
11064+
Device, m_support, "GroupWaveIndexTest",
11065+
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *ShaderOp) {
11066+
VERIFY_IS_TRUE(0 == strcmp(Name, "UAVBuffer0"));
11067+
ShaderOp->Shaders.at(0).Text = Shader;
11068+
ShaderOp->Shaders.at(0).Arguments = CompilerOptions;
11069+
11070+
VERIFY_IS_TRUE(sizeof(GroupWaveData) * NumThreads <= Data.size());
11071+
GroupWaveData *InData = (GroupWaveData *)Data.data();
11072+
memset(InData, 0, sizeof(GroupWaveData) * NumThreads);
11073+
},
11074+
ShaderOpSet);
11075+
11076+
MappedData DataUav;
11077+
Test->Test->GetReadBackData("UAVBuffer0", &DataUav);
11078+
VERIFY_IS_TRUE(sizeof(GroupWaveData) * NumThreads <= DataUav.size());
11079+
const GroupWaveData *Results = (const GroupWaveData *)DataUav.data();
11080+
11081+
// Verify WaveCount is uniform across all threads and >= 1.
11082+
const uint32_t GroupWaveCount = Results[0].WaveCount;
11083+
VERIFY_IS_GREATER_THAN_OR_EQUAL(GroupWaveCount, 1u);
11084+
for (UINT I = 0; I < NumThreads; ++I) {
11085+
VERIFY_ARE_EQUAL(Results[I].WaveCount, GroupWaveCount);
11086+
}
11087+
11088+
// Verify WaveCount >= ceil(threadGroupSize / LaneCount) per spec.
11089+
const uint32_t GroupLaneCount = Results[0].LaneCount;
11090+
const uint32_t MinWaves =
11091+
(NumThreads + GroupLaneCount - 1) / GroupLaneCount;
11092+
LogCommentFmt(L" waveCount=%u, laneCount=%u, minWaves=%u", GroupWaveCount,
11093+
GroupLaneCount, MinWaves);
11094+
VERIFY_IS_GREATER_THAN_OR_EQUAL(GroupWaveCount, MinWaves);
11095+
11096+
// If a specific WaveSize was requested, verify LaneCount matches.
11097+
if (Cfg.WaveSize > 0) {
11098+
VERIFY_ARE_EQUAL(GroupLaneCount, Cfg.WaveSize);
11099+
}
11100+
11101+
// Verify WaveIndex is in range [0, WaveCount).
11102+
for (UINT I = 0; I < NumThreads; ++I) {
11103+
VERIFY_IS_LESS_THAN(Results[I].WaveIndex, GroupWaveCount);
11104+
}
11105+
11106+
// Group threads by wave using FirstLaneGroupIndex.
11107+
std::map<uint32_t, std::vector<const GroupWaveData *>> Waves;
11108+
for (UINT I = 0; I < NumThreads; ++I) {
11109+
Waves[Results[I].FirstLaneGroupIndex].push_back(&Results[I]);
11110+
}
11111+
11112+
// Verify number of distinct waves matches WaveCount.
11113+
VERIFY_ARE_EQUAL(Waves.size(), static_cast<size_t>(GroupWaveCount));
11114+
11115+
// Verify WaveIndex is uniform within each wave and unique across waves.
11116+
std::set<uint32_t> SeenWaveIndices;
11117+
for (auto &WavePair : Waves) {
11118+
const std::vector<const GroupWaveData *> &Lanes = WavePair.second;
11119+
VERIFY_IS_GREATER_THAN_OR_EQUAL(Lanes.size(), 1u);
11120+
11121+
uint32_t ExpectedWaveIndex = Lanes[0]->WaveIndex;
11122+
for (size_t J = 1; J < Lanes.size(); ++J) {
11123+
VERIFY_ARE_EQUAL(Lanes[J]->WaveIndex, ExpectedWaveIndex);
11124+
}
11125+
11126+
VERIFY_IS_TRUE(SeenWaveIndices.find(ExpectedWaveIndex) ==
11127+
SeenWaveIndices.end());
11128+
SeenWaveIndices.insert(ExpectedWaveIndex);
11129+
}
11130+
11131+
// Verify all wave indices from 0 to WaveCount-1 are present.
11132+
VERIFY_ARE_EQUAL(SeenWaveIndices.size(),
11133+
static_cast<size_t>(GroupWaveCount));
11134+
for (uint32_t I = 0; I < GroupWaveCount; ++I) {
11135+
VERIFY_IS_TRUE(SeenWaveIndices.count(I) == 1);
11136+
}
11137+
}
11138+
}
11139+
1093411140
// Atomic operation testing
1093511141

1093611142
// Atomic tests take a single integer index as input and contort it into some

tools/clang/unittests/HLSLExec/ShaderOpArith.xml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1915,6 +1915,17 @@
19151915
<Shader Name="PS" Target="ps_6_0" EntryPoint="PSMain" Text="@MS" />
19161916
</ShaderOp>
19171917

1918+
<ShaderOp Name="GroupWaveIndexTest" CS="CS">
1919+
<RootSignature>RootFlags(0), UAV(u0)</RootSignature>
1920+
<Resource Name="UAVBuffer0" Dimension="BUFFER" Width="32768" InitialResourceState="COPY_DEST" Init="ByName" Flags="ALLOW_UNORDERED_ACCESS" TransitionTo="UNORDERED_ACCESS" ReadBack="true" Format="R32_TYPELESS" />
1921+
<RootValues>
1922+
<RootValue Index="0" ResName="UAVBuffer0" />
1923+
</RootValues>
1924+
<Shader Name="CS" Target="cs_6_10">
1925+
<![CDATA[// Shader source code will be set at runtime]]>
1926+
</Shader>
1927+
</ShaderOp>
1928+
19181929
<ShaderOp Name="PackUnpackOp" CS="CS" DispatchX="1" DispatchY="1">
19191930
<RootSignature>RootFlags(0), UAV(u0), UAV(u1), UAV(u2)</RootSignature>
19201931
<Resource Name="g_bufIn" Dimension="BUFFER" Width="1024" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="false" />

0 commit comments

Comments
 (0)