Skip to content

Commit c44a383

Browse files
JoeCitizentex3ddamyanp
authored
Add GroupSharedLimit attribute support for Mesh, Amp and Node shaders (#8140)
- Adds support for the GroupSharedLimit feature for Mesh, Amplification and Node shaders. - Tests for each of those shader types --------- Co-authored-by: Tex Riddell <[email protected]> Co-authored-by: Damyan Pepper <[email protected]>
1 parent 0eed3f7 commit c44a383

17 files changed

Lines changed: 743 additions & 232 deletions

File tree

docs/DXIL.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3329,6 +3329,7 @@ SM.CSNOSIGNATURES Compute shaders mu
33293329
SM.DOMAINLOCATIONIDXOOB DomainLocation component index out of bounds for the domain.
33303330
SM.DSINPUTCONTROLPOINTCOUNTRANGE DS input control point count must be [0..%0]. %1 specified.
33313331
SM.DXILVERSION Target shader model requires specific Dxil Version
3332+
SM.EXPLICITTGSMSIZEONENTRY Total Thread Group Shared Memory used by entry must not exceed limit specified by entry attribute.
33323333
SM.GSINSTANCECOUNTRANGE GS instance count must be [1..%0]. %1 specified.
33333334
SM.GSOUTPUTVERTEXCOUNTRANGE GS output vertex count must be [0..%0]. %1 specified.
33343335
SM.GSTOTALOUTPUTVERTEXDATARANGE Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2. This value cannot be greater than %3.
@@ -3351,8 +3352,7 @@ SM.INVALIDSAMPLERFEEDBACKTYPE Invalid sampler fe
33513352
SM.INVALIDTEXTUREKINDONUAV TextureCube[Array] resources are not supported with UAVs.
33523353
SM.ISOLINEOUTPUTPRIMITIVEMISMATCH Hull Shader declared with IsoLine Domain must specify output primitive point or line. Triangle_cw or triangle_ccw output are not compatible with the IsoLine Domain.
33533354
SM.ISSPECIALFLOAT 16 bit IsSpecialFloat overloads require Shader Model 6.9 or higher.
3354-
SM.MAXMSSMSIZE Total Thread Group Shared Memory storage is %0, exceeded %1.
3355-
SM.MAXTGSMSIZE Total Thread Group Shared Memory storage is %0, exceeded %1.
3355+
SM.MAXTGSMSIZEONENTRY Total Thread Group Shared Memory used by entry must not exceed maximum for shader model.
33563356
SM.MAXTHEADGROUP Declared Thread Group Count %0 (X*Y*Z) is beyond the valid maximum of %1.
33573357
SM.MESHPSIGROWCOUNT For shader '%0', primitive output signatures are taking up more than %1 rows.
33583358
SM.MESHSHADERINOUTSIZE For shader '%0', payload plus output size is greater than %1.

include/dxc/DXIL/DxilFunctionProps.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ struct DxilWaveSize {
109109
};
110110

111111
struct DxilFunctionProps {
112+
static constexpr int kGroupSharedLimitUnset = -1;
113+
112114
DxilFunctionProps() {
113115
memset(&ShaderProps, 0, sizeof(ShaderProps));
114116
shaderKind = DXIL::ShaderKind::Invalid;
@@ -117,7 +119,7 @@ struct DxilFunctionProps {
117119
memset(&Node, 0, sizeof(Node));
118120
Node.LaunchType = DXIL::NodeLaunchType::Invalid;
119121
Node.LocalRootArgumentsTableIndex = -1;
120-
groupSharedLimitBytes = 0;
122+
groupSharedLimitBytes = kGroupSharedLimitUnset;
121123
}
122124
union {
123125
// Geometry shader.
@@ -175,7 +177,7 @@ struct DxilFunctionProps {
175177
// numThreads shared between multiple shader types and node shaders.
176178
unsigned numThreads[3];
177179

178-
unsigned groupSharedLimitBytes;
180+
int groupSharedLimitBytes;
179181

180182
struct NodeProps {
181183
DXIL::NodeLaunchType LaunchType = DXIL::NodeLaunchType::Invalid;

include/dxc/DXIL/DxilModule.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,6 @@ class DxilModule {
254254
void SetNumThreads(unsigned x, unsigned y, unsigned z);
255255
unsigned GetNumThreads(unsigned idx) const;
256256

257-
unsigned GetGroupSharedLimit() const;
258257
// The total amount of group shared memory (in bytes) used by the shader.
259258
unsigned GetTGSMSizeInBytes() const;
260259

lib/DXIL/DxilMetadataHelper.cpp

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1628,7 +1628,9 @@ MDTuple *DxilMDHelper::EmitDxilEntryProperties(uint64_t rawShaderFlag,
16281628
}
16291629

16301630
const hlsl::ShaderModel *SM = GetShaderModel();
1631-
if (SM->IsSMAtLeast(6, 10)) {
1631+
if (SM->IsSMAtLeast(6, 10) &&
1632+
props.groupSharedLimitBytes !=
1633+
DxilFunctionProps::kGroupSharedLimitUnset) {
16321634
MDVals.emplace_back(
16331635
Uint32ToConstMD(DxilMDHelper::kDxilGroupSharedLimitTag));
16341636
MDVals.emplace_back(Uint32ToConstMD(props.groupSharedLimitBytes));
@@ -1697,13 +1699,31 @@ MDTuple *DxilMDHelper::EmitDxilEntryProperties(uint64_t rawShaderFlag,
16971699
MS.maxPrimitiveCount, MS.outputTopology,
16981700
MS.payloadSizeInBytes);
16991701
MDVals.emplace_back(pMDTuple);
1702+
1703+
const hlsl::ShaderModel *SM = GetShaderModel();
1704+
if (SM->IsSMAtLeast(6, 10) &&
1705+
props.groupSharedLimitBytes !=
1706+
DxilFunctionProps::kGroupSharedLimitUnset) {
1707+
MDVals.emplace_back(
1708+
Uint32ToConstMD(DxilMDHelper::kDxilGroupSharedLimitTag));
1709+
MDVals.emplace_back(Uint32ToConstMD(props.groupSharedLimitBytes));
1710+
}
17001711
} break;
17011712
case DXIL::ShaderKind::Amplification: {
17021713
auto &AS = props.ShaderProps.AS;
17031714
MDVals.emplace_back(Uint32ToConstMD(DxilMDHelper::kDxilASStateTag));
17041715
MDTuple *pMDTuple =
17051716
EmitDxilASState(props.numThreads, AS.payloadSizeInBytes);
17061717
MDVals.emplace_back(pMDTuple);
1718+
1719+
const hlsl::ShaderModel *SM = GetShaderModel();
1720+
if (SM->IsSMAtLeast(6, 10) &&
1721+
props.groupSharedLimitBytes !=
1722+
DxilFunctionProps::kGroupSharedLimitUnset) {
1723+
MDVals.emplace_back(
1724+
Uint32ToConstMD(DxilMDHelper::kDxilGroupSharedLimitTag));
1725+
MDVals.emplace_back(Uint32ToConstMD(props.groupSharedLimitBytes));
1726+
}
17071727
} break;
17081728
case DXIL::ShaderKind::Node: {
17091729
// The Node specific properties have already been handled by
@@ -1716,6 +1736,15 @@ MDTuple *DxilMDHelper::EmitDxilEntryProperties(uint64_t rawShaderFlag,
17161736
NumThreadVals.emplace_back(Uint32ToConstMD(props.numThreads[1]));
17171737
NumThreadVals.emplace_back(Uint32ToConstMD(props.numThreads[2]));
17181738
MDVals.emplace_back(MDNode::get(m_Ctx, NumThreadVals));
1739+
1740+
const hlsl::ShaderModel *SM = GetShaderModel();
1741+
if (SM->IsSMAtLeast(6, 10) &&
1742+
props.groupSharedLimitBytes !=
1743+
DxilFunctionProps::kGroupSharedLimitUnset) {
1744+
MDVals.emplace_back(
1745+
Uint32ToConstMD(DxilMDHelper::kDxilGroupSharedLimitTag));
1746+
MDVals.emplace_back(Uint32ToConstMD(props.groupSharedLimitBytes));
1747+
}
17191748
} break;
17201749
default:
17211750
break;
@@ -1783,7 +1812,8 @@ void DxilMDHelper::LoadDxilEntryProperties(const MDOperand &MDO,
17831812
} break;
17841813

17851814
case DxilMDHelper::kDxilGroupSharedLimitTag: {
1786-
DXASSERT(props.IsCS(), "else invalid shader kind");
1815+
DXASSERT(props.IsCS() || props.IsMS() || props.IsAS() || props.IsNode(),
1816+
"else invalid shader kind");
17871817
props.groupSharedLimitBytes = ConstMDToUint32(MDO);
17881818
if (!m_pSM->IsSMAtLeast(6, 10))
17891819
m_bExtraMetadata = true;

lib/DXIL/DxilModule.cpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,12 @@ void DxilModule::SetEntryFunction(Function *pEntryFunc) {
234234
// Move entry props to new function in order to preserve them.
235235
std::unique_ptr<DxilEntryProps> Props =
236236
std::move(m_DxilEntryPropsMap.begin()->second);
237+
// For HS, make sure we add the patch constant function to the set of patch
238+
// constant functions.
239+
m_PatchConstantFunctions.clear();
240+
if (Props->props.IsHS() && Props->props.ShaderProps.HS.patchConstantFunc)
241+
m_PatchConstantFunctions.insert(
242+
Props->props.ShaderProps.HS.patchConstantFunc);
237243
m_DxilEntryPropsMap.clear();
238244
m_DxilEntryPropsMap[m_pEntryFunc] = std::move(Props);
239245
}
@@ -412,15 +418,6 @@ unsigned DxilModule::GetNumThreads(unsigned idx) const {
412418
return props.numThreads[idx];
413419
}
414420

415-
unsigned DxilModule::GetGroupSharedLimit() const {
416-
DXASSERT(m_DxilEntryPropsMap.size() == 1 &&
417-
(m_pSM->IsCS() || m_pSM->IsMS() || m_pSM->IsAS()),
418-
"only works for CS/MS/AS profiles");
419-
const DxilFunctionProps &props = m_DxilEntryPropsMap.begin()->second->props;
420-
DXASSERT_NOMSG(m_pSM->GetKind() == props.shaderKind);
421-
return props.groupSharedLimitBytes;
422-
}
423-
424421
unsigned DxilModule::GetTGSMSizeInBytes() const {
425422
const DataLayout &DL = m_pModule->getDataLayout();
426423
unsigned TGSMSize = 0;

lib/DxilValidation/DxilValidation.cpp

Lines changed: 136 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -3916,70 +3916,156 @@ static void ValidateGlobalVariables(ValidationContext &ValCtx) {
39163916
DxilModule &M = ValCtx.DxilMod;
39173917

39183918
const ShaderModel *pSM = ValCtx.DxilMod.GetShaderModel();
3919-
bool TGSMAllowed = pSM->IsCS() || pSM->IsAS() || pSM->IsMS() || pSM->IsLib();
3920-
3921-
unsigned TGSMSize = 0;
3922-
std::vector<StoreInst *> FixAddrTGSMList;
39233919
const DataLayout &DL = M.GetModule()->getDataLayout();
3920+
std::vector<StoreInst *> FixAddrTGSMList;
3921+
3922+
auto isTGSMEntry = [](DXIL::ShaderKind Kind) -> bool {
3923+
return Kind == DXIL::ShaderKind::Compute ||
3924+
Kind == DXIL::ShaderKind::Amplification ||
3925+
Kind == DXIL::ShaderKind::Mesh || Kind == DXIL::ShaderKind::Node;
3926+
};
3927+
3928+
auto getMaxTGSM = [](const DxilFunctionProps &Props) -> unsigned {
3929+
if (Props.groupSharedLimitBytes >= 0)
3930+
return static_cast<unsigned>(Props.groupSharedLimitBytes);
3931+
if (Props.IsCS() || Props.IsAS() || Props.IsNode())
3932+
return DXIL::kMaxTGSMSize;
3933+
else if (Props.IsMS())
3934+
return DXIL::kMaxMSSMSize;
3935+
return 0;
3936+
};
3937+
3938+
DenseMap<const Function *, uint32_t> TGSMInFunc;
3939+
// Initialize all function TGSM usage to zero
3940+
for (auto &function : M.GetModule()->getFunctionList())
3941+
TGSMInFunc[&function] = 0;
3942+
3943+
// Map TGSM overages per function, used for error reporting
3944+
// Tracks first user per GV that caused overage.
3945+
typedef MapVector<GlobalVariable *, Instruction *> FirstUserMap;
3946+
typedef DenseMap<const Function *, FirstUserMap> TGSMOverageMap;
3947+
TGSMOverageMap TGSMOverages;
3948+
3949+
auto ReportTGSMOverages = [&](Function *EntryFunc) {
3950+
unsigned Size = TGSMInFunc[EntryFunc];
3951+
if (!Size)
3952+
return; // No TGSM used.
3953+
3954+
// Several possibilities:
3955+
// - Entry point or library function with function properties
3956+
// - Patch constant function without function properties, TGSM not allowed
3957+
// - No-inline function without function properties, TGSM counted in entry
3958+
DXIL::ShaderKind Kind = DXIL::ShaderKind::Invalid;
3959+
bool IsPatchConstant = M.IsPatchConstantShader(EntryFunc);
3960+
if (M.HasDxilFunctionProps(EntryFunc))
3961+
Kind = M.GetDxilEntryProps(EntryFunc).props.shaderKind;
3962+
else if (!IsPatchConstant)
3963+
return; // no-inline function, accounted for in entry
3964+
3965+
auto Overages = TGSMOverages.find(EntryFunc);
3966+
if (Overages == TGSMOverages.end())
3967+
return;
3968+
3969+
unsigned MaxSize = 0;
3970+
ValidationRule Rule = ValidationRule::SmMaxTGSMSizeOnEntry;
3971+
3972+
// Props only exist if not a patch constant function.
3973+
if (!IsPatchConstant) {
3974+
DxilFunctionProps &Props = M.GetDxilFunctionProps(EntryFunc);
3975+
MaxSize = getMaxTGSM(Props);
3976+
Rule = Props.groupSharedLimitBytes !=
3977+
DxilFunctionProps::kGroupSharedLimitUnset
3978+
? ValidationRule::SmExplicitTGSMSizeOnEntry
3979+
: ValidationRule::SmMaxTGSMSizeOnEntry;
3980+
}
3981+
3982+
for (auto &GVAndUser : Overages->second) {
3983+
Instruction *UseInst = GVAndUser.second;
3984+
if (!isTGSMEntry(Kind))
3985+
ValCtx.EmitInstrFormatError(UseInst, ValidationRule::SmTGSMUnsupported,
3986+
{"from non-compute entry points"});
3987+
else
3988+
ValCtx.EmitInstrFormatError(UseInst, Rule,
3989+
{EntryFunc->getName(), std::to_string(Size),
3990+
std::to_string(MaxSize)});
3991+
}
3992+
};
3993+
3994+
struct WorkListEntry {
3995+
User *U;
3996+
// FirstUser tracks the first (inner-most) instruction user of the TGSM
3997+
// variable for this worklist entry.
3998+
Instruction *FirstUser;
3999+
};
4000+
4001+
// Collect total groupshared memory potentially used by every function
39244002
for (GlobalVariable &GV : M.GetModule()->globals()) {
39254003
ValidateGlobalVariable(GV, ValCtx);
39264004
if (GV.getType()->getAddressSpace() == DXIL::kTGSMAddrSpace) {
3927-
if (!TGSMAllowed)
3928-
ValCtx.EmitGlobalVariableFormatError(
3929-
&GV, ValidationRule::SmTGSMUnsupported,
3930-
{std::string("in Shader Model ") + M.GetShaderModel()->GetName()});
3931-
// Lib targets need to check the usage to know if it's allowed
3932-
if (pSM->IsLib()) {
3933-
for (User *U : GV.users()) {
3934-
if (Instruction *I = dyn_cast<Instruction>(U)) {
3935-
llvm::Function *F = I->getParent()->getParent();
4005+
SmallPtrSet<llvm::Function *, 8> completeFuncs;
4006+
SmallVector<WorkListEntry, 16> WorkList;
4007+
auto AddUsers = [&WorkList](User *U, Instruction *FirstUser) {
4008+
for (User *U : U->users()) {
4009+
if (!FirstUser && isa<Instruction>(U))
4010+
WorkList.push_back({U, cast<Instruction>(U)});
4011+
else
4012+
WorkList.push_back({U, FirstUser});
4013+
}
4014+
};
4015+
uint32_t GVSize = DL.getTypeAllocSize(GV.getType()->getElementType());
4016+
4017+
AddUsers(&GV, nullptr);
4018+
4019+
while (!WorkList.empty()) {
4020+
WorkListEntry Info = WorkList.pop_back_val();
4021+
// If const, keep going until we find something we can use
4022+
if (isa<Constant>(Info.U)) {
4023+
AddUsers(Info.U, Info.FirstUser);
4024+
continue;
4025+
}
4026+
4027+
if (Instruction *I = dyn_cast<Instruction>(Info.U)) {
4028+
llvm::Function *F = I->getParent()->getParent();
4029+
if (completeFuncs.insert(F).second) {
4030+
// If function is new, process it and its users
4031+
// Add users to the worklist
4032+
Instruction *FirstUser = Info.FirstUser ? Info.FirstUser : I;
4033+
AddUsers(F, FirstUser);
4034+
// Add groupshared size to function's total
4035+
unsigned &TotalSize = TGSMInFunc[F];
4036+
TotalSize += GVSize;
4037+
// If this is an entry function, check the TotalSize against the
4038+
// limits.
39364039
if (M.HasDxilEntryProps(F)) {
3937-
DxilFunctionProps &Props = M.GetDxilEntryProps(F).props;
3938-
if (!Props.IsCS() && !Props.IsAS() && !Props.IsMS() &&
3939-
!Props.IsNode()) {
3940-
ValCtx.EmitInstrFormatError(I,
3941-
ValidationRule::SmTGSMUnsupported,
3942-
{"from non-compute entry points"});
3943-
}
4040+
const DxilFunctionProps &Props = M.GetDxilEntryProps(F).props;
4041+
unsigned MaxSize = getMaxTGSM(Props);
4042+
if (TotalSize > MaxSize && TGSMOverages[F].count(&GV) == 0)
4043+
TGSMOverages[F][&GV] = FirstUser;
4044+
} else if (M.IsPatchConstantShader(F)) {
4045+
// Collect illegal usage for error reporting
4046+
if (TGSMOverages[F].count(&GV) == 0)
4047+
TGSMOverages[F][&GV] = FirstUser;
39444048
}
39454049
}
39464050
}
39474051
}
3948-
TGSMSize += DL.getTypeAllocSize(GV.getType()->getElementType());
39494052
CollectFixAddressAccess(&GV, FixAddrTGSMList);
39504053
}
39514054
}
39524055

3953-
ValidationRule Rule = ValidationRule::SmMaxTGSMSize;
3954-
unsigned MaxSize = DXIL::kMaxTGSMSize;
3955-
3956-
if (M.GetShaderModel()->IsMS()) {
3957-
Rule = ValidationRule::SmMaxMSSMSize;
3958-
MaxSize = DXIL::kMaxMSSMSize;
3959-
}
3960-
3961-
// Check if the entry function has attribute to override TGSM size.
3962-
if (M.HasDxilEntryProps(M.GetEntryFunction())) {
3963-
DxilEntryProps &EntryProps = M.GetDxilEntryProps(M.GetEntryFunction());
3964-
if (EntryProps.props.IsCS()) {
3965-
unsigned SpecifiedTGSMSize = EntryProps.props.groupSharedLimitBytes;
3966-
if (SpecifiedTGSMSize > 0) {
3967-
MaxSize = SpecifiedTGSMSize;
3968-
}
4056+
if (pSM->IsLib()) {
4057+
for (auto &F : M.GetModule()->functions()) {
4058+
if (F.isDeclaration() ||
4059+
!(M.HasDxilEntryProps(&F) || M.IsPatchConstantShader(&F)))
4060+
continue;
4061+
ReportTGSMOverages(&F);
39694062
}
3970-
}
3971-
3972-
if (TGSMSize > MaxSize) {
3973-
Module::global_iterator GI = M.GetModule()->global_end();
3974-
GlobalVariable *GV = &*GI;
3975-
do {
3976-
GI--;
3977-
GV = &*GI;
3978-
if (GV->getType()->getAddressSpace() == hlsl::DXIL::kTGSMAddrSpace)
3979-
break;
3980-
} while (GI != M.GetModule()->global_begin());
3981-
ValCtx.EmitGlobalVariableFormatError(
3982-
GV, Rule, {std::to_string(TGSMSize), std::to_string(MaxSize)});
4063+
} else {
4064+
Function *EntryFunc = M.GetEntryFunction();
4065+
if (EntryFunc)
4066+
ReportTGSMOverages(EntryFunc);
4067+
if (pSM->IsHS())
4068+
ReportTGSMOverages(M.GetPatchConstantFunction());
39834069
}
39844070

39854071
if (!FixAddrTGSMList.empty()) {

tools/clang/lib/CodeGen/CGHLSLMS.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1665,13 +1665,8 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
16651665
FD->getAttr<HLSLGroupSharedLimitAttr>()) {
16661666
funcProps->groupSharedLimitBytes = Attr->getLimit();
16671667
} else {
1668-
if (SM->IsMS()) { // Fallback to default limits
1669-
funcProps->groupSharedLimitBytes = DXIL::kMaxMSSMSize; // 28k For MS
1670-
} else if (SM->IsAS() || SM->IsCS()) {
1671-
funcProps->groupSharedLimitBytes = DXIL::kMaxTGSMSize; // 32k For AS/CS
1672-
} else {
1673-
funcProps->groupSharedLimitBytes = 0;
1674-
}
1668+
funcProps->groupSharedLimitBytes =
1669+
DxilFunctionProps::kGroupSharedLimitUnset; // not specified
16751670
}
16761671

16771672
// Hull shader.

tools/clang/test/CodeGenHLSL/mesh-val/oversizeSM.hlsl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// RUN: %dxc -E main -T ms_6_5 %s | FileCheck %s
22

3-
// CHECK: Total Thread Group Shared Memory storage is 28676, exceeded 28672
3+
// CHECK: Total Thread Group Shared Memory used by 'main' is 28676, exceeding maximum: 28672.
44

55
#define MAX_VERT 32
66
#define MAX_PRIM 16

0 commit comments

Comments
 (0)