-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[Clang][HIP][CUDA] Add __cluster_dims__
and __no_cluster__
attribute
#156686
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesThis PR adds basic frontend support for Co-authored-by: Yaxun (Sam) Liu <yaxun.liu@amd.com> Patch is 22.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156686.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 29364c5903d31..efb019d43cbe4 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -979,7 +979,7 @@ def AnalyzerNoReturn : InheritableAttr {
}
def InferredNoReturn : InheritableAttr {
- let Spellings = [];
+ let Spellings = [];
let SemaHandler = 0;
let Subjects = SubjectList<[Function], ErrorDiag>;
let Documentation = [InternalOnly];
@@ -1557,6 +1557,23 @@ def HIPManaged : InheritableAttr {
let Documentation = [HIPManagedAttrDocs];
}
+def CUDAClusterDims : InheritableAttr {
+ let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
+ let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def CUDANoCluster : InheritableAttr {
+ let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
+
def CUDAInvalidTarget : InheritableAttr {
let Spellings = [];
let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3146f20da1424..32b6944f2e038 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10698,7 +10698,7 @@ def warn_dangling_reference_captured_by_unknown : Warning<
// Diagnostics based on the Lifetime safety analysis.
def warn_lifetime_safety_loan_expires_permissive : Warning<
- "object whose reference is captured does not live long enough">,
+ "object whose reference is captured does not live long enough">,
InGroup<LifetimeSafetyPermissive>, DefaultIgnore;
def warn_lifetime_safety_loan_expires_strict : Warning<
"object whose reference is captured may not live long enough">,
@@ -13027,6 +13027,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"%1 attribute">, InGroup<IgnoredAttributes>;
+def err_cuda_cluster_attr_not_supported : Error<
+ "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
+>;
+
+def err_cuda_cluster_dims_too_large : Error<
+ "only a maximum of %0 thread blocks in a cluster is supported"
+>;
+
// VTable pointer authentication errors
def err_non_polymorphic_vtable_pointer_auth : Error<
"cannot set vtable pointer authentication on monomorphic type %0">;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index aa035a1555950..c9c77bd565260 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -5002,6 +5002,14 @@ class Sema final : public SemaBase {
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
+ /// Add a cluster_dims attribute to a particular declaration.
+ CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z);
+ void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z);
+ /// Add a no_cluster attribute to a particular declaration.
+ void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
+
enum class RetainOwnershipKind { NS, CF, OS };
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0fcbf7e458a34..48855ce485f91 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
+ llvm::StringMap<bool> TargetFetureMap;
+ M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
+
const auto *ReqdWGS =
M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
}
+
+ if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
+ uint32_t X =
+ Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+ uint32_t Y =
+ Attr->getY()
+ ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+ uint32_t Z =
+ Attr->getZ()
+ ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+
+ llvm::SmallString<32> AttrVal;
+ llvm::raw_svector_ostream OS(AttrVal);
+ OS << X << ',' << Y << ',' << Z;
+ F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
+ }
+
+ // OpenCL doesn't support cluster feature.
+ if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
+ FD->getAttr<CUDANoClusterAttr>())
+ F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
}
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
index da1e39ac7270e..fb0ece96e1418 100644
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -25,6 +25,8 @@
#define __constant__ __attribute__((constant))
#define __managed__ __attribute__((managed))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+
#if !defined(__cplusplus) || __cplusplus < 201103L
#define nullptr NULL;
#endif
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3ded60cd8b073..ad2c28843f970 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5654,6 +5654,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
}
+static std::pair<Expr *, int>
+makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
+ const unsigned Idx) {
+ if (S.DiagnoseUnexpandedParameterPack(E))
+ return {nullptr, 0};
+
+ // Accept template arguments for now as they depend on something else.
+ // We'll get to check them when they eventually get instantiated.
+ if (E->isValueDependent())
+ return {E, 1};
+
+ std::optional<llvm::APSInt> I = llvm::APSInt(64);
+ if (!(I = E->getIntegerConstantExpr(S.Context))) {
+ S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
+ << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
+ return {nullptr, 0};
+ }
+ // Make sure we can fit it in 4 bits.
+ if (!I->isIntN(4)) {
+ S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+ << toString(*I, 10, false) << 4 << /* Unsigned */ 1;
+ return {nullptr, 0};
+ }
+ if (*I < 0)
+ S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
+ << &AL << Idx << E->getSourceRange();
+
+ // We may need to perform implicit conversion of the argument.
+ InitializedEntity Entity = InitializedEntity::InitializeParameter(
+ S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+ ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
+ assert(!ValArg.isInvalid() &&
+ "Unexpected PerformCopyInitialization() failure.");
+
+ return {ValArg.getAs<Expr>(), I->getZExtValue()};
+}
+
+CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z) {
+ CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
+
+ int ValX = 1;
+ int ValY = 1;
+ int ValZ = 1;
+
+ std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
+ if (!X)
+ return nullptr;
+
+ if (Y) {
+ std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
+ if (!Y)
+ return nullptr;
+ }
+
+ if (Z) {
+ std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
+ if (!Z)
+ return nullptr;
+ }
+
+ int FlatDim = ValX * ValY * ValZ;
+ auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+ ? Context.getAuxTargetInfo()->getTriple()
+ : Context.getTargetInfo().getTriple();
+ int MaxDim = 1;
+ if (TT.isNVPTX())
+ MaxDim = 8;
+ else if (TT.isAMDGPU())
+ MaxDim = 16;
+ else
+ return nullptr;
+
+ // A maximum of 8 thread blocks in a cluster is supported as a portable
+ // cluster size in CUDA. The number is 16 for AMDGPU.
+ if (FlatDim > MaxDim) {
+ Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
+ return nullptr;
+ }
+
+ return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
+}
+
+void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z) {
+ if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
+ D->addAttr(Attr);
+}
+
+void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
+ if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
+ D->addAttr(Attr);
+}
+
+static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
+ return;
+ }
+
+ if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
+ !AL.checkAtMostNumArgs(S, /*Num=*/3))
+ return;
+
+ S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
+ AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
+ AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
+}
+
+static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
+ return;
+ }
+
+ S.addNoClusterAttr(D, AL);
+}
+
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
const ParsedAttr &AL) {
if (!AL.isArgIdent(0)) {
@@ -7105,6 +7229,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_CUDALaunchBounds:
handleLaunchBoundsAttr(S, D, AL);
break;
+ case ParsedAttr::AT_CUDAClusterDims:
+ handleClusterDimsAttr(S, D, AL);
+ break;
+ case ParsedAttr::AT_CUDANoCluster:
+ handleNoClusterAttr(S, D, AL);
+ break;
case ParsedAttr::AT_Restrict:
handleRestrictAttr(S, D, AL);
break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index ee1b520fa46e9..aab93a93ba95b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -681,6 +681,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}
+static void instantiateDependentCUDAClusterDimsAttr(
+ Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+ const CUDAClusterDimsAttr &Attr, Decl *New) {
+ EnterExpressionEvaluationContext Unevaluated(
+ S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+ Expr *XExpr = nullptr;
+ Expr *YExpr = nullptr;
+ Expr *ZExpr = nullptr;
+
+ if (Attr.getX()) {
+ ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
+ if (ResultX.isUsable())
+ XExpr = ResultX.getAs<Expr>();
+ }
+
+ if (Attr.getY()) {
+ ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
+ if (ResultY.isUsable())
+ YExpr = ResultY.getAs<Expr>();
+ }
+
+ if (Attr.getZ()) {
+ ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
+ if (ResultZ.isUsable())
+ ZExpr = ResultZ.getAs<Expr>();
+ }
+
+ if (XExpr)
+ S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
// This doesn't take any template parameters, but we have a custom action that
// needs to happen when the kernel itself is instantiated. We need to run the
// ItaniumMangler to mark the names required to name this kernel.
@@ -883,6 +915,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
}
+ if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
+ instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
+ *CUDAClusterDims, New);
+ }
+
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
New);
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..e7ad784335027 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
#else
#define __constant__
#define __device__
@@ -22,6 +24,8 @@
#define __managed__
#define __launch_bounds__(...)
#define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
#endif
struct dim3 {
diff --git a/clang/test/CodeGenCUDA/cluster_dims.cu b/clang/test/CodeGenCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..00635e3572a7f
--- /dev/null
+++ b/clang/test/CodeGenCUDA/cluster_dims.cu
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// HOST-NOT: "amdgpu-cluster-dims"
+
+// CHECK: "amdgpu-cluster-dims"="2,2,2"
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
+
+// CHECK: "amdgpu-cluster-dims"="2,2,1"
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,1,1"
+__global__ void __cluster_dims__(4) test_literal_1d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,2,1"
+__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
+
+// CHECK: "amdgpu-cluster-dims"="0,0,0"
+__global__ void __no_cluster__ test_no_cluster() {}
+
+// CHECK: "amdgpu-cluster-dims"="7,1,1"
+template<unsigned a>
+__global__ void __cluster_dims__(a) test_template_1d() {}
+template __global__ void test_template_1d<7>();
+
+// CHECK: "amdgpu-cluster-dims"="2,6,1"
+template<unsigned a, unsigned b>
+__global__ void __cluster_dims__(a, b) test_template_2d() {}
+template __global__ void test_template_2d<2, 6>();
+
+// CHECK: "amdgpu-cluster-dims"="1,2,3"
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
+template __global__ void test_template_3d<1, 2, 3>();
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 37ff33e5a1523..c8c913448d968 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -35,6 +35,7 @@
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
+// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 10db947d8246c..2bf45e03d91c7 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#define __managed__ __attribute__((managed))
#define __grid_constant__ __attribute__((grid_constant))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
struct dim3 {
unsigned x, y, z;
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..3cd0e0197c29b
--- /dev/null
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -target-cpu sm_90 -fcuda-is-device -ast-print -x hip -verify=cuda,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d()
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d()
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d()
+__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant()
+__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z)))
+template <int x, int y, int z> void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z)))
+template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(32, 2, 4) test...
[truncated]
|
@llvm/pr-subscribers-clang Author: Shilei Tian (shiltian) ChangesThis PR adds basic frontend support for Co-authored-by: Yaxun (Sam) Liu <yaxun.liu@amd.com> Patch is 22.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156686.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 29364c5903d31..efb019d43cbe4 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -979,7 +979,7 @@ def AnalyzerNoReturn : InheritableAttr {
}
def InferredNoReturn : InheritableAttr {
- let Spellings = [];
+ let Spellings = [];
let SemaHandler = 0;
let Subjects = SubjectList<[Function], ErrorDiag>;
let Documentation = [InternalOnly];
@@ -1557,6 +1557,23 @@ def HIPManaged : InheritableAttr {
let Documentation = [HIPManagedAttrDocs];
}
+def CUDAClusterDims : InheritableAttr {
+ let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
+ let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def CUDANoCluster : InheritableAttr {
+ let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
+
def CUDAInvalidTarget : InheritableAttr {
let Spellings = [];
let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3146f20da1424..32b6944f2e038 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10698,7 +10698,7 @@ def warn_dangling_reference_captured_by_unknown : Warning<
// Diagnostics based on the Lifetime safety analysis.
def warn_lifetime_safety_loan_expires_permissive : Warning<
- "object whose reference is captured does not live long enough">,
+ "object whose reference is captured does not live long enough">,
InGroup<LifetimeSafetyPermissive>, DefaultIgnore;
def warn_lifetime_safety_loan_expires_strict : Warning<
"object whose reference is captured may not live long enough">,
@@ -13027,6 +13027,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"%1 attribute">, InGroup<IgnoredAttributes>;
+def err_cuda_cluster_attr_not_supported : Error<
+ "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
+>;
+
+def err_cuda_cluster_dims_too_large : Error<
+ "only a maximum of %0 thread blocks in a cluster is supported"
+>;
+
// VTable pointer authentication errors
def err_non_polymorphic_vtable_pointer_auth : Error<
"cannot set vtable pointer authentication on monomorphic type %0">;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index aa035a1555950..c9c77bd565260 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -5002,6 +5002,14 @@ class Sema final : public SemaBase {
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
+ /// Add a cluster_dims attribute to a particular declaration.
+ CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z);
+ void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z);
+ /// Add a no_cluster attribute to a particular declaration.
+ void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
+
enum class RetainOwnershipKind { NS, CF, OS };
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0fcbf7e458a34..48855ce485f91 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
+ llvm::StringMap<bool> TargetFetureMap;
+ M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
+
const auto *ReqdWGS =
M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
}
+
+ if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
+ uint32_t X =
+ Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+ uint32_t Y =
+ Attr->getY()
+ ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+ uint32_t Z =
+ Attr->getZ()
+ ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+
+ llvm::SmallString<32> AttrVal;
+ llvm::raw_svector_ostream OS(AttrVal);
+ OS << X << ',' << Y << ',' << Z;
+ F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
+ }
+
+ // OpenCL doesn't support cluster feature.
+ if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
+ FD->getAttr<CUDANoClusterAttr>())
+ F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
}
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
index da1e39ac7270e..fb0ece96e1418 100644
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -25,6 +25,8 @@
#define __constant__ __attribute__((constant))
#define __managed__ __attribute__((managed))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+
#if !defined(__cplusplus) || __cplusplus < 201103L
#define nullptr NULL;
#endif
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3ded60cd8b073..ad2c28843f970 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5654,6 +5654,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
}
+static std::pair<Expr *, int>
+makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
+ const unsigned Idx) {
+ if (S.DiagnoseUnexpandedParameterPack(E))
+ return {nullptr, 0};
+
+ // Accept template arguments for now as they depend on something else.
+ // We'll get to check them when they eventually get instantiated.
+ if (E->isValueDependent())
+ return {E, 1};
+
+ std::optional<llvm::APSInt> I = llvm::APSInt(64);
+ if (!(I = E->getIntegerConstantExpr(S.Context))) {
+ S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
+ << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
+ return {nullptr, 0};
+ }
+ // Make sure we can fit it in 4 bits.
+ if (!I->isIntN(4)) {
+ S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+ << toString(*I, 10, false) << 4 << /* Unsigned */ 1;
+ return {nullptr, 0};
+ }
+ if (*I < 0)
+ S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
+ << &AL << Idx << E->getSourceRange();
+
+ // We may need to perform implicit conversion of the argument.
+ InitializedEntity Entity = InitializedEntity::InitializeParameter(
+ S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+ ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
+ assert(!ValArg.isInvalid() &&
+ "Unexpected PerformCopyInitialization() failure.");
+
+ return {ValArg.getAs<Expr>(), I->getZExtValue()};
+}
+
+CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z) {
+ CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
+
+ int ValX = 1;
+ int ValY = 1;
+ int ValZ = 1;
+
+ std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
+ if (!X)
+ return nullptr;
+
+ if (Y) {
+ std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
+ if (!Y)
+ return nullptr;
+ }
+
+ if (Z) {
+ std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
+ if (!Z)
+ return nullptr;
+ }
+
+ int FlatDim = ValX * ValY * ValZ;
+ auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+ ? Context.getAuxTargetInfo()->getTriple()
+ : Context.getTargetInfo().getTriple();
+ int MaxDim = 1;
+ if (TT.isNVPTX())
+ MaxDim = 8;
+ else if (TT.isAMDGPU())
+ MaxDim = 16;
+ else
+ return nullptr;
+
+ // A maximum of 8 thread blocks in a cluster is supported as a portable
+ // cluster size in CUDA. The number is 16 for AMDGPU.
+ if (FlatDim > MaxDim) {
+ Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
+ return nullptr;
+ }
+
+ return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
+}
+
+void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z) {
+ if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
+ D->addAttr(Attr);
+}
+
+void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
+ if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
+ D->addAttr(Attr);
+}
+
+static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
+ return;
+ }
+
+ if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
+ !AL.checkAtMostNumArgs(S, /*Num=*/3))
+ return;
+
+ S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
+ AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
+ AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
+}
+
+static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
+ return;
+ }
+
+ S.addNoClusterAttr(D, AL);
+}
+
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
const ParsedAttr &AL) {
if (!AL.isArgIdent(0)) {
@@ -7105,6 +7229,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_CUDALaunchBounds:
handleLaunchBoundsAttr(S, D, AL);
break;
+ case ParsedAttr::AT_CUDAClusterDims:
+ handleClusterDimsAttr(S, D, AL);
+ break;
+ case ParsedAttr::AT_CUDANoCluster:
+ handleNoClusterAttr(S, D, AL);
+ break;
case ParsedAttr::AT_Restrict:
handleRestrictAttr(S, D, AL);
break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index ee1b520fa46e9..aab93a93ba95b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -681,6 +681,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}
+static void instantiateDependentCUDAClusterDimsAttr(
+ Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+ const CUDAClusterDimsAttr &Attr, Decl *New) {
+ EnterExpressionEvaluationContext Unevaluated(
+ S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+ Expr *XExpr = nullptr;
+ Expr *YExpr = nullptr;
+ Expr *ZExpr = nullptr;
+
+ if (Attr.getX()) {
+ ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
+ if (ResultX.isUsable())
+ XExpr = ResultX.getAs<Expr>();
+ }
+
+ if (Attr.getY()) {
+ ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
+ if (ResultY.isUsable())
+ YExpr = ResultY.getAs<Expr>();
+ }
+
+ if (Attr.getZ()) {
+ ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
+ if (ResultZ.isUsable())
+ ZExpr = ResultZ.getAs<Expr>();
+ }
+
+ if (XExpr)
+ S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
// This doesn't take any template parameters, but we have a custom action that
// needs to happen when the kernel itself is instantiated. We need to run the
// ItaniumMangler to mark the names required to name this kernel.
@@ -883,6 +915,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
}
+ if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
+ instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
+ *CUDAClusterDims, New);
+ }
+
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
New);
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..e7ad784335027 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
#else
#define __constant__
#define __device__
@@ -22,6 +24,8 @@
#define __managed__
#define __launch_bounds__(...)
#define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
#endif
struct dim3 {
diff --git a/clang/test/CodeGenCUDA/cluster_dims.cu b/clang/test/CodeGenCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..00635e3572a7f
--- /dev/null
+++ b/clang/test/CodeGenCUDA/cluster_dims.cu
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// HOST-NOT: "amdgpu-cluster-dims"
+
+// CHECK: "amdgpu-cluster-dims"="2,2,2"
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
+
+// CHECK: "amdgpu-cluster-dims"="2,2,1"
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,1,1"
+__global__ void __cluster_dims__(4) test_literal_1d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,2,1"
+__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
+
+// CHECK: "amdgpu-cluster-dims"="0,0,0"
+__global__ void __no_cluster__ test_no_cluster() {}
+
+// CHECK: "amdgpu-cluster-dims"="7,1,1"
+template<unsigned a>
+__global__ void __cluster_dims__(a) test_template_1d() {}
+template __global__ void test_template_1d<7>();
+
+// CHECK: "amdgpu-cluster-dims"="2,6,1"
+template<unsigned a, unsigned b>
+__global__ void __cluster_dims__(a, b) test_template_2d() {}
+template __global__ void test_template_2d<2, 6>();
+
+// CHECK: "amdgpu-cluster-dims"="1,2,3"
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
+template __global__ void test_template_3d<1, 2, 3>();
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 37ff33e5a1523..c8c913448d968 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -35,6 +35,7 @@
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
+// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 10db947d8246c..2bf45e03d91c7 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#define __managed__ __attribute__((managed))
#define __grid_constant__ __attribute__((grid_constant))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
struct dim3 {
unsigned x, y, z;
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..3cd0e0197c29b
--- /dev/null
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -target-cpu sm_90 -fcuda-is-device -ast-print -x hip -verify=cuda,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d()
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d()
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d()
+__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant()
+__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z)))
+template <int x, int y, int z> void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z)))
+template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(32, 2, 4) test...
[truncated]
|
@llvm/pr-subscribers-backend-x86 Author: Shilei Tian (shiltian) ChangesThis PR adds basic frontend support for Co-authored-by: Yaxun (Sam) Liu <yaxun.liu@amd.com> Patch is 22.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156686.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 29364c5903d31..efb019d43cbe4 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -979,7 +979,7 @@ def AnalyzerNoReturn : InheritableAttr {
}
def InferredNoReturn : InheritableAttr {
- let Spellings = [];
+ let Spellings = [];
let SemaHandler = 0;
let Subjects = SubjectList<[Function], ErrorDiag>;
let Documentation = [InternalOnly];
@@ -1557,6 +1557,23 @@ def HIPManaged : InheritableAttr {
let Documentation = [HIPManagedAttrDocs];
}
+def CUDAClusterDims : InheritableAttr {
+ let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
+ let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def CUDANoCluster : InheritableAttr {
+ let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
+
def CUDAInvalidTarget : InheritableAttr {
let Spellings = [];
let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3146f20da1424..32b6944f2e038 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10698,7 +10698,7 @@ def warn_dangling_reference_captured_by_unknown : Warning<
// Diagnostics based on the Lifetime safety analysis.
def warn_lifetime_safety_loan_expires_permissive : Warning<
- "object whose reference is captured does not live long enough">,
+ "object whose reference is captured does not live long enough">,
InGroup<LifetimeSafetyPermissive>, DefaultIgnore;
def warn_lifetime_safety_loan_expires_strict : Warning<
"object whose reference is captured may not live long enough">,
@@ -13027,6 +13027,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"%1 attribute">, InGroup<IgnoredAttributes>;
+def err_cuda_cluster_attr_not_supported : Error<
+ "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
+>;
+
+def err_cuda_cluster_dims_too_large : Error<
+ "only a maximum of %0 thread blocks in a cluster is supported"
+>;
+
// VTable pointer authentication errors
def err_non_polymorphic_vtable_pointer_auth : Error<
"cannot set vtable pointer authentication on monomorphic type %0">;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index aa035a1555950..c9c77bd565260 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -5002,6 +5002,14 @@ class Sema final : public SemaBase {
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
+ /// Add a cluster_dims attribute to a particular declaration.
+ CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z);
+ void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z);
+ /// Add a no_cluster attribute to a particular declaration.
+ void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
+
enum class RetainOwnershipKind { NS, CF, OS };
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0fcbf7e458a34..48855ce485f91 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
+ llvm::StringMap<bool> TargetFetureMap;
+ M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
+
const auto *ReqdWGS =
M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
}
+
+ if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
+ uint32_t X =
+ Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+ uint32_t Y =
+ Attr->getY()
+ ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+ uint32_t Z =
+ Attr->getZ()
+ ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+
+ llvm::SmallString<32> AttrVal;
+ llvm::raw_svector_ostream OS(AttrVal);
+ OS << X << ',' << Y << ',' << Z;
+ F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
+ }
+
+ // OpenCL doesn't support cluster feature.
+ if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
+ FD->getAttr<CUDANoClusterAttr>())
+ F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
}
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
index da1e39ac7270e..fb0ece96e1418 100644
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -25,6 +25,8 @@
#define __constant__ __attribute__((constant))
#define __managed__ __attribute__((managed))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+
#if !defined(__cplusplus) || __cplusplus < 201103L
#define nullptr NULL;
#endif
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3ded60cd8b073..ad2c28843f970 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5654,6 +5654,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
}
+static std::pair<Expr *, int>
+makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
+ const unsigned Idx) {
+ if (S.DiagnoseUnexpandedParameterPack(E))
+ return {nullptr, 0};
+
+ // Accept template arguments for now as they depend on something else.
+ // We'll get to check them when they eventually get instantiated.
+ if (E->isValueDependent())
+ return {E, 1};
+
+ std::optional<llvm::APSInt> I = llvm::APSInt(64);
+ if (!(I = E->getIntegerConstantExpr(S.Context))) {
+ S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
+ << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
+ return {nullptr, 0};
+ }
+ // Make sure we can fit it in 4 bits.
+ if (!I->isIntN(4)) {
+ S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+ << toString(*I, 10, false) << 4 << /* Unsigned */ 1;
+ return {nullptr, 0};
+ }
+ if (*I < 0)
+ S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
+ << &AL << Idx << E->getSourceRange();
+
+ // We may need to perform implicit conversion of the argument.
+ InitializedEntity Entity = InitializedEntity::InitializeParameter(
+ S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+ ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
+ assert(!ValArg.isInvalid() &&
+ "Unexpected PerformCopyInitialization() failure.");
+
+ return {ValArg.getAs<Expr>(), I->getZExtValue()};
+}
+
+CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z) {
+ CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
+
+ int ValX = 1;
+ int ValY = 1;
+ int ValZ = 1;
+
+ std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
+ if (!X)
+ return nullptr;
+
+ if (Y) {
+ std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
+ if (!Y)
+ return nullptr;
+ }
+
+ if (Z) {
+ std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
+ if (!Z)
+ return nullptr;
+ }
+
+ int FlatDim = ValX * ValY * ValZ;
+ auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+ ? Context.getAuxTargetInfo()->getTriple()
+ : Context.getTargetInfo().getTriple();
+ int MaxDim = 1;
+ if (TT.isNVPTX())
+ MaxDim = 8;
+ else if (TT.isAMDGPU())
+ MaxDim = 16;
+ else
+ return nullptr;
+
+ // A maximum of 8 thread blocks in a cluster is supported as a portable
+ // cluster size in CUDA. The number is 16 for AMDGPU.
+ if (FlatDim > MaxDim) {
+ Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
+ return nullptr;
+ }
+
+ return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
+}
+
+void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z) {
+ if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
+ D->addAttr(Attr);
+}
+
+void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
+ if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
+ D->addAttr(Attr);
+}
+
+static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
+ return;
+ }
+
+ if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
+ !AL.checkAtMostNumArgs(S, /*Num=*/3))
+ return;
+
+ S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
+ AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
+ AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
+}
+
+static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
+ return;
+ }
+
+ S.addNoClusterAttr(D, AL);
+}
+
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
const ParsedAttr &AL) {
if (!AL.isArgIdent(0)) {
@@ -7105,6 +7229,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_CUDALaunchBounds:
handleLaunchBoundsAttr(S, D, AL);
break;
+ case ParsedAttr::AT_CUDAClusterDims:
+ handleClusterDimsAttr(S, D, AL);
+ break;
+ case ParsedAttr::AT_CUDANoCluster:
+ handleNoClusterAttr(S, D, AL);
+ break;
case ParsedAttr::AT_Restrict:
handleRestrictAttr(S, D, AL);
break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index ee1b520fa46e9..aab93a93ba95b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -681,6 +681,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}
+static void instantiateDependentCUDAClusterDimsAttr(
+ Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+ const CUDAClusterDimsAttr &Attr, Decl *New) {
+ EnterExpressionEvaluationContext Unevaluated(
+ S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+ Expr *XExpr = nullptr;
+ Expr *YExpr = nullptr;
+ Expr *ZExpr = nullptr;
+
+ if (Attr.getX()) {
+ ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
+ if (ResultX.isUsable())
+ XExpr = ResultX.getAs<Expr>();
+ }
+
+ if (Attr.getY()) {
+ ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
+ if (ResultY.isUsable())
+ YExpr = ResultY.getAs<Expr>();
+ }
+
+ if (Attr.getZ()) {
+ ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
+ if (ResultZ.isUsable())
+ ZExpr = ResultZ.getAs<Expr>();
+ }
+
+ if (XExpr)
+ S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
// This doesn't take any template parameters, but we have a custom action that
// needs to happen when the kernel itself is instantiated. We need to run the
// ItaniumMangler to mark the names required to name this kernel.
@@ -883,6 +915,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
}
+ if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
+ instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
+ *CUDAClusterDims, New);
+ }
+
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
New);
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..e7ad784335027 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
#else
#define __constant__
#define __device__
@@ -22,6 +24,8 @@
#define __managed__
#define __launch_bounds__(...)
#define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
#endif
struct dim3 {
diff --git a/clang/test/CodeGenCUDA/cluster_dims.cu b/clang/test/CodeGenCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..00635e3572a7f
--- /dev/null
+++ b/clang/test/CodeGenCUDA/cluster_dims.cu
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// HOST-NOT: "amdgpu-cluster-dims"
+
+// CHECK: "amdgpu-cluster-dims"="2,2,2"
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
+
+// CHECK: "amdgpu-cluster-dims"="2,2,1"
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,1,1"
+__global__ void __cluster_dims__(4) test_literal_1d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,2,1"
+__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
+
+// CHECK: "amdgpu-cluster-dims"="0,0,0"
+__global__ void __no_cluster__ test_no_cluster() {}
+
+// CHECK: "amdgpu-cluster-dims"="7,1,1"
+template<unsigned a>
+__global__ void __cluster_dims__(a) test_template_1d() {}
+template __global__ void test_template_1d<7>();
+
+// CHECK: "amdgpu-cluster-dims"="2,6,1"
+template<unsigned a, unsigned b>
+__global__ void __cluster_dims__(a, b) test_template_2d() {}
+template __global__ void test_template_2d<2, 6>();
+
+// CHECK: "amdgpu-cluster-dims"="1,2,3"
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
+template __global__ void test_template_3d<1, 2, 3>();
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 37ff33e5a1523..c8c913448d968 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -35,6 +35,7 @@
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
+// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 10db947d8246c..2bf45e03d91c7 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#define __managed__ __attribute__((managed))
#define __grid_constant__ __attribute__((grid_constant))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
struct dim3 {
unsigned x, y, z;
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..3cd0e0197c29b
--- /dev/null
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -target-cpu sm_90 -fcuda-is-device -ast-print -x hip -verify=cuda,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d()
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d()
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d()
+__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant()
+__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z)))
+template <int x, int y, int z> void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z)))
+template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+//NS-error@+1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(32, 2, 4) test...
[truncated]
|
4d6e309
to
190dd7a
Compare
It isn't clear what these are, why/whether we want them, what their signature is, etc. Additionally, we do not allow adding undocumented attributes anymore, so please make sure these get documented. And why can't these just use the other dimensions-attributes that we have (of which we have plenty). |
It is the support for
Noted. Will add some documents.
I don't follow this one. Can you explain more? These are for different purposes. |
ALL of this (including a summary that doesn't require a link) should be in the commit message and documentation.
are they? There is nothing on this patch that indicates that (being a bit coy, the documentation should probably make it clear what the differences are/how it is used differently than all the other 'dim' based attributes). |
64f5398
to
7d41777
Compare
This PR adds basic frontend support for `__cluster_dims__` and `__no_cluster__` attribute. Co-authored-by: Yaxun (Sam) Liu <yaxun.liu@amd.com> Co-authored-by: Jay Foad <jay.foad@amd.com>
7d41777
to
e57c037
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but I will defer to people more familiar with clang.
let Category = DocCatDecl; | ||
let Content = [{ | ||
In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function | ||
to set the dimensions of a thread block cluster. This allows to group multiple thread blocks into |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"This allows to group" reads awkwardly, awkwardly enough I don't have a suggestion on how to fix it, but please try a reword.
to set the dimensions of a thread block cluster. This allows to group multiple thread blocks into | ||
a larger unit called a "cluster". `__cluster_dims__` defines the cluster size as ``(X, Y, Z)``, | ||
where each value is the number of thread blocks in that dimension. | ||
}]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a code sample.
let Category = DocCatDecl; | ||
let Content = [{ | ||
In CUDA/HIP programming, the ``__no_cluster__`` attribute can be applied to a kernel function to | ||
indicate that the thread block cluster feature will not be enabled at both compile time and kernel |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is a thread block cluster feature
? Can that be explained in these docs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think that needs to be explained in LLVM. It's like we don't explain what a grid or a thread block is in LLVM either. LLVM docs are not a CUDA/HIP programming guide.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps we should :) The point of the docs here is to help newbies have SOME level of idea what the thing does so they can research it more. Right now it is an expert-only documentation.
let Content = [{ | ||
In CUDA/HIP programming, the ``__no_cluster__`` attribute can be applied to a kernel function to | ||
indicate that the thread block cluster feature will not be enabled at both compile time and kernel | ||
launch time. Note: this is a LLVM/Clang only attribute. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of as a note, I'd suggest making the first sentence be: ..., the Clang/LLVM-exclusive __no_cluster__ attribute
@@ -13027,6 +13027,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning< | |||
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " | |||
"%1 attribute">, InGroup<IgnoredAttributes>; | |||
|
|||
def err_cuda_cluster_attr_not_supported : Error< | |||
"%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just use a %0
, that way we can accept other spellings as well.
if (E->isValueDependent()) | ||
return {E, 1}; | ||
|
||
std::optional<llvm::APSInt> I = llvm::APSInt(64); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
std::optional<llvm::APSInt> I = llvm::APSInt(64); | |
std::optional<llvm::APSInt> I; |
You're immediately replacing this on 5669.
return {E, 1}; | ||
|
||
std::optional<llvm::APSInt> I = llvm::APSInt(64); | ||
if (!(I = E->getIntegerConstantExpr(S.Context))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd prefer splitting this into 2 lines: Put the E->getIntegerConstantExpr
in the initializer above, then just do !I
. here.
// We may need to perform implicit conversion of the argument. | ||
InitializedEntity Entity = InitializedEntity::InitializeParameter( | ||
S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false); | ||
ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We shouldn't need to do this. We already know this is an integral value, we can just store it as a number.
The AST here can/should just store a `ConstantExpr' (https://clang.llvm.org/doxygen/classclang_1_1ConstantExpr.html) with the value that we have, so that we don't have to do the work again.
} | ||
|
||
void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) { | ||
if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't really do this?
Instead just do: D->addAttr(CUDANoClusterAttr::Create(...))
.
return nullptr; | ||
} | ||
|
||
return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use CUDAClusterDimsAttr::Create
.
This PR adds basic frontend support for
__cluster_dims__
and__no_cluster__
attribute.In CUDA/HIP programming, the
__cluster_dims__
attribute can be applied to a kernel function to set the dimensions of a thread block cluster. The__no_cluster__
attribute can be applied to a kernel function to indicate that the thread block cluster feature will not be enabled at both compile time and kernel launch time. Note that__no_cluster__
is a LLVM/Clang only attribute.Co-authored-by: Yaxun (Sam) Liu yaxun.liu@amd.com
Co-authored-by: Jay Foad jay.foad@amd.com