23#include "llvm/ADT/StringRef.h"
24#include "llvm/Frontend/Offloading/Utility.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/DerivedTypes.h"
28#include "llvm/IR/ReplaceConstant.h"
29#include "llvm/Support/Format.h"
30#include "llvm/Support/VirtualFileSystem.h"
33using namespace CodeGen;
36constexpr unsigned CudaFatMagic = 0x466243b1;
37constexpr unsigned HIPFatMagic = 0x48495046;
45 llvm::IntegerType *IntTy, *SizeTy;
47 llvm::PointerType *PtrTy;
50 llvm::LLVMContext &Context;
52 llvm::Module &TheModule;
62 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
64 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
66 llvm::GlobalVariable *Var;
74 llvm::GlobalVariable *GpuBinaryHandle =
nullptr;
76 bool RelocatableDeviceCode;
78 std::unique_ptr<MangleContext> DeviceMC;
80 llvm::FunctionCallee getSetupArgumentFn()
const;
81 llvm::FunctionCallee getLaunchFn()
const;
83 llvm::FunctionType *getRegisterGlobalsFnTy()
const;
84 llvm::FunctionType *getCallbackFnTy()
const;
85 llvm::FunctionType *getRegisterLinkedBinaryFnTy()
const;
86 std::string addPrefixToName(StringRef FuncName)
const;
87 std::string addUnderscoredPrefixToName(StringRef FuncName)
const;
90 llvm::Function *makeRegisterGlobalsFn();
95 llvm::Constant *makeConstantString(
const std::string &Str,
96 const std::string &Name =
"") {
97 return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer();
103 llvm::Constant *makeConstantArray(StringRef Str,
105 StringRef SectionName =
"",
106 unsigned Alignment = 0,
107 bool AddNull =
false) {
108 llvm::Constant *
Value =
109 llvm::ConstantDataArray::getString(Context, Str, AddNull);
110 auto *GV =
new llvm::GlobalVariable(
112 llvm::GlobalValue::PrivateLinkage,
Value, Name);
113 if (!SectionName.empty()) {
114 GV->setSection(SectionName);
117 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
120 GV->setAlignment(llvm::Align(Alignment));
125 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
126 assert(FnTy->getReturnType()->isVoidTy() &&
127 "Can only generate dummy functions returning void!");
128 llvm::Function *DummyFunc = llvm::Function::Create(
129 FnTy, llvm::GlobalValue::InternalLinkage,
"dummy", &TheModule);
131 llvm::BasicBlock *DummyBlock =
132 llvm::BasicBlock::Create(Context,
"", DummyFunc);
134 FuncBuilder.SetInsertPoint(DummyBlock);
135 FuncBuilder.CreateRetVoid();
147 void registerDeviceVar(
const VarDecl *VD, llvm::GlobalVariable &Var,
148 bool Extern,
bool Constant) {
149 DeviceVars.push_back({&Var,
152 VD->hasAttr<HIPManagedAttr>(),
155 void registerDeviceSurf(
const VarDecl *VD, llvm::GlobalVariable &Var,
156 bool Extern,
int Type) {
157 DeviceVars.push_back({&Var,
163 void registerDeviceTex(
const VarDecl *VD, llvm::GlobalVariable &Var,
164 bool Extern,
int Type,
bool Normalized) {
165 DeviceVars.push_back({&Var,
168 false, Normalized,
Type}});
172 llvm::Function *makeModuleCtorFunction();
174 llvm::Function *makeModuleDtorFunction();
176 void transformManagedVars();
178 void createOffloadingEntries();
184 llvm::Function *
getKernelStub(llvm::GlobalValue *Handle)
override {
185 auto Loc = KernelStubs.find(Handle);
186 assert(
Loc != KernelStubs.end());
191 llvm::GlobalVariable &Var)
override;
194 llvm::GlobalValue::LinkageTypes &
Linkage)
override;
201std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName)
const {
202 return (Prefix + FuncName).str();
205CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName)
const {
206 return (
"__" + Prefix + FuncName).str();
216 return std::unique_ptr<MangleContext>(
227 TheModule(CGM.getModule()),
228 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
243llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn()
const {
245 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
247 llvm::FunctionType::get(IntTy, Params,
false),
248 addPrefixToName(
"SetupArgument"));
251llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn()
const {
255 llvm::FunctionType::get(IntTy, PtrTy,
false),
"hipLaunchByPtr");
262llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy()
const {
263 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
266llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy()
const {
267 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
270llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy()
const {
271 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
272 llvm::PointerType::getUnqual(Context)};
273 return llvm::FunctionType::get(VoidTy, Params,
false);
276std::string CGNVCUDARuntime::getDeviceSideName(
const NamedDecl *ND) {
279 if (
auto *FD = dyn_cast<FunctionDecl>(ND))
280 GD =
GlobalDecl(FD, KernelReferenceKind::Kernel);
283 std::string DeviceSideName;
291 llvm::raw_svector_ostream Out(Buffer);
293 DeviceSideName = std::string(Out.str());
301 llvm::raw_svector_ostream Out(Buffer);
302 Out << DeviceSideName;
304 DeviceSideName = std::string(Out.str());
306 return DeviceSideName;
313 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.
CurFn->getName()])) {
314 GV->setLinkage(CGF.
CurFn->getLinkage());
315 GV->setInitializer(CGF.
CurFn);
318 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
321 emitDeviceStubBodyNew(CGF, Args);
323 emitDeviceStubBodyLegacy(CGF, Args);
336 for (
auto &Arg : Args)
338 llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);
340 auto *Int64Ty = CGF.
Builder.getInt64Ty();
341 KernelLaunchParamsTypes.push_back(Int64Ty);
342 KernelLaunchParamsTypes.push_back(PtrTy);
343 KernelLaunchParamsTypes.push_back(PtrTy);
345 llvm::StructType *KernelLaunchParamsTy =
346 llvm::StructType::create(KernelLaunchParamsTypes);
351 "kernel_launch_params");
353 auto KernelArgsSize = CGM.
getDataLayout().getTypeAllocSize(KernelArgsTy);
361 for (
unsigned i = 0; i < Args.size(); ++i) {
366 return KernelLaunchParams;
376 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
378 for (
unsigned i = 0; i < Args.size(); ++i) {
380 llvm::Value *VoidVarPtr = CGF.
Builder.CreatePointerCast(VarPtr, PtrTy);
382 VoidVarPtr, CGF.
Builder.CreateConstGEP1_32(
394 ? prepareKernelArgsLLVMOffload(CGF, Args)
395 : prepareKernelArgs(CGF, Args);
411 std::string KernelLaunchAPI =
"LaunchKernel";
413 LangOptions::GPUDefaultStreamKind::PerThread) {
415 KernelLaunchAPI = KernelLaunchAPI +
"_spt";
417 KernelLaunchAPI = KernelLaunchAPI +
"_ptsz";
419 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
423 for (
auto *Result : DC->
lookup(&cudaLaunchKernelII)) {
425 cudaLaunchKernelFD = FD;
428 if (cudaLaunchKernelFD ==
nullptr) {
430 "Can't find declaration for " + LaunchKernelName);
444 llvm::FunctionType::get(IntTy,
450 addUnderscoredPrefixToName(
"PopCallConfiguration"));
459 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
475 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
479 llvm::FunctionCallee cudaLaunchKernelFn =
489 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(
Kernel);
490 std::string GlobalVarName = (KernelFunction->getName() +
".id").str();
492 llvm::GlobalVariable *HandleVar =
493 CGM.
getModule().getNamedGlobal(GlobalVarName);
495 HandleVar =
new llvm::GlobalVariable(
497 false, KernelFunction->getLinkage(),
498 llvm::ConstantInt::get(CGM.
Int8Ty, 0), GlobalVarName);
499 HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
500 HandleVar->setVisibility(KernelFunction->getVisibility());
501 if (KernelFunction->hasComdat())
502 HandleVar->setComdat(CGM.
getModule().getOrInsertComdat(GlobalVarName));
518 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
521 for (
const VarDecl *A : Args) {
523 Offset = Offset.alignTo(TInfo.Align);
524 llvm::Value *Args[] = {
527 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
528 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
531 llvm::Constant *
Zero = llvm::ConstantInt::get(IntTy, 0);
532 llvm::Value *CBZero = CGF.
Builder.CreateICmpEQ(CB,
Zero);
534 CGF.
Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
536 Offset += TInfo.Width;
540 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
542 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
552 llvm::GlobalVariable *ManagedVar) {
554 for (
auto &&VarUse : Var->uses()) {
555 WorkList.push_back({VarUse.getUser()});
557 while (!WorkList.empty()) {
558 auto &&WorkItem = WorkList.pop_back_val();
559 auto *
U = WorkItem.back();
560 if (isa<llvm::ConstantExpr>(
U)) {
561 for (
auto &&UU :
U->uses()) {
562 WorkItem.push_back(UU.getUser());
563 WorkList.push_back(WorkItem);
568 if (
auto *I = dyn_cast<llvm::Instruction>(
U)) {
569 llvm::Value *OldV = Var;
570 llvm::Instruction *NewV =
new llvm::LoadInst(
571 Var->getType(), ManagedVar,
"ld.managed",
false,
572 llvm::Align(Var->getAlignment()), I->getIterator());
576 for (
auto &&Op : WorkItem) {
577 auto *CE = cast<llvm::ConstantExpr>(Op);
578 auto *NewInst = CE->getAsInstruction();
579 NewInst->insertBefore(*I->getParent(), I->getIterator());
580 NewInst->replaceUsesOfWith(OldV, NewV);
584 I->replaceUsesOfWith(OldV, NewV);
586 llvm_unreachable(
"Invalid use of managed variable");
605llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
607 if (EmittedKernels.empty() && DeviceVars.empty())
610 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
611 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
612 addUnderscoredPrefixToName(
"_register_globals"), &TheModule);
613 llvm::BasicBlock *EntryBB =
614 llvm::BasicBlock::Create(Context,
"entry", RegisterKernelsFunc);
616 Builder.SetInsertPoint(EntryBB);
620 llvm::Type *RegisterFuncParams[] = {
621 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
622 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
624 llvm::FunctionType::get(IntTy, RegisterFuncParams,
false),
625 addUnderscoredPrefixToName(
"RegisterFunction"));
630 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
631 for (
auto &&I : EmittedKernels) {
632 llvm::Constant *KernelName =
633 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
634 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
635 llvm::Value *Args[] = {
637 KernelHandles[I.Kernel->getName()],
640 llvm::ConstantInt::get(IntTy, -1),
645 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
646 Builder.CreateCall(RegisterFunc, Args);
649 llvm::Type *VarSizeTy = IntTy;
657 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
658 IntTy, VarSizeTy, IntTy, IntTy};
660 llvm::FunctionType::get(VoidTy, RegisterVarParams,
false),
661 addUnderscoredPrefixToName(
"RegisterVar"));
664 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
665 PtrTy, VarSizeTy, IntTy};
667 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams,
false),
668 addUnderscoredPrefixToName(
"RegisterManagedVar"));
672 llvm::FunctionType::get(
673 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy},
false),
674 addUnderscoredPrefixToName(
"RegisterSurface"));
678 llvm::FunctionType::get(
679 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy},
false),
680 addUnderscoredPrefixToName(
"RegisterTexture"));
681 for (
auto &&Info : DeviceVars) {
682 llvm::GlobalVariable *Var = Info.Var;
683 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
684 "External variables should not show up here, except HIP managed "
686 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
687 switch (Info.Flags.getKind()) {
688 case DeviceVarFlags::Variable: {
691 if (Info.Flags.isManaged()) {
692 assert(Var->getName().ends_with(
".managed") &&
693 "HIP managed variables not transformed");
694 auto *ManagedVar = CGM.
getModule().getNamedGlobal(
695 Var->getName().drop_back(StringRef(
".managed").size()));
696 llvm::Value *Args[] = {
701 llvm::ConstantInt::get(VarSizeTy, VarSize),
702 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
703 if (!Var->isDeclaration())
704 Builder.CreateCall(RegisterManagedVar, Args);
706 llvm::Value *Args[] = {
711 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
712 llvm::ConstantInt::get(VarSizeTy, VarSize),
713 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
714 llvm::ConstantInt::get(IntTy, 0)};
715 Builder.CreateCall(RegisterVar, Args);
719 case DeviceVarFlags::Surface:
722 {&GpuBinaryHandlePtr, Var, VarName, VarName,
723 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
724 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
726 case DeviceVarFlags::Texture:
729 {&GpuBinaryHandlePtr, Var, VarName, VarName,
730 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
731 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
732 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
737 Builder.CreateRetVoid();
738 return RegisterKernelsFunc;
760llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
765 if (CudaGpuBinaryFileName.empty() && !IsHIP)
767 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
772 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
775 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
776 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
780 llvm::FunctionType::get(PtrTy, PtrTy,
false),
781 addUnderscoredPrefixToName(
"RegisterFatBinary"));
783 llvm::StructType *FatbinWrapperTy =
784 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
790 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary =
nullptr;
791 if (!CudaGpuBinaryFileName.empty()) {
793 auto CudaGpuBinaryOrErr =
794 VFS->getBufferForFile(CudaGpuBinaryFileName, -1,
false);
795 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
797 << CudaGpuBinaryFileName << EC.message();
800 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
803 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
804 llvm::FunctionType::get(VoidTy,
false),
805 llvm::GlobalValue::InternalLinkage,
806 addUnderscoredPrefixToName(
"_module_ctor"), &TheModule);
807 llvm::BasicBlock *CtorEntryBB =
808 llvm::BasicBlock::Create(Context,
"entry", ModuleCtorFunc);
811 CtorBuilder.SetInsertPoint(CtorEntryBB);
813 const char *FatbinConstantName;
814 const char *FatbinSectionName;
815 const char *ModuleIDSectionName;
816 StringRef ModuleIDPrefix;
817 llvm::Constant *FatBinStr;
820 FatbinConstantName =
".hip_fatbin";
821 FatbinSectionName =
".hipFatBinSegment";
823 ModuleIDSectionName =
"__hip_module_id";
824 ModuleIDPrefix =
"__hip_";
829 const unsigned HIPCodeObjectAlign = 4096;
830 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
831 FatbinConstantName, HIPCodeObjectAlign);
837 FatBinStr =
new llvm::GlobalVariable(
839 true, llvm::GlobalValue::ExternalLinkage,
nullptr,
843 nullptr, llvm::GlobalVariable::NotThreadLocal);
844 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
847 FatMagic = HIPFatMagic;
849 if (RelocatableDeviceCode)
850 FatbinConstantName = CGM.
getTriple().isMacOSX()
851 ?
"__NV_CUDA,__nv_relfatbin"
855 CGM.
getTriple().isMacOSX() ?
"__NV_CUDA,__nv_fatbin" :
".nv_fatbin";
858 CGM.
getTriple().isMacOSX() ?
"__NV_CUDA,__fatbin" :
".nvFatBinSegment";
860 ModuleIDSectionName = CGM.
getTriple().isMacOSX()
861 ?
"__NV_CUDA,__nv_module_id"
863 ModuleIDPrefix =
"__nv_";
867 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
868 FatbinConstantName, 8);
869 FatMagic = CudaFatMagic;
874 auto Values = Builder.beginStruct(FatbinWrapperTy);
876 Values.addInt(IntTy, FatMagic);
878 Values.addInt(IntTy, 1);
880 Values.add(FatBinStr);
882 Values.add(llvm::ConstantPointerNull::get(PtrTy));
883 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
886 FatbinWrapper->setSection(FatbinSectionName);
896 auto Linkage = RelocatableDeviceCode ? llvm::GlobalValue::ExternalLinkage
897 : llvm::GlobalValue::InternalLinkage;
898 llvm::BasicBlock *IfBlock =
899 llvm::BasicBlock::Create(Context,
"if", ModuleCtorFunc);
900 llvm::BasicBlock *ExitBlock =
901 llvm::BasicBlock::Create(Context,
"exit", ModuleCtorFunc);
904 GpuBinaryHandle =
new llvm::GlobalVariable(
905 TheModule, PtrTy,
false,
Linkage,
907 !RelocatableDeviceCode ? llvm::ConstantPointerNull::get(PtrTy)
914 if (
Linkage != llvm::GlobalValue::InternalLinkage)
915 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
917 GpuBinaryHandle, PtrTy,
920 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
921 llvm::Constant *
Zero =
922 llvm::Constant::getNullValue(HandleValue->getType());
923 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue,
Zero);
924 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
927 CtorBuilder.SetInsertPoint(IfBlock);
929 llvm::CallInst *RegisterFatbinCall =
930 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
931 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
932 CtorBuilder.CreateBr(ExitBlock);
935 CtorBuilder.SetInsertPoint(ExitBlock);
937 if (RegisterGlobalsFunc) {
938 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
939 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
942 }
else if (!RelocatableDeviceCode) {
946 llvm::CallInst *RegisterFatbinCall =
947 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
948 GpuBinaryHandle =
new llvm::GlobalVariable(
949 TheModule, PtrTy,
false, llvm::GlobalValue::InternalLinkage,
950 llvm::ConstantPointerNull::get(PtrTy),
"__cuda_gpubin_handle");
952 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
956 if (RegisterGlobalsFunc)
957 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
961 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
964 llvm::FunctionType::get(VoidTy, PtrTy,
false),
965 "__cudaRegisterFatBinaryEnd");
966 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
971 llvm::raw_svector_ostream OS(ModuleID);
972 OS << ModuleIDPrefix << llvm::format(
"%" PRIx64, FatbinWrapper->getGUID());
973 llvm::Constant *ModuleIDConstant = makeConstantArray(
974 std::string(ModuleID),
"", ModuleIDSectionName, 32,
true);
977 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
978 Twine(
"__fatbinwrap") + ModuleID, FatbinWrapper);
983 RegisterLinkedBinaryName += ModuleID;
985 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
987 assert(RegisterGlobalsFunc &&
"Expecting at least dummy function!");
988 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
989 makeDummyFunction(getCallbackFnTy())};
990 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
996 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
998 llvm::FunctionType *AtExitTy =
999 llvm::FunctionType::get(IntTy, CleanupFn->getType(),
false);
1000 llvm::FunctionCallee AtExitFunc =
1003 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
1006 CtorBuilder.CreateRetVoid();
1007 return ModuleCtorFunc;
1029llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
1031 if (!GpuBinaryHandle)
1036 llvm::FunctionType::get(VoidTy, PtrTy,
false),
1037 addUnderscoredPrefixToName(
"UnregisterFatBinary"));
1039 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
1040 llvm::FunctionType::get(VoidTy,
false),
1041 llvm::GlobalValue::InternalLinkage,
1042 addUnderscoredPrefixToName(
"_module_dtor"), &TheModule);
1044 llvm::BasicBlock *DtorEntryBB =
1045 llvm::BasicBlock::Create(Context,
"entry", ModuleDtorFunc);
1047 DtorBuilder.SetInsertPoint(DtorEntryBB);
1050 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
1052 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
1057 llvm::BasicBlock *IfBlock =
1058 llvm::BasicBlock::Create(Context,
"if", ModuleDtorFunc);
1059 llvm::BasicBlock *ExitBlock =
1060 llvm::BasicBlock::Create(Context,
"exit", ModuleDtorFunc);
1061 llvm::Constant *
Zero = llvm::Constant::getNullValue(HandleValue->getType());
1062 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue,
Zero);
1063 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
1065 DtorBuilder.SetInsertPoint(IfBlock);
1066 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1067 DtorBuilder.CreateStore(
Zero, GpuBinaryAddr);
1068 DtorBuilder.CreateBr(ExitBlock);
1070 DtorBuilder.SetInsertPoint(ExitBlock);
1072 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1074 DtorBuilder.CreateRetVoid();
1075 return ModuleDtorFunc;
1079 return new CGNVCUDARuntime(CGM);
1082void CGNVCUDARuntime::internalizeDeviceSideVar(
1101 D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1102 D->getType()->isCUDADeviceBuiltinTextureType()) {
1103 Linkage = llvm::GlobalValue::InternalLinkage;
1107void CGNVCUDARuntime::handleVarRegistration(
const VarDecl *
D,
1108 llvm::GlobalVariable &GV) {
1123 if ((!
D->hasExternalStorage() && !
D->isInline()) ||
1126 registerDeviceVar(
D, GV, !
D->hasDefinition(),
1129 }
else if (
D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1130 D->getType()->isCUDADeviceBuiltinTextureType()) {
1133 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1134 D->getType()->castAsCXXRecordDecl());
1136 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1137 assert(Args.
size() == 2 &&
1138 "Unexpected number of template arguments of CUDA device "
1139 "builtin surface type.");
1140 auto SurfType = Args[1].getAsIntegral();
1141 if (!
D->hasExternalStorage())
1142 registerDeviceSurf(
D, GV, !
D->hasDefinition(), SurfType.getSExtValue());
1144 assert(Args.
size() == 3 &&
1145 "Unexpected number of template arguments of CUDA device "
1146 "builtin texture type.");
1147 auto TexType = Args[1].getAsIntegral();
1148 auto Normalized = Args[2].getAsIntegral();
1149 if (!
D->hasExternalStorage())
1150 registerDeviceTex(
D, GV, !
D->hasDefinition(), TexType.getSExtValue(),
1151 Normalized.getZExtValue());
1160void CGNVCUDARuntime::transformManagedVars() {
1161 for (
auto &&Info : DeviceVars) {
1162 llvm::GlobalVariable *Var = Info.Var;
1163 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1164 Info.Flags.isManaged()) {
1165 auto *ManagedVar =
new llvm::GlobalVariable(
1167 false, Var->getLinkage(),
1168 Var->isDeclaration()
1170 : llvm::ConstantPointerNull::get(Var->getType()),
1172 llvm::GlobalVariable::NotThreadLocal,
1174 ? LangAS::cuda_device
1175 : LangAS::Default));
1176 ManagedVar->setDSOLocal(Var->isDSOLocal());
1177 ManagedVar->setVisibility(Var->getVisibility());
1178 ManagedVar->setExternallyInitialized(
true);
1180 ManagedVar->takeName(Var);
1181 Var->setName(Twine(ManagedVar->getName()) +
".managed");
1184 if (CGM.
getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1185 assert(!ManagedVar->isDeclaration());
1196void CGNVCUDARuntime::createOffloadingEntries() {
1198 ? llvm::object::OffloadKind::OFK_HIP
1199 : llvm::object::OffloadKind::OFK_Cuda;
1202 Kind = llvm::object::OffloadKind::OFK_OpenMP;
1205 for (KernelInfo &I : EmittedKernels)
1206 llvm::offloading::emitOffloadingEntry(
1207 M, Kind, KernelHandles[I.Kernel->getName()],
1208 getDeviceSideName(cast<NamedDecl>(I.D)), 0, 0,
1209 llvm::offloading::OffloadGlobalEntry);
1211 for (VarInfo &I : DeviceVars) {
1213 CGM.
getDataLayout().getTypeAllocSize(I.Var->getValueType());
1216 ?
static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1218 (I.Flags.isConstant()
1219 ?
static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1221 (I.Flags.isNormalized()
1222 ?
static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1224 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1225 if (I.Flags.isManaged()) {
1226 assert(I.Var->getName().ends_with(
".managed") &&
1227 "HIP managed variables not transformed");
1229 auto *ManagedVar = M.getNamedGlobal(
1230 I.Var->getName().drop_back(StringRef(
".managed").size()));
1231 llvm::offloading::emitOffloadingEntry(
1232 M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1233 llvm::offloading::OffloadGlobalManagedEntry | Flags,
1234 I.Var->getAlignment(), ManagedVar);
1236 llvm::offloading::emitOffloadingEntry(
1237 M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1238 llvm::offloading::OffloadGlobalEntry | Flags,
1241 }
else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1242 llvm::offloading::emitOffloadingEntry(
1243 M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1244 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1245 I.Flags.getSurfTexType());
1246 }
else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1247 llvm::offloading::emitOffloadingEntry(
1248 M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
1249 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1250 I.Flags.getSurfTexType());
1256llvm::Function *CGNVCUDARuntime::finalizeModule() {
1257 transformManagedVars();
1269 for (
auto &&Info : DeviceVars) {
1270 auto Kind = Info.Flags.getKind();
1271 if (!Info.Var->isDeclaration() &&
1272 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1273 (Kind == DeviceVarFlags::Variable ||
1274 Kind == DeviceVarFlags::Surface ||
1275 Kind == DeviceVarFlags::Texture) &&
1276 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1284 (CGM.
getLangOpts().HIP || RelocatableDeviceCode)))
1285 createOffloadingEntries();
1287 return makeModuleCtorFunction();
1292llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1294 auto Loc = KernelHandles.find(F->getName());
1295 if (
Loc != KernelHandles.end()) {
1296 auto OldHandle =
Loc->second;
1297 if (KernelStubs[OldHandle] == F)
1305 KernelStubs[OldHandle] = F;
1310 KernelStubs.erase(OldHandle);
1314 KernelHandles[F->getName()] = F;
1319 auto *Var =
new llvm::GlobalVariable(
1320 TheModule, F->getType(),
true, F->getLinkage(),
1325 Var->setDSOLocal(F->isDSOLocal());
1326 Var->setVisibility(F->getVisibility());
1327 auto *FD = cast<FunctionDecl>(GD.
getDecl());
1328 auto *FT = FD->getPrimaryTemplate();
1329 if (!FT || FT->isThisDeclarationADefinition())
1331 KernelHandles[F->getName()] = Var;
1332 KernelStubs[Var] = F;
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
TranslationUnitDecl * getTranslationUnitDecl() const
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
bool shouldExternalize(const Decl *D) const
Whether a C++ static variable or CUDA/HIP kernel should be externalized.
StringRef getCUIDHash() const
const TargetInfo * getAuxTargetInfo() const
llvm::DenseSet< const VarDecl * > CUDADeviceVarODRUsedByHost
Keep track of CUDA/HIP device-side variables ODR-used by host code.
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
TypeInfoChars getTypeInfoInChars(const Type *T) const
const TargetInfo & getTargetInfo() const
unsigned getTargetAddressSpace(LangAS AS) const
CharUnits - This is an opaque type for sizes expressed in character units.
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
static CharUnits One()
One - Construct a CharUnits quantity of one.
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
std::string CudaGpuBinaryFileName
Name of file passed with -fcuda-include-gpubinary option to forward to CUDA runtime back-end for inco...
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
llvm::PointerType * getType() const
Return the type of the pointer value.
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
llvm::StoreInst * CreateAlignedStore(llvm::Value *Val, llvm::Value *Addr, CharUnits Align, bool IsVolatile=false)
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
Address CreateStructGEP(Address Addr, unsigned Index, const llvm::Twine &Name="")
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
MangleContext & getMangleContext()
Gets the mangle context.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
void add(RValue rvalue, QualType type)
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Emits a call or invoke instruction to the given runtime function.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **CallOrInvoke, bool IsMustTail, SourceLocation Loc, bool IsVirtualFunctionPointerThunk=false)
EmitCall - Generate a call of the given function, expecting the given result type,...
RawAddress CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits align, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates a alloca and inserts it into the entry block.
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::Type * ConvertTypeForMem(QualType T)
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
llvm::FunctionCallee CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false, bool AssumeConvergent=false)
Create or return a runtime function declaration with the specified type and name.
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
const IntrusiveRefCntPtr< llvm::vfs::FileSystem > & getFileSystem() const
DiagnosticsEngine & getDiags() const
const LangOptions & getLangOpts() const
CodeGenTypes & getTypes()
const TargetInfo & getTarget() const
const llvm::DataLayout & getDataLayout() const
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
CGCXXABI & getCXXABI() const
const llvm::Triple & getTriple() const
ASTContext & getContext() const
const CodeGenOptions & getCodeGenOpts() const
StringRef getMangledName(GlobalDecl GD)
void maybeSetTrivialComdat(const Decl &D, llvm::GlobalObject &GO)
void printPostfixForExternalizedDecl(llvm::raw_ostream &OS, const Decl *D) const
Print the postfix for externalized static variable or kernels for single source offloading languages ...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
const CGFunctionInfo & arrangeFunctionDeclaration(const GlobalDecl GD)
Free functions are functions that are compatible with an ordinary C function pointer type.
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
static RValue get(llvm::Value *V)
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Decl - This represents one declaration (or definition), e.g.
SourceLocation getLocation() const
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Represents a function declaration or definition.
const ParmVarDecl * getParamDecl(unsigned i) const
GlobalDecl - represents a global declaration.
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
const Decl * getDecl() const
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
std::string CUID
The user provided compilation unit ID, if non-empty.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
bool shouldMangleDeclName(const NamedDecl *D)
void mangleName(GlobalDecl GD, raw_ostream &)
This represents a decl that may have a name.
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Represents a parameter to a function.
A (possibly-)qualified type.
QualType getCanonicalType() const
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
const llvm::VersionTuple & getSDKVersion() const
A template argument list.
unsigned size() const
Retrieve the number of template arguments in this template argument list.
The top declaration context.
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
The base class of the type hierarchy.
Represents a variable declaration or definition.
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
llvm::IntegerType * SizeTy
llvm::IntegerType * IntTy
int
CharUnits getSizeAlign() const
llvm::PointerType * UnqualPtrTy
CharUnits getPointerAlign() const