Skip to content

Commit 2af74e2

Browse files
committed
[MS] Overhaul how clang passes overaligned args on x86_32
MSVC 2013 would refuse to pass highly aligned things (typically vectors and aggregates) by value. Users would receive this error: t.cpp(11) : error C2719: 'w': formal parameter with __declspec(align('32')) won't be aligned t.cpp(11) : error C2719: 'q': formal parameter with __declspec(align('32')) won't be aligned However, in MSVC 2015, this behavior was changed, and highly aligned things are now passed indirectly. To avoid breaking backwards incompatibility, objects that do not have a *required* high alignment (i.e. double) are still passed directly, even though they are not naturally aligned. This change implements the new behavior of passing things indirectly. The new behavior is: - up to three vector parameters can be passed in [XYZ]MM0-2 - remaining arguments with required alignment greater than 4 bytes are passed indirectly Previously, MSVC never passed things truly indirectly, meaning clang would always apply the byval attribute to indirect arguments. We had to go to the trouble of adding inalloca so that non-trivially copyable C++ types could be passed in place without copying the object representation. When inalloca was added, we asserted that all arguments passed indirectly must use byval. With this change, that assert no longer holds, and I had to update inalloca to handle that case. The implicit sret pointer parameter was already handled this way, and this change generalizes some of that logic to arguments. There are two cases that this change leaves unfixed: 1. objects that are non-trivially copyable *and* overaligned 2. vectorcall + inalloca + vectors For case 1, I need to touch C++ ABI code in MicrosoftCXXABI.cpp, so I want to do it in a follow-up. For case 2, my fix is one line, but it will require updating IR tests to use lots of inreg, so I wanted to separate it out. Related to D71915 and D72110 Fixes most of PR44395 Reviewed By: rjmccall, craig.topper, erichkeane Differential Revision: https://reviews.llvm.org/D72114
1 parent 44b4967 commit 2af74e2

File tree

6 files changed

+269
-33
lines changed

6 files changed

+269
-33
lines changed

clang/include/clang/CodeGen/CGFunctionInfo.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ class ABIArgInfo {
8888
Kind TheKind;
8989
bool PaddingInReg : 1;
9090
bool InAllocaSRet : 1; // isInAlloca()
91+
bool InAllocaIndirect : 1;// isInAlloca()
9192
bool IndirectByVal : 1; // isIndirect()
9293
bool IndirectRealign : 1; // isIndirect()
9394
bool SRetAfterThis : 1; // isIndirect()
@@ -110,8 +111,8 @@ class ABIArgInfo {
110111

111112
public:
112113
ABIArgInfo(Kind K = Direct)
113-
: TypeData(nullptr), PaddingType(nullptr), DirectOffset(0),
114-
TheKind(K), PaddingInReg(false), InAllocaSRet(false),
114+
: TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), TheKind(K),
115+
PaddingInReg(false), InAllocaSRet(false), InAllocaIndirect(false),
115116
IndirectByVal(false), IndirectRealign(false), SRetAfterThis(false),
116117
InReg(false), CanBeFlattened(false), SignExt(false) {}
117118

@@ -185,9 +186,10 @@ class ABIArgInfo {
185186
AI.setInReg(true);
186187
return AI;
187188
}
188-
static ABIArgInfo getInAlloca(unsigned FieldIndex) {
189+
static ABIArgInfo getInAlloca(unsigned FieldIndex, bool Indirect = false) {
189190
auto AI = ABIArgInfo(InAlloca);
190191
AI.setInAllocaFieldIndex(FieldIndex);
192+
AI.setInAllocaIndirect(Indirect);
191193
return AI;
192194
}
193195
static ABIArgInfo getExpand() {
@@ -380,6 +382,15 @@ class ABIArgInfo {
380382
AllocaFieldIndex = FieldIndex;
381383
}
382384

385+
unsigned getInAllocaIndirect() const {
386+
assert(isInAlloca() && "Invalid kind!");
387+
return InAllocaIndirect;
388+
}
389+
void setInAllocaIndirect(bool Indirect) {
390+
assert(isInAlloca() && "Invalid kind!");
391+
InAllocaIndirect = Indirect;
392+
}
393+
383394
/// Return true if this field of an inalloca struct should be returned
384395
/// to implement a struct return calling convention.
385396
bool getInAllocaSRet() const {

clang/lib/CodeGen/CGCall.cpp

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2339,6 +2339,9 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
23392339
auto FieldIndex = ArgI.getInAllocaFieldIndex();
23402340
Address V =
23412341
Builder.CreateStructGEP(ArgStruct, FieldIndex, Arg->getName());
2342+
if (ArgI.getInAllocaIndirect())
2343+
V = Address(Builder.CreateLoad(V),
2344+
getContext().getTypeAlignInChars(Ty));
23422345
ArgVals.push_back(ParamValue::forIndirect(V));
23432346
break;
23442347
}
@@ -4038,18 +4041,39 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
40384041
assert(NumIRArgs == 0);
40394042
assert(getTarget().getTriple().getArch() == llvm::Triple::x86);
40404043
if (I->isAggregate()) {
4041-
// Replace the placeholder with the appropriate argument slot GEP.
40424044
Address Addr = I->hasLValue()
40434045
? I->getKnownLValue().getAddress(*this)
40444046
: I->getKnownRValue().getAggregateAddress();
40454047
llvm::Instruction *Placeholder =
40464048
cast<llvm::Instruction>(Addr.getPointer());
4047-
CGBuilderTy::InsertPoint IP = Builder.saveIP();
4048-
Builder.SetInsertPoint(Placeholder);
4049-
Addr =
4050-
Builder.CreateStructGEP(ArgMemory, ArgInfo.getInAllocaFieldIndex());
4051-
Builder.restoreIP(IP);
4049+
4050+
if (!ArgInfo.getInAllocaIndirect()) {
4051+
// Replace the placeholder with the appropriate argument slot GEP.
4052+
CGBuilderTy::InsertPoint IP = Builder.saveIP();
4053+
Builder.SetInsertPoint(Placeholder);
4054+
Addr = Builder.CreateStructGEP(ArgMemory,
4055+
ArgInfo.getInAllocaFieldIndex());
4056+
Builder.restoreIP(IP);
4057+
} else {
4058+
// For indirect things such as overaligned structs, replace the
4059+
// placeholder with a regular aggregate temporary alloca. Store the
4060+
// address of this alloca into the struct.
4061+
Addr = CreateMemTemp(info_it->type, "inalloca.indirect.tmp");
4062+
Address ArgSlot = Builder.CreateStructGEP(
4063+
ArgMemory, ArgInfo.getInAllocaFieldIndex());
4064+
Builder.CreateStore(Addr.getPointer(), ArgSlot);
4065+
}
40524066
deferPlaceholderReplacement(Placeholder, Addr.getPointer());
4067+
} else if (ArgInfo.getInAllocaIndirect()) {
4068+
// Make a temporary alloca and store the address of it into the argument
4069+
// struct.
4070+
Address Addr = CreateMemTempWithoutCast(
4071+
I->Ty, getContext().getTypeAlignInChars(I->Ty),
4072+
"indirect-arg-temp");
4073+
I->copyInto(*this, Addr);
4074+
Address ArgSlot =
4075+
Builder.CreateStructGEP(ArgMemory, ArgInfo.getInAllocaFieldIndex());
4076+
Builder.CreateStore(Addr.getPointer(), ArgSlot);
40534077
} else {
40544078
// Store the RValue into the argument struct.
40554079
Address Addr =

clang/lib/CodeGen/TargetInfo.cpp

Lines changed: 50 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1676,6 +1676,7 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
16761676
bool IsVectorCall = State.CC == llvm::CallingConv::X86_VectorCall;
16771677

16781678
Ty = useFirstFieldIfTransparentUnion(Ty);
1679+
TypeInfo TI = getContext().getTypeInfo(Ty);
16791680

16801681
// Check with the C++ ABI first.
16811682
const RecordType *RT = Ty->getAs<RecordType>();
@@ -1725,7 +1726,7 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
17251726
bool NeedsPadding = false;
17261727
bool InReg;
17271728
if (shouldAggregateUseDirect(Ty, State, InReg, NeedsPadding)) {
1728-
unsigned SizeInRegs = (getContext().getTypeSize(Ty) + 31) / 32;
1729+
unsigned SizeInRegs = (TI.Width + 31) / 32;
17291730
SmallVector<llvm::Type*, 3> Elements(SizeInRegs, Int32);
17301731
llvm::Type *Result = llvm::StructType::get(LLVMContext, Elements);
17311732
if (InReg)
@@ -1735,29 +1736,44 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
17351736
}
17361737
llvm::IntegerType *PaddingType = NeedsPadding ? Int32 : nullptr;
17371738

1739+
// Pass over-aligned aggregates on Windows indirectly. This behavior was
1740+
// added in MSVC 2015.
1741+
if (IsWin32StructABI && TI.AlignIsRequired && TI.Align > 32)
1742+
return getIndirectResult(Ty, /*ByVal=*/false, State);
1743+
17381744
// Expand small (<= 128-bit) record types when we know that the stack layout
17391745
// of those arguments will match the struct. This is important because the
17401746
// LLVM backend isn't smart enough to remove byval, which inhibits many
17411747
// optimizations.
17421748
// Don't do this for the MCU if there are still free integer registers
17431749
// (see X86_64 ABI for full explanation).
1744-
if (getContext().getTypeSize(Ty) <= 4 * 32 &&
1745-
(!IsMCUABI || State.FreeRegs == 0) && canExpandIndirectArgument(Ty))
1750+
if (TI.Width <= 4 * 32 && (!IsMCUABI || State.FreeRegs == 0) &&
1751+
canExpandIndirectArgument(Ty))
17461752
return ABIArgInfo::getExpandWithPadding(
17471753
IsFastCall || IsVectorCall || IsRegCall, PaddingType);
17481754

17491755
return getIndirectResult(Ty, true, State);
17501756
}
17511757

17521758
if (const VectorType *VT = Ty->getAs<VectorType>()) {
1759+
// On Windows, vectors are passed directly if registers are available, or
1760+
// indirectly if not. This avoids the need to align argument memory. Pass
1761+
// user-defined vector types larger than 512 bits indirectly for simplicity.
1762+
if (IsWin32StructABI) {
1763+
if (TI.Width <= 512 && State.FreeSSERegs > 0) {
1764+
--State.FreeSSERegs;
1765+
return ABIArgInfo::getDirectInReg();
1766+
}
1767+
return getIndirectResult(Ty, /*ByVal=*/false, State);
1768+
}
1769+
17531770
// On Darwin, some vectors are passed in memory, we handle this by passing
17541771
// it as an i8/i16/i32/i64.
17551772
if (IsDarwinVectorABI) {
1756-
uint64_t Size = getContext().getTypeSize(Ty);
1757-
if ((Size == 8 || Size == 16 || Size == 32) ||
1758-
(Size == 64 && VT->getNumElements() == 1))
1759-
return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(),
1760-
Size));
1773+
if ((TI.Width == 8 || TI.Width == 16 || TI.Width == 32) ||
1774+
(TI.Width == 64 && VT->getNumElements() == 1))
1775+
return ABIArgInfo::getDirect(
1776+
llvm::IntegerType::get(getVMContext(), TI.Width));
17611777
}
17621778

17631779
if (IsX86_MMXType(CGT.ConvertType(Ty)))
@@ -1787,16 +1803,22 @@ void X86_32ABIInfo::computeInfo(CGFunctionInfo &FI) const {
17871803
CCState State(FI);
17881804
if (IsMCUABI)
17891805
State.FreeRegs = 3;
1790-
else if (State.CC == llvm::CallingConv::X86_FastCall)
1806+
else if (State.CC == llvm::CallingConv::X86_FastCall) {
17911807
State.FreeRegs = 2;
1792-
else if (State.CC == llvm::CallingConv::X86_VectorCall) {
1808+
State.FreeSSERegs = 3;
1809+
} else if (State.CC == llvm::CallingConv::X86_VectorCall) {
17931810
State.FreeRegs = 2;
17941811
State.FreeSSERegs = 6;
17951812
} else if (FI.getHasRegParm())
17961813
State.FreeRegs = FI.getRegParm();
17971814
else if (State.CC == llvm::CallingConv::X86_RegCall) {
17981815
State.FreeRegs = 5;
17991816
State.FreeSSERegs = 8;
1817+
} else if (IsWin32StructABI) {
1818+
// Since MSVC 2015, the first three SSE vectors have been passed in
1819+
// registers. The rest are passed indirectly.
1820+
State.FreeRegs = DefaultNumRegisterParameters;
1821+
State.FreeSSERegs = 3;
18001822
} else
18011823
State.FreeRegs = DefaultNumRegisterParameters;
18021824

@@ -1843,16 +1865,25 @@ X86_32ABIInfo::addFieldToArgStruct(SmallVector<llvm::Type *, 6> &FrameFields,
18431865
CharUnits &StackOffset, ABIArgInfo &Info,
18441866
QualType Type) const {
18451867
// Arguments are always 4-byte-aligned.
1846-
CharUnits FieldAlign = CharUnits::fromQuantity(4);
1868+
CharUnits WordSize = CharUnits::fromQuantity(4);
1869+
assert(StackOffset.isMultipleOf(WordSize) && "unaligned inalloca struct");
18471870

1848-
assert(StackOffset.isMultipleOf(FieldAlign) && "unaligned inalloca struct");
1849-
Info = ABIArgInfo::getInAlloca(FrameFields.size());
1850-
FrameFields.push_back(CGT.ConvertTypeForMem(Type));
1851-
StackOffset += getContext().getTypeSizeInChars(Type);
1871+
// sret pointers and indirect things will require an extra pointer
1872+
// indirection, unless they are byval. Most things are byval, and will not
1873+
// require this indirection.
1874+
bool IsIndirect = false;
1875+
if (Info.isIndirect() && !Info.getIndirectByVal())
1876+
IsIndirect = true;
1877+
Info = ABIArgInfo::getInAlloca(FrameFields.size(), IsIndirect);
1878+
llvm::Type *LLTy = CGT.ConvertTypeForMem(Type);
1879+
if (IsIndirect)
1880+
LLTy = LLTy->getPointerTo(0);
1881+
FrameFields.push_back(LLTy);
1882+
StackOffset += IsIndirect ? WordSize : getContext().getTypeSizeInChars(Type);
18521883

18531884
// Insert padding bytes to respect alignment.
18541885
CharUnits FieldEnd = StackOffset;
1855-
StackOffset = FieldEnd.alignTo(FieldAlign);
1886+
StackOffset = FieldEnd.alignTo(WordSize);
18561887
if (StackOffset != FieldEnd) {
18571888
CharUnits NumBytes = StackOffset - FieldEnd;
18581889
llvm::Type *Ty = llvm::Type::getInt8Ty(getVMContext());
@@ -1866,16 +1897,12 @@ static bool isArgInAlloca(const ABIArgInfo &Info) {
18661897
switch (Info.getKind()) {
18671898
case ABIArgInfo::InAlloca:
18681899
return true;
1869-
case ABIArgInfo::Indirect:
1870-
assert(Info.getIndirectByVal());
1871-
return true;
18721900
case ABIArgInfo::Ignore:
18731901
return false;
1902+
case ABIArgInfo::Indirect:
18741903
case ABIArgInfo::Direct:
18751904
case ABIArgInfo::Extend:
1876-
if (Info.getInReg())
1877-
return false;
1878-
return true;
1905+
return !Info.getInReg();
18791906
case ABIArgInfo::Expand:
18801907
case ABIArgInfo::CoerceAndExpand:
18811908
// These are aggregate types which are never passed in registers when
@@ -1909,8 +1936,7 @@ void X86_32ABIInfo::rewriteWithInAlloca(CGFunctionInfo &FI) const {
19091936

19101937
// Put the sret parameter into the inalloca struct if it's in memory.
19111938
if (Ret.isIndirect() && !Ret.getInReg()) {
1912-
CanQualType PtrTy = getContext().getPointerType(FI.getReturnType());
1913-
addFieldToArgStruct(FrameFields, StackOffset, Ret, PtrTy);
1939+
addFieldToArgStruct(FrameFields, StackOffset, Ret, FI.getReturnType());
19141940
// On Windows, the hidden sret parameter is always returned in eax.
19151941
Ret.setInAllocaSRet(IsWin32StructABI);
19161942
}

clang/test/CodeGen/x86_32-arguments-win32.c

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,47 @@ struct s6 {
4646
struct s6 f6_1(void) { while (1) {} }
4747
void f6_2(struct s6 a0) {}
4848

49+
50+
// MSVC passes up to three vectors in registers, and the rest indirectly. We
51+
// (arbitrarily) pass oversized vectors indirectly, since that is the safest way
52+
// to do it.
53+
typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
54+
typedef float __m256 __attribute__((__vector_size__(32), __aligned__(32)));
55+
typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
56+
typedef float __m1024 __attribute__((__vector_size__(128), __aligned__(128)));
57+
58+
__m128 gv128;
59+
__m256 gv256;
60+
__m512 gv512;
61+
__m1024 gv1024;
62+
63+
void receive_vec_128(__m128 x, __m128 y, __m128 z, __m128 w, __m128 q) {
64+
gv128 = x + y + z + w + q;
65+
}
66+
void receive_vec_256(__m256 x, __m256 y, __m256 z, __m256 w, __m256 q) {
67+
gv256 = x + y + z + w + q;
68+
}
69+
void receive_vec_512(__m512 x, __m512 y, __m512 z, __m512 w, __m512 q) {
70+
gv512 = x + y + z + w + q;
71+
}
72+
void receive_vec_1024(__m1024 x, __m1024 y, __m1024 z, __m1024 w, __m1024 q) {
73+
gv1024 = x + y + z + w + q;
74+
}
75+
// CHECK-LABEL: define dso_local void @receive_vec_128(<4 x float> inreg %x, <4 x float> inreg %y, <4 x float> inreg %z, <4 x float>* %0, <4 x float>* %1)
76+
// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, <8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
77+
// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, <16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* %1)
78+
// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)
79+
80+
void pass_vec_128() {
81+
__m128 z = {0};
82+
receive_vec_128(z, z, z, z, z);
83+
}
84+
85+
// CHECK-LABEL: define dso_local void @pass_vec_128()
86+
// CHECK: call void @receive_vec_128(<4 x float> inreg %{{[^,)]*}}, <4 x float> inreg %{{[^,)]*}}, <4 x float> inreg %{{[^,)]*}}, <4 x float>* %{{[^,)]*}}, <4 x float>* %{{[^,)]*}})
87+
88+
89+
void __fastcall fastcall_indirect_vec(__m128 x, __m128 y, __m128 z, __m128 w, int edx, __m128 q) {
90+
gv128 = x + y + z + w + q;
91+
}
92+
// CHECK-LABEL: define dso_local x86_fastcallcc void @"\01@fastcall_indirect_vec@84"(<4 x float> inreg %x, <4 x float> inreg %y, <4 x float> inreg %z, <4 x float>* inreg %0, i32 inreg %edx, <4 x float>* %1)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
// RUN: %clang_cc1 -fms-extensions -w -triple i386-pc-win32 -emit-llvm -o - %s | FileCheck %s
2+
3+
// PR44395
4+
// MSVC passes overaligned types indirectly since MSVC 2015. Make sure that
5+
// works with inalloca.
6+
7+
// FIXME: Pass non-trivial *and* overaligned types indirectly. Right now the C++
8+
// ABI rules say to use inalloca, and they take precedence, so it's not easy to
9+
// implement this.
10+
11+
12+
struct NonTrivial {
13+
NonTrivial();
14+
NonTrivial(const NonTrivial &o);
15+
int x;
16+
};
17+
18+
struct __declspec(align(64)) OverAligned {
19+
OverAligned();
20+
int buf[16];
21+
};
22+
23+
extern int gvi32;
24+
25+
int receive_inalloca_overaligned(NonTrivial nt, OverAligned o) {
26+
return nt.x + o.buf[0];
27+
}
28+
29+
// CHECK-LABEL: define dso_local i32 @"?receive_inalloca_overaligned@@Y{{.*}}"
30+
// CHECK-SAME: (<{ %struct.NonTrivial, %struct.OverAligned* }>* inalloca %0)
31+
32+
int pass_inalloca_overaligned() {
33+
gvi32 = receive_inalloca_overaligned(NonTrivial(), OverAligned());
34+
return gvi32;
35+
}
36+
37+
// CHECK-LABEL: define dso_local i32 @"?pass_inalloca_overaligned@@Y{{.*}}"
38+
// CHECK: [[TMP:%[^ ]*]] = alloca %struct.OverAligned, align 64
39+
// CHECK: call i8* @llvm.stacksave()
40+
// CHECK: alloca inalloca <{ %struct.NonTrivial, %struct.OverAligned* }>
41+
42+
// Construct OverAligned into TMP.
43+
// CHECK: call x86_thiscallcc %struct.OverAligned* @"??0OverAligned@@QAE@XZ"(%struct.OverAligned* [[TMP]])
44+
45+
// Construct NonTrivial into the GEP.
46+
// CHECK: [[GEP:%[^ ]*]] = getelementptr inbounds <{ %struct.NonTrivial, %struct.OverAligned* }>, <{ %struct.NonTrivial, %struct.OverAligned* }>* %{{.*}}, i32 0, i32 0
47+
// CHECK: call x86_thiscallcc %struct.NonTrivial* @"??0NonTrivial@@QAE@XZ"(%struct.NonTrivial* [[GEP]])
48+
49+
// Store the address of an OverAligned temporary into the struct.
50+
// CHECK: getelementptr inbounds <{ %struct.NonTrivial, %struct.OverAligned* }>, <{ %struct.NonTrivial, %struct.OverAligned* }>* %{{.*}}, i32 0, i32 1
51+
// CHECK: store %struct.OverAligned* [[TMP]], %struct.OverAligned** %{{.*}}, align 4
52+
// CHECK: call i32 @"?receive_inalloca_overaligned@@Y{{.*}}"(<{ %struct.NonTrivial, %struct.OverAligned* }>* inalloca %argmem)

0 commit comments

Comments
 (0)