-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU][MC] GFX9 - Support NV bit in FLAT instructions in pre-GFX90A #154237
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU][MC] GFX9 - Support NV bit in FLAT instructions in pre-GFX90A #154237
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-mc Author: Jun Wang (jwanggit86) Changestargets This patch enables support of the NV (non-volatile) bit in FLAT instructions in GFX9 (pre-GFX90A) targets. Patch is 119.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154237.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0d2feeb4edea3..0847b7faa6167 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -5405,7 +5405,8 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
Error(S, "scale_offset is not supported on this GPU");
}
- if (CPol & CPol::NV) {
+ if ((CPol & CPol::NV) && (!isGFX9() || isGFX90A())) {
+ // nv not supported on GFX90A+
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
StringRef CStr(S.getPointer());
S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]);
@@ -7167,6 +7168,13 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
unsigned Enabled = 0, Seen = 0;
for (;;) {
SMLoc S = getLoc();
+
+ if (isGFX9() && trySkipId("nv")) {
+ Enabled |= CPol::NV;
+ Seen |= CPol::NV;
+ continue;
+ }
+
bool Disabling;
unsigned CPol = getCPolKind(getId(), Mnemo, Disabling);
if (!CPol)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fb7d634e62272..6fabb32b9d7e9 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -822,6 +822,19 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::FLAT) {
+ if (isGFX9() && !isGFX90A()) {
+ // Pre-GFX90A GFX9's use bit 55 as NV.
+ assert(Bytes_.size() >= 8);
+ if (Bytes_[6] & 0x80) { // check bit 55
+ int CPolIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::cpol);
+ MI.getOperand(CPolIdx).setImm(MI.getOperand(CPolIdx).getImm() |
+ AMDGPU::CPol::NV);
+ }
+ }
+ }
+
if ((MCII->get(MI.getOpcode()).TSFlags &
(SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
(STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ee8683a549a80..ffd474bc6ab2e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -180,7 +180,10 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
if (Imm & ~CPol::ALL_pregfx12)
- O << " /* unexpected cache policy bit */";
+ if ((Imm & CPol::NV) && AMDGPU::isGFX9(STI) && !AMDGPU::isGFX90A(STI))
+ O << " nv";
+ else
+ O << " /* unexpected cache policy bit */";
}
void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index f3580842c6ff0..a3662f6c3ca41 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -394,6 +394,16 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
Encoding |= getImplicitOpSelHiEncoding(Opcode);
}
+ // For GFX90A+ targets, bit 55 of the FLAT instructions is the ACC bit
+ // indicating the use of AGPRs. However, pre-GFX90A, the same bit is for NV.
+ if ((Desc.TSFlags & SIInstrFlags::FLAT) && AMDGPU::isGFX9(STI) &&
+ !AMDGPU::isGFX90A(STI)) {
+ int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::cpol);
+ unsigned Cpol = MI.getOperand(Idx).getImm();
+ if (Cpol & AMDGPU::CPol::NV)
+ Encoding |= (UINT64_C(1) << 55);
+ }
+
// GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC.
// Documentation requires dst to be encoded as EXEC (0x7E),
// but it looks like the actual value encoded for dst operand
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
index 5cc3d2533a149..7687c0a478bd9 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
@@ -24,6 +24,18 @@ flat_load_ubyte v5, v[1:2] offset:4095 glc
flat_load_ubyte v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05]
+flat_load_ubyte v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_sbyte v5, v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05]
@@ -48,6 +60,18 @@ flat_load_sbyte v5, v[1:2] offset:4095 glc
flat_load_sbyte v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05]
+flat_load_sbyte v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_ushort v5, v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05]
@@ -72,6 +96,18 @@ flat_load_ushort v5, v[1:2] offset:4095 glc
flat_load_ushort v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05]
+flat_load_ushort v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ushort v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ushort v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ushort v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_sshort v5, v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05]
@@ -96,6 +132,18 @@ flat_load_sshort v5, v[1:2] offset:4095 glc
flat_load_sshort v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05]
+flat_load_sshort v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sshort v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sshort v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sshort v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_dword v5, v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05]
@@ -120,6 +168,18 @@ flat_load_dword v5, v[1:2] offset:4095 glc
flat_load_dword v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05]
+flat_load_dword v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dword v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dword v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dword v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_dwordx2 v[5:6], v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05]
@@ -144,6 +204,18 @@ flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc
flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05]
+flat_load_dwordx2 v[5:6], v[1:2] nv
+// CHECK: [0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx2 v[5:6], v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_dwordx3 v[5:7], v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05]
@@ -168,6 +240,18 @@ flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc
flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05]
+flat_load_dwordx3 v[5:7], v[1:2] nv
+// CHECK: [0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx3 v[5:7], v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_dwordx4 v[5:8], v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05]
@@ -192,6 +276,18 @@ flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc
flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05]
+flat_load_dwordx4 v[5:8], v[1:2] nv
+// CHECK: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx4 v[5:8], v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05]
+
flat_store_byte v[1:2], v2 offset:4095
// CHECK: [0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00]
@@ -216,6 +312,18 @@ flat_store_byte v[1:2], v2 offset:4095 glc
flat_store_byte v[1:2], v2 offset:4095 slc
// CHECK: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00]
+flat_store_byte v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00]
+
flat_store_byte_d16_hi v[1:2], v2 offset:4095
// CHECK: [0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00]
@@ -240,6 +348,18 @@ flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc
flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc
// CHECK: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00]
+flat_store_byte_d16_hi v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte_d16_hi v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00]
+
flat_store_short v[1:2], v2 offset:4095
// CHECK: [0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00]
@@ -264,6 +384,18 @@ flat_store_short v[1:2], v2 offset:4095 glc
flat_store_short v[1:2], v2 offset:4095 slc
// CHECK: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00]
+flat_store_short v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00]
+
flat_store_short_d16_hi v[1:2], v2 offset:4095
// CHECK: [0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00]
@@ -288,6 +420,18 @@ flat_store_short_d16_hi v[1:2], v2 offset:4095 glc
flat_store_short_d16_hi v[1:2], v2 offset:4095 slc
// CHECK: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00]
+flat_store_short_d16_hi v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short_d16_hi v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short_d16_hi v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short_d16_hi v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00]
+
flat_store_dword v[1:2], v2 offset:4095
// CHECK: [0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00]
@@ -312,6 +456,18 @@ flat_store_dword v[1:2], v2 offset:4095 glc
flat_store_dword v[1:2], v2 offset:4095 slc
// CHECK: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00]
+flat_store_dword v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dword v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dword v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dword v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00]
+
flat_store_dwordx2 v[1:2], v[2:3] offset:4095
// CHECK: [0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00]
@@ -336,6 +492,18 @@ flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc
flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc
// CHECK: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00]
+flat_store_dwordx2 v[1:2], v[2:3] nv
+// CHECK: [0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx2 v[1:2], v[2:3] offset:7 nv
+// CHECK: [0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00]
+
flat_store_dwordx3 v[1:2], v[2:4] offset:4095
// CHECK: [0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00]
@@ -360,6 +528,18 @@ flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc
flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc
// CHECK: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00]
+flat_store_dwordx3 v[1:2], v[2:4] nv
+// CHECK: [0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx3 v[1:2], v[2:4] offset:7 nv
+// CHECK: [0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00]
+
flat_store_dwordx4 v[1:2], v[2:5] offset:4095
// CHECK: [0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00]
@@ -384,6 +564,18 @@ flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc
flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc
// CHECK: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00]
+flat_store_dwordx4 v[1:2], v[2:5] nv
+// CHECK: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx4 v[1:2], v[2:5] offset:7 nv
+// CHECK: [0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00]
+
flat_load_ubyte_d16 v5, v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05]
@@ -408,6 +600,18 @@ flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc
flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05]
+flat_load_ubyte_d16 v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16 v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_ubyte_d16_hi v5, v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05]
@@ -432,6 +636,18 @@ flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc
flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05]
+flat_load_ubyte_d16_hi v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16_hi v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_sbyte_d16 v5, v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05]
@@ -456,6 +672,18 @@ flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc
flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05]
+flat_load_sbyte_d16 v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16 v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_sbyte_d16_hi v5, v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05]
@@ -480,6 +708,18 @@ flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc
flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05]
+flat_load_sbyte_d16_hi v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16_hi v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_short_d16 v5, v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05]
@@ -504,6 +744,18 @@ flat_load_short_d16 v5, v[1:2] offset:4095 glc
flat_load_short_d16 v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05]
+flat_load_short_d16 v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16 v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16 v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16 v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05]
+
flat_load_short_d16_hi v5, v[1:2] offset:4095
// CHECK: [0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05]
@@ -528,6 +780,18 @@ flat_load_short_d16_hi v5, v[1:2] offset:4095 glc
flat_load_short_d16_hi v5, v[1:2] offset:4095 slc
// CHECK: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05]
+flat_load_short_d16_hi v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16_hi v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16_hi v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16_hi v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05]
+
flat_atomic_swap v[1:2], v2 offset:4095
// CHECK: [0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00]
@@ -552,6 +816,18 @@ flat_atomic_swap v0, v[1:2], v2 offset:4095 glc
flat_atomic_swap v[1:2], v2 offset:4095 slc
// CHECK: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00]
+flat_atomic_swap v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_swap v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_swap v0, v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_swap v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00]
+
flat_atomic_cmpswap v[1:2], v[2:3] offset:4095
// CHECK: [0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00]
@@ -576,6 +852,18 @@ flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc
flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc
// CHECK: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00]
+flat_atomic_cmpswap v[1:2], v[2:3] nv
+// CHECK: [0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_cmpswap v[1:2], v[2:3] offset:7 nv
+// CHECK: [0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00]
+
flat_atomic_add v[1:2], v2 offset:4095
// CHECK:...
[truncated]
|
@@ -5405,7 +5405,8 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, | |||
S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]); | |||
Error(S, "scale_offset is not supported on this GPU"); | |||
} | |||
if (CPol & CPol::NV) { | |||
if ((CPol & CPol::NV) && (!isGFX9() || isGFX90A())) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Avoid generation checks, should have a named predicate for this
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::FLAT) { | ||
if (isGFX9() && !isGFX90A()) { | ||
// Pre-GFX90A GFX9's use bit 55 as NV. | ||
assert(Bytes_.size() >= 8); | ||
if (Bytes_[6] & 0x80) { // check bit 55 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should have less special case code like this. It would be preferable to have a separate instruction definition
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree, it should be a separate Real instruction (for the same pseudo).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed. In this case, however, bit 55 is used for something else for GFX90A+. I tried defining a new instruction, but then it had decoding conflict. Any suggestions would be appreciated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You need a separate Real in a separate DecoderNamespace with SIEncodingFamily.GFX90A.
targets This patch enables support of the NV (non-volatile) bit in FLAT instructions in GFX9 (pre-GFX90A) targets.
no need to do the encoding/decoding in CPP code.
c51b033
to
ea20bff
Compare
targets
This patch enables support of the NV (non-volatile) bit in FLAT instructions in GFX9 (pre-GFX90A) targets.