From 538ca37da7de45e2329dd5c6796f50626ca2cabb Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 26 Aug 2025 13:25:16 -0700 Subject: [PATCH] [AMDGPU] wmma_scale* IR verification --- llvm/lib/IR/Verifier.cpp | 4 +- llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll | 124 +++++++++++++++++++++++ 2 files changed, 127 insertions(+), 1 deletion(-) diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 4eb4b58d022e8..9fda08645e118 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6724,7 +6724,9 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "invalid vector type for format", &Call, Src1, Call.getArgOperand(5)); break; } - case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: { + case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: { Value *Src0 = Call.getArgOperand(1); Value *Src1 = Call.getArgOperand(3); diff --git a/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll index af0d7f1169189..553c64dcd87fe 100644 --- a/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll +++ b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll @@ -24,6 +24,46 @@ bb: ret void } +; CHECK: operand 1 must be 8, 12 or 16 element i32 vector +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) +; CHECK-NEXT: <16 x i64> %A +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: operand 3 must be 8, 12 or 16 element i32 vector +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) +; CHECK-NEXT: <16 x i64> %B +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: operand 1 must be 8, 12 or 16 element i32 vector +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) +; CHECK-NEXT: <16 x i64> %A +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: operand 3 must be 8, 12 or 16 element i32 vector +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) +; CHECK-NEXT: <16 x i64> %B +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + ; -------------------------------------------------------------------- ; Impossible vector types ; -------------------------------------------------------------------- @@ -48,6 +88,26 @@ bb: ret void } +; CHECK: operand 1 must be 8, 12 or 16 element i32 vector +; CHECK-NEXT: call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v15i32.v16i32(i32 0, <15 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) +; CHECK-NEXT: <15 x i32> %A +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v15i32_fp8___v16i32_fp8(<15 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v15i32.v16i32(i32 0, <15 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: operand 3 must be 8, 12 or 16 element i32 vector +; CHECK-NEXT: call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v15i32(i32 0, <16 x i32> %A, i32 0, <15 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) +; CHECK-NEXT: <15 x i32> %B +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v15i32_fp8(<16 x i32> %A, <15 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v15i32(i32 0, <16 x i32> %A, i32 0, <15 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + ; -------------------------------------------------------------------- ; Out of bounds format ; -------------------------------------------------------------------- @@ -72,6 +132,26 @@ bb: ret void } +; CHECK: invalid value for matrix format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 5, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) +; CHECK-NEXT: i32 5 +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_invalid0___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 5, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid value for matrix format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 5, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) +; CHECK-NEXT: i32 5 +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_invalid1(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 5, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + ; -------------------------------------------------------------------- ; Incorrect signature for format cases (IR vector too small) ; -------------------------------------------------------------------- @@ -163,3 +243,47 @@ bb: store <8 x float> %res, ptr addrspace(1) %out ret void } + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 2, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) +; CHECK-NEXT: <8 x i32> %A +; CHECK-NEXT: i32 2 +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v8i32_fp6___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 2, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 2, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) +; CHECK-NEXT: <8 x i32> %B +; CHECK-NEXT: i32 2 +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_fp6(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <16 x i32> %A, i32 2, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 3, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) +; CHECK-NEXT: <8 x i32> %A +; CHECK-NEXT: i32 3 +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v8i32_bf6___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 3, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +; CHECK: invalid vector type for format +; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 3, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) +; CHECK-NEXT: <8 x i32> %B +; CHECK-NEXT: i32 3 +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_bf6(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <16 x i32> %A, i32 3, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +}