Skip to content

Conversation

rampitec
Copy link
Collaborator

No description provided.

@rampitec rampitec requested review from changpeng and shiltian August 26, 2025 20:26
Copy link
Collaborator Author

This stack of pull requests is managed by Graphite. Learn more about stacking.

@rampitec rampitec marked this pull request as ready for review August 26, 2025 20:26
@llvmbot
Copy link
Member

llvmbot commented Aug 26, 2025

@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/155493.diff

2 Files Affected:

  • (modified) llvm/lib/IR/Verifier.cpp (+3-1)
  • (modified) llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll (+124)
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 4eb4b58d022e8..9fda08645e118 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6724,7 +6724,9 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "invalid vector type for format", &Call, Src1, Call.getArgOperand(5));
     break;
   }
-  case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+  case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
+  case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
+  case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
     Value *Src0 = Call.getArgOperand(1);
     Value *Src1 = Call.getArgOperand(3);
 
diff --git a/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
index af0d7f1169189..553c64dcd87fe 100644
--- a/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
+++ b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
@@ -24,6 +24,46 @@ bb:
   ret void
 }
 
+; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <16 x i64> %A
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <16 x i64> %B
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+; CHECK-NEXT: <16 x i64> %A
+define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+; CHECK-NEXT: <16 x i64> %B
+define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 ; --------------------------------------------------------------------
 ; Impossible vector types
 ; --------------------------------------------------------------------
@@ -48,6 +88,26 @@ bb:
   ret void
 }
 
+; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v15i32.v16i32(i32 0, <15 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <15 x i32> %A
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v15i32_fp8___v16i32_fp8(<15 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v15i32.v16i32(i32 0, <15 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v15i32(i32 0, <16 x i32> %A, i32 0, <15 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <15 x i32> %B
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v15i32_fp8(<16 x i32> %A, <15 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v15i32(i32 0, <16 x i32> %A, i32 0, <15 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 ; --------------------------------------------------------------------
 ; Out of bounds format
 ; --------------------------------------------------------------------
@@ -72,6 +132,26 @@ bb:
   ret void
 }
 
+; CHECK: invalid value for matrix format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 5, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: i32 5
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_invalid0___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 5, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid value for matrix format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 5, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: i32 5
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_invalid1(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 5, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 ; --------------------------------------------------------------------
 ; Incorrect signature for format cases (IR vector too small)
 ; --------------------------------------------------------------------
@@ -163,3 +243,47 @@ bb:
   store <8 x float> %res, ptr addrspace(1) %out
   ret void
 }
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 2, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <8 x i32> %A
+; CHECK-NEXT: i32 2
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v8i32_fp6___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 2, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 2, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+; CHECK-NEXT: <8 x i32> %B
+; CHECK-NEXT: i32 2
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_fp6(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <16 x i32> %A, i32 2, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 3, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+; CHECK-NEXT: <8 x i32> %A
+; CHECK-NEXT: i32 3
+define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v8i32_bf6___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 3, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 3, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+; CHECK-NEXT: <8 x i32> %B
+; CHECK-NEXT: i32 3
+define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_bf6(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <16 x i32> %A, i32 3, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}

@rampitec rampitec merged commit 2c920a1 into main Aug 26, 2025
14 checks passed
@rampitec rampitec deleted the users/rampitec/08-26-_amdgpu_wmma_scale_ir_verification branch August 26, 2025 21:10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants