From 0d8b5147e37e53e98019eea8e0d0a3aed0e875c5 Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Wed, 19 Feb 2025 12:09:17 -0600
Subject: [PATCH 1/2] Enable gcc-c-torture suite for hexagon ; Add HVX test for
 hexagon (#205) (#208)

* Add HVX intrinsic test for Hexagon

* Enable gcc torture suite for hexagon
---
 SingleSource/Regression/C/CMakeLists.txt      |   2 +-
 .../C/gcc-c-torture/execute/CMakeLists.txt    |  14 +
 .../gcc-c-torture/execute/ieee/CMakeLists.txt |  13 +
 SingleSource/UnitTests/Vector/CMakeLists.txt  |   3 +
 .../UnitTests/Vector/HVX/CMakeLists.txt       |   3 +
 SingleSource/UnitTests/Vector/HVX/QFloat.c    | 325 ++++++++++++++++++
 .../Vector/HVX/QFloat.reference_output        |  45 +++
 7 files changed, 404 insertions(+), 1 deletion(-)
 create mode 100644 SingleSource/UnitTests/Vector/HVX/CMakeLists.txt
 create mode 100644 SingleSource/UnitTests/Vector/HVX/QFloat.c
 create mode 100644 SingleSource/UnitTests/Vector/HVX/QFloat.reference_output

diff --git a/SingleSource/Regression/C/CMakeLists.txt b/SingleSource/Regression/C/CMakeLists.txt
index 3f224e2768..6288d2c64d 100644
--- a/SingleSource/Regression/C/CMakeLists.txt
+++ b/SingleSource/Regression/C/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(ARCH MATCHES "x86" OR ARCH MATCHES "riscv" OR
    ARCH MATCHES "SystemZ" OR ARCH MATCHES "Mips" OR
    ARCH MATCHES "AArch64" OR ARCH MATCHES "ARM" OR
-   ARCH MATCHES "LoongArch")
+   ARCH MATCHES "LoongArch" OR ARCH MATCHES "Hexagon")
   add_subdirectory(gcc-c-torture)
 endif()
 
diff --git a/SingleSource/Regression/C/gcc-c-torture/execute/CMakeLists.txt b/SingleSource/Regression/C/gcc-c-torture/execute/CMakeLists.txt
index 034258119a..a2a25927cb 100644
--- a/SingleSource/Regression/C/gcc-c-torture/execute/CMakeLists.txt
+++ b/SingleSource/Regression/C/gcc-c-torture/execute/CMakeLists.txt
@@ -337,6 +337,20 @@ if(ARCH MATCHES "ARM")
   list(APPEND TestsToSkip ${ARMTestsToSkip})
 endif()
 
+if(ARCH MATCHES "Hexagon")
+    file(GLOB HexagonTestsToSkip CONFIGURE_DEPENDS
+	990127-1.c
+	alloca-1.c
+	va-arg-22.c
+	# No support for __int128 on Hexagon
+	pr84748.c
+	built-in-setjmp.c
+	pr84521.c
+  )
+
+  list(APPEND TestsToSkip ${HexagonTestsToSkip})
+endif()
+
 # Darwin Test Blacklist
 if(TARGET_OS STREQUAL "Darwin")
   file(GLOB DarwinTestsToSkip CONFIGURE_DEPENDS
diff --git a/SingleSource/Regression/C/gcc-c-torture/execute/ieee/CMakeLists.txt b/SingleSource/Regression/C/gcc-c-torture/execute/ieee/CMakeLists.txt
index e4ba9009e5..21f1081c81 100644
--- a/SingleSource/Regression/C/gcc-c-torture/execute/ieee/CMakeLists.txt
+++ b/SingleSource/Regression/C/gcc-c-torture/execute/ieee/CMakeLists.txt
@@ -19,6 +19,19 @@ file(GLOB UnsupportedTests
 )
 list(APPEND TestsToSkip ${UnsupportedTests})
 
+if(ARCH MATCHES "Hexagon")
+  file(GLOB HexagonTestsToSkip
+    CONFIGURE_DEPENDS
+      fp-cmp-8.c
+      fp-cmp-8f.c
+      fp-cmp-8l.c
+      pr38016.c
+      pr50310.c
+  )
+  list(APPEND TestsToSkip ${HexagonTestsToSkip})
+endif()
+
+
 ##
 ## Tests that require extra CFLAGS in Clang
 ##
diff --git a/SingleSource/UnitTests/Vector/CMakeLists.txt b/SingleSource/UnitTests/Vector/CMakeLists.txt
index e39c33a68c..3926d46076 100644
--- a/SingleSource/UnitTests/Vector/CMakeLists.txt
+++ b/SingleSource/UnitTests/Vector/CMakeLists.txt
@@ -53,4 +53,7 @@ if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
     endif()
   endif()
 endif()
+if(ARCH STREQUAL "Hexagon")
+  add_subdirectory(HVX)
+endif()
 llvm_singlesource(PREFIX "Vector-")
diff --git a/SingleSource/UnitTests/Vector/HVX/CMakeLists.txt b/SingleSource/UnitTests/Vector/HVX/CMakeLists.txt
new file mode 100644
index 0000000000..0e5f574683
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/HVX/CMakeLists.txt
@@ -0,0 +1,3 @@
+list(APPEND CFLAGS -mhvx)
+list(APPEND CFLAGS -mv69)
+llvm_singlesource(PREFIX "Vector-HVX-")
diff --git a/SingleSource/UnitTests/Vector/HVX/QFloat.c b/SingleSource/UnitTests/Vector/HVX/QFloat.c
new file mode 100644
index 0000000000..e11ba64eca
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/HVX/QFloat.c
@@ -0,0 +1,325 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <hexagon_types.h>
+#if !defined(__linux__)
+#include <hexagon_standalone.h>
+#endif
+
+union ui32f {
+  int32_t i;
+  float f;
+};
+union ui16f16 {
+  int16_t i;
+  __fp16 f16;
+};
+
+// 128 byte vectors
+#define VSIZE_BYTES 128
+#define VSIZE_WORDS (VSIZE_BYTES / 4)
+
+void print_vector_words(HVX_Vector x) {
+  for (int i = 0; i < VSIZE_WORDS; i++) {
+    if (!(i % 8))
+      printf("\n");
+    printf("0x%08lx ", x[i]);
+  }
+
+  printf("\n");
+}
+
+//
+// Create vectors
+//
+
+// create a vector of floats from a float
+static __attribute__((always_inline)) HVX_Vector
+create_sfv_from_sf(float value) {
+  union ui32f cvt;
+  cvt.f = value;
+  HVX_Vector tmp = Q6_V_vsplat_R(cvt.i);
+  return tmp;
+}
+
+// create a vector of half floats from a float
+static __attribute__((always_inline)) HVX_Vector
+create_hfv_from_sf(float value) {
+  __fp16 hf = value;
+  union ui16f16 cvt;
+  cvt.f16 = hf;
+  HVX_Vector tmp = Q6_Vh_vsplat_R(cvt.i);
+  return tmp;
+}
+
+// create a vector of qf32's from a float
+static __attribute__((always_inline)) HVX_Vector
+create_qf32v_from_sf(float value) {
+  HVX_Vector tmp =
+      Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_vsplat_R(0), create_sfv_from_sf(value));
+  return tmp;
+}
+
+// create a vector of qf16's from a float
+static __attribute__((always_inline)) HVX_Vector
+create_qf16v_from_sf(float value) {
+  // create qf16 vector from hf
+  HVX_Vector tmp =
+      Q6_Vqf16_vadd_Vqf16Vhf(Q6_V_vsplat_R(0), create_hfv_from_sf(value));
+  return tmp;
+}
+
+//
+// Conversion vectors
+//
+
+// convert qf32 vector to float vector
+static __attribute__((always_inline)) HVX_Vector
+convert_qf32v_to_fltv(HVX_Vector vect) {
+  HVX_Vector tmp = Q6_Vsf_equals_Vqf32(vect);
+  return tmp;
+}
+
+// convert qf16 vector to half float vector
+static __attribute__((always_inline)) HVX_Vector
+convert_qf16v_to_hfv(HVX_Vector vect) {
+  HVX_Vector tmp = Q6_Vhf_equals_Vqf16(vect);
+  return tmp;
+}
+
+//
+// Extraction routines
+//
+
+// get lowest float from a vector of floats
+static __attribute__((always_inline)) float
+get_flt0_from_fltv(HVX_Vector vect) {
+  union ui32f cvt;
+  cvt.i = vect[0];
+  return cvt.f;
+}
+
+// get lowest float from a vector of qf32's
+static __attribute__((always_inline)) float
+get_flt0_from_qf32v(HVX_Vector vect) {
+  union ui32f cvt;
+  HVX_Vector tmp = convert_qf32v_to_fltv(vect);
+  cvt.i = tmp[0];
+  return cvt.f;
+}
+
+// get lowest float from a vector of halfs
+static __attribute__((always_inline)) float
+get_flt0_from_halfv(HVX_Vector vect) {
+  union ui16f16 cvt;
+  cvt.i = (vect[0] & 0xffff);
+  return (float)cvt.f16;
+}
+
+// get lowest float from a vector of qf16's
+static __attribute__((always_inline)) float
+get_flt0_from_qf16v(HVX_Vector vect) {
+  return get_flt0_from_halfv(convert_qf16v_to_hfv(vect));
+}
+
+// get lowest float from a vector pair of qf32's
+static __attribute__((always_inline)) float
+get_flt0_from_qf32vp(HVX_VectorPair vect) {
+  union ui32f cvt;
+  HVX_Vector tmp = convert_qf32v_to_fltv(HEXAGON_HVX_GET_V0(vect));
+  cvt.i = tmp[0];
+  return cvt.f;
+}
+
+int main(int argc, char **argv) {
+#if !defined(__linux__)
+  SIM_ACQUIRE_HVX;
+  SIM_SET_HVX_DOUBLE_MODE;
+#endif
+
+  // create 2 sf vectors in IEEE-754 format
+  HVX_Vector v1 = create_sfv_from_sf(0.5);
+  HVX_Vector v2 = create_sfv_from_sf(0.25);
+
+  // create 2 vectors in the qf32 format
+  HVX_Vector v3 = create_qf32v_from_sf(0.5);
+  HVX_Vector v4 = create_qf32v_from_sf(0.25);
+
+  printf("\nAdd intrinsics with a qf32 result\n");
+
+  // add the IEEE vectors into a qf32 vector
+  HVX_Vector result1 = Q6_Vqf32_vadd_VsfVsf(v1, v2);
+  printf("The sum of flt  %f and flt  %f is %f\n", get_flt0_from_fltv(v1),
+         get_flt0_from_fltv(v2), get_flt0_from_qf32v(result1));
+
+  // add the qf32 vectors into a qf32 vector
+  HVX_Vector result2 = Q6_Vqf32_vadd_Vqf32Vqf32(v3, v4);
+  printf("The sum of qf32 %f and qf32 %f is %f\n", get_flt0_from_qf32v(v3),
+         get_flt0_from_qf32v(v4), get_flt0_from_qf32v(result2));
+
+  // add a qf32 vector and an IEEE vector into a qf32 vector
+  HVX_Vector result3 = Q6_Vqf32_vadd_Vqf32Vsf(v3, v2);
+  printf("The sum of qf32 %f and flt  %f is %f\n", get_flt0_from_qf32v(v3),
+         get_flt0_from_fltv(v2), get_flt0_from_qf32v(result3));
+
+  printf("\nSubtract intrinsics with a qf32 result\n");
+
+  // subtract the IEEE vectors into a qf32 vector
+  HVX_Vector result4 = Q6_Vqf32_vsub_VsfVsf(v1, v2);
+  printf("The sum of flt  %f and flt  -%f is %f\n", get_flt0_from_fltv(v1),
+         get_flt0_from_fltv(v2), get_flt0_from_qf32v(result4));
+
+  // add the qf32 vectors into a qf32 vector
+  HVX_Vector result5 = Q6_Vqf32_vsub_Vqf32Vqf32(v3, v4);
+  printf("The sum of qf32 %f and qf32 -%f is %f\n", get_flt0_from_qf32v(v3),
+         get_flt0_from_qf32v(v4), get_flt0_from_qf32v(result5));
+
+  // add a qf32 vector and an IEEE vector into a qf32 vector
+  HVX_Vector result6 = Q6_Vqf32_vsub_Vqf32Vsf(v3, v2);
+  printf("The sum of qf32 %f and flt  -%f is %f\n", get_flt0_from_qf32v(v3),
+         get_flt0_from_fltv(v2), get_flt0_from_qf32v(result6));
+
+  printf("\nMultiply intrinsics with a qf32 result\n");
+
+  // multiply the IEEE vectors into a qf32 vector
+  HVX_Vector result7 = Q6_Vqf32_vmpy_VsfVsf(v1, v2);
+  printf("The result of flt  %f * flt  %f is %f\n", get_flt0_from_fltv(v1),
+         get_flt0_from_fltv(v2), get_flt0_from_qf32v(result7));
+
+  // multiply the qf32 vectors into a qf32 vector
+  HVX_Vector result8 = Q6_Vqf32_vmpy_Vqf32Vqf32(v3, v4);
+  printf("The result of qf32 %f * qf32 %f is %f\n", get_flt0_from_qf32v(v3),
+         get_flt0_from_qf32v(v4), get_flt0_from_qf32v(result8));
+
+  // create 2 half vectors in the IEEE-754 format
+  HVX_Vector v5 = create_hfv_from_sf(0.5);
+  HVX_Vector v6 = create_hfv_from_sf(0.25);
+
+  // create 2 vectors in the qf16 format
+  HVX_Vector v7 = create_qf16v_from_sf(0.5);
+  HVX_Vector v8 = create_qf16v_from_sf(0.25);
+
+  printf("\nAdd intrinsics with a qf16 result\n");
+
+  // add the IEEE hf vectors into a qf16 vector
+  HVX_Vector result9 = Q6_Vqf16_vadd_VhfVhf(v5, v6);
+  printf("The sum of hf   %.3f and hf   %.3f is %.3f\n",
+         get_flt0_from_halfv(v5), get_flt0_from_halfv(v6),
+         get_flt0_from_qf16v(result9));
+
+  // add the qf16 vectors into a qf16 vector
+  HVX_Vector result10 = Q6_Vqf16_vadd_Vqf16Vqf16(v7, v8);
+  printf("The sum of qf16 %.3f and qf16 %.3f is %.3f\n",
+         get_flt0_from_qf16v(v7), get_flt0_from_qf16v(v8),
+         get_flt0_from_qf16v(result10));
+
+  // add a qf16 vector and an IEEE hf vector into a qf16 vector
+  HVX_Vector result11 = Q6_Vqf16_vadd_Vqf16Vhf(v7, v6);
+  printf("The sum of qf16 %.3f and hf   %.3f is %.3f\n",
+         get_flt0_from_qf16v(v7), get_flt0_from_halfv(v6),
+         get_flt0_from_qf16v(result11));
+
+  printf("\nSubtract intrinsics with a qf16 result\n");
+
+  // add the IEEE hf vectors into a qf16 vector
+  HVX_Vector result12 = Q6_Vqf16_vsub_VhfVhf(v5, v6);
+  printf("The sum of hf   %.3f and hf   -%.3f is %.3f\n",
+         get_flt0_from_halfv(v5), get_flt0_from_halfv(v6),
+         get_flt0_from_qf16v(result12));
+
+  // add the qf16 vectors into a qf16 vector
+  HVX_Vector result13 = Q6_Vqf16_vsub_Vqf16Vqf16(v7, v8);
+  printf("The sum of qf16 %.3f and qf16 -%.3f is %.3f\n",
+         get_flt0_from_qf16v(v7), get_flt0_from_qf16v(v8),
+         get_flt0_from_qf16v(result13));
+
+  // add a qf16 vector and an IEEE hf vector into a qf16 vector
+  HVX_Vector result14 = Q6_Vqf16_vsub_Vqf16Vhf(v7, v6);
+  printf("The sum of qf16 %.3f and hf   -%.3f is %.3f\n",
+         get_flt0_from_qf16v(v7), get_flt0_from_halfv(v6),
+         get_flt0_from_qf16v(result14));
+
+  printf("\nMultiply intrinsics with a qf16 result\n");
+
+  // multiply the IEEE hf vectors into a qf16 vector
+  HVX_Vector result15 = Q6_Vqf16_vmpy_VhfVhf(v5, v6);
+  printf("The result of hf   %.3f * hf   %.3f is %.3f\n",
+         get_flt0_from_halfv(v5), get_flt0_from_halfv(v6),
+         get_flt0_from_qf16v(result15));
+
+  // multiply the qf16 vectors into a qf16 vector
+  HVX_Vector result16 = Q6_Vqf16_vmpy_Vqf16Vqf16(v7, v8);
+  printf("The result of qf16 %.3f * qf16 %.3f is %.3f\n",
+         get_flt0_from_qf16v(v7), get_flt0_from_qf16v(v8),
+         get_flt0_from_qf16v(result16));
+
+  // multiply the qf16 vector with a hf vector into a qf16 vector
+  HVX_Vector result17 = Q6_Vqf16_vmpy_Vqf16Vhf(v7, v6);
+  printf("The result of qf16 %.3f * hf   %.3f is %.3f\n",
+         get_flt0_from_qf16v(v7), get_flt0_from_halfv(v6),
+         get_flt0_from_qf16v(result17));
+
+  // multiply with pair results
+
+  printf("\nMultiply hf/qf16 intrinsics with a qf32 result\n");
+
+  // multiply the IEEE hf vectors into a qf32 vector pair
+  HVX_VectorPair result18 = Q6_Wqf32_vmpy_VhfVhf(v5, v6);
+  printf("The result of hf   %.3f * hf   %.3f is %.3f\n",
+         get_flt0_from_halfv(v5), get_flt0_from_halfv(v6),
+         get_flt0_from_qf32vp(result18));
+
+  // multiply the qf16 vectors into a qf32 vector pair
+  HVX_VectorPair result19 = Q6_Wqf32_vmpy_Vqf16Vqf16(v7, v8);
+  printf("The result of qf16 %.3f * qf16 %.3f is %.3f\n",
+         get_flt0_from_qf16v(v7), get_flt0_from_qf16v(v8),
+         get_flt0_from_qf32vp(result19));
+
+  // multiply the qf16 vector with a hf vector into a qf32 vector
+  HVX_VectorPair result20 = Q6_Wqf32_vmpy_Vqf16Vhf(v7, v6);
+  printf("The result of qf16 %.3f * hf   %.3f is %.3f\n",
+         get_flt0_from_qf16v(v7), get_flt0_from_halfv(v6),
+         get_flt0_from_qf32vp(result20));
+
+  // create_qf32v_from_qf16v(HVX_Vector qf16)
+
+  printf("\nCompare instrinsics\n");
+
+  // compare 2 single float vectors
+  HVX_VectorPred Pred = Q6_Q_vcmp_gt_VsfVsf(v1, v2);
+
+  // sum the bits
+  HVX_Vector sum = Q6_Vw_prefixsum_Q(Pred);
+  printf("The sum of the predicate bits from the sf compare is %ld\n", sum[31]);
+
+  // compare 2 half float vectors
+  Pred = Q6_Q_vcmp_gt_VhfVhf(v5, v6);
+
+  // sum the bits
+  sum = Q6_Vh_prefixsum_Q(Pred);
+  printf("The sum of the predicate bits from the hf compare is %ld\n",
+         sum[31] >> 16);
+
+  printf("\nMin/Max instrinsics\n");
+
+  // get a vector that is the max of 2 sf vectors
+  HVX_Vector sfmax = Q6_Vsf_vmax_VsfVsf(v1, v2);
+  printf("The max value of sf v1 and sf v2 is %f\n", get_flt0_from_fltv(sfmax));
+
+  // get a vector that is the min of 2 sf vectors
+  HVX_Vector sfmin = Q6_Vsf_vmin_VsfVsf(v1, v2);
+  printf("The min value of sf v1 and sf v2 is %f\n", get_flt0_from_fltv(sfmin));
+
+  // get a vector that is the max of 2 hf vectors
+  HVX_Vector hfmax = Q6_Vhf_vmax_VhfVhf(v5, v6);
+  printf("The max value of hf v5 and sf v6 is %f\n",
+         get_flt0_from_halfv(hfmax));
+
+  // get a vector that is the min of 2 hf vectors
+  HVX_Vector hfmin = Q6_Vhf_vmin_VhfVhf(v5, v6);
+  printf("The min value of hf v5 and sf v6 is %f\n",
+         get_flt0_from_halfv(hfmin));
+
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/HVX/QFloat.reference_output b/SingleSource/UnitTests/Vector/HVX/QFloat.reference_output
new file mode 100644
index 0000000000..ca4e88fcd8
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/HVX/QFloat.reference_output
@@ -0,0 +1,45 @@
+
+Add intrinsics with a qf32 result
+The sum of flt  0.500000 and flt  0.250000 is 0.750000
+The sum of qf32 0.500000 and qf32 0.250000 is 0.750000
+The sum of qf32 0.500000 and flt  0.250000 is 0.750000
+
+Subtract intrinsics with a qf32 result
+The sum of flt  0.500000 and flt  -0.250000 is 0.250000
+The sum of qf32 0.500000 and qf32 -0.250000 is 0.250000
+The sum of qf32 0.500000 and flt  -0.250000 is 0.250000
+
+Multiply intrinsics with a qf32 result
+The result of flt  0.500000 * flt  0.250000 is 0.125000
+The result of qf32 0.500000 * qf32 0.250000 is 0.125000
+
+Add intrinsics with a qf16 result
+The sum of hf   0.500 and hf   0.250 is 0.750
+The sum of qf16 0.500 and qf16 0.250 is 0.750
+The sum of qf16 0.500 and hf   0.250 is 0.750
+
+Subtract intrinsics with a qf16 result
+The sum of hf   0.500 and hf   -0.250 is 0.250
+The sum of qf16 0.500 and qf16 -0.250 is 0.250
+The sum of qf16 0.500 and hf   -0.250 is 0.250
+
+Multiply intrinsics with a qf16 result
+The result of hf   0.500 * hf   0.250 is 0.125
+The result of qf16 0.500 * qf16 0.250 is 0.125
+The result of qf16 0.500 * hf   0.250 is 0.125
+
+Multiply hf/qf16 intrinsics with a qf32 result
+The result of hf   0.500 * hf   0.250 is 0.125
+The result of qf16 0.500 * qf16 0.250 is 0.125
+The result of qf16 0.500 * hf   0.250 is 0.125
+
+Compare instrinsics
+The sum of the predicate bits from the sf compare is 128
+The sum of the predicate bits from the hf compare is 128
+
+Min/Max instrinsics
+The max value of sf v1 and sf v2 is 0.500000
+The min value of sf v1 and sf v2 is 0.250000
+The max value of hf v5 and sf v6 is 0.500000
+The min value of hf v5 and sf v6 is 0.250000
+exit 0

From acf9e8d53639848de444adc4e5f06bb3cc7dbcdd Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Mon, 24 Feb 2025 15:48:48 -0600
Subject: [PATCH 2/2] [Hexagon] Add v79 HVX cmake cache file (#210)

---
 cmake/caches/target-hexagon-v79-O2.cmake | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 cmake/caches/target-hexagon-v79-O2.cmake

diff --git a/cmake/caches/target-hexagon-v79-O2.cmake b/cmake/caches/target-hexagon-v79-O2.cmake
new file mode 100644
index 0000000000..cc8d0b4289
--- /dev/null
+++ b/cmake/caches/target-hexagon-v79-O2.cmake
@@ -0,0 +1,6 @@
+set(OPTFLAGS "${OPTFLAGS} -mv79 -mhvx -mhvx-ieee-fp")
+set(OPTFLAGS "${OPTFLAGS} -O2")
+
+set(CMAKE_C_FLAGS_RELEASE "${OPTFLAGS}" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELEASE "${OPTFLAGS}" CACHE STRING "")
+set(CMAKE_BUILD_TYPE "Release" CACHE STRING "")