Skip to content

Commit 4d14ccd

Browse files
committed
Use native CRC instructions on 64-bit LoongArch
As with the Intel and Arm CRC instructions, compiler intrinsics for them must be supported by the compiler. In contrast, no runtime check is needed. Aligned memory access is faster, so use the Arm coding as a model. YANG Xudong Discussion: https://postgr.es/m/b522a0c5-e3b2-99cc-6387-58134fb88cbe%40ymatrix.cn
1 parent fa2e874 commit 4d14ccd

File tree

8 files changed

+240
-17
lines changed

8 files changed

+240
-17
lines changed

config/c-compiler.m4

+33
Original file line numberDiff line numberDiff line change
@@ -661,3 +661,36 @@ if test x"$Ac_cachevar" = x"yes"; then
661661
fi
662662
undefine([Ac_cachevar])dnl
663663
])# PGAC_ARMV8_CRC32C_INTRINSICS
664+
665+
# PGAC_LOONGARCH_CRC32C_INTRINSICS
666+
# ---------------------------
667+
# Check if the compiler supports the LoongArch CRCC instructions, using
668+
# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
669+
# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
670+
# intrinsic functions.
671+
#
672+
# We test for the 8-byte variant since platforms capable of running
673+
# Postgres are 64-bit only (as of PG17), and we know CRC instructions
674+
# are available there without a runtime check.
675+
#
676+
# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics.
677+
AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
678+
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl
679+
AC_CACHE_CHECK(
680+
[for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w],
681+
[Ac_cachevar],
682+
[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
683+
[unsigned int crc = 0;
684+
crc = __builtin_loongarch_crcc_w_b_w(0, crc);
685+
crc = __builtin_loongarch_crcc_w_h_w(0, crc);
686+
crc = __builtin_loongarch_crcc_w_w_w(0, crc);
687+
crc = __builtin_loongarch_crcc_w_d_w(0, crc);
688+
/* return computed value, to prevent the above being optimized away */
689+
return crc == 0;])],
690+
[Ac_cachevar=yes],
691+
[Ac_cachevar=no])])
692+
if test x"$Ac_cachevar" = x"yes"; then
693+
pgac_loongarch_crc32c_intrinsics=yes
694+
fi
695+
undefine([Ac_cachevar])dnl
696+
])# PGAC_LOONGARCH_CRC32C_INTRINSICS

configure

+66-8
Original file line numberDiff line numberDiff line change
@@ -18047,6 +18047,47 @@ fi
1804718047

1804818048
fi
1804918049

18050+
# Check for LoongArch CRC intrinsics to do CRC calculations.
18051+
#
18052+
# Check if __builtin_loongarch_crcc_* intrinsics can be used
18053+
# with the default compiler flags.
18054+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w" >&5
18055+
$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w... " >&6; }
18056+
if ${pgac_cv_loongarch_crc32c_intrinsics+:} false; then :
18057+
$as_echo_n "(cached) " >&6
18058+
else
18059+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
18060+
/* end confdefs.h. */
18061+
18062+
int
18063+
main ()
18064+
{
18065+
unsigned int crc = 0;
18066+
crc = __builtin_loongarch_crcc_w_b_w(0, crc);
18067+
crc = __builtin_loongarch_crcc_w_h_w(0, crc);
18068+
crc = __builtin_loongarch_crcc_w_w_w(0, crc);
18069+
crc = __builtin_loongarch_crcc_w_d_w(0, crc);
18070+
/* return computed value, to prevent the above being optimized away */
18071+
return crc == 0;
18072+
;
18073+
return 0;
18074+
}
18075+
_ACEOF
18076+
if ac_fn_c_try_link "$LINENO"; then :
18077+
pgac_cv_loongarch_crc32c_intrinsics=yes
18078+
else
18079+
pgac_cv_loongarch_crc32c_intrinsics=no
18080+
fi
18081+
rm -f core conftest.err conftest.$ac_objext \
18082+
conftest$ac_exeext conftest.$ac_ext
18083+
fi
18084+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics" >&5
18085+
$as_echo "$pgac_cv_loongarch_crc32c_intrinsics" >&6; }
18086+
if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then
18087+
pgac_loongarch_crc32c_intrinsics=yes
18088+
fi
18089+
18090+
1805018091

1805118092

1805218093
# Select CRC-32C implementation.
@@ -18063,9 +18104,12 @@ fi
1806318104
# we're not targeting such a processor, but can nevertheless produce code that
1806418105
# uses the CRC instructions, compile both, and select at runtime.
1806518106
#
18066-
# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
18107+
# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
1806718108
# in the template or configure command line.
18068-
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
18109+
#
18110+
# If we are targeting a LoongArch processor, CRC instructions are
18111+
# always available (at least on 64 bit), so no runtime check is needed.
18112+
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
1806918113
# Use Intel SSE 4.2 if available.
1807018114
if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
1807118115
USE_SSE42_CRC32C=1
@@ -18083,10 +18127,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
1808318127
if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
1808418128
USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
1808518129
else
18086-
# fall back to slicing-by-8 algorithm, which doesn't require any
18087-
# special CPU support.
18088-
USE_SLICING_BY_8_CRC32C=1
18089-
fi
18130+
# LoongArch CRCC instructions.
18131+
if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
18132+
USE_LOONGARCH_CRC32C=1
18133+
else
18134+
# fall back to slicing-by-8 algorithm, which doesn't require any
18135+
# special CPU support.
18136+
USE_SLICING_BY_8_CRC32C=1
18137+
fi
18138+
fi
1809018139
fi
1809118140
fi
1809218141
fi
@@ -18127,12 +18176,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
1812718176
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
1812818177
$as_echo "ARMv8 CRC instructions with runtime check" >&6; }
1812918178
else
18179+
if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
18180+
18181+
$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
18182+
18183+
PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
18184+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
18185+
$as_echo "LoongArch CRCC instructions" >&6; }
18186+
else
1813018187

1813118188
$as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
1813218189

18133-
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
18134-
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
18190+
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
18191+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
1813518192
$as_echo "slicing-by-8" >&6; }
18193+
fi
1813618194
fi
1813718195
fi
1813818196
fi

configure.ac

+29-9
Original file line numberDiff line numberDiff line change
@@ -2099,6 +2099,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
20992099
PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
21002100
fi
21012101

2102+
# Check for LoongArch CRC intrinsics to do CRC calculations.
2103+
#
2104+
# Check if __builtin_loongarch_crcc_* intrinsics can be used
2105+
# with the default compiler flags.
2106+
PGAC_LOONGARCH_CRC32C_INTRINSICS()
2107+
21022108
AC_SUBST(CFLAGS_CRC)
21032109

21042110
# Select CRC-32C implementation.
@@ -2115,9 +2121,12 @@ AC_SUBST(CFLAGS_CRC)
21152121
# we're not targeting such a processor, but can nevertheless produce code that
21162122
# uses the CRC instructions, compile both, and select at runtime.
21172123
#
2118-
# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
2124+
# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
21192125
# in the template or configure command line.
2120-
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
2126+
#
2127+
# If we are targeting a LoongArch processor, CRC instructions are
2128+
# always available (at least on 64 bit), so no runtime check is needed.
2129+
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
21212130
# Use Intel SSE 4.2 if available.
21222131
if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
21232132
USE_SSE42_CRC32C=1
@@ -2135,10 +2144,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
21352144
if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
21362145
USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
21372146
else
2138-
# fall back to slicing-by-8 algorithm, which doesn't require any
2139-
# special CPU support.
2140-
USE_SLICING_BY_8_CRC32C=1
2141-
fi
2147+
# LoongArch CRCC instructions.
2148+
if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
2149+
USE_LOONGARCH_CRC32C=1
2150+
else
2151+
# fall back to slicing-by-8 algorithm, which doesn't require any
2152+
# special CPU support.
2153+
USE_SLICING_BY_8_CRC32C=1
2154+
fi
2155+
fi
21422156
fi
21432157
fi
21442158
fi
@@ -2166,9 +2180,15 @@ else
21662180
PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
21672181
AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
21682182
else
2169-
AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
2170-
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
2171-
AC_MSG_RESULT(slicing-by-8)
2183+
if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
2184+
AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
2185+
PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
2186+
AC_MSG_RESULT(LoongArch CRCC instructions)
2187+
else
2188+
AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
2189+
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
2190+
AC_MSG_RESULT(slicing-by-8)
2191+
fi
21722192
fi
21732193
fi
21742194
fi

meson.build

+24
Original file line numberDiff line numberDiff line change
@@ -2065,6 +2065,30 @@ int main(void)
20652065
cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
20662066
have_optimized_crc = true
20672067
endif
2068+
2069+
elif host_cpu == 'loongarch64'
2070+
2071+
prog = '''
2072+
int main(void)
2073+
{
2074+
unsigned int crc = 0;
2075+
crc = __builtin_loongarch_crcc_w_b_w(0, crc);
2076+
crc = __builtin_loongarch_crcc_w_h_w(0, crc);
2077+
crc = __builtin_loongarch_crcc_w_w_w(0, crc);
2078+
crc = __builtin_loongarch_crcc_w_d_w(0, crc);
2079+
2080+
/* return computed value, to prevent the above being optimized away */
2081+
return crc == 0;
2082+
}
2083+
'''
2084+
2085+
if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w',
2086+
args: test_c_args)
2087+
# Use LoongArch CRC instruction unconditionally
2088+
cdata.set('USE_LOONGARCH_CRC32C', 1)
2089+
have_optimized_crc = true
2090+
endif
2091+
20682092
endif
20692093

20702094
if not have_optimized_crc

src/include/pg_config.h.in

+3
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,9 @@
714714
/* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
715715
#undef USE_LLVM
716716

717+
/* Define to 1 to use LoongArch CRCC instructions. */
718+
#undef USE_LOONGARCH_CRC32C
719+
717720
/* Define to 1 to build with LZ4 support. (--with-lz4) */
718721
#undef USE_LZ4
719722

src/include/port/pg_crc32c.h

+9
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
5858

5959
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
6060

61+
#elif defined(USE_LOONGARCH_CRC32C)
62+
/* Use LoongArch CRCC instructions. */
63+
64+
#define COMP_CRC32C(crc, data, len) \
65+
((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
66+
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
67+
68+
extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
69+
6170
#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
6271

6372
/*

src/port/meson.build

+3
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ replace_funcs_pos = [
9292
['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
9393
['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
9494

95+
# loongarch
96+
['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
97+
9598
# generic fallback
9699
['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
97100
]

src/port/pg_crc32c_loongarch.c

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* pg_crc32c_loongarch.c
4+
* Compute CRC-32C checksum using LoongArch CRCC instructions
5+
*
6+
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7+
* Portions Copyright (c) 1994, Regents of the University of California
8+
*
9+
*
10+
* IDENTIFICATION
11+
* src/port/pg_crc32c_loongarch.c
12+
*
13+
*-------------------------------------------------------------------------
14+
*/
15+
#include "c.h"
16+
17+
#include "port/pg_crc32c.h"
18+
19+
pg_crc32c
20+
pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
21+
{
22+
const unsigned char *p = data;
23+
const unsigned char *pend = p + len;
24+
25+
/*
26+
* LoongArch doesn't require alignment, but aligned memory access is
27+
* significantly faster. Process leading bytes so that the loop below
28+
* starts with a pointer aligned to eight bytes.
29+
*/
30+
if (!PointerIsAligned(p, uint16) &&
31+
p + 1 <= pend)
32+
{
33+
crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
34+
p += 1;
35+
}
36+
if (!PointerIsAligned(p, uint32) &&
37+
p + 2 <= pend)
38+
{
39+
crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
40+
p += 2;
41+
}
42+
if (!PointerIsAligned(p, uint64) &&
43+
p + 4 <= pend)
44+
{
45+
crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
46+
p += 4;
47+
}
48+
49+
/* Process eight bytes at a time, as far as we can. */
50+
while (p + 8 <= pend)
51+
{
52+
crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
53+
p += 8;
54+
}
55+
56+
/* Process remaining 0-7 bytes. */
57+
if (p + 4 <= pend)
58+
{
59+
crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
60+
p += 4;
61+
}
62+
if (p + 2 <= pend)
63+
{
64+
crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
65+
p += 2;
66+
}
67+
if (p < pend)
68+
{
69+
crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
70+
}
71+
72+
return crc;
73+
}

0 commit comments

Comments
 (0)