From 8daa5370ab517ac0165b424f89bdc7add540ea42 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 29 May 2020 18:32:25 +0200 Subject: [PATCH 1/6] bpo-29882: Add _Py_popcount32() function * Rename pycore_byteswap.h to pycore_bitutils.h. * Move popcount_digit() to pycore_bitutils.h as _Py_popcount32(). * _Py_popcount32() uses GCC and clang builtin function if available. * Add unit tests to _Py_popcount32(). --- .../{pycore_byteswap.h => pycore_bitutils.h} | 29 +++++++++++- Makefile.pre.in | 2 +- Modules/_ctypes/cfield.c | 2 +- Modules/_testinternalcapi.c | 45 ++++++++++++++++++- Modules/sha256module.c | 2 +- Modules/sha512module.c | 2 +- Objects/longobject.c | 13 +++--- Objects/stringlib/codecs.h | 2 +- PCbuild/pythoncore.vcxproj | 2 +- PCbuild/pythoncore.vcxproj.filters | 2 +- Python/hamt.c | 25 ++--------- 11 files changed, 87 insertions(+), 39 deletions(-) rename Include/internal/{pycore_byteswap.h => pycore_bitutils.h} (77%) diff --git a/Include/internal/pycore_byteswap.h b/Include/internal/pycore_bitutils.h similarity index 77% rename from Include/internal/pycore_byteswap.h rename to Include/internal/pycore_bitutils.h index 5e64704a004c82..be7bdc1da9987e 100644 --- a/Include/internal/pycore_byteswap.h +++ b/Include/internal/pycore_bitutils.h @@ -1,4 +1,6 @@ -/* Bytes swap functions, reverse order of bytes: +/* Bit and bytes utilities. + + Bytes swap functions, reverse order of bytes: - _Py_bswap16(uint16_t) - _Py_bswap32(uint32_t) @@ -82,6 +84,31 @@ _Py_bswap64(uint64_t word) } +// Population count: count the number of 1's in 'u' +// (number of bits set to 1) +static inline int +_Py_popcount32(uint32_t u) +{ +#if defined(__clang__) || defined(__GNUC__) + +#if SIZEOF_INT == 4 + return __builtin_popcount(u); +#elif SIZEOF_LONG == 4 + return __builtin_popcountl(u); +#else +# error "unsupported configuration" +#endif + +#else + /* 32-bit SWAR (SIMD Within A Register) popcount */ + u -= (u >> 1) & UINT32_C(0x55555555); + u = (u & UINT32_C(0x33333333)) + ((u >> 2) & UINT32_C(0x33333333)); + u = (u + (u >> 4)) & UINT32_C(0x0f0f0f0f); + return (uint32_t)(u * UINT32_C(0x01010101)) >> 24; +#endif +} + + #ifdef __cplusplus } #endif diff --git a/Makefile.pre.in b/Makefile.pre.in index 5a18704e441985..b115e7fc01f74c 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1121,7 +1121,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_abstract.h \ $(srcdir)/Include/internal/pycore_accu.h \ $(srcdir)/Include/internal/pycore_atomic.h \ - $(srcdir)/Include/internal/pycore_byteswap.h \ + $(srcdir)/Include/internal/pycore_bitutils.h \ $(srcdir)/Include/internal/pycore_bytes_methods.h \ $(srcdir)/Include/internal/pycore_call.h \ $(srcdir)/Include/internal/pycore_ceval.h \ diff --git a/Modules/_ctypes/cfield.c b/Modules/_ctypes/cfield.c index 32a2beeb744f7c..3a9b7119201cf0 100644 --- a/Modules/_ctypes/cfield.c +++ b/Modules/_ctypes/cfield.c @@ -1,5 +1,5 @@ #include "Python.h" -#include "pycore_byteswap.h" // _Py_bswap32() +#include "pycore_bitutils.h" // _Py_bswap32() #include #ifdef MS_WIN32 diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 5f217dcb8978e2..acc478b0b27cfe 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -12,7 +12,7 @@ #define PY_SSIZE_T_CLEAN #include "Python.h" -#include "pycore_byteswap.h" // _Py_bswap32() +#include "pycore_bitutils.h" // _Py_bswap32() #include "pycore_initconfig.h" // _Py_GetConfigsAsDict() #include "pycore_hashtable.h" // _Py_hashtable_new() #include "pycore_gc.h" // PyGC_Head @@ -63,6 +63,48 @@ test_bswap(PyObject *self, PyObject *Py_UNUSED(args)) } +static int +check_popcount(uint32_t x, int expected) +{ + // Use volatile to prevent the compiler to optimize out the whole test + volatile uint32_t u = x; + int bits = _Py_popcount32(u); + if (bits != expected) { + PyErr_Format(PyExc_AssertionError, + "_Py_popcount32(%lu) returns %i, expected %i", + (unsigned long)x, bits, expected); + return -1; + } + return 0; +} + + +static PyObject* +test_popcount(PyObject *self, PyObject *Py_UNUSED(args)) +{ + if (check_popcount(0, 0) < 0) { + return NULL; + } + if (check_popcount(1, 1) < 0) { + return NULL; + } + if (check_popcount(UINT32_C(0x100), 1) < 0) { + return NULL; + } + if (check_popcount(UINT32_C(0xffff), 16) < 0) { + return NULL; + } + if (check_popcount(UINT32_C(0x10101010), 4) < 0) { + return NULL; + } + if (check_popcount(UINT32_C(0xDEADCAFE), 22) < 0) { + return NULL; + } + + Py_RETURN_NONE; +} + + #define TO_PTR(ch) ((void*)(uintptr_t)ch) #define FROM_PTR(ptr) ((uintptr_t)ptr) #define VALUE(key) (1 + ((int)(key) - 'a')) @@ -157,6 +199,7 @@ static PyMethodDef TestMethods[] = { {"get_configs", get_configs, METH_NOARGS}, {"get_recursion_depth", get_recursion_depth, METH_NOARGS}, {"test_bswap", test_bswap, METH_NOARGS}, + {"test_popcount", test_popcount, METH_NOARGS}, {"test_hashtable", test_hashtable, METH_NOARGS}, {NULL, NULL} /* sentinel */ }; diff --git a/Modules/sha256module.c b/Modules/sha256module.c index 8edb1d53828835..261f9daee28072 100644 --- a/Modules/sha256module.c +++ b/Modules/sha256module.c @@ -17,7 +17,7 @@ /* SHA objects */ #include "Python.h" -#include "pycore_byteswap.h" // _Py_bswap32() +#include "pycore_bitutils.h" // _Py_bswap32() #include "structmember.h" // PyMemberDef #include "hashlib.h" #include "pystrhex.h" diff --git a/Modules/sha512module.c b/Modules/sha512module.c index 561ef8ef0e8676..aa2aeedcc6c649 100644 --- a/Modules/sha512module.c +++ b/Modules/sha512module.c @@ -17,7 +17,7 @@ /* SHA objects */ #include "Python.h" -#include "pycore_byteswap.h" // _Py_bswap32() +#include "pycore_bitutils.h" // _Py_bswap32() #include "structmember.h" // PyMemberDef #include "hashlib.h" #include "pystrhex.h" diff --git a/Objects/longobject.c b/Objects/longobject.c index 0b209a403c4b76..9c0aac38aa313a 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -3,8 +3,9 @@ /* XXX The functional organization of this file is terrible */ #include "Python.h" -#include "pycore_interp.h" // _PY_NSMALLPOSINTS -#include "pycore_pystate.h" // _Py_IsMainInterpreter() +#include "pycore_bitutils.h" // _Py_popcount32() +#include "pycore_interp.h" // _PY_NSMALLPOSINTS +#include "pycore_pystate.h" // _Py_IsMainInterpreter() #include "longintrepr.h" #include @@ -5307,12 +5308,8 @@ int_bit_length_impl(PyObject *self) static int popcount_digit(digit d) { - /* 32bit SWAR popcount. */ - uint32_t u = d; - u -= (u >> 1) & 0x55555555U; - u = (u & 0x33333333U) + ((u >> 2) & 0x33333333U); - u = (u + (u >> 4)) & 0x0f0f0f0fU; - return (uint32_t)(u * 0x01010101U) >> 24; + Py_BUILD_ASSERT(sizeof(digit) <= sizeof(uint32_t)); + return _Py_popcount32((uint32_t)d); } /*[clinic input] diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 9b2a29ba3b8c2a..197605b012e5c6 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -4,7 +4,7 @@ # error "codecs.h is specific to Unicode" #endif -#include "pycore_byteswap.h" // _Py_bswap32() +#include "pycore_bitutils.h" // _Py_bswap32() /* Mask to quickly check whether a C 'long' contains a non-ASCII, UTF8-encoded char. */ diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index b6b0cf3e991ba7..8d5f99f8336a3d 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -170,7 +170,7 @@ - + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 10dfffba6113e5..7bc9f8f1664569 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -201,7 +201,7 @@ Include - + Include diff --git a/Python/hamt.c b/Python/hamt.c index 8801c5ea418c7f..47784baa1ef8f1 100644 --- a/Python/hamt.c +++ b/Python/hamt.c @@ -1,5 +1,6 @@ #include "Python.h" +#include "pycore_bitutils.h" // _Py_popcount32 #include "pycore_hamt.h" #include "pycore_object.h" // _PyObject_GC_TRACK() #include // offsetof() @@ -433,30 +434,10 @@ hamt_bitpos(int32_t hash, uint32_t shift) return (uint32_t)1 << hamt_mask(hash, shift); } -static inline uint32_t -hamt_bitcount(uint32_t i) -{ - /* We could use native popcount instruction but that would - require to either add configure flags to enable SSE4.2 - support or to detect it dynamically. Otherwise, we have - a risk of CPython not working properly on older hardware. - - In practice, there's no observable difference in - performance between using a popcount instruction or the - following fallback code. - - The algorithm is copied from: - https://graphics.stanford.edu/~seander/bithacks.html - */ - i = i - ((i >> 1) & 0x55555555); - i = (i & 0x33333333) + ((i >> 2) & 0x33333333); - return (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; -} - static inline uint32_t hamt_bitindex(uint32_t bitmap, uint32_t bit) { - return hamt_bitcount(bitmap & (bit - 1)); + return _Py_popcount32(bitmap & (bit - 1)); } @@ -820,7 +801,7 @@ hamt_node_bitmap_assoc(PyHamtNode_Bitmap *self, else { /* There was no key before with the same (shift,hash). */ - uint32_t n = hamt_bitcount(self->b_bitmap); + uint32_t n = _Py_popcount32(self->b_bitmap); if (n >= 16) { /* When we have a situation where we want to store more From 9e813d102832e71d36f87119be542be438461eff Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 29 May 2020 19:32:33 +0200 Subject: [PATCH 2/6] Enhance unit test --- Modules/_testinternalcapi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index acc478b0b27cfe..de99a16fa52dde 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -94,7 +94,7 @@ test_popcount(PyObject *self, PyObject *Py_UNUSED(args)) if (check_popcount(UINT32_C(0xffff), 16) < 0) { return NULL; } - if (check_popcount(UINT32_C(0x10101010), 4) < 0) { + if (check_popcount(UINT32_C(0x10204080), 4) < 0) { return NULL; } if (check_popcount(UINT32_C(0xDEADCAFE), 22) < 0) { From b793ff826e57335594b61162d3e481d8ab1ba42d Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 2 Jun 2020 02:04:52 +0200 Subject: [PATCH 3/6] Use constants --- Include/internal/pycore_bitutils.h | 42 ++++++++++++++++++------------ Modules/_testinternalcapi.c | 35 ++++++++++++------------- 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/Include/internal/pycore_bitutils.h b/Include/internal/pycore_bitutils.h index be7bdc1da9987e..71027059be4cbd 100644 --- a/Include/internal/pycore_bitutils.h +++ b/Include/internal/pycore_bitutils.h @@ -85,26 +85,34 @@ _Py_bswap64(uint64_t word) // Population count: count the number of 1's in 'u' -// (number of bits set to 1) +// (number of bits set to 1), also known as the hamming weight. static inline int -_Py_popcount32(uint32_t u) +_Py_popcount32(uint32_t x) { -#if defined(__clang__) || defined(__GNUC__) - -#if SIZEOF_INT == 4 - return __builtin_popcount(u); -#elif SIZEOF_LONG == 4 - return __builtin_popcountl(u); -#else -# error "unsupported configuration" -#endif - +#if (defined(__clang__) || defined(__GNUC__)) && SIZEOF_INT == 4 + return __builtin_popcount(x); +#elif (defined(__clang__) || defined(__GNUC__)) && SIZEOF_INT == 8 + return __builtin_popcountl(x); #else - /* 32-bit SWAR (SIMD Within A Register) popcount */ - u -= (u >> 1) & UINT32_C(0x55555555); - u = (u & UINT32_C(0x33333333)) + ((u >> 2) & UINT32_C(0x33333333)); - u = (u + (u >> 4)) & UINT32_C(0x0f0f0f0f); - return (uint32_t)(u * UINT32_C(0x01010101)) >> 24; + // 32-bit SWAR (SIMD Within A Register) popcount + + // Binary: 0 1 0 1 ... + const uint32_t M1 = UINT32_C(0x55555555); + // Binary: 00 11 00 11. .. + const uint32_t M2 = UINT32_C(0x33333333); + // Binary: 0000 1111 0000 1111 ... + const uint32_t M4 = UINT32_C(0x0F0F0F0F); + // 256^4 + 256^3 + 256^2 + 256^1 + const uint32_t SUM = UINT32_C(0x01010101); + + // Put count of each 2 bits into those 2 bits + x = x - ((x >> 1) & M1); + // Put count of each 4 bits into those 4 bits + x = (x & M2) + ((x >> 2) & M2); + // Put count of each 8 bits into those 8 bits + x = (x + (x >> 4)) & M4; + // Sum of the 4 byte counts + return (uint32_t)((uint64_t)x * (uint64_t)SUM) >> 24; #endif } diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index de99a16fa52dde..abe3dc950214c6 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -82,26 +82,23 @@ check_popcount(uint32_t x, int expected) static PyObject* test_popcount(PyObject *self, PyObject *Py_UNUSED(args)) { - if (check_popcount(0, 0) < 0) { - return NULL; - } - if (check_popcount(1, 1) < 0) { - return NULL; - } - if (check_popcount(UINT32_C(0x100), 1) < 0) { - return NULL; - } - if (check_popcount(UINT32_C(0xffff), 16) < 0) { - return NULL; - } - if (check_popcount(UINT32_C(0x10204080), 4) < 0) { - return NULL; - } - if (check_popcount(UINT32_C(0xDEADCAFE), 22) < 0) { - return NULL; - } - +#define CHECK(X, RESULT) \ + do { \ + if (check_popcount(UINT32_C(X), RESULT) < 0) { \ + return NULL; \ + } \ + } while (0) + + CHECK(0, 0); + CHECK(1, 1); + CHECK(0x08080808, 4); + CHECK(0x10101010, 4); + CHECK(0x10204080, 4); + CHECK(0xDEADCAFE, 22); + CHECK(0xFFFFFFFF, 32); Py_RETURN_NONE; + +#undef CHECK } From 585f9067c752a13f1794be7e0a04cb2fe79019cb Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 2 Jun 2020 02:35:28 +0200 Subject: [PATCH 4/6] Add implementation note on SSE4a --- Include/internal/pycore_bitutils.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Include/internal/pycore_bitutils.h b/Include/internal/pycore_bitutils.h index 71027059be4cbd..ca8dc981cc5645 100644 --- a/Include/internal/pycore_bitutils.h +++ b/Include/internal/pycore_bitutils.h @@ -86,6 +86,12 @@ _Py_bswap64(uint64_t word) // Population count: count the number of 1's in 'u' // (number of bits set to 1), also known as the hamming weight. +// +// Implementation note. CPUID is not used, to test if x86 POPCNT instruction +// can be used, to keep the implementation simple. For example, Visual Studio +// __popcnt() is not used this reason. The clang and GCC builtin function can +// use the x86 POPCNT instruction if the target architecture has SSE4a or +// newer. static inline int _Py_popcount32(uint32_t x) { From 48df7e3b3880c877f714c009adf8db5418b2f12b Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 2 Jun 2020 02:41:55 +0200 Subject: [PATCH 5/6] hamt.c: cast result explicitly --- Python/hamt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/hamt.c b/Python/hamt.c index 47784baa1ef8f1..e272e8808fd956 100644 --- a/Python/hamt.c +++ b/Python/hamt.c @@ -437,7 +437,7 @@ hamt_bitpos(int32_t hash, uint32_t shift) static inline uint32_t hamt_bitindex(uint32_t bitmap, uint32_t bit) { - return _Py_popcount32(bitmap & (bit - 1)); + return (uint32_t)_Py_popcount32(bitmap & (bit - 1)); } @@ -801,7 +801,7 @@ hamt_node_bitmap_assoc(PyHamtNode_Bitmap *self, else { /* There was no key before with the same (shift,hash). */ - uint32_t n = _Py_popcount32(self->b_bitmap); + uint32_t n = (uint32_t)_Py_popcount32(self->b_bitmap); if (n >= 16) { /* When we have a situation where we want to store more From 5838d2751e34c106df9299d18f1125a8844c738e Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sun, 7 Jun 2020 23:09:06 +0200 Subject: [PATCH 6/6] Address Mark's review * Remove UINT32_C() * Fix #ifdef to decide if builtin functions are used of not: use SIZEOF_INT >= 4. Always use __builtin_popcountl() otherwise. * popcount_digit(): check PyLong_SHIFT rather than sizeof(digit). * Fix also two comments --- Include/internal/pycore_bitutils.h | 24 ++++++++++++++++-------- Modules/_testinternalcapi.c | 2 +- Objects/longobject.c | 4 +++- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/Include/internal/pycore_bitutils.h b/Include/internal/pycore_bitutils.h index ca8dc981cc5645..36ffe23b9ff264 100644 --- a/Include/internal/pycore_bitutils.h +++ b/Include/internal/pycore_bitutils.h @@ -84,7 +84,7 @@ _Py_bswap64(uint64_t word) } -// Population count: count the number of 1's in 'u' +// Population count: count the number of 1's in 'x' // (number of bits set to 1), also known as the hamming weight. // // Implementation note. CPUID is not used, to test if x86 POPCNT instruction @@ -95,21 +95,29 @@ _Py_bswap64(uint64_t word) static inline int _Py_popcount32(uint32_t x) { -#if (defined(__clang__) || defined(__GNUC__)) && SIZEOF_INT == 4 +#if (defined(__clang__) || defined(__GNUC__)) + +#if SIZEOF_INT >= 4 + Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int)); return __builtin_popcount(x); -#elif (defined(__clang__) || defined(__GNUC__)) && SIZEOF_INT == 8 +#else + // The C standard guarantees that unsigned long will always be big enough + // to hold a uint32_t value without losing information. + Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long)); return __builtin_popcountl(x); +#endif + #else // 32-bit SWAR (SIMD Within A Register) popcount // Binary: 0 1 0 1 ... - const uint32_t M1 = UINT32_C(0x55555555); + const uint32_t M1 = 0x55555555; // Binary: 00 11 00 11. .. - const uint32_t M2 = UINT32_C(0x33333333); + const uint32_t M2 = 0x33333333; // Binary: 0000 1111 0000 1111 ... - const uint32_t M4 = UINT32_C(0x0F0F0F0F); - // 256^4 + 256^3 + 256^2 + 256^1 - const uint32_t SUM = UINT32_C(0x01010101); + const uint32_t M4 = 0x0F0F0F0F; + // 256**4 + 256**3 + 256**2 + 256**1 + const uint32_t SUM = 0x01010101; // Put count of each 2 bits into those 2 bits x = x - ((x >> 1) & M1); diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index abe3dc950214c6..6d5af5917f1f07 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -84,7 +84,7 @@ test_popcount(PyObject *self, PyObject *Py_UNUSED(args)) { #define CHECK(X, RESULT) \ do { \ - if (check_popcount(UINT32_C(X), RESULT) < 0) { \ + if (check_popcount(X, RESULT) < 0) { \ return NULL; \ } \ } while (0) diff --git a/Objects/longobject.c b/Objects/longobject.c index 9c0aac38aa313a..ce10c4f66586a1 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -5308,7 +5308,9 @@ int_bit_length_impl(PyObject *self) static int popcount_digit(digit d) { - Py_BUILD_ASSERT(sizeof(digit) <= sizeof(uint32_t)); + // digit can be larger than uint32_t, but only PyLong_SHIFT bits + // of it will be ever used. + Py_BUILD_ASSERT(PyLong_SHIFT <= 32); return _Py_popcount32((uint32_t)d); }