micropython · dpgeorge · Jun 24, 2024 · Apr 16, 2024 · Nov 29, 2023 · projectgus
diff --git a/docs/library/builtins.rst b/docs/library/builtins.rst
@@ -82,6 +82,10 @@ Functions and types
       In MicroPython, `byteorder` parameter must be positional (this is
       compatible with CPython).
 
+      .. note:: The optional ``signed`` kwarg from CPython is not supported.
+                MicroPython currently converts negative integers as signed,
+                and positive as unsigned. (:ref:`Details <cpydiff_types_int_to_bytes>`.)
+
 .. function:: isinstance()
 
 .. function:: issubclass()

diff --git a/py/asmthumb.c b/py/asmthumb.c
@@ -35,23 +35,7 @@
 
 #include "py/mpstate.h"
 #include "py/asmthumb.h"
-
-#ifdef _MSC_VER
-#include <intrin.h>
-
-static uint32_t mp_clz(uint32_t x) {
-    unsigned long lz = 0;
-    return _BitScanReverse(&lz, x) ? (sizeof(x) * 8 - 1) - lz : 0;
-}
-
-static uint32_t mp_ctz(uint32_t x) {
-    unsigned long tz = 0;
-    return _BitScanForward(&tz, x) ? tz : 0;
-}
-#else
-#define mp_clz(x) __builtin_clz(x)
-#define mp_ctz(x) __builtin_ctz(x)
-#endif
+#include "py/misc.h"
 
 #define UNSIGNED_FIT5(x) ((uint32_t)(x) < 32)
 #define UNSIGNED_FIT7(x) ((uint32_t)(x) < 128)

diff --git a/py/misc.h b/py/misc.h
@@ -334,4 +334,55 @@ typedef const char *mp_rom_error_text_t;
 // For now, forward directly to MP_COMPRESSED_ROM_TEXT.
 #define MP_ERROR_TEXT(x) (mp_rom_error_text_t)MP_COMPRESSED_ROM_TEXT(x)
 
+// Portable implementations of CLZ and CTZ intrinsics
+#ifdef _MSC_VER
+#include <intrin.h>
+
+static uint32_t mp_clz(uint32_t x) {
+    unsigned long lz = 0;
+    return _BitScanReverse(&lz, x) ? (sizeof(x) * 8 - 1) - lz : 0;
+}
+
+static uint32_t mp_clzl(unsigned long x) {
+    unsigned long lz = 0;
+    return _BitScanReverse(&lz, x) ? (sizeof(x) * 8 - 1) - lz : 0;
+}
+
+#ifdef _WIN64
+static uint32_t mp_clzll(unsigned long long x) {
+    unsigned long lz = 0;
+    return _BitScanReverse64(&lz, x) ? (sizeof(x) * 8 - 1) - lz : 0;
+}
+#else
+// Microsoft don't ship _BitScanReverse64 on Win32, so emulate it
+static uint32_t mp_clzll(unsigned long long x) {
+    unsigned long h = x >> 32;
+    return h ? mp_clzl(h) : (mp_clzl(x) + 32);
+}
+#endif
+
+static uint32_t mp_ctz(uint32_t x) {
+    unsigned long tz = 0;
+    return _BitScanForward(&tz, x) ? tz : 0;
+}
+#else
+#define mp_clz(x) __builtin_clz(x)
+#define mp_clzl(x) __builtin_clzl(x)
+#define mp_clzll(x) __builtin_clzll(x)
+#define mp_ctz(x) __builtin_ctz(x)
+#endif
+
+// mp_int_t can be larger than long, i.e. Windows 64-bit, nan-box variants
+static inline uint32_t mp_clz_mpi(mp_int_t x) {
+    MP_STATIC_ASSERT(sizeof(mp_int_t) == sizeof(long long)
+        || sizeof(mp_int_t) == sizeof(long));
+
+    // ugly, but should compile to single intrinsic unless O0 is set
+    if (sizeof(mp_int_t) == sizeof(long)) {
+        return mp_clzl(x);
+    } else {
+        return mp_clzll(x);
+    }
+}
+
 #endif // MICROPY_INCLUDED_PY_MISC_H
diff --git a/py/mpz.c b/py/mpz.c
@@ -1589,7 +1589,7 @@ bool mpz_as_uint_checked(const mpz_t *i, mp_uint_t *value) {
     return true;
 }
 
-void mpz_as_bytes(const mpz_t *z, bool big_endian, size_t len, byte *buf) {
+bool mpz_as_bytes(const mpz_t *z, bool big_endian, bool as_signed, size_t len, byte *buf) {
     byte *b = buf;
     if (big_endian) {
         b += len;
@@ -1598,6 +1598,8 @@ void mpz_as_bytes(const mpz_t *z, bool big_endian, size_t len, byte *buf) {
     int bits = 0;
     mpz_dbl_dig_t d = 0;
     mpz_dbl_dig_t carry = 1;
+    size_t olen = len; // bytes in output buffer
+    bool ok = true;
     for (size_t zlen = z->len; zlen > 0; --zlen) {
         bits += DIG_SIZE;
         d = (d << DIG_SIZE) | *zdig++;
@@ -1607,28 +1609,32 @@ void mpz_as_bytes(const mpz_t *z, bool big_endian, size_t len, byte *buf) {
                 val = (~val & 0xff) + carry;
                 carry = val >> 8;
             }
+
+            if (!olen) {
+                // Buffer is full, only OK if all remaining bytes are zeroes
+                ok = ok && ((byte)val == 0);
+                continue;
+            }
+
             if (big_endian) {
                 *--b = val;
-                if (b == buf) {
-                    return;
-                }
             } else {
                 *b++ = val;
-                if (b == buf + len) {
-                    return;
-                }
             }
+            olen--;
         }
     }
 
-    // fill remainder of buf with zero/sign extension of the integer
-    if (big_endian) {
-        len = b - buf;
+    if (as_signed && olen == 0 && len > 0) {
+        // If output exhausted then ensure there was enough space for the sign bit
+        byte most_sig = big_endian ? buf[0] : buf[len - 1];
+        ok = ok && (bool)(most_sig & 0x80) == (bool)z->neg;
     } else {
-        len = buf + len - b;
-        buf = b;
+        // fill remainder of buf with zero/sign extension of the integer
+        memset(big_endian ? buf : b, z->neg ? 0xff : 0x00, olen);
     }
-    memset(buf, z->neg ? 0xff : 0x00, len);
+
+    return ok;
 }
 
 #if MICROPY_PY_BUILTINS_FLOAT

diff --git a/py/mpz.h b/py/mpz.h
@@ -93,9 +93,9 @@ typedef int8_t mpz_dbl_dig_signed_t;
 typedef struct _mpz_t {
     // Zero has neg=0, len=0.  Negative zero is not allowed.
     size_t neg : 1;
-    size_t fixed_dig : 1;
-    size_t alloc : (8 * sizeof(size_t) - 2);
-    size_t len;
+    size_t fixed_dig : 1; // flag, 'dig' buffer cannot be reallocated
+    size_t alloc : (8 * sizeof(size_t) - 2); // number of entries allocated in 'dig'
+    size_t len; // number of entries used in 'dig'
     mpz_dig_t *dig;
 } mpz_t;
 
@@ -145,7 +145,8 @@ static inline size_t mpz_max_num_bits(const mpz_t *z) {
 mp_int_t mpz_hash(const mpz_t *z);
 bool mpz_as_int_checked(const mpz_t *z, mp_int_t *value);
 bool mpz_as_uint_checked(const mpz_t *z, mp_uint_t *value);
-void mpz_as_bytes(const mpz_t *z, bool big_endian, size_t len, byte *buf);
+// Returns true if 'z' fit into 'len' bytes of 'buf' without overflowing, 'buf' is truncated otherwise.
+bool mpz_as_bytes(const mpz_t *z, bool big_endian, bool as_signed, size_t len, byte *buf);
 #if MICROPY_PY_BUILTINS_FLOAT
 mp_float_t mpz_as_float(const mpz_t *z);
 #endif

diff --git a/py/objint.c b/py/objint.c
@@ -421,29 +421,50 @@ static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(int_from_bytes_fun_obj, 3, 4, int_fro
 static MP_DEFINE_CONST_CLASSMETHOD_OBJ(int_from_bytes_obj, MP_ROM_PTR(&int_from_bytes_fun_obj));
 
 static mp_obj_t int_to_bytes(size_t n_args, const mp_obj_t *args) {
-    // TODO: Support signed param (assumes signed=False)
+    // TODO: Support signed (currently behaves as if signed=(val < 0))
     (void)n_args;
+    bool overflow;
 
-    mp_int_t len = mp_obj_get_int(args[1]);
-    if (len < 0) {
+    mp_int_t dlen = mp_obj_get_int(args[1]);
+    if (dlen < 0) {
         mp_raise_ValueError(NULL);
     }
     bool big_endian = args[2] != MP_OBJ_NEW_QSTR(MP_QSTR_little);
 
     vstr_t vstr;
-    vstr_init_len(&vstr, len);
+    vstr_init_len(&vstr, dlen);
     byte *data = (byte *)vstr.buf;
-    memset(data, 0, len);
 
     #if MICROPY_LONGINT_IMPL != MICROPY_LONGINT_IMPL_NONE
     if (!mp_obj_is_small_int(args[0])) {
-        mp_obj_int_to_bytes_impl(args[0], big_endian, len, data);
+        overflow = !mp_obj_int_to_bytes_impl(args[0], big_endian, dlen, data);
     } else
     #endif
     {
         mp_int_t val = MP_OBJ_SMALL_INT_VALUE(args[0]);
-        size_t l = MIN((size_t)len, sizeof(val));
-        mp_binary_set_int(l, big_endian, data + (big_endian ? (len - l) : 0), val);
+        int slen = 0;  // Number of bytes to represent val
+
+        // This logic has a twin in objint_longlong.c
+        if (val > 0) {
+            slen = (sizeof(mp_int_t) * 8 - mp_clz_mpi(val) + 7) / 8;
+        } else if (val < -1) {
+            slen = (sizeof(mp_int_t) * 8 - mp_clz_mpi(~val) + 8) / 8;
+        } else {
+            // clz of 0 is defined, so 0 and -1 map to 0 and 1
+            slen = -val;
+        }
+
+        if (slen <= dlen) {
+            memset(data, val < 0 ? 0xFF : 0x00, dlen);
+            mp_binary_set_int(slen, big_endian, data + (big_endian ? (dlen - slen) : 0), val);
+            overflow = false;
+        } else {
+            overflow = true;
+        }
+    }
+
+    if (overflow) {
+        mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("buffer too small"));
     }
 
     return mp_obj_new_bytes_from_vstr(&vstr);

diff --git a/py/objint.h b/py/objint.h
@@ -55,7 +55,8 @@ char *mp_obj_int_formatted_impl(char **buf, size_t *buf_size, size_t *fmt_size,
     int base, const char *prefix, char base_char, char comma);
 mp_int_t mp_obj_int_hash(mp_obj_t self_in);
 mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, size_t len, const byte *buf);
-void mp_obj_int_to_bytes_impl(mp_obj_t self_in, bool big_endian, size_t len, byte *buf);
+// Returns true if 'self_in' fit into 'len' bytes of 'buf' without overflowing, 'buf' is truncated otherwise.
+bool mp_obj_int_to_bytes_impl(mp_obj_t self_in, bool big_endian, size_t len, byte *buf);
 int mp_obj_int_sign(mp_obj_t self_in);
 mp_obj_t mp_obj_int_unary_op(mp_unary_op_t op, mp_obj_t o_in);
 mp_obj_t mp_obj_int_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in);

diff --git a/py/objint_longlong.c b/py/objint_longlong.c
@@ -57,10 +57,27 @@ mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, size_t len, const byte *buf
     return mp_obj_new_int_from_ll(value);
 }
 
-void mp_obj_int_to_bytes_impl(mp_obj_t self_in, bool big_endian, size_t len, byte *buf) {
+bool mp_obj_int_to_bytes_impl(mp_obj_t self_in, bool big_endian, size_t len, byte *buf) {
     assert(mp_obj_is_exact_type(self_in, &mp_type_int));
     mp_obj_int_t *self = self_in;
     long long val = self->val;
+    size_t slen; // Number of bytes to represent val
+
+    // This logic has a twin in objint.c
+    if (val > 0) {
+        slen = (sizeof(long long) * 8 - mp_clzll(val) + 7) / 8;
+    } else if (val < -1) {
+        slen = (sizeof(long long) * 8 - mp_clzll(~val) + 8) / 8;
+    } else {
+        // clz of 0 is defined, so 0 and -1 map to 0 and 1
+        slen = -val;
+    }
+
+    if (slen > len) {
+        return false; // Would overflow
+        // TODO: Determine whether to copy and truncate, as some callers probably expect this...?
+    }
+
     if (big_endian) {
         byte *b = buf + len;
         while (b > buf) {
@@ -73,6 +90,7 @@ void mp_obj_int_to_bytes_impl(mp_obj_t self_in, bool big_endian, size_t len, byt
             val >>= 8;
         }
     }
+    return true;
 }
 
 int mp_obj_int_sign(mp_obj_t self_in) {

diff --git a/py/objint_mpz.c b/py/objint_mpz.c
@@ -112,10 +112,10 @@ mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, size_t len, const byte *buf
     return MP_OBJ_FROM_PTR(o);
 }
 
-void mp_obj_int_to_bytes_impl(mp_obj_t self_in, bool big_endian, size_t len, byte *buf) {
+bool mp_obj_int_to_bytes_impl(mp_obj_t self_in, bool big_endian, size_t len, byte *buf) {
     assert(mp_obj_is_exact_type(self_in, &mp_type_int));
     mp_obj_int_t *self = MP_OBJ_TO_PTR(self_in);
-    mpz_as_bytes(&self->mpz, big_endian, len, buf);
+    return mpz_as_bytes(&self->mpz, big_endian, self->mpz.neg, len, buf);
 }
 
 int mp_obj_int_sign(mp_obj_t self_in) {

diff --git a/tests/basics/int_bytes.py b/tests/basics/int_bytes.py
@@ -1,3 +1,5 @@
+import sys
+
 print((10).to_bytes(1, "little"))
 print((111111).to_bytes(4, "little"))
 print((100).to_bytes(10, "little"))
@@ -20,3 +22,74 @@
     (1).to_bytes(-1, "little")
 except ValueError:
     print("ValueError")
+
+# zero byte destination should also raise an error
+try:
+    (1).to_bytes(0, "little")
+except OverflowError:
+    print("OverflowError")
+
+# except for converting 0 to a zero-length byte array
+print((0).to_bytes(0, "big"))
+
+# byte length can fit the integer directly
+print((0xFF).to_bytes(1, "little"))
+print((0xFF).to_bytes(1, "big"))
+print((0xEFF).to_bytes(2, "little"))
+print((0xEFF).to_bytes(2, "big"))
+print((0xCDEFF).to_bytes(3, "little"))
+print((0xCDEFF).to_bytes(3, "big"))
+
+# OverFlowError if not big enough
+
+try:
+    (0x123).to_bytes(1, "big")
+except OverflowError:
+    print("OverflowError")
+
+try:
+    (0x12345).to_bytes(2, "big")
+except OverflowError:
+    print("OverflowError")
+
+try:
+    (0x1234567).to_bytes(3, "big")
+except OverflowError:
+    print("OverflowError")
+
+
+# negative representations
+
+# MicroPython int.to_bytes() behaves as if signed=True for negative numbers
+if "micropython" in repr(sys.implementation):
+
+    def to_bytes_compat(i, l, e):
+        return i.to_bytes(l, e)
+else:
+    # Implement MicroPython compatible behaviour for CPython
+    def to_bytes_compat(i, l, e):
+        return i.to_bytes(l, e, signed=i < 0)
+
+
+print(to_bytes_compat(-1, 1, "little"))
+print(to_bytes_compat(-1, 3, "little"))
+print(to_bytes_compat(-1, 1, "big"))
+print(to_bytes_compat(-1, 3, "big"))
+print(to_bytes_compat(-128, 1, "big"))
+print(to_bytes_compat(-32768, 2, "big"))
+print(to_bytes_compat(-(1 << 23), 3, "big"))
+
+try:
+    print(to_bytes_compat(-129, 1, "big"))
+except OverflowError:
+    print("OverflowError")
+
+try:
+    print(to_bytes_compat(-32769, 2, "big"))
+except OverflowError:
+    print("OverflowError")
+
+try:
+    print(to_bytes_compat(-(1 << 23) - 1, 2, "big"))
+except OverflowError:
+    print("OverflowError")