From da051a988de98d1fba23f3f35f5cf20f2f0258eb Mon Sep 17 00:00:00 2001 From: Peter Kese Date: Tue, 8 Jun 2021 14:35:46 +0200 Subject: [PATCH 1/8] Add a failing test for Unicode conversion --- src/embed_tests/TestPyString.cs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/embed_tests/TestPyString.cs b/src/embed_tests/TestPyString.cs index 0de436e35..561fd4eaa 100644 --- a/src/embed_tests/TestPyString.cs +++ b/src/embed_tests/TestPyString.cs @@ -94,5 +94,15 @@ public void TestUnicode() PyObject actual = new PyString(expected); Assert.AreEqual(expected, actual.ToString()); } + + [Test] + public void TestUnicodeSurrogate() + { + const string expected = "foo\ud83d\udc3c"; // "foo🐼" + PyObject actual = new PyString(expected); + // python treats "foo🐼" as 4 characters, dotnet as 5 + Assert.AreEqual(4, actual.Length()); + Assert.AreEqual(expected, actual.ToString()); + } } } From 1b6c6c059b5adc6b09d3673a1d4034c40902b777 Mon Sep 17 00:00:00 2001 From: Peter Kese Date: Tue, 8 Jun 2021 20:15:07 +0200 Subject: [PATCH 2/8] Fix Python -> .Net unicode string conversion --- src/runtime/runtime.cs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/runtime/runtime.cs b/src/runtime/runtime.cs index 789b71f3e..1065fda19 100644 --- a/src/runtime/runtime.cs +++ b/src/runtime/runtime.cs @@ -1646,11 +1646,12 @@ internal static string GetManagedString(IntPtr op) if (type == PyUnicodeType) { using var p = PyUnicode_AsUTF16String(new BorrowedReference(op)); - int length = (int)PyUnicode_GetSize(op); - char* codePoints = (char*)PyBytes_AsString(p.DangerousGetAddress()); + var bytesPtr = p.DangerousGetAddress(); + int bytesLength = (int)Runtime.PyBytes_Size(bytesPtr); + char* codePoints = (char*)PyBytes_AsString(bytesPtr); return new string(codePoints, startIndex: 1, // skip BOM - length: length); + length: bytesLength/2-1); // utf16 - BOM } return null; From 4674b5c55d419a50801c6813ab4238db2d9860ad Mon Sep 17 00:00:00 2001 From: Peter Kese Date: Tue, 8 Jun 2021 20:44:33 +0200 Subject: [PATCH 3/8] Add separate test for the initial problem (passing); disable failing test --- AUTHORS.md | 1 + CHANGELOG.md | 1 + src/embed_tests/TestPyString.cs | 10 ++++++++++ 3 files changed, 12 insertions(+) diff --git a/AUTHORS.md b/AUTHORS.md index 6cfa216b1..ebd2021b3 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -55,6 +55,7 @@ - Meinrad Recheis ([@henon](https://github.com/henon)) - Mohamed Koubaa ([@koubaa](https://github.com/koubaa)) - Patrick Stewart ([@patstew](https://github.com/patstew)) +- Peter Kese ([@patstew](https://github.com/pkese)) - Raphael Nestler ([@rnestler](https://github.com/rnestler)) - Rickard Holmberg ([@rickardraysearch](https://github.com/rickardraysearch)) - Sam Winstanley ([@swinstanley](https://github.com/swinstanley)) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5871e7ffb..c453f2b4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,7 @@ One must now either use enum members (e.g. `MyEnum.Option`), or use enum constru - Exception stacktraces on `PythonException.StackTrace` are now properly formatted - Providing an invalid type parameter to a generic type or method produces a helpful Python error - Empty parameter names (as can be generated from F#) do not cause crashes +- Unicode strings with surrogates get truncated when converting from Python ### Removed diff --git a/src/embed_tests/TestPyString.cs b/src/embed_tests/TestPyString.cs index 561fd4eaa..c2f2a0d6a 100644 --- a/src/embed_tests/TestPyString.cs +++ b/src/embed_tests/TestPyString.cs @@ -96,6 +96,16 @@ public void TestUnicode() } [Test] + public void TestUnicodeSurrogateToString() + { + var expected = "foo\ud83d\udc3c"; + var actual = PythonEngine.Eval("'foo\ud83d\udc3c'"); + Assert.AreEqual(4, actual.Length()); + Assert.AreEqual(expected, actual.ToString()); + } + + [Test] + [Ignore("Bug: Unicode conversion issue #1466")] public void TestUnicodeSurrogate() { const string expected = "foo\ud83d\udc3c"; // "foo🐼" From 1629116c08ac90759b1c4283b156078c394bd710 Mon Sep 17 00:00:00 2001 From: Benedikt Reinartz Date: Wed, 9 Jun 2021 08:09:34 +0200 Subject: [PATCH 4/8] Use exclusively PyUnicode_DecodeUTF16 for .NET->Python string conversions --- src/embed_tests/TestCustomMarshal.cs | 2 +- src/embed_tests/TestRuntime.cs | 2 +- src/runtime/converter.cs | 2 +- src/runtime/exceptions.cs | 6 ++-- src/runtime/pystring.cs | 2 +- src/runtime/runtime.cs | 47 ++++++---------------------- src/runtime/typemanager.cs | 2 +- 7 files changed, 17 insertions(+), 46 deletions(-) diff --git a/src/embed_tests/TestCustomMarshal.cs b/src/embed_tests/TestCustomMarshal.cs index 5860857a3..99911bdb0 100644 --- a/src/embed_tests/TestCustomMarshal.cs +++ b/src/embed_tests/TestCustomMarshal.cs @@ -23,7 +23,7 @@ public static void GetManagedStringTwice() { const string expected = "FooBar"; - IntPtr op = Runtime.Runtime.PyUnicode_FromString(expected); + IntPtr op = Runtime.Runtime.PyString_FromString(expected); string s1 = Runtime.Runtime.GetManagedString(op); string s2 = Runtime.Runtime.GetManagedString(op); diff --git a/src/embed_tests/TestRuntime.cs b/src/embed_tests/TestRuntime.cs index 9ca6cf139..9fb2e8b22 100644 --- a/src/embed_tests/TestRuntime.cs +++ b/src/embed_tests/TestRuntime.cs @@ -36,7 +36,7 @@ public static void Py_IsInitializedValue() public static void RefCountTest() { Runtime.Runtime.Py_Initialize(); - IntPtr op = Runtime.Runtime.PyUnicode_FromString("FooBar"); + IntPtr op = Runtime.Runtime.PyString_FromString("FooBar"); // New object RefCount should be one Assert.AreEqual(1, Runtime.Runtime.Refcount(op)); diff --git a/src/runtime/converter.cs b/src/runtime/converter.cs index 47263e8c4..80f31f058 100644 --- a/src/runtime/converter.cs +++ b/src/runtime/converter.cs @@ -221,7 +221,7 @@ internal static IntPtr ToPython(object value, Type type) return CLRObject.GetInstHandle(value, type); case TypeCode.String: - return Runtime.PyUnicode_FromString((string)value); + return Runtime.PyString_FromString((string)value); case TypeCode.Int32: return Runtime.PyInt_FromInt32((int)value); diff --git a/src/runtime/exceptions.cs b/src/runtime/exceptions.cs index bbdcdad30..a612e34e3 100644 --- a/src/runtime/exceptions.cs +++ b/src/runtime/exceptions.cs @@ -50,7 +50,7 @@ internal static Exception ToException(BorrowedReference ob) { message = String.Format("{0}()", name); } - return Runtime.PyUnicode_FromString(message); + return Runtime.PyString_FromString(message); } /// @@ -75,7 +75,7 @@ internal static Exception ToException(BorrowedReference ob) { message = message.Substring(fullTypeName.Length); } - return Runtime.PyUnicode_FromString(message); + return Runtime.PyString_FromString(message); } } @@ -153,7 +153,7 @@ internal static void SetArgsAndCause(BorrowedReference ob, Exception e) if (!string.IsNullOrEmpty(e.Message)) { args = Runtime.PyTuple_New(1); - IntPtr msg = Runtime.PyUnicode_FromString(e.Message); + IntPtr msg = Runtime.PyString_FromString(e.Message); Runtime.PyTuple_SetItem(args, 0, msg); } else diff --git a/src/runtime/pystring.cs b/src/runtime/pystring.cs index 07eabba14..172c09ebd 100644 --- a/src/runtime/pystring.cs +++ b/src/runtime/pystring.cs @@ -51,7 +51,7 @@ public PyString(PyObject o) : base(FromObject(o)) private static IntPtr FromString(string s) { - IntPtr val = Runtime.PyUnicode_FromUnicode(s, s.Length); + IntPtr val = Runtime.PyString_FromString(s); PythonException.ThrowIfIsNull(val); return val; } diff --git a/src/runtime/runtime.cs b/src/runtime/runtime.cs index 1065fda19..1d736f1ec 100644 --- a/src/runtime/runtime.cs +++ b/src/runtime/runtime.cs @@ -230,7 +230,7 @@ private static void InitPyMembers() () => PyStringType = IntPtr.Zero); XDecref(op); - op = PyUnicode_FromString("unicode"); + op = PyString_FromString("unicode"); SetPyMemberTypeOf(ref PyUnicodeType, op, () => PyUnicodeType = IntPtr.Zero); XDecref(op); @@ -1527,7 +1527,12 @@ internal static bool PyString_Check(IntPtr ob) internal static IntPtr PyString_FromString(string value) { fixed(char* ptr = value) - return PyUnicode_FromKindAndData(2, (IntPtr)ptr, value.Length); + return Delegates.PyUnicode_DecodeUTF16( + (IntPtr)ptr, + new IntPtr(value.Length * sizeof(Char)), + IntPtr.Zero, + IntPtr.Zero + ).DangerousGetAddress(); } @@ -1553,16 +1558,6 @@ internal static long PyBytes_Size(IntPtr op) private static IntPtr _PyBytes_Size(IntPtr op) => Delegates._PyBytes_Size(op); - - internal static IntPtr PyUnicode_FromStringAndSize(IntPtr value, long size) - { - return PyUnicode_FromStringAndSize(value, new IntPtr(size)); - } - - - private static IntPtr PyUnicode_FromStringAndSize(IntPtr value, IntPtr size) => Delegates.PyUnicode_FromStringAndSize(value, size); - - internal static IntPtr PyUnicode_AsUTF8(IntPtr unicode) => Delegates.PyUnicode_AsUTF8(unicode); internal static bool PyUnicode_Check(IntPtr ob) @@ -1576,22 +1571,6 @@ internal static bool PyUnicode_Check(IntPtr ob) internal static IntPtr PyUnicode_FromEncodedObject(IntPtr ob, IntPtr enc, IntPtr err) => Delegates.PyUnicode_FromEncodedObject(ob, enc, err); - internal static IntPtr PyUnicode_FromKindAndData(int kind, IntPtr s, long size) - { - return PyUnicode_FromKindAndData(kind, s, new IntPtr(size)); - } - - - private static IntPtr PyUnicode_FromKindAndData(int kind, IntPtr s, IntPtr size) - => Delegates.PyUnicode_FromKindAndData(kind, s, size); - - internal static IntPtr PyUnicode_FromUnicode(string s, long size) - { - fixed(char* ptr = s) - return PyUnicode_FromKindAndData(2, (IntPtr)ptr, size); - } - - internal static int PyUnicode_GetMax() => Delegates.PyUnicode_GetMax(); internal static long PyUnicode_GetSize(IntPtr ob) @@ -1610,12 +1589,6 @@ internal static long PyUnicode_GetSize(IntPtr ob) internal static IntPtr PyUnicode_FromOrdinal(int c) => Delegates.PyUnicode_FromOrdinal(c); - internal static IntPtr PyUnicode_FromString(string s) - { - return PyUnicode_FromUnicode(s, s.Length); - } - - internal static IntPtr PyUnicode_InternFromString(string s) { using var ptr = new StrPtr(s, Encoding.UTF8); @@ -2443,11 +2416,10 @@ static Delegates() PyBytes_AsString = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyBytes_AsString), GetUnmanagedDll(_PythonDll)); PyBytes_FromString = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyBytes_FromString), GetUnmanagedDll(_PythonDll)); _PyBytes_Size = (delegate* unmanaged[Cdecl])GetFunctionByName("PyBytes_Size", GetUnmanagedDll(_PythonDll)); - PyUnicode_FromStringAndSize = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_FromStringAndSize), GetUnmanagedDll(_PythonDll)); PyUnicode_AsUTF8 = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_AsUTF8), GetUnmanagedDll(_PythonDll)); PyUnicode_FromObject = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_FromObject), GetUnmanagedDll(_PythonDll)); + PyUnicode_DecodeUTF16 = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_DecodeUTF16), GetUnmanagedDll(_PythonDll)); PyUnicode_FromEncodedObject = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_FromEncodedObject), GetUnmanagedDll(_PythonDll)); - PyUnicode_FromKindAndData = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_FromKindAndData), GetUnmanagedDll(_PythonDll)); PyUnicode_GetMax = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_GetMax), GetUnmanagedDll(_PythonDll)); _PyUnicode_GetSize = (delegate* unmanaged[Cdecl])GetFunctionByName("PyUnicode_GetSize", GetUnmanagedDll(_PythonDll)); PyUnicode_AsUnicode = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_AsUnicode), GetUnmanagedDll(_PythonDll)); @@ -2739,11 +2711,10 @@ static Delegates() internal static delegate* unmanaged[Cdecl] PyBytes_AsString { get; } internal static delegate* unmanaged[Cdecl] PyBytes_FromString { get; } internal static delegate* unmanaged[Cdecl] _PyBytes_Size { get; } - internal static delegate* unmanaged[Cdecl] PyUnicode_FromStringAndSize { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_AsUTF8 { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_FromObject { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_FromEncodedObject { get; } - internal static delegate* unmanaged[Cdecl] PyUnicode_FromKindAndData { get; } + internal static delegate* unmanaged[Cdecl] PyUnicode_DecodeUTF16 { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_GetMax { get; } internal static delegate* unmanaged[Cdecl] _PyUnicode_GetSize { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_AsUnicode { get; } diff --git a/src/runtime/typemanager.cs b/src/runtime/typemanager.cs index 13d822c09..e1bfe6aef 100644 --- a/src/runtime/typemanager.cs +++ b/src/runtime/typemanager.cs @@ -580,7 +580,7 @@ internal static IntPtr AllocateTypeObject(string name, IntPtr metatype) // Cheat a little: we'll set tp_name to the internal char * of // the Python version of the type name - otherwise we'd have to // allocate the tp_name and would have no way to free it. - IntPtr temp = Runtime.PyUnicode_FromString(name); + IntPtr temp = Runtime.PyString_FromString(name); IntPtr raw = Runtime.PyUnicode_AsUTF8(temp); Marshal.WriteIntPtr(type, TypeOffset.tp_name, raw); Marshal.WriteIntPtr(type, TypeOffset.name, temp); From df5ebc2ec4eb55411fbfb2113a6305ec927d6044 Mon Sep 17 00:00:00 2001 From: Benedikt Reinartz Date: Wed, 9 Jun 2021 08:35:34 +0200 Subject: [PATCH 5/8] Activate UnicodeSurrogate test --- src/embed_tests/TestPyString.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/embed_tests/TestPyString.cs b/src/embed_tests/TestPyString.cs index c2f2a0d6a..669ecde0d 100644 --- a/src/embed_tests/TestPyString.cs +++ b/src/embed_tests/TestPyString.cs @@ -105,7 +105,6 @@ public void TestUnicodeSurrogateToString() } [Test] - [Ignore("Bug: Unicode conversion issue #1466")] public void TestUnicodeSurrogate() { const string expected = "foo\ud83d\udc3c"; // "foo🐼" From d084b2e0d53293d26bb2a72d7e078306fa9cdaf2 Mon Sep 17 00:00:00 2001 From: Peter Kese Date: Wed, 9 Jun 2021 18:40:35 +0200 Subject: [PATCH 6/8] Apply suggested changes --- CHANGELOG.md | 2 +- src/runtime/runtime.cs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c453f2b4b..8aba9e9b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,7 +71,7 @@ One must now either use enum members (e.g. `MyEnum.Option`), or use enum constru - Exception stacktraces on `PythonException.StackTrace` are now properly formatted - Providing an invalid type parameter to a generic type or method produces a helpful Python error - Empty parameter names (as can be generated from F#) do not cause crashes -- Unicode strings with surrogates get truncated when converting from Python +- Unicode strings with surrogates were truncated when converting from Python ### Removed diff --git a/src/runtime/runtime.cs b/src/runtime/runtime.cs index 1d736f1ec..80a0e5f01 100644 --- a/src/runtime/runtime.cs +++ b/src/runtime/runtime.cs @@ -1619,12 +1619,12 @@ internal static string GetManagedString(IntPtr op) if (type == PyUnicodeType) { using var p = PyUnicode_AsUTF16String(new BorrowedReference(op)); - var bytesPtr = p.DangerousGetAddress(); - int bytesLength = (int)Runtime.PyBytes_Size(bytesPtr); + var bytesPtr = p.DangerousMoveToPointerOrNull(); + nint bytesLength = (nint)Runtime.PyBytes_Size(bytesPtr); char* codePoints = (char*)PyBytes_AsString(bytesPtr); return new string(codePoints, startIndex: 1, // skip BOM - length: bytesLength/2-1); // utf16 - BOM + length: (int) (bytesLength/2-1)); // utf16 - BOM } return null; From f061d287e43407d96a8ac4c75b8a7a030f45a80e Mon Sep 17 00:00:00 2001 From: Peter Kese Date: Wed, 9 Jun 2021 18:58:02 +0200 Subject: [PATCH 7/8] Revert stuff that I don't understand --- AUTHORS.md | 2 +- src/runtime/runtime.cs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/AUTHORS.md b/AUTHORS.md index ebd2021b3..912831836 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -55,7 +55,7 @@ - Meinrad Recheis ([@henon](https://github.com/henon)) - Mohamed Koubaa ([@koubaa](https://github.com/koubaa)) - Patrick Stewart ([@patstew](https://github.com/patstew)) -- Peter Kese ([@patstew](https://github.com/pkese)) +- Peter Kese ([@pkese](https://github.com/pkese)) - Raphael Nestler ([@rnestler](https://github.com/rnestler)) - Rickard Holmberg ([@rickardraysearch](https://github.com/rickardraysearch)) - Sam Winstanley ([@swinstanley](https://github.com/swinstanley)) diff --git a/src/runtime/runtime.cs b/src/runtime/runtime.cs index 80a0e5f01..1d736f1ec 100644 --- a/src/runtime/runtime.cs +++ b/src/runtime/runtime.cs @@ -1619,12 +1619,12 @@ internal static string GetManagedString(IntPtr op) if (type == PyUnicodeType) { using var p = PyUnicode_AsUTF16String(new BorrowedReference(op)); - var bytesPtr = p.DangerousMoveToPointerOrNull(); - nint bytesLength = (nint)Runtime.PyBytes_Size(bytesPtr); + var bytesPtr = p.DangerousGetAddress(); + int bytesLength = (int)Runtime.PyBytes_Size(bytesPtr); char* codePoints = (char*)PyBytes_AsString(bytesPtr); return new string(codePoints, startIndex: 1, // skip BOM - length: (int) (bytesLength/2-1)); // utf16 - BOM + length: bytesLength/2-1); // utf16 - BOM } return null; From 49ccc1e5cf4d1af3cca1ee937bc3db69299cd2da Mon Sep 17 00:00:00 2001 From: Benedikt Reinartz Date: Fri, 11 Jun 2021 07:27:29 +0200 Subject: [PATCH 8/8] Apply code review suggestions --- src/runtime/runtime.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/runtime/runtime.cs b/src/runtime/runtime.cs index 1d736f1ec..537e5348f 100644 --- a/src/runtime/runtime.cs +++ b/src/runtime/runtime.cs @@ -1529,10 +1529,10 @@ internal static IntPtr PyString_FromString(string value) fixed(char* ptr = value) return Delegates.PyUnicode_DecodeUTF16( (IntPtr)ptr, - new IntPtr(value.Length * sizeof(Char)), + value.Length * sizeof(Char), IntPtr.Zero, IntPtr.Zero - ).DangerousGetAddress(); + ).DangerousMoveToPointerOrNull(); } @@ -2418,7 +2418,7 @@ static Delegates() _PyBytes_Size = (delegate* unmanaged[Cdecl])GetFunctionByName("PyBytes_Size", GetUnmanagedDll(_PythonDll)); PyUnicode_AsUTF8 = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_AsUTF8), GetUnmanagedDll(_PythonDll)); PyUnicode_FromObject = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_FromObject), GetUnmanagedDll(_PythonDll)); - PyUnicode_DecodeUTF16 = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_DecodeUTF16), GetUnmanagedDll(_PythonDll)); + PyUnicode_DecodeUTF16 = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_DecodeUTF16), GetUnmanagedDll(_PythonDll)); PyUnicode_FromEncodedObject = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_FromEncodedObject), GetUnmanagedDll(_PythonDll)); PyUnicode_GetMax = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_GetMax), GetUnmanagedDll(_PythonDll)); _PyUnicode_GetSize = (delegate* unmanaged[Cdecl])GetFunctionByName("PyUnicode_GetSize", GetUnmanagedDll(_PythonDll)); @@ -2714,7 +2714,7 @@ static Delegates() internal static delegate* unmanaged[Cdecl] PyUnicode_AsUTF8 { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_FromObject { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_FromEncodedObject { get; } - internal static delegate* unmanaged[Cdecl] PyUnicode_DecodeUTF16 { get; } + internal static delegate* unmanaged[Cdecl] PyUnicode_DecodeUTF16 { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_GetMax { get; } internal static delegate* unmanaged[Cdecl] _PyUnicode_GetSize { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_AsUnicode { get; }