diff --git a/AUTHORS.md b/AUTHORS.md index 6cfa216b1..912831836 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -55,6 +55,7 @@ - Meinrad Recheis ([@henon](https://github.com/henon)) - Mohamed Koubaa ([@koubaa](https://github.com/koubaa)) - Patrick Stewart ([@patstew](https://github.com/patstew)) +- Peter Kese ([@pkese](https://github.com/pkese)) - Raphael Nestler ([@rnestler](https://github.com/rnestler)) - Rickard Holmberg ([@rickardraysearch](https://github.com/rickardraysearch)) - Sam Winstanley ([@swinstanley](https://github.com/swinstanley)) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5871e7ffb..8aba9e9b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,7 @@ One must now either use enum members (e.g. `MyEnum.Option`), or use enum constru - Exception stacktraces on `PythonException.StackTrace` are now properly formatted - Providing an invalid type parameter to a generic type or method produces a helpful Python error - Empty parameter names (as can be generated from F#) do not cause crashes +- Unicode strings with surrogates were truncated when converting from Python ### Removed diff --git a/src/embed_tests/TestCustomMarshal.cs b/src/embed_tests/TestCustomMarshal.cs index 5860857a3..99911bdb0 100644 --- a/src/embed_tests/TestCustomMarshal.cs +++ b/src/embed_tests/TestCustomMarshal.cs @@ -23,7 +23,7 @@ public static void GetManagedStringTwice() { const string expected = "FooBar"; - IntPtr op = Runtime.Runtime.PyUnicode_FromString(expected); + IntPtr op = Runtime.Runtime.PyString_FromString(expected); string s1 = Runtime.Runtime.GetManagedString(op); string s2 = Runtime.Runtime.GetManagedString(op); diff --git a/src/embed_tests/TestPyString.cs b/src/embed_tests/TestPyString.cs index 0de436e35..669ecde0d 100644 --- a/src/embed_tests/TestPyString.cs +++ b/src/embed_tests/TestPyString.cs @@ -94,5 +94,24 @@ public void TestUnicode() PyObject actual = new PyString(expected); Assert.AreEqual(expected, actual.ToString()); } + + [Test] + public void TestUnicodeSurrogateToString() + { + var expected = "foo\ud83d\udc3c"; + var actual = PythonEngine.Eval("'foo\ud83d\udc3c'"); + Assert.AreEqual(4, actual.Length()); + Assert.AreEqual(expected, actual.ToString()); + } + + [Test] + public void TestUnicodeSurrogate() + { + const string expected = "foo\ud83d\udc3c"; // "foo🐼" + PyObject actual = new PyString(expected); + // python treats "foo🐼" as 4 characters, dotnet as 5 + Assert.AreEqual(4, actual.Length()); + Assert.AreEqual(expected, actual.ToString()); + } } } diff --git a/src/embed_tests/TestRuntime.cs b/src/embed_tests/TestRuntime.cs index 9ca6cf139..9fb2e8b22 100644 --- a/src/embed_tests/TestRuntime.cs +++ b/src/embed_tests/TestRuntime.cs @@ -36,7 +36,7 @@ public static void Py_IsInitializedValue() public static void RefCountTest() { Runtime.Runtime.Py_Initialize(); - IntPtr op = Runtime.Runtime.PyUnicode_FromString("FooBar"); + IntPtr op = Runtime.Runtime.PyString_FromString("FooBar"); // New object RefCount should be one Assert.AreEqual(1, Runtime.Runtime.Refcount(op)); diff --git a/src/runtime/converter.cs b/src/runtime/converter.cs index 47263e8c4..80f31f058 100644 --- a/src/runtime/converter.cs +++ b/src/runtime/converter.cs @@ -221,7 +221,7 @@ internal static IntPtr ToPython(object value, Type type) return CLRObject.GetInstHandle(value, type); case TypeCode.String: - return Runtime.PyUnicode_FromString((string)value); + return Runtime.PyString_FromString((string)value); case TypeCode.Int32: return Runtime.PyInt_FromInt32((int)value); diff --git a/src/runtime/exceptions.cs b/src/runtime/exceptions.cs index bbdcdad30..a612e34e3 100644 --- a/src/runtime/exceptions.cs +++ b/src/runtime/exceptions.cs @@ -50,7 +50,7 @@ internal static Exception ToException(BorrowedReference ob) { message = String.Format("{0}()", name); } - return Runtime.PyUnicode_FromString(message); + return Runtime.PyString_FromString(message); } /// @@ -75,7 +75,7 @@ internal static Exception ToException(BorrowedReference ob) { message = message.Substring(fullTypeName.Length); } - return Runtime.PyUnicode_FromString(message); + return Runtime.PyString_FromString(message); } } @@ -153,7 +153,7 @@ internal static void SetArgsAndCause(BorrowedReference ob, Exception e) if (!string.IsNullOrEmpty(e.Message)) { args = Runtime.PyTuple_New(1); - IntPtr msg = Runtime.PyUnicode_FromString(e.Message); + IntPtr msg = Runtime.PyString_FromString(e.Message); Runtime.PyTuple_SetItem(args, 0, msg); } else diff --git a/src/runtime/pystring.cs b/src/runtime/pystring.cs index 07eabba14..172c09ebd 100644 --- a/src/runtime/pystring.cs +++ b/src/runtime/pystring.cs @@ -51,7 +51,7 @@ public PyString(PyObject o) : base(FromObject(o)) private static IntPtr FromString(string s) { - IntPtr val = Runtime.PyUnicode_FromUnicode(s, s.Length); + IntPtr val = Runtime.PyString_FromString(s); PythonException.ThrowIfIsNull(val); return val; } diff --git a/src/runtime/runtime.cs b/src/runtime/runtime.cs index 789b71f3e..537e5348f 100644 --- a/src/runtime/runtime.cs +++ b/src/runtime/runtime.cs @@ -230,7 +230,7 @@ private static void InitPyMembers() () => PyStringType = IntPtr.Zero); XDecref(op); - op = PyUnicode_FromString("unicode"); + op = PyString_FromString("unicode"); SetPyMemberTypeOf(ref PyUnicodeType, op, () => PyUnicodeType = IntPtr.Zero); XDecref(op); @@ -1527,7 +1527,12 @@ internal static bool PyString_Check(IntPtr ob) internal static IntPtr PyString_FromString(string value) { fixed(char* ptr = value) - return PyUnicode_FromKindAndData(2, (IntPtr)ptr, value.Length); + return Delegates.PyUnicode_DecodeUTF16( + (IntPtr)ptr, + value.Length * sizeof(Char), + IntPtr.Zero, + IntPtr.Zero + ).DangerousMoveToPointerOrNull(); } @@ -1553,16 +1558,6 @@ internal static long PyBytes_Size(IntPtr op) private static IntPtr _PyBytes_Size(IntPtr op) => Delegates._PyBytes_Size(op); - - internal static IntPtr PyUnicode_FromStringAndSize(IntPtr value, long size) - { - return PyUnicode_FromStringAndSize(value, new IntPtr(size)); - } - - - private static IntPtr PyUnicode_FromStringAndSize(IntPtr value, IntPtr size) => Delegates.PyUnicode_FromStringAndSize(value, size); - - internal static IntPtr PyUnicode_AsUTF8(IntPtr unicode) => Delegates.PyUnicode_AsUTF8(unicode); internal static bool PyUnicode_Check(IntPtr ob) @@ -1576,22 +1571,6 @@ internal static bool PyUnicode_Check(IntPtr ob) internal static IntPtr PyUnicode_FromEncodedObject(IntPtr ob, IntPtr enc, IntPtr err) => Delegates.PyUnicode_FromEncodedObject(ob, enc, err); - internal static IntPtr PyUnicode_FromKindAndData(int kind, IntPtr s, long size) - { - return PyUnicode_FromKindAndData(kind, s, new IntPtr(size)); - } - - - private static IntPtr PyUnicode_FromKindAndData(int kind, IntPtr s, IntPtr size) - => Delegates.PyUnicode_FromKindAndData(kind, s, size); - - internal static IntPtr PyUnicode_FromUnicode(string s, long size) - { - fixed(char* ptr = s) - return PyUnicode_FromKindAndData(2, (IntPtr)ptr, size); - } - - internal static int PyUnicode_GetMax() => Delegates.PyUnicode_GetMax(); internal static long PyUnicode_GetSize(IntPtr ob) @@ -1610,12 +1589,6 @@ internal static long PyUnicode_GetSize(IntPtr ob) internal static IntPtr PyUnicode_FromOrdinal(int c) => Delegates.PyUnicode_FromOrdinal(c); - internal static IntPtr PyUnicode_FromString(string s) - { - return PyUnicode_FromUnicode(s, s.Length); - } - - internal static IntPtr PyUnicode_InternFromString(string s) { using var ptr = new StrPtr(s, Encoding.UTF8); @@ -1646,11 +1619,12 @@ internal static string GetManagedString(IntPtr op) if (type == PyUnicodeType) { using var p = PyUnicode_AsUTF16String(new BorrowedReference(op)); - int length = (int)PyUnicode_GetSize(op); - char* codePoints = (char*)PyBytes_AsString(p.DangerousGetAddress()); + var bytesPtr = p.DangerousGetAddress(); + int bytesLength = (int)Runtime.PyBytes_Size(bytesPtr); + char* codePoints = (char*)PyBytes_AsString(bytesPtr); return new string(codePoints, startIndex: 1, // skip BOM - length: length); + length: bytesLength/2-1); // utf16 - BOM } return null; @@ -2442,11 +2416,10 @@ static Delegates() PyBytes_AsString = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyBytes_AsString), GetUnmanagedDll(_PythonDll)); PyBytes_FromString = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyBytes_FromString), GetUnmanagedDll(_PythonDll)); _PyBytes_Size = (delegate* unmanaged[Cdecl])GetFunctionByName("PyBytes_Size", GetUnmanagedDll(_PythonDll)); - PyUnicode_FromStringAndSize = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_FromStringAndSize), GetUnmanagedDll(_PythonDll)); PyUnicode_AsUTF8 = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_AsUTF8), GetUnmanagedDll(_PythonDll)); PyUnicode_FromObject = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_FromObject), GetUnmanagedDll(_PythonDll)); + PyUnicode_DecodeUTF16 = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_DecodeUTF16), GetUnmanagedDll(_PythonDll)); PyUnicode_FromEncodedObject = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_FromEncodedObject), GetUnmanagedDll(_PythonDll)); - PyUnicode_FromKindAndData = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_FromKindAndData), GetUnmanagedDll(_PythonDll)); PyUnicode_GetMax = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_GetMax), GetUnmanagedDll(_PythonDll)); _PyUnicode_GetSize = (delegate* unmanaged[Cdecl])GetFunctionByName("PyUnicode_GetSize", GetUnmanagedDll(_PythonDll)); PyUnicode_AsUnicode = (delegate* unmanaged[Cdecl])GetFunctionByName(nameof(PyUnicode_AsUnicode), GetUnmanagedDll(_PythonDll)); @@ -2738,11 +2711,10 @@ static Delegates() internal static delegate* unmanaged[Cdecl] PyBytes_AsString { get; } internal static delegate* unmanaged[Cdecl] PyBytes_FromString { get; } internal static delegate* unmanaged[Cdecl] _PyBytes_Size { get; } - internal static delegate* unmanaged[Cdecl] PyUnicode_FromStringAndSize { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_AsUTF8 { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_FromObject { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_FromEncodedObject { get; } - internal static delegate* unmanaged[Cdecl] PyUnicode_FromKindAndData { get; } + internal static delegate* unmanaged[Cdecl] PyUnicode_DecodeUTF16 { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_GetMax { get; } internal static delegate* unmanaged[Cdecl] _PyUnicode_GetSize { get; } internal static delegate* unmanaged[Cdecl] PyUnicode_AsUnicode { get; } diff --git a/src/runtime/typemanager.cs b/src/runtime/typemanager.cs index 13d822c09..e1bfe6aef 100644 --- a/src/runtime/typemanager.cs +++ b/src/runtime/typemanager.cs @@ -580,7 +580,7 @@ internal static IntPtr AllocateTypeObject(string name, IntPtr metatype) // Cheat a little: we'll set tp_name to the internal char * of // the Python version of the type name - otherwise we'd have to // allocate the tp_name and would have no way to free it. - IntPtr temp = Runtime.PyUnicode_FromString(name); + IntPtr temp = Runtime.PyString_FromString(name); IntPtr raw = Runtime.PyUnicode_AsUTF8(temp); Marshal.WriteIntPtr(type, TypeOffset.tp_name, raw); Marshal.WriteIntPtr(type, TypeOffset.name, temp);