Skip to content

Add a failing test for Unicode conversion #1467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
- Meinrad Recheis ([@henon](https://github.com/henon))
- Mohamed Koubaa ([@koubaa](https://github.com/koubaa))
- Patrick Stewart ([@patstew](https://github.com/patstew))
- Peter Kese ([@pkese](https://github.com/pkese))
- Raphael Nestler ([@rnestler](https://github.com/rnestler))
- Rickard Holmberg ([@rickardraysearch](https://github.com/rickardraysearch))
- Sam Winstanley ([@swinstanley](https://github.com/swinstanley))
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ One must now either use enum members (e.g. `MyEnum.Option`), or use enum constru
- Exception stacktraces on `PythonException.StackTrace` are now properly formatted
- Providing an invalid type parameter to a generic type or method produces a helpful Python error
- Empty parameter names (as can be generated from F#) do not cause crashes
- Unicode strings with surrogates were truncated when converting from Python

### Removed

Expand Down
2 changes: 1 addition & 1 deletion src/embed_tests/TestCustomMarshal.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public static void GetManagedStringTwice()
{
const string expected = "FooBar";

IntPtr op = Runtime.Runtime.PyUnicode_FromString(expected);
IntPtr op = Runtime.Runtime.PyString_FromString(expected);
string s1 = Runtime.Runtime.GetManagedString(op);
string s2 = Runtime.Runtime.GetManagedString(op);

Expand Down
19 changes: 19 additions & 0 deletions src/embed_tests/TestPyString.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,24 @@ public void TestUnicode()
PyObject actual = new PyString(expected);
Assert.AreEqual(expected, actual.ToString());
}

[Test]
public void TestUnicodeSurrogateToString()
{
var expected = "foo\ud83d\udc3c";
var actual = PythonEngine.Eval("'foo\ud83d\udc3c'");
Assert.AreEqual(4, actual.Length());
Assert.AreEqual(expected, actual.ToString());
}

[Test]
public void TestUnicodeSurrogate()
{
const string expected = "foo\ud83d\udc3c"; // "foo🐼"
PyObject actual = new PyString(expected);
// python treats "foo🐼" as 4 characters, dotnet as 5
Assert.AreEqual(4, actual.Length());
Assert.AreEqual(expected, actual.ToString());
}
}
}
2 changes: 1 addition & 1 deletion src/embed_tests/TestRuntime.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public static void Py_IsInitializedValue()
public static void RefCountTest()
{
Runtime.Runtime.Py_Initialize();
IntPtr op = Runtime.Runtime.PyUnicode_FromString("FooBar");
IntPtr op = Runtime.Runtime.PyString_FromString("FooBar");

// New object RefCount should be one
Assert.AreEqual(1, Runtime.Runtime.Refcount(op));
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/converter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ internal static IntPtr ToPython(object value, Type type)
return CLRObject.GetInstHandle(value, type);

case TypeCode.String:
return Runtime.PyUnicode_FromString((string)value);
return Runtime.PyString_FromString((string)value);

case TypeCode.Int32:
return Runtime.PyInt_FromInt32((int)value);
Expand Down
6 changes: 3 additions & 3 deletions src/runtime/exceptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ internal static Exception ToException(BorrowedReference ob)
{
message = String.Format("{0}()", name);
}
return Runtime.PyUnicode_FromString(message);
return Runtime.PyString_FromString(message);
}

/// <summary>
Expand All @@ -75,7 +75,7 @@ internal static Exception ToException(BorrowedReference ob)
{
message = message.Substring(fullTypeName.Length);
}
return Runtime.PyUnicode_FromString(message);
return Runtime.PyString_FromString(message);
}
}

Expand Down Expand Up @@ -153,7 +153,7 @@ internal static void SetArgsAndCause(BorrowedReference ob, Exception e)
if (!string.IsNullOrEmpty(e.Message))
{
args = Runtime.PyTuple_New(1);
IntPtr msg = Runtime.PyUnicode_FromString(e.Message);
IntPtr msg = Runtime.PyString_FromString(e.Message);
Runtime.PyTuple_SetItem(args, 0, msg);
}
else
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/pystring.cs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public PyString(PyObject o) : base(FromObject(o))

private static IntPtr FromString(string s)
{
IntPtr val = Runtime.PyUnicode_FromUnicode(s, s.Length);
IntPtr val = Runtime.PyString_FromString(s);
PythonException.ThrowIfIsNull(val);
return val;
}
Expand Down
54 changes: 13 additions & 41 deletions src/runtime/runtime.cs
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ private static void InitPyMembers()
() => PyStringType = IntPtr.Zero);
XDecref(op);

op = PyUnicode_FromString("unicode");
op = PyString_FromString("unicode");
SetPyMemberTypeOf(ref PyUnicodeType, op,
() => PyUnicodeType = IntPtr.Zero);
XDecref(op);
Expand Down Expand Up @@ -1527,7 +1527,12 @@ internal static bool PyString_Check(IntPtr ob)
internal static IntPtr PyString_FromString(string value)
{
fixed(char* ptr = value)
return PyUnicode_FromKindAndData(2, (IntPtr)ptr, value.Length);
return Delegates.PyUnicode_DecodeUTF16(
(IntPtr)ptr,
value.Length * sizeof(Char),
IntPtr.Zero,
IntPtr.Zero
).DangerousMoveToPointerOrNull();
}


Expand All @@ -1553,16 +1558,6 @@ internal static long PyBytes_Size(IntPtr op)

private static IntPtr _PyBytes_Size(IntPtr op) => Delegates._PyBytes_Size(op);


internal static IntPtr PyUnicode_FromStringAndSize(IntPtr value, long size)
{
return PyUnicode_FromStringAndSize(value, new IntPtr(size));
}


private static IntPtr PyUnicode_FromStringAndSize(IntPtr value, IntPtr size) => Delegates.PyUnicode_FromStringAndSize(value, size);


internal static IntPtr PyUnicode_AsUTF8(IntPtr unicode) => Delegates.PyUnicode_AsUTF8(unicode);

internal static bool PyUnicode_Check(IntPtr ob)
Expand All @@ -1576,22 +1571,6 @@ internal static bool PyUnicode_Check(IntPtr ob)

internal static IntPtr PyUnicode_FromEncodedObject(IntPtr ob, IntPtr enc, IntPtr err) => Delegates.PyUnicode_FromEncodedObject(ob, enc, err);

internal static IntPtr PyUnicode_FromKindAndData(int kind, IntPtr s, long size)
{
return PyUnicode_FromKindAndData(kind, s, new IntPtr(size));
}


private static IntPtr PyUnicode_FromKindAndData(int kind, IntPtr s, IntPtr size)
=> Delegates.PyUnicode_FromKindAndData(kind, s, size);

internal static IntPtr PyUnicode_FromUnicode(string s, long size)
{
fixed(char* ptr = s)
return PyUnicode_FromKindAndData(2, (IntPtr)ptr, size);
}


internal static int PyUnicode_GetMax() => Delegates.PyUnicode_GetMax();

internal static long PyUnicode_GetSize(IntPtr ob)
Expand All @@ -1610,12 +1589,6 @@ internal static long PyUnicode_GetSize(IntPtr ob)

internal static IntPtr PyUnicode_FromOrdinal(int c) => Delegates.PyUnicode_FromOrdinal(c);

internal static IntPtr PyUnicode_FromString(string s)
{
return PyUnicode_FromUnicode(s, s.Length);
}


internal static IntPtr PyUnicode_InternFromString(string s)
{
using var ptr = new StrPtr(s, Encoding.UTF8);
Expand Down Expand Up @@ -1646,11 +1619,12 @@ internal static string GetManagedString(IntPtr op)
if (type == PyUnicodeType)
{
using var p = PyUnicode_AsUTF16String(new BorrowedReference(op));
int length = (int)PyUnicode_GetSize(op);
char* codePoints = (char*)PyBytes_AsString(p.DangerousGetAddress());
var bytesPtr = p.DangerousGetAddress();
int bytesLength = (int)Runtime.PyBytes_Size(bytesPtr);
char* codePoints = (char*)PyBytes_AsString(bytesPtr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: better add overload for PyBytes_AsString that takes BorrowedReference. This would remove unnecessary call to DangerousGetAddress above.

return new string(codePoints,
startIndex: 1, // skip BOM
length: length);
length: bytesLength/2-1); // utf16 - BOM
}

return null;
Expand Down Expand Up @@ -2442,11 +2416,10 @@ static Delegates()
PyBytes_AsString = (delegate* unmanaged[Cdecl]<BorrowedReference, IntPtr>)GetFunctionByName(nameof(PyBytes_AsString), GetUnmanagedDll(_PythonDll));
PyBytes_FromString = (delegate* unmanaged[Cdecl]<IntPtr, IntPtr>)GetFunctionByName(nameof(PyBytes_FromString), GetUnmanagedDll(_PythonDll));
_PyBytes_Size = (delegate* unmanaged[Cdecl]<IntPtr, IntPtr>)GetFunctionByName("PyBytes_Size", GetUnmanagedDll(_PythonDll));
PyUnicode_FromStringAndSize = (delegate* unmanaged[Cdecl]<IntPtr, IntPtr, IntPtr>)GetFunctionByName(nameof(PyUnicode_FromStringAndSize), GetUnmanagedDll(_PythonDll));
PyUnicode_AsUTF8 = (delegate* unmanaged[Cdecl]<IntPtr, IntPtr>)GetFunctionByName(nameof(PyUnicode_AsUTF8), GetUnmanagedDll(_PythonDll));
PyUnicode_FromObject = (delegate* unmanaged[Cdecl]<IntPtr, IntPtr>)GetFunctionByName(nameof(PyUnicode_FromObject), GetUnmanagedDll(_PythonDll));
PyUnicode_DecodeUTF16 = (delegate* unmanaged[Cdecl]<IntPtr, nint, IntPtr, IntPtr, NewReference>)GetFunctionByName(nameof(PyUnicode_DecodeUTF16), GetUnmanagedDll(_PythonDll));
PyUnicode_FromEncodedObject = (delegate* unmanaged[Cdecl]<IntPtr, IntPtr, IntPtr, IntPtr>)GetFunctionByName(nameof(PyUnicode_FromEncodedObject), GetUnmanagedDll(_PythonDll));
PyUnicode_FromKindAndData = (delegate* unmanaged[Cdecl]<int, IntPtr, IntPtr, IntPtr>)GetFunctionByName(nameof(PyUnicode_FromKindAndData), GetUnmanagedDll(_PythonDll));
PyUnicode_GetMax = (delegate* unmanaged[Cdecl]<int>)GetFunctionByName(nameof(PyUnicode_GetMax), GetUnmanagedDll(_PythonDll));
_PyUnicode_GetSize = (delegate* unmanaged[Cdecl]<IntPtr, IntPtr>)GetFunctionByName("PyUnicode_GetSize", GetUnmanagedDll(_PythonDll));
PyUnicode_AsUnicode = (delegate* unmanaged[Cdecl]<IntPtr, IntPtr>)GetFunctionByName(nameof(PyUnicode_AsUnicode), GetUnmanagedDll(_PythonDll));
Expand Down Expand Up @@ -2738,11 +2711,10 @@ static Delegates()
internal static delegate* unmanaged[Cdecl]<BorrowedReference, IntPtr> PyBytes_AsString { get; }
internal static delegate* unmanaged[Cdecl]<IntPtr, IntPtr> PyBytes_FromString { get; }
internal static delegate* unmanaged[Cdecl]<IntPtr, IntPtr> _PyBytes_Size { get; }
internal static delegate* unmanaged[Cdecl]<IntPtr, IntPtr, IntPtr> PyUnicode_FromStringAndSize { get; }
internal static delegate* unmanaged[Cdecl]<IntPtr, IntPtr> PyUnicode_AsUTF8 { get; }
internal static delegate* unmanaged[Cdecl]<IntPtr, IntPtr> PyUnicode_FromObject { get; }
internal static delegate* unmanaged[Cdecl]<IntPtr, IntPtr, IntPtr, IntPtr> PyUnicode_FromEncodedObject { get; }
internal static delegate* unmanaged[Cdecl]<int, IntPtr, IntPtr, IntPtr> PyUnicode_FromKindAndData { get; }
internal static delegate* unmanaged[Cdecl]<IntPtr, nint, IntPtr, IntPtr, NewReference> PyUnicode_DecodeUTF16 { get; }
internal static delegate* unmanaged[Cdecl]<int> PyUnicode_GetMax { get; }
internal static delegate* unmanaged[Cdecl]<IntPtr, IntPtr> _PyUnicode_GetSize { get; }
internal static delegate* unmanaged[Cdecl]<IntPtr, IntPtr> PyUnicode_AsUnicode { get; }
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/typemanager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ internal static IntPtr AllocateTypeObject(string name, IntPtr metatype)
// Cheat a little: we'll set tp_name to the internal char * of
// the Python version of the type name - otherwise we'd have to
// allocate the tp_name and would have no way to free it.
IntPtr temp = Runtime.PyUnicode_FromString(name);
IntPtr temp = Runtime.PyString_FromString(name);
IntPtr raw = Runtime.PyUnicode_AsUTF8(temp);
Marshal.WriteIntPtr(type, TypeOffset.tp_name, raw);
Marshal.WriteIntPtr(type, TypeOffset.name, temp);
Expand Down