From 4c46c6d4bdf0175f55f498c1ca2d6ebb7a0fc8d5 Mon Sep 17 00:00:00 2001 From: Benedikt Reinartz Date: Sat, 4 May 2024 16:53:14 +0200 Subject: [PATCH 1/2] Use non-BOM encodings --- src/embed_tests/TestPyType.cs | 2 +- src/runtime/Loader.cs | 6 ++-- src/runtime/Native/CustomMarshaler.cs | 2 +- src/runtime/Native/NativeTypeSpec.cs | 2 +- src/runtime/PythonTypes/PyType.cs | 2 +- src/runtime/Runtime.cs | 42 +++++++++++++-------------- src/runtime/Util/Encodings.cs | 10 +++++++ tests/test_conversion.py | 3 ++ 8 files changed, 41 insertions(+), 28 deletions(-) create mode 100644 src/runtime/Util/Encodings.cs diff --git a/src/embed_tests/TestPyType.cs b/src/embed_tests/TestPyType.cs index 34645747d..0470070c3 100644 --- a/src/embed_tests/TestPyType.cs +++ b/src/embed_tests/TestPyType.cs @@ -28,7 +28,7 @@ public void CanCreateHeapType() const string name = "nÁmæ"; const string docStr = "dÁcæ"; - using var doc = new StrPtr(docStr, Encoding.UTF8); + using var doc = new StrPtr(docStr, Encodings.UTF8); var spec = new TypeSpec( name: name, basicSize: Util.ReadInt32(Runtime.Runtime.PyBaseObjectType, TypeOffset.tp_basicsize), diff --git a/src/runtime/Loader.cs b/src/runtime/Loader.cs index 516b9ab9c..c0e964abc 100644 --- a/src/runtime/Loader.cs +++ b/src/runtime/Loader.cs @@ -12,7 +12,7 @@ public unsafe static int Initialize(IntPtr data, int size) { try { - var dllPath = Encoding.UTF8.GetString((byte*)data.ToPointer(), size); + var dllPath = Encodings.UTF8.GetString((byte*)data.ToPointer(), size); if (!string.IsNullOrEmpty(dllPath)) { @@ -33,7 +33,7 @@ public unsafe static int Initialize(IntPtr data, int size) ); return 1; } - + return 0; } @@ -41,7 +41,7 @@ public unsafe static int Shutdown(IntPtr data, int size) { try { - var command = Encoding.UTF8.GetString((byte*)data.ToPointer(), size); + var command = Encodings.UTF8.GetString((byte*)data.ToPointer(), size); if (command == "full_shutdown") { diff --git a/src/runtime/Native/CustomMarshaler.cs b/src/runtime/Native/CustomMarshaler.cs index 62c027150..299af3a33 100644 --- a/src/runtime/Native/CustomMarshaler.cs +++ b/src/runtime/Native/CustomMarshaler.cs @@ -42,7 +42,7 @@ public int GetNativeDataSize() internal class UcsMarshaler : MarshalerBase { internal static readonly int _UCS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? 2 : 4; - internal static readonly Encoding PyEncoding = _UCS == 2 ? Encoding.Unicode : Encoding.UTF32; + internal static readonly Encoding PyEncoding = _UCS == 2 ? Encodings.UTF16 : Encodings.UTF32; private static readonly MarshalerBase Instance = new UcsMarshaler(); public override IntPtr MarshalManagedToNative(object managedObj) diff --git a/src/runtime/Native/NativeTypeSpec.cs b/src/runtime/Native/NativeTypeSpec.cs index 8b84df536..50019a148 100644 --- a/src/runtime/Native/NativeTypeSpec.cs +++ b/src/runtime/Native/NativeTypeSpec.cs @@ -17,7 +17,7 @@ public NativeTypeSpec(TypeSpec spec) { if (spec is null) throw new ArgumentNullException(nameof(spec)); - this.Name = new StrPtr(spec.Name, Encoding.UTF8); + this.Name = new StrPtr(spec.Name, Encodings.UTF8); this.BasicSize = spec.BasicSize; this.ItemSize = spec.ItemSize; this.Flags = (int)spec.Flags; diff --git a/src/runtime/PythonTypes/PyType.cs b/src/runtime/PythonTypes/PyType.cs index af796a5c5..28bda5d3e 100644 --- a/src/runtime/PythonTypes/PyType.cs +++ b/src/runtime/PythonTypes/PyType.cs @@ -53,7 +53,7 @@ public string Name { RawPointer = Util.ReadIntPtr(this, TypeOffset.tp_name), }; - return namePtr.ToString(System.Text.Encoding.UTF8)!; + return namePtr.ToString(Encodings.UTF8)!; } } diff --git a/src/runtime/Runtime.cs b/src/runtime/Runtime.cs index 4e1c6156a..3d51b8bc9 100644 --- a/src/runtime/Runtime.cs +++ b/src/runtime/Runtime.cs @@ -795,13 +795,13 @@ public static int Py_Main(int argc, string[] argv) internal static int PyRun_SimpleString(string code) { - using var codePtr = new StrPtr(code, Encoding.UTF8); + using var codePtr = new StrPtr(code, Encodings.UTF8); return Delegates.PyRun_SimpleStringFlags(codePtr, Utf8String); } internal static NewReference PyRun_String(string code, RunFlagType st, BorrowedReference globals, BorrowedReference locals) { - using var codePtr = new StrPtr(code, Encoding.UTF8); + using var codePtr = new StrPtr(code, Encodings.UTF8); return Delegates.PyRun_StringFlags(codePtr, st, globals, locals, Utf8String); } @@ -813,14 +813,14 @@ internal static NewReference PyRun_String(string code, RunFlagType st, BorrowedR /// internal static NewReference Py_CompileString(string str, string file, int start) { - using var strPtr = new StrPtr(str, Encoding.UTF8); + using var strPtr = new StrPtr(str, Encodings.UTF8); using var fileObj = new PyString(file); return Delegates.Py_CompileStringObject(strPtr, fileObj, start, Utf8String, -1); } internal static NewReference PyImport_ExecCodeModule(string name, BorrowedReference code) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyImport_ExecCodeModule(namePtr, code); } @@ -867,13 +867,13 @@ internal static bool PyObject_IsIterable(BorrowedReference ob) internal static int PyObject_HasAttrString(BorrowedReference pointer, string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyObject_HasAttrString(pointer, namePtr); } internal static NewReference PyObject_GetAttrString(BorrowedReference pointer, string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyObject_GetAttrString(pointer, namePtr); } @@ -884,12 +884,12 @@ internal static NewReference PyObject_GetAttrString(BorrowedReference pointer, S internal static int PyObject_DelAttr(BorrowedReference @object, BorrowedReference name) => Delegates.PyObject_SetAttr(@object, name, null); internal static int PyObject_DelAttrString(BorrowedReference @object, string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyObject_SetAttrString(@object, namePtr, null); } internal static int PyObject_SetAttrString(BorrowedReference @object, string name, BorrowedReference value) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyObject_SetAttrString(@object, namePtr, value); } @@ -1071,7 +1071,7 @@ internal static bool PyBool_CheckExact(BorrowedReference ob) internal static NewReference PyLong_FromString(string value, int radix) { - using var valPtr = new StrPtr(value, Encoding.UTF8); + using var valPtr = new StrPtr(value, Encodings.UTF8); return Delegates.PyLong_FromString(valPtr, IntPtr.Zero, radix); } @@ -1272,7 +1272,7 @@ internal static NewReference EmptyPyBytes() internal static NewReference PyByteArray_FromStringAndSize(IntPtr strPtr, nint len) => Delegates.PyByteArray_FromStringAndSize(strPtr, len); internal static NewReference PyByteArray_FromStringAndSize(string s) { - using var ptr = new StrPtr(s, Encoding.UTF8); + using var ptr = new StrPtr(s, Encodings.UTF8); return PyByteArray_FromStringAndSize(ptr.RawPointer, checked((nint)ptr.ByteCount)); } @@ -1300,7 +1300,7 @@ internal static IntPtr PyBytes_AsString(BorrowedReference ob) internal static NewReference PyUnicode_InternFromString(string s) { - using var ptr = new StrPtr(s, Encoding.UTF8); + using var ptr = new StrPtr(s, Encodings.UTF8); return Delegates.PyUnicode_InternFromString(ptr); } @@ -1375,7 +1375,7 @@ internal static bool PyDict_Check(BorrowedReference ob) internal static BorrowedReference PyDict_GetItemString(BorrowedReference pointer, string key) { - using var keyStr = new StrPtr(key, Encoding.UTF8); + using var keyStr = new StrPtr(key, Encodings.UTF8); return Delegates.PyDict_GetItemString(pointer, keyStr); } @@ -1391,7 +1391,7 @@ internal static BorrowedReference PyDict_GetItemString(BorrowedReference pointer /// internal static int PyDict_SetItemString(BorrowedReference dict, string key, BorrowedReference value) { - using var keyPtr = new StrPtr(key, Encoding.UTF8); + using var keyPtr = new StrPtr(key, Encodings.UTF8); return Delegates.PyDict_SetItemString(dict, keyPtr, value); } @@ -1400,7 +1400,7 @@ internal static int PyDict_SetItemString(BorrowedReference dict, string key, Bor internal static int PyDict_DelItemString(BorrowedReference pointer, string key) { - using var keyPtr = new StrPtr(key, Encoding.UTF8); + using var keyPtr = new StrPtr(key, Encodings.UTF8); return Delegates.PyDict_DelItemString(pointer, keyPtr); } @@ -1515,7 +1515,7 @@ internal static bool PyIter_Check(BorrowedReference ob) internal static NewReference PyModule_New(string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyModule_New(namePtr); } @@ -1529,7 +1529,7 @@ internal static NewReference PyModule_New(string name) /// Return -1 on error, 0 on success. internal static int PyModule_AddObject(BorrowedReference module, string name, StolenReference value) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); IntPtr valueAddr = value.DangerousGetAddressOrNull(); int res = Delegates.PyModule_AddObject(module, namePtr, valueAddr); // We can't just exit here because the reference is stolen only on success. @@ -1547,7 +1547,7 @@ internal static int PyModule_AddObject(BorrowedReference module, string name, St internal static NewReference PyImport_ImportModule(string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyImport_ImportModule(namePtr); } @@ -1556,7 +1556,7 @@ internal static NewReference PyImport_ImportModule(string name) internal static BorrowedReference PyImport_AddModule(string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyImport_AddModule(namePtr); } @@ -1584,13 +1584,13 @@ internal static void PySys_SetArgvEx(int argc, string[] argv, int updatepath) internal static BorrowedReference PySys_GetObject(string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PySys_GetObject(namePtr); } internal static int PySys_SetObject(string name, BorrowedReference ob) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PySys_SetObject(namePtr, ob); } @@ -1689,7 +1689,7 @@ internal static IntPtr PyMem_Malloc(long size) internal static void PyErr_SetString(BorrowedReference ob, string message) { - using var msgPtr = new StrPtr(message, Encoding.UTF8); + using var msgPtr = new StrPtr(message, Encodings.UTF8); Delegates.PyErr_SetString(ob, msgPtr); } diff --git a/src/runtime/Util/Encodings.cs b/src/runtime/Util/Encodings.cs new file mode 100644 index 000000000..d5a0c6ff8 --- /dev/null +++ b/src/runtime/Util/Encodings.cs @@ -0,0 +1,10 @@ +using System; +using System.Text; + +namespace Python.Runtime; + +static class Encodings { + public static System.Text.Encoding UTF8 = new UTF8Encoding(false, true); + public static System.Text.Encoding UTF16 = new UnicodeEncoding(!BitConverter.IsLittleEndian, false, true); + public static System.Text.Encoding UTF32 = new UTF32Encoding(!BitConverter.IsLittleEndian, false, true); +} diff --git a/tests/test_conversion.py b/tests/test_conversion.py index bb686dd52..dd70f900a 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -510,6 +510,9 @@ def test_string_conversion(): ob.StringField = System.String(u'\uffff\uffff') assert ob.StringField == u'\uffff\uffff' + ob.StringField = System.String("\ufeffbom") + assert ob.StringField == "\ufeffbom" + ob.StringField = None assert ob.StringField is None From dc6f5efb7905ff695d244a65e0afc161be6ef1af Mon Sep 17 00:00:00 2001 From: Benedikt Reinartz Date: Sun, 5 May 2024 20:38:24 +0200 Subject: [PATCH 2/2] Copy potential BOM to the output of PyString_FromString The documentation of the used `PyUnicode_DecodeUTF16` states that not passing `*byteorder` or passing a 0 results in the first two bytes, if they are the BOM (U+FEFF, zero-width no-break space), to be interpreted and skipped, which is incorrect when we convert a known "non BOM" string, which all strings from C# are. --- src/runtime/Runtime.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/runtime/Runtime.cs b/src/runtime/Runtime.cs index 3d51b8bc9..2f9e18f65 100644 --- a/src/runtime/Runtime.cs +++ b/src/runtime/Runtime.cs @@ -1252,12 +1252,14 @@ internal static bool PyString_CheckExact(BorrowedReference ob) internal static NewReference PyString_FromString(string value) { + int byteorder = BitConverter.IsLittleEndian ? -1 : 1; + int* byteorderPtr = &byteorder; fixed(char* ptr = value) return Delegates.PyUnicode_DecodeUTF16( (IntPtr)ptr, value.Length * sizeof(Char), IntPtr.Zero, - IntPtr.Zero + (IntPtr)byteorderPtr ); }