Skip to content

Commit 195cde6

Browse files
authored
Use non-BOM encodings (pythonnet#2370)
* Use non-BOM encodings The documentation of the used `PyUnicode_DecodeUTF16` states that not passing `*byteorder` or passing a 0 results in the first two bytes, if they are the BOM (U+FEFF, zero-width no-break space), to be interpreted and skipped, which is incorrect when we convert a known "non BOM" string, which all strings from C# are.
1 parent 6a8a97d commit 195cde6

File tree

8 files changed

+44
-29
lines changed

8 files changed

+44
-29
lines changed

src/embed_tests/TestPyType.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ public void CanCreateHeapType()
2828
const string name = "nÁmæ";
2929
const string docStr = "dÁcæ";
3030

31-
using var doc = new StrPtr(docStr, Encoding.UTF8);
31+
using var doc = new StrPtr(docStr, Encodings.UTF8);
3232
var spec = new TypeSpec(
3333
name: name,
3434
basicSize: Util.ReadInt32(Runtime.Runtime.PyBaseObjectType, TypeOffset.tp_basicsize),

src/runtime/Loader.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public unsafe static int Initialize(IntPtr data, int size)
1212
{
1313
try
1414
{
15-
var dllPath = Encoding.UTF8.GetString((byte*)data.ToPointer(), size);
15+
var dllPath = Encodings.UTF8.GetString((byte*)data.ToPointer(), size);
1616

1717
if (!string.IsNullOrEmpty(dllPath))
1818
{
@@ -33,15 +33,15 @@ public unsafe static int Initialize(IntPtr data, int size)
3333
);
3434
return 1;
3535
}
36-
36+
3737
return 0;
3838
}
3939

4040
public unsafe static int Shutdown(IntPtr data, int size)
4141
{
4242
try
4343
{
44-
var command = Encoding.UTF8.GetString((byte*)data.ToPointer(), size);
44+
var command = Encodings.UTF8.GetString((byte*)data.ToPointer(), size);
4545

4646
if (command == "full_shutdown")
4747
{

src/runtime/Native/CustomMarshaler.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public int GetNativeDataSize()
4242
internal class UcsMarshaler : MarshalerBase
4343
{
4444
internal static readonly int _UCS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? 2 : 4;
45-
internal static readonly Encoding PyEncoding = _UCS == 2 ? Encoding.Unicode : Encoding.UTF32;
45+
internal static readonly Encoding PyEncoding = _UCS == 2 ? Encodings.UTF16 : Encodings.UTF32;
4646
private static readonly MarshalerBase Instance = new UcsMarshaler();
4747

4848
public override IntPtr MarshalManagedToNative(object managedObj)

src/runtime/Native/NativeTypeSpec.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ public NativeTypeSpec(TypeSpec spec)
1717
{
1818
if (spec is null) throw new ArgumentNullException(nameof(spec));
1919

20-
this.Name = new StrPtr(spec.Name, Encoding.UTF8);
20+
this.Name = new StrPtr(spec.Name, Encodings.UTF8);
2121
this.BasicSize = spec.BasicSize;
2222
this.ItemSize = spec.ItemSize;
2323
this.Flags = (int)spec.Flags;

src/runtime/PythonTypes/PyType.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public string Name
5353
{
5454
RawPointer = Util.ReadIntPtr(this, TypeOffset.tp_name),
5555
};
56-
return namePtr.ToString(System.Text.Encoding.UTF8)!;
56+
return namePtr.ToString(Encodings.UTF8)!;
5757
}
5858
}
5959

src/runtime/Runtime.cs

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -795,13 +795,13 @@ public static int Py_Main(int argc, string[] argv)
795795

796796
internal static int PyRun_SimpleString(string code)
797797
{
798-
using var codePtr = new StrPtr(code, Encoding.UTF8);
798+
using var codePtr = new StrPtr(code, Encodings.UTF8);
799799
return Delegates.PyRun_SimpleStringFlags(codePtr, Utf8String);
800800
}
801801

802802
internal static NewReference PyRun_String(string code, RunFlagType st, BorrowedReference globals, BorrowedReference locals)
803803
{
804-
using var codePtr = new StrPtr(code, Encoding.UTF8);
804+
using var codePtr = new StrPtr(code, Encodings.UTF8);
805805
return Delegates.PyRun_StringFlags(codePtr, st, globals, locals, Utf8String);
806806
}
807807

@@ -813,14 +813,14 @@ internal static NewReference PyRun_String(string code, RunFlagType st, BorrowedR
813813
/// </summary>
814814
internal static NewReference Py_CompileString(string str, string file, int start)
815815
{
816-
using var strPtr = new StrPtr(str, Encoding.UTF8);
816+
using var strPtr = new StrPtr(str, Encodings.UTF8);
817817
using var fileObj = new PyString(file);
818818
return Delegates.Py_CompileStringObject(strPtr, fileObj, start, Utf8String, -1);
819819
}
820820

821821
internal static NewReference PyImport_ExecCodeModule(string name, BorrowedReference code)
822822
{
823-
using var namePtr = new StrPtr(name, Encoding.UTF8);
823+
using var namePtr = new StrPtr(name, Encodings.UTF8);
824824
return Delegates.PyImport_ExecCodeModule(namePtr, code);
825825
}
826826

@@ -867,13 +867,13 @@ internal static bool PyObject_IsIterable(BorrowedReference ob)
867867

868868
internal static int PyObject_HasAttrString(BorrowedReference pointer, string name)
869869
{
870-
using var namePtr = new StrPtr(name, Encoding.UTF8);
870+
using var namePtr = new StrPtr(name, Encodings.UTF8);
871871
return Delegates.PyObject_HasAttrString(pointer, namePtr);
872872
}
873873

874874
internal static NewReference PyObject_GetAttrString(BorrowedReference pointer, string name)
875875
{
876-
using var namePtr = new StrPtr(name, Encoding.UTF8);
876+
using var namePtr = new StrPtr(name, Encodings.UTF8);
877877
return Delegates.PyObject_GetAttrString(pointer, namePtr);
878878
}
879879

@@ -884,12 +884,12 @@ internal static NewReference PyObject_GetAttrString(BorrowedReference pointer, S
884884
internal static int PyObject_DelAttr(BorrowedReference @object, BorrowedReference name) => Delegates.PyObject_SetAttr(@object, name, null);
885885
internal static int PyObject_DelAttrString(BorrowedReference @object, string name)
886886
{
887-
using var namePtr = new StrPtr(name, Encoding.UTF8);
887+
using var namePtr = new StrPtr(name, Encodings.UTF8);
888888
return Delegates.PyObject_SetAttrString(@object, namePtr, null);
889889
}
890890
internal static int PyObject_SetAttrString(BorrowedReference @object, string name, BorrowedReference value)
891891
{
892-
using var namePtr = new StrPtr(name, Encoding.UTF8);
892+
using var namePtr = new StrPtr(name, Encodings.UTF8);
893893
return Delegates.PyObject_SetAttrString(@object, namePtr, value);
894894
}
895895

@@ -1071,7 +1071,7 @@ internal static bool PyBool_CheckExact(BorrowedReference ob)
10711071

10721072
internal static NewReference PyLong_FromString(string value, int radix)
10731073
{
1074-
using var valPtr = new StrPtr(value, Encoding.UTF8);
1074+
using var valPtr = new StrPtr(value, Encodings.UTF8);
10751075
return Delegates.PyLong_FromString(valPtr, IntPtr.Zero, radix);
10761076
}
10771077

@@ -1252,12 +1252,14 @@ internal static bool PyString_CheckExact(BorrowedReference ob)
12521252

12531253
internal static NewReference PyString_FromString(string value)
12541254
{
1255+
int byteorder = BitConverter.IsLittleEndian ? -1 : 1;
1256+
int* byteorderPtr = &byteorder;
12551257
fixed(char* ptr = value)
12561258
return Delegates.PyUnicode_DecodeUTF16(
12571259
(IntPtr)ptr,
12581260
value.Length * sizeof(Char),
12591261
IntPtr.Zero,
1260-
IntPtr.Zero
1262+
(IntPtr)byteorderPtr
12611263
);
12621264
}
12631265

@@ -1272,7 +1274,7 @@ internal static NewReference EmptyPyBytes()
12721274
internal static NewReference PyByteArray_FromStringAndSize(IntPtr strPtr, nint len) => Delegates.PyByteArray_FromStringAndSize(strPtr, len);
12731275
internal static NewReference PyByteArray_FromStringAndSize(string s)
12741276
{
1275-
using var ptr = new StrPtr(s, Encoding.UTF8);
1277+
using var ptr = new StrPtr(s, Encodings.UTF8);
12761278
return PyByteArray_FromStringAndSize(ptr.RawPointer, checked((nint)ptr.ByteCount));
12771279
}
12781280

@@ -1300,7 +1302,7 @@ internal static IntPtr PyBytes_AsString(BorrowedReference ob)
13001302

13011303
internal static NewReference PyUnicode_InternFromString(string s)
13021304
{
1303-
using var ptr = new StrPtr(s, Encoding.UTF8);
1305+
using var ptr = new StrPtr(s, Encodings.UTF8);
13041306
return Delegates.PyUnicode_InternFromString(ptr);
13051307
}
13061308

@@ -1375,7 +1377,7 @@ internal static bool PyDict_Check(BorrowedReference ob)
13751377

13761378
internal static BorrowedReference PyDict_GetItemString(BorrowedReference pointer, string key)
13771379
{
1378-
using var keyStr = new StrPtr(key, Encoding.UTF8);
1380+
using var keyStr = new StrPtr(key, Encodings.UTF8);
13791381
return Delegates.PyDict_GetItemString(pointer, keyStr);
13801382
}
13811383

@@ -1391,7 +1393,7 @@ internal static BorrowedReference PyDict_GetItemString(BorrowedReference pointer
13911393
/// </summary>
13921394
internal static int PyDict_SetItemString(BorrowedReference dict, string key, BorrowedReference value)
13931395
{
1394-
using var keyPtr = new StrPtr(key, Encoding.UTF8);
1396+
using var keyPtr = new StrPtr(key, Encodings.UTF8);
13951397
return Delegates.PyDict_SetItemString(dict, keyPtr, value);
13961398
}
13971399

@@ -1400,7 +1402,7 @@ internal static int PyDict_SetItemString(BorrowedReference dict, string key, Bor
14001402

14011403
internal static int PyDict_DelItemString(BorrowedReference pointer, string key)
14021404
{
1403-
using var keyPtr = new StrPtr(key, Encoding.UTF8);
1405+
using var keyPtr = new StrPtr(key, Encodings.UTF8);
14041406
return Delegates.PyDict_DelItemString(pointer, keyPtr);
14051407
}
14061408

@@ -1515,7 +1517,7 @@ internal static bool PyIter_Check(BorrowedReference ob)
15151517

15161518
internal static NewReference PyModule_New(string name)
15171519
{
1518-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1520+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15191521
return Delegates.PyModule_New(namePtr);
15201522
}
15211523

@@ -1529,7 +1531,7 @@ internal static NewReference PyModule_New(string name)
15291531
/// <returns>Return -1 on error, 0 on success.</returns>
15301532
internal static int PyModule_AddObject(BorrowedReference module, string name, StolenReference value)
15311533
{
1532-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1534+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15331535
IntPtr valueAddr = value.DangerousGetAddressOrNull();
15341536
int res = Delegates.PyModule_AddObject(module, namePtr, valueAddr);
15351537
// We can't just exit here because the reference is stolen only on success.
@@ -1547,7 +1549,7 @@ internal static int PyModule_AddObject(BorrowedReference module, string name, St
15471549

15481550
internal static NewReference PyImport_ImportModule(string name)
15491551
{
1550-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1552+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15511553
return Delegates.PyImport_ImportModule(namePtr);
15521554
}
15531555

@@ -1556,7 +1558,7 @@ internal static NewReference PyImport_ImportModule(string name)
15561558

15571559
internal static BorrowedReference PyImport_AddModule(string name)
15581560
{
1559-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1561+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15601562
return Delegates.PyImport_AddModule(namePtr);
15611563
}
15621564

@@ -1584,13 +1586,13 @@ internal static void PySys_SetArgvEx(int argc, string[] argv, int updatepath)
15841586

15851587
internal static BorrowedReference PySys_GetObject(string name)
15861588
{
1587-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1589+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15881590
return Delegates.PySys_GetObject(namePtr);
15891591
}
15901592

15911593
internal static int PySys_SetObject(string name, BorrowedReference ob)
15921594
{
1593-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1595+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15941596
return Delegates.PySys_SetObject(namePtr, ob);
15951597
}
15961598

@@ -1689,7 +1691,7 @@ internal static IntPtr PyMem_Malloc(long size)
16891691

16901692
internal static void PyErr_SetString(BorrowedReference ob, string message)
16911693
{
1692-
using var msgPtr = new StrPtr(message, Encoding.UTF8);
1694+
using var msgPtr = new StrPtr(message, Encodings.UTF8);
16931695
Delegates.PyErr_SetString(ob, msgPtr);
16941696
}
16951697

src/runtime/Util/Encodings.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
using System;
2+
using System.Text;
3+
4+
namespace Python.Runtime;
5+
6+
static class Encodings {
7+
public static System.Text.Encoding UTF8 = new UTF8Encoding(false, true);
8+
public static System.Text.Encoding UTF16 = new UnicodeEncoding(!BitConverter.IsLittleEndian, false, true);
9+
public static System.Text.Encoding UTF32 = new UTF32Encoding(!BitConverter.IsLittleEndian, false, true);
10+
}

tests/test_conversion.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,9 @@ def test_string_conversion():
510510
ob.StringField = System.String(u'\uffff\uffff')
511511
assert ob.StringField == u'\uffff\uffff'
512512

513+
ob.StringField = System.String("\ufeffbom")
514+
assert ob.StringField == "\ufeffbom"
515+
513516
ob.StringField = None
514517
assert ob.StringField is None
515518

0 commit comments

Comments
 (0)