Skip to content

bpo-27580: Add support of null characters in csv #28808

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 39 additions & 5 deletions Lib/test/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,17 @@ def test_write_escape(self):
self._write_test(['C\\', '6', '7', 'X"'], 'C\\\\,6,7,"X"""',
escapechar='\\', quoting=csv.QUOTE_MINIMAL)

def test_write_lineterminator(self):
for lineterminator in '\r\n', '\n', '\r', '!@#', '\0':
with self.subTest(lineterminator=lineterminator):
with StringIO() as sio:
writer = csv.writer(sio, lineterminator=lineterminator)
writer.writerow(['a', 'b'])
writer.writerow([1, 2])
self.assertEqual(sio.getvalue(),
f'a,b{lineterminator}'
f'1,2{lineterminator}')

def test_write_iterable(self):
self._write_test(iter(['a', 1, 'p,q']), 'a,1,"p,q"')
self._write_test(iter(['a', 1, None]), 'a,1,')
Expand Down Expand Up @@ -286,14 +297,10 @@ def test_read_oddinputs(self):
self._read_test([''], [[]])
self.assertRaises(csv.Error, self._read_test,
['"ab"c'], None, strict = 1)
# cannot handle null bytes for the moment
self.assertRaises(csv.Error, self._read_test,
['ab\0c'], None, strict = 1)
self._read_test(['"ab"c'], [['abc']], doublequote = 0)

self.assertRaises(csv.Error, self._read_test,
[b'ab\0c'], None)

[b'abc'], None)

def test_read_eol(self):
self._read_test(['a,b'], [['a','b']])
Expand All @@ -313,13 +320,30 @@ def test_read_eof(self):
self.assertRaises(csv.Error, self._read_test,
['^'], [], escapechar='^', strict=True)

def test_read_nul(self):
self._read_test(['\0'], [['\0']])
self._read_test(['a,\0b,c'], [['a', '\0b', 'c']])
self._read_test(['a,b\0,c'], [['a', 'b\0', 'c']])
self._read_test(['a,b\\\0,c'], [['a', 'b\0', 'c']], escapechar='\\')
self._read_test(['a,"\0b",c'], [['a', '\0b', 'c']])

def test_read_delimiter(self):
self._read_test(['a,b,c'], [['a', 'b', 'c']])
self._read_test(['a;b;c'], [['a', 'b', 'c']], delimiter=';')
self._read_test(['a\0b\0c'], [['a', 'b', 'c']], delimiter='\0')

def test_read_escape(self):
self._read_test(['a,\\b,c'], [['a', 'b', 'c']], escapechar='\\')
self._read_test(['a,b\\,c'], [['a', 'b,c']], escapechar='\\')
self._read_test(['a,"b\\,c"'], [['a', 'b,c']], escapechar='\\')
self._read_test(['a,"b,\\c"'], [['a', 'b,c']], escapechar='\\')
self._read_test(['a,"b,c\\""'], [['a', 'b,c"']], escapechar='\\')
self._read_test(['a,"b,c"\\'], [['a', 'b,c\\']], escapechar='\\')
self._read_test(['a,^b,c'], [['a', 'b', 'c']], escapechar='^')
self._read_test(['a,\0b,c'], [['a', 'b', 'c']], escapechar='\0')
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar=None)
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar='')
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']])

def test_read_quoting(self):
self._read_test(['1,",3,",5'], [['1', ',3,', '5']])
Expand All @@ -334,6 +358,8 @@ def test_read_quoting(self):
self.assertRaises(ValueError, self._read_test,
['abc,3'], [[]],
quoting=csv.QUOTE_NONNUMERIC)
self._read_test(['1,@,3,@,5'], [['1', ',3,', '5']], quotechar='@')
self._read_test(['1,\0,3,\0,5'], [['1', ',3,', '5']], quotechar='\0')

def test_read_bigfield(self):
# This exercises the buffer realloc functionality and field size
Expand Down Expand Up @@ -1074,6 +1100,12 @@ class TestSniffer(unittest.TestCase):
a,b
""")

sample14 = """\
abc\0def
ghijkl\0mno
ghi\0jkl
"""

def test_issue43625(self):
sniffer = csv.Sniffer()
self.assertTrue(sniffer.has_header(self.sample12))
Expand Down Expand Up @@ -1142,6 +1174,8 @@ def test_delimiters(self):
dialect = sniffer.sniff(self.sample9)
self.assertEqual(dialect.delimiter, '+')
self.assertEqual(dialect.quotechar, "'")
dialect = sniffer.sniff(self.sample14)
self.assertEqual(dialect.delimiter, '\0')

def test_doublequote(self):
sniffer = csv.Sniffer()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add support of null characters in :mod:`csv`.
63 changes: 30 additions & 33 deletions Modules/_csv.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ module instead.
#include "structmember.h" // PyMemberDef
#include <stdbool.h>

#define NOT_SET ((Py_UCS4)-1)
#define EOL ((Py_UCS4)-2)


typedef struct {
PyObject *error_obj; /* CSV exception */
Expand Down Expand Up @@ -153,9 +156,9 @@ get_dialect_from_registry(PyObject *name_obj, _csvstate *module_state)
}

static PyObject *
get_nullchar_as_None(Py_UCS4 c)
get_char_or_None(Py_UCS4 c)
{
if (c == '\0') {
if (c == NOT_SET) {
Py_RETURN_NONE;
}
else
Expand All @@ -172,19 +175,19 @@ Dialect_get_lineterminator(DialectObj *self, void *Py_UNUSED(ignored))
static PyObject *
Dialect_get_delimiter(DialectObj *self, void *Py_UNUSED(ignored))
{
return get_nullchar_as_None(self->delimiter);
return get_char_or_None(self->delimiter);
}

static PyObject *
Dialect_get_escapechar(DialectObj *self, void *Py_UNUSED(ignored))
{
return get_nullchar_as_None(self->escapechar);
return get_char_or_None(self->escapechar);
}

static PyObject *
Dialect_get_quotechar(DialectObj *self, void *Py_UNUSED(ignored))
{
return get_nullchar_as_None(self->quotechar);
return get_char_or_None(self->quotechar);
}

static PyObject *
Expand Down Expand Up @@ -235,7 +238,7 @@ _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt
*target = dflt;
}
else {
*target = '\0';
*target = NOT_SET;
if (src != Py_None) {
if (!PyUnicode_Check(src)) {
PyErr_Format(PyExc_TypeError,
Expand All @@ -254,7 +257,7 @@ _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt
return -1;
}
/* PyUnicode_READY() is called in PyUnicode_GetLength() */
else {
else if (len > 0) {
*target = PyUnicode_READ_CHAR(src, 0);
}
}
Expand All @@ -269,7 +272,7 @@ _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt)
*target = dflt;
}
else {
*target = '\0';
*target = NOT_SET;
if (!PyUnicode_Check(src)) {
PyErr_Format(PyExc_TypeError,
"\"%s\" must be string, not %.200s", name,
Expand All @@ -287,7 +290,7 @@ _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt)
return -1;
}
/* PyUnicode_READY() is called in PyUnicode_GetLength() */
else {
else if (len > 0) {
*target = PyUnicode_READ_CHAR(src, 0);
}
}
Expand Down Expand Up @@ -481,7 +484,7 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
goto err
DIASET(_set_char, "delimiter", &self->delimiter, delimiter, ',');
DIASET(_set_bool, "doublequote", &self->doublequote, doublequote, true);
DIASET(_set_char_or_none, "escapechar", &self->escapechar, escapechar, 0);
DIASET(_set_char_or_none, "escapechar", &self->escapechar, escapechar, NOT_SET);
DIASET(_set_str, "lineterminator", &self->lineterminator, lineterminator, "\r\n");
DIASET(_set_char_or_none, "quotechar", &self->quotechar, quotechar, '"');
DIASET(_set_int, "quoting", &self->quoting, quoting, QUOTE_MINIMAL);
Expand All @@ -491,19 +494,19 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
/* validate options */
if (dialect_check_quoting(self->quoting))
goto err;
if (self->delimiter == 0) {
if (self->delimiter == NOT_SET) {
PyErr_SetString(PyExc_TypeError,
"\"delimiter\" must be a 1-character string");
goto err;
}
if (quotechar == Py_None && quoting == NULL)
self->quoting = QUOTE_NONE;
if (self->quoting != QUOTE_NONE && self->quotechar == 0) {
if (self->quoting != QUOTE_NONE && self->quotechar == NOT_SET) {
PyErr_SetString(PyExc_TypeError,
"quotechar must be set if quoting enabled");
goto err;
}
if (self->lineterminator == 0) {
if (self->lineterminator == NULL) {
PyErr_SetString(PyExc_TypeError, "lineterminator must be set");
goto err;
}
Expand Down Expand Up @@ -670,7 +673,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
switch (self->state) {
case START_RECORD:
/* start of record */
if (c == '\0')
if (c == EOL)
/* empty line - return [] */
break;
else if (c == '\n' || c == '\r') {
Expand All @@ -682,11 +685,11 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
/* fallthru */
case START_FIELD:
/* expecting field */
if (c == '\n' || c == '\r' || c == '\0') {
if (c == '\n' || c == '\r' || c == EOL) {
/* save empty field - return [fields] */
if (parse_save_field(self) < 0)
return -1;
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
self->state = (c == EOL ? START_RECORD : EAT_CRNL);
}
else if (c == dialect->quotechar &&
dialect->quoting != QUOTE_NONE) {
Expand Down Expand Up @@ -722,25 +725,25 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
self->state = AFTER_ESCAPED_CRNL;
break;
}
if (c == '\0')
if (c == EOL)
c = '\n';
if (parse_add_char(self, module_state, c) < 0)
return -1;
self->state = IN_FIELD;
break;

case AFTER_ESCAPED_CRNL:
if (c == '\0')
if (c == EOL)
break;
/*fallthru*/

case IN_FIELD:
/* in unquoted field */
if (c == '\n' || c == '\r' || c == '\0') {
if (c == '\n' || c == '\r' || c == EOL) {
/* end of line - return [fields] */
if (parse_save_field(self) < 0)
return -1;
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
self->state = (c == EOL ? START_RECORD : EAT_CRNL);
}
else if (c == dialect->escapechar) {
/* possible escaped character */
Expand All @@ -761,7 +764,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)

case IN_QUOTED_FIELD:
/* in quoted field */
if (c == '\0')
if (c == EOL)
;
else if (c == dialect->escapechar) {
/* Possible escape character */
Expand All @@ -786,7 +789,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
break;

case ESCAPE_IN_QUOTED_FIELD:
if (c == '\0')
if (c == EOL)
c = '\n';
if (parse_add_char(self, module_state, c) < 0)
return -1;
Expand All @@ -808,11 +811,11 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
return -1;
self->state = START_FIELD;
}
else if (c == '\n' || c == '\r' || c == '\0') {
else if (c == '\n' || c == '\r' || c == EOL) {
/* end of line - return [fields] */
if (parse_save_field(self) < 0)
return -1;
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
self->state = (c == EOL ? START_RECORD : EAT_CRNL);
}
else if (!dialect->strict) {
if (parse_add_char(self, module_state, c) < 0)
Expand All @@ -831,7 +834,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
case EAT_CRNL:
if (c == '\n' || c == '\r')
;
else if (c == '\0')
else if (c == EOL)
self->state = START_RECORD;
else {
PyErr_Format(module_state->error_obj,
Expand Down Expand Up @@ -909,20 +912,14 @@ Reader_iternext(ReaderObj *self)
linelen = PyUnicode_GET_LENGTH(lineobj);
while (linelen--) {
c = PyUnicode_READ(kind, data, pos);
if (c == '\0') {
Py_DECREF(lineobj);
PyErr_Format(module_state->error_obj,
"line contains NUL");
goto err;
}
if (parse_process_char(self, module_state, c) < 0) {
Py_DECREF(lineobj);
goto err;
}
pos++;
}
Py_DECREF(lineobj);
if (parse_process_char(self, module_state, 0) < 0)
if (parse_process_char(self, module_state, EOL) < 0)
goto err;
} while (self->state != START_RECORD);

Expand Down Expand Up @@ -1127,7 +1124,7 @@ join_append_data(WriterObj *self, unsigned int field_kind, const void *field_dat
*quoted = 1;
}
if (want_escape) {
if (!dialect->escapechar) {
if (dialect->escapechar == NOT_SET) {
PyErr_Format(self->error_obj,
"need to escape, but no escapechar set");
return -1;
Expand Down