From e74e1958c4540ce65d9735dd0410e156cbfee7e0 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Mon, 25 Jan 2021 15:11:24 -0600 Subject: [PATCH 1/2] Switch to csv-core for the _csv module --- Cargo.lock | 2 +- vm/Cargo.toml | 2 +- vm/src/stdlib/csv.rs | 241 ++++++++++++++++++++----------------------- 3 files changed, 114 insertions(+), 131 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f3698d6084..d890259104 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1961,7 +1961,7 @@ dependencies = [ "crc", "crc32fast", "crossbeam-utils", - "csv", + "csv-core", "digest", "dns-lookup", "exitcode", diff --git a/vm/Cargo.toml b/vm/Cargo.toml index b1f3b4824e..0b370588aa 100644 --- a/vm/Cargo.toml +++ b/vm/Cargo.toml @@ -67,7 +67,7 @@ crc = "^1.0.0" bitflags = "1.2.1" libc = "0.2" nix = "0.18" -csv = "1.1.1" +csv-core = "0.1" paste = "0.1" base64 = "0.13" is-macro = "0.1" diff --git a/vm/src/stdlib/csv.rs b/vm/src/stdlib/csv.rs index 9ffcd2088e..6e06724b61 100644 --- a/vm/src/stdlib/csv.rs +++ b/vm/src/stdlib/csv.rs @@ -1,15 +1,16 @@ -use csv as rust_csv; use itertools::{self, Itertools}; use std::fmt::{self, Debug, Formatter}; -use crate::builtins::pystr::{self, PyStr}; +use crate::builtins::pystr::{PyStr, PyStrRef}; use crate::builtins::pytype::PyTypeRef; -use crate::common::lock::PyRwLock; -use crate::function::FuncArgs; +use crate::common::lock::PyMutex; +use crate::function::{ArgumentError, FromArgs, FuncArgs}; +use crate::iterator; use crate::pyobject::{ - BorrowValue, IntoPyObject, PyClassImpl, PyIterable, PyObjectRef, PyResult, PyValue, StaticType, - TryFromObject, TypeProtocol, + BorrowValue, PyClassImpl, PyObjectRef, PyRef, PyResult, PyValue, StaticType, TryFromObject, + TypeProtocol, }; +use crate::slots::PyIter; use crate::types::create_simple_type; use crate::VirtualMachine; @@ -26,12 +27,12 @@ struct ReaderOption { quotechar: u8, } -impl ReaderOption { - fn new(args: FuncArgs, vm: &VirtualMachine) -> PyResult { - let delimiter = if let Some(delimiter) = args.get_optional_kwarg("delimiter") { - *pystr::borrow_value(&delimiter) - .as_bytes() - .iter() +impl FromArgs for ReaderOption { + fn from_args(vm: &VirtualMachine, args: &mut FuncArgs) -> Result { + let delimiter = if let Some(delimiter) = args.kwargs.remove("delimiter") { + PyStrRef::try_from_object(vm, delimiter)? + .borrow_value() + .bytes() .exactly_one() .map_err(|_| { let msg = r#""delimiter" must be a 1-character string"#; @@ -41,10 +42,10 @@ impl ReaderOption { b',' }; - let quotechar = if let Some(quotechar) = args.get_optional_kwarg("quotechar") { - *pystr::borrow_value("echar) - .as_bytes() - .iter() + let quotechar = if let Some(quotechar) = args.kwargs.remove("quotechar") { + PyStrRef::try_from_object(vm, quotechar)? + .borrow_value() + .bytes() .exactly_one() .map_err(|_| { let msg = r#""quotechar" must be a 1-character string"#; @@ -61,71 +62,25 @@ impl ReaderOption { } } -pub fn build_reader( - iterable: PyIterable, - args: FuncArgs, - vm: &VirtualMachine, -) -> PyResult { - let config = ReaderOption::new(args, vm)?; - - Ok(Reader::new(iterable, config).into_object(vm)) -} - -fn into_strings(iterable: &PyIterable, vm: &VirtualMachine) -> PyResult> { - iterable - .iter(vm)? - .map(|py_obj_ref| { - match_class!(match py_obj_ref? { - py_str @ PyStr => Ok(py_str.borrow_value().trim().to_owned()), - obj => { - let msg = format!( - "iterator should return strings, not {} (did you open the file in text mode?)", - obj.class().name - ); - Err(vm.new_type_error(msg)) - } - }) - }) - .collect::>>() +impl ReaderOption { + fn to_reader(&self) -> csv_core::Reader { + csv_core::ReaderBuilder::new() + .delimiter(self.delimiter) + .quote(self.quotechar) + .build() + } } -type MemIO = std::io::Cursor>; - -#[allow(dead_code)] -enum ReadState { - PyIter(PyIterable, ReaderOption), - CsvIter(rust_csv::StringRecordsIntoIter), -} - -impl ReadState { - fn new(iter: PyIterable, config: ReaderOption) -> Self { - ReadState::PyIter(iter, config) - } - - fn cast_to_reader(&mut self, vm: &VirtualMachine) -> PyResult<()> { - if let ReadState::PyIter(ref iterable, ref config) = self { - let lines = into_strings(iterable, vm)?; - let contents = itertools::join(lines, "\n"); - - let bytes = Vec::from(contents.as_bytes()); - let reader = MemIO::new(bytes); - - let csv_iter = rust_csv::ReaderBuilder::new() - .delimiter(config.delimiter) - .quote(config.quotechar) - .has_headers(false) - .from_reader(reader) - .into_records(); - - *self = ReadState::CsvIter(csv_iter); - } - Ok(()) - } +struct ReadState { + buffer: Vec, + output_ends: Vec, + reader: csv_core::Reader, } #[pyclass(module = "csv", name = "Reader")] struct Reader { - state: PyRwLock, + iter: PyObjectRef, + state: PyMutex, } impl Debug for Reader { @@ -140,66 +95,94 @@ impl PyValue for Reader { } } -impl Reader { - fn new(iter: PyIterable, config: ReaderOption) -> Self { - let state = PyRwLock::new(ReadState::new(iter, config)); - Reader { state } - } -} - -#[pyimpl] -impl Reader { - #[pyslot] - #[pymethod(magic)] - fn iter(this: PyObjectRef, vm: &VirtualMachine) -> PyResult { - let this = match this.downcast::() { - Ok(reader) => reader, - Err(_) => return Err(vm.new_type_error("unexpected payload for __iter__".to_owned())), - }; - this.state.write().cast_to_reader(vm)?; - Ok(this.into_pyobject(vm)) - } - - #[pyslot] - fn tp_iternext(zelf: &PyObjectRef, vm: &VirtualMachine) -> PyResult { - let zelf = match zelf.downcast_ref::() { - Some(reader) => reader, - None => return Err(vm.new_type_error("unexpected payload for __next__".to_owned())), - }; - let mut state = zelf.state.write(); - state.cast_to_reader(vm)?; - - if let ReadState::CsvIter(ref mut reader) = &mut *state { - if let Some(row) = reader.next() { - match row { - Ok(records) => { - let iter = records - .into_iter() - .map(|bytes| bytes.into_pyobject(vm)) - .collect::>(); - Ok(vm.ctx.new_list(iter)) - } - Err(_err) => { - let msg = String::from("Decode Error"); - let decode_error = vm.new_unicode_decode_error(msg); - Err(decode_error) - } +#[pyimpl(with(PyIter))] +impl Reader {} +impl PyIter for Reader { + fn next(zelf: &PyRef, vm: &VirtualMachine) -> PyResult { + let string = iterator::call_next(vm, &zelf.iter)?; + let string = string.downcast::().map_err(|obj| { + vm.new_type_error(format!( + "iterator should return strings, not {} (the file should be opened in text mode)", + obj.class().name + )) + })?; + let input = string.borrow_value().as_bytes(); + + let mut state = zelf.state.lock(); + let ReadState { + buffer, + output_ends, + reader, + } = &mut *state; + + let mut input_offset = 0; + let mut output_offset = 0; + let mut output_ends_offset = 0; + + loop { + let (res, nread, nwritten, nends) = reader.read_record( + &input[input_offset..], + &mut buffer[output_offset..], + &mut output_ends[output_ends_offset..], + ); + input_offset += nread; + output_offset += nwritten; + output_ends_offset += nends; + match res { + csv_core::ReadRecordResult::InputEmpty => {} + csv_core::ReadRecordResult::OutputFull => { + let new_size = buffer.len() * 2; + buffer.resize(new_size, 0u8); } - } else { - Err(vm.new_stop_iteration()) + csv_core::ReadRecordResult::OutputEndsFull => { + let new_size = output_ends.len() * 2; + output_ends.resize(new_size, 0); + } + csv_core::ReadRecordResult::Record => break, + csv_core::ReadRecordResult::End => return Err(vm.new_stop_iteration()), } - } else { - unreachable!() } + let rest = &input[input_offset..]; + if !rest.iter().all(|&c| matches!(c, b'\r' | b'\n')) { + return Err(vm.new_value_error( + "new-line character seen in unquoted field - \ + do you need to open the file in universal-newline mode?" + .to_owned(), + )); + } + + let mut prev_end = 0; + let out = output_ends[..output_ends_offset] + .iter() + .map(|&end| { + let range = prev_end..end; + prev_end = end; + std::str::from_utf8(&buffer[range]) + .map(|s| vm.ctx.new_str(s.to_owned())) + // not sure if this is possible - the input was all strings + .map_err(|_e| vm.new_unicode_decode_error("csv not utf8".to_owned())) + }) + .collect::>()?; + Ok(vm.ctx.new_list(out)) } } -fn _csv_reader(fp: PyObjectRef, args: FuncArgs, vm: &VirtualMachine) -> PyResult { - if let Ok(iterable) = PyIterable::::try_from_object(vm, fp) { - build_reader(iterable, args, vm) - } else { - Err(vm.new_type_error("argument 1 must be an iterator".to_owned())) - } +fn _csv_reader( + iter: PyObjectRef, + options: ReaderOption, + // TODO: handle quote style, etc + _rest: FuncArgs, + vm: &VirtualMachine, +) -> PyResult { + let iter = iterator::get_iter(vm, iter)?; + Ok(Reader { + iter, + state: PyMutex::new(ReadState { + buffer: vec![0; 1024], + output_ends: vec![0; 16], + reader: options.to_reader(), + }), + }) } pub fn make_module(vm: &VirtualMachine) -> PyObjectRef { From 3aafd1a85fcf24a6d70fcd135d9975ac0b49731d Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Mon, 25 Jan 2021 16:35:45 -0600 Subject: [PATCH 2/2] Add csv.writer --- Lib/csv.py | 2 +- vm/src/stdlib/csv.rs | 162 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 142 insertions(+), 22 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index dd896f28c1..2f38bb1a19 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -4,7 +4,7 @@ """ import re -from _csv import Error, reader, \ +from _csv import Error, writer, reader, \ QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ __doc__ diff --git a/vm/src/stdlib/csv.rs b/vm/src/stdlib/csv.rs index 6e06724b61..fd297ab514 100644 --- a/vm/src/stdlib/csv.rs +++ b/vm/src/stdlib/csv.rs @@ -1,5 +1,5 @@ use itertools::{self, Itertools}; -use std::fmt::{self, Debug, Formatter}; +use std::fmt; use crate::builtins::pystr::{PyStr, PyStrRef}; use crate::builtins::pytype::PyTypeRef; @@ -7,8 +7,8 @@ use crate::common::lock::PyMutex; use crate::function::{ArgumentError, FromArgs, FuncArgs}; use crate::iterator; use crate::pyobject::{ - BorrowValue, PyClassImpl, PyObjectRef, PyRef, PyResult, PyValue, StaticType, TryFromObject, - TypeProtocol, + BorrowValue, PyClassImpl, PyIterable, PyObjectRef, PyRef, PyResult, PyValue, StaticType, + TryFromObject, TypeProtocol, }; use crate::slots::PyIter; use crate::types::create_simple_type; @@ -22,12 +22,12 @@ pub enum QuoteStyle { QuoteNone, } -struct ReaderOption { +struct FormatOptions { delimiter: u8, quotechar: u8, } -impl FromArgs for ReaderOption { +impl FromArgs for FormatOptions { fn from_args(vm: &VirtualMachine, args: &mut FuncArgs) -> Result { let delimiter = if let Some(delimiter) = args.kwargs.remove("delimiter") { PyStrRef::try_from_object(vm, delimiter)? @@ -55,18 +55,26 @@ impl FromArgs for ReaderOption { b'"' }; - Ok(ReaderOption { + Ok(FormatOptions { delimiter, quotechar, }) } } -impl ReaderOption { +impl FormatOptions { fn to_reader(&self) -> csv_core::Reader { csv_core::ReaderBuilder::new() .delimiter(self.delimiter) .quote(self.quotechar) + .terminator(csv_core::Terminator::CRLF) + .build() + } + fn to_writer(&self) -> csv_core::Writer { + csv_core::WriterBuilder::new() + .delimiter(self.delimiter) + .quote(self.quotechar) + .terminator(csv_core::Terminator::CRLF) .build() } } @@ -77,14 +85,14 @@ struct ReadState { reader: csv_core::Reader, } -#[pyclass(module = "csv", name = "Reader")] +#[pyclass(module = "_csv", name = "reader")] struct Reader { iter: PyObjectRef, state: PyMutex, } -impl Debug for Reader { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { +impl fmt::Debug for Reader { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "_csv.reader") } } @@ -130,14 +138,8 @@ impl PyIter for Reader { output_ends_offset += nends; match res { csv_core::ReadRecordResult::InputEmpty => {} - csv_core::ReadRecordResult::OutputFull => { - let new_size = buffer.len() * 2; - buffer.resize(new_size, 0u8); - } - csv_core::ReadRecordResult::OutputEndsFull => { - let new_size = output_ends.len() * 2; - output_ends.resize(new_size, 0); - } + csv_core::ReadRecordResult::OutputFull => resize_buf(buffer), + csv_core::ReadRecordResult::OutputEndsFull => resize_buf(output_ends), csv_core::ReadRecordResult::Record => break, csv_core::ReadRecordResult::End => return Err(vm.new_stop_iteration()), } @@ -169,7 +171,7 @@ impl PyIter for Reader { fn _csv_reader( iter: PyObjectRef, - options: ReaderOption, + options: FormatOptions, // TODO: handle quote style, etc _rest: FuncArgs, vm: &VirtualMachine, @@ -185,16 +187,134 @@ fn _csv_reader( }) } +struct WriteState { + buffer: Vec, + writer: csv_core::Writer, +} + +#[pyclass(module = "_csv", name = "writer")] +struct Writer { + write: PyObjectRef, + state: PyMutex, +} + +impl fmt::Debug for Writer { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "_csv.writer") + } +} + +impl PyValue for Writer { + fn class(_vm: &VirtualMachine) -> &PyTypeRef { + Self::static_type() + } +} + +#[pyimpl] +impl Writer { + #[pymethod] + fn writerow(&self, row: PyObjectRef, vm: &VirtualMachine) -> PyResult { + let mut state = self.state.lock(); + let WriteState { buffer, writer } = &mut *state; + + let mut buffer_offset = 0; + + macro_rules! handle_res { + ($x:expr) => {{ + let (res, nwritten) = $x; + buffer_offset += nwritten; + match res { + csv_core::WriteResult::InputEmpty => break, + csv_core::WriteResult::OutputFull => resize_buf(buffer), + } + }}; + } + + let row = PyIterable::try_from_object(vm, row)?; + for field in row.iter(vm)? { + let field: PyObjectRef = field?; + let stringified; + let data: &[u8] = match_class!(match field { + ref s @ PyStr => s.borrow_value().as_bytes(), + crate::builtins::PyNone => b"", + ref obj => { + stringified = vm.to_str(obj)?; + stringified.borrow_value().as_bytes() + } + }); + + let mut input_offset = 0; + + loop { + let (res, nread, nwritten) = + writer.field(&data[input_offset..], &mut buffer[buffer_offset..]); + input_offset += nread; + handle_res!((res, nwritten)); + } + + loop { + handle_res!(writer.delimiter(&mut buffer[buffer_offset..])); + } + } + + loop { + handle_res!(writer.terminator(&mut buffer[buffer_offset..])); + } + + let s = std::str::from_utf8(&buffer[..buffer_offset]) + .map_err(|_| vm.new_unicode_decode_error("csv not utf8".to_owned()))?; + + vm.invoke(&self.write, (s.to_owned(),)) + } + + #[pymethod] + fn writerows(&self, rows: PyIterable, vm: &VirtualMachine) -> PyResult<()> { + for row in rows.iter(vm)? { + self.writerow(row?, vm)?; + } + Ok(()) + } +} + +fn _csv_writer( + file: PyObjectRef, + options: FormatOptions, + // TODO: handle quote style, etc + _rest: FuncArgs, + vm: &VirtualMachine, +) -> PyResult { + let write = match vm.get_attribute_opt(file.clone(), "write")? { + Some(write_meth) => write_meth, + None if vm.is_callable(&file) => file, + None => return Err(vm.new_type_error("argument 1 must have a \"write\" method".to_owned())), + }; + + Ok(Writer { + write, + state: PyMutex::new(WriteState { + buffer: vec![0; 1024], + writer: options.to_writer(), + }), + }) +} + +#[inline] +fn resize_buf(buf: &mut Vec) { + let new_size = buf.len() * 2; + buf.resize(new_size, T::zero()); +} + pub fn make_module(vm: &VirtualMachine) -> PyObjectRef { let ctx = &vm.ctx; - let reader_type = Reader::make_class(ctx); + Reader::make_class(ctx); + Writer::make_class(ctx); let error = create_simple_type("Error", &ctx.exceptions.exception_type); py_module!(vm, "_csv", { "reader" => named_function!(ctx, _csv, reader), - "Reader" => reader_type, + "writer" => named_function!(ctx, _csv, writer), "Error" => error, // constants "QUOTE_MINIMAL" => ctx.new_int(QuoteStyle::QuoteMinimal as i32),