Skip to content

fix dtypes not used on csv parse #657

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ node_modules
test/fixtures/*
test/samples/*
*.xlsx

testsss
11 changes: 7 additions & 4 deletions src/danfojs-base/io/browser/io.csv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,16 @@ import Papa from 'papaparse'
*/
const $readCSV = async (file: any, options?: CsvInputOptionsBrowser): Promise<DataFrame> => {
const frameConfig = options?.frameConfig || {}
const hasStringType = frameConfig.dtypes?.includes("string")

return new Promise((resolve, reject) => {
let hasError = false;

Papa.parse(file, {
header: true,
dynamicTyping: true,
dynamicTyping: !hasStringType,
skipEmptyLines: 'greedy',
delimiter: ",",
...options,
error: (error) => {
hasError = true;
Expand Down Expand Up @@ -108,12 +110,13 @@ const $streamCSV = async (file: string, callback: (df: DataFrame) => void, optio
return new Promise((resolve, reject) => {
let count = 0
let hasError = false;

const hasStringType = frameConfig.dtypes?.includes("string")
Papa.parse(file, {
...options,
dynamicTyping: true,
header: true,
download: true,
dynamicTyping: !hasStringType,
delimiter: ",",
...options,
step: results => {
if (hasError) return;
try {
Expand Down
7 changes: 5 additions & 2 deletions src/danfojs-base/io/node/io.csv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,16 @@ import fs from 'fs'
*/
const $readCSV = async (filePath: string, options?: CsvInputOptionsNode): Promise<DataFrame> => {
const frameConfig = options?.frameConfig || {}
const hasStringType = frameConfig.dtypes?.includes("string")

if (filePath.startsWith("http") || filePath.startsWith("https")) {
return new Promise((resolve, reject) => {
let hasError = false;
const optionsWithDefaults = {
header: true,
dynamicTyping: true,
dynamicTyping: !hasStringType,
skipEmptyLines: 'greedy',
delimiter: ",",
...options,
}

Expand Down Expand Up @@ -116,7 +118,8 @@ const $readCSV = async (filePath: string, options?: CsvInputOptionsNode): Promis

Papa.parse(fileStream, {
header: true,
dynamicTyping: true,
dynamicTyping: !hasStringType,
delimiter: ",",
...options,
error: (error) => {
hasError = true;
Expand Down
35 changes: 35 additions & 0 deletions src/danfojs-browser/tests/io/csv.reader.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,41 @@ describe("readCSV", function () {
assert.ok(error instanceof Error);
}
});

it("Preserves leading zeros when dtype is string", async function () {
// Create a CSV file with leading zeros
const csvContent = "codes\n012345\n001234";
const file = new File([ csvContent ], "leading_zeros.csv", { type: "text/csv" });

const df = await dfd.readCSV(file, {
frameConfig: {
dtypes: [ "string" ]
}
});

assert.deepEqual(df.values, [ [ "012345" ], [ "001234" ] ]);
assert.deepEqual(df.dtypes, [ "string" ]);

// Verify the values are actually strings
const jsonData = dfd.toJSON(df);
assert.deepEqual(jsonData, [ { codes: "012345" }, { codes: "001234" } ]);
});

it("Converts to numbers when dtype is not string", async function () {
// Create a CSV file with leading zeros
const csvContent = "codes\n012345\n001234";
const file = new File([ csvContent ], "leading_zeros.csv", { type: "text/csv" });

const df = await dfd.readCSV(file); // default behavior without string dtype

// Values should be converted to numbers
assert.deepEqual(df.values, [ [ 12345 ], [ 1234 ] ]);
assert.deepEqual(df.dtypes, [ "int32" ]);

// Verify JSON output
const jsonData = dfd.toJSON(df);
assert.deepEqual(jsonData, [ { codes: 12345 }, { codes: 1234 } ]);
});
});

// describe("streamCSV", function () {
Expand Down
55 changes: 54 additions & 1 deletion src/danfojs-node/test/io/csv.reader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import path from "path";
import chai, { assert, expect } from "chai";
import { describe, it } from "mocha";
import chaiAsPromised from "chai-as-promised";
import { DataFrame, readCSV, Series, streamCSV, toCSV } from "../../dist/danfojs-node/src";
import { DataFrame, readCSV, Series, streamCSV, toCSV, toJSON } from "../../dist/danfojs-node/src";
import fs from 'fs';
import process from 'process';

Expand Down Expand Up @@ -112,6 +112,59 @@ describe("readCSV", function () {
const filePath = path.join(testSamplesDir, "invalid.csv");
await expect(readCSV(filePath)).to.be.rejectedWith("ENOENT: no such file or directory");
});

it("Preserves leading zeros when dtype is string", async function () {
const filePath = path.join(testSamplesDir, "leading_zeros.csv");
// Create test CSV file
fs.writeFileSync(filePath, "codes\n012345\n001234");

try {
const df = await readCSV(filePath, {
frameConfig: {
dtypes: ["string"]
}
});

assert.deepEqual(df.values, [["012345"], ["001234"]]);
assert.deepEqual(df.dtypes, ["string"]);

// Verify the values are actually strings
const jsonData = toJSON(df);
assert.deepEqual(jsonData, [{ codes: "012345" }, { codes: "001234" }]);

// Clean up
fs.unlinkSync(filePath);
} catch (error) {
// Clean up even if test fails
fs.unlinkSync(filePath);
throw error;
}
});

it("Converts to numbers when dtype is not string", async function () {
const filePath = path.join(testSamplesDir, "leading_zeros.csv");
// Create test CSV file
fs.writeFileSync(filePath, "codes\n012345\n001234");

try {
const df = await readCSV(filePath); // default behavior without string dtype

// Values should be converted to numbers
assert.deepEqual(df.values, [[12345], [1234]]);
assert.deepEqual(df.dtypes, ["int32"]);

// Verify JSON output
const jsonData = toJSON(df);
assert.deepEqual(jsonData, [{ codes: 12345 }, { codes: 1234 }]);

// Clean up
fs.unlinkSync(filePath);
} catch (error) {
// Clean up even if test fails
fs.unlinkSync(filePath);
throw error;
}
});
});

describe("streamCSV", function () {
Expand Down