From a4201ed42fb19eb789dda5e5d6a57c4202d266f6 Mon Sep 17 00:00:00 2001 From: Ajay Kumar Date: Fri, 9 Feb 2024 19:50:29 +0530 Subject: [PATCH 1/9] First commit with setup and DVC files --- .dvc/.gitignore | 3 +++ .dvc/config | 5 +++++ .dvcignore | 3 +++ data/raw/.gitignore | 2 ++ data/raw/train.dvc | 6 ++++++ data/raw/val.dvc | 6 ++++++ 6 files changed, 25 insertions(+) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore create mode 100644 data/raw/train.dvc create mode 100644 data/raw/val.dvc diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 00000000..9ff72bdc --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + analytics = false + remote = remote_storage +['remote "remote_storage"'] + url = C:\dvcData diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/data/raw/.gitignore b/data/raw/.gitignore index e69de29b..a5d9d98f 100644 --- a/data/raw/.gitignore +++ b/data/raw/.gitignore @@ -0,0 +1,2 @@ +/train +/val diff --git a/data/raw/train.dvc b/data/raw/train.dvc new file mode 100644 index 00000000..77878d32 --- /dev/null +++ b/data/raw/train.dvc @@ -0,0 +1,6 @@ +outs: +- md5: 7adc7abb69056f4d7afb512c78f2fce9.dir + size: 75309082 + nfiles: 9470 + hash: md5 + path: train diff --git a/data/raw/val.dvc b/data/raw/val.dvc new file mode 100644 index 00000000..03717a08 --- /dev/null +++ b/data/raw/val.dvc @@ -0,0 +1,6 @@ +outs: +- md5: 0ad4dcf197b452735726bf8d8777201d.dir + size: 31248080 + nfiles: 3925 + hash: md5 + path: val From 9da488b90092176a245a96805529051177c44e60 Mon Sep 17 00:00:00 2001 From: Ajay Kumar Date: Sat, 10 Feb 2024 17:27:41 +0530 Subject: [PATCH 2/9] Updated csv files --- data/prepared/.gitignore | 2 ++ data/prepared/test.csv.dvc | 5 +++++ data/prepared/train.csv.dvc | 5 +++++ 3 files changed, 12 insertions(+) create mode 100644 data/prepared/test.csv.dvc create mode 100644 data/prepared/train.csv.dvc diff --git a/data/prepared/.gitignore b/data/prepared/.gitignore index e69de29b..22a65dd9 100644 --- a/data/prepared/.gitignore +++ b/data/prepared/.gitignore @@ -0,0 +1,2 @@ +/train.csv +/test.csv diff --git a/data/prepared/test.csv.dvc b/data/prepared/test.csv.dvc new file mode 100644 index 00000000..8c87ee84 --- /dev/null +++ b/data/prepared/test.csv.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 7ae101f311cd0d5073ad3c40c2616345 + size: 76226 + hash: md5 + path: test.csv diff --git a/data/prepared/train.csv.dvc b/data/prepared/train.csv.dvc new file mode 100644 index 00000000..2e1e1be9 --- /dev/null +++ b/data/prepared/train.csv.dvc @@ -0,0 +1,5 @@ +outs: +- md5: d5e8793c48b0502964966ae2fd3ffdc5 + size: 189527 + hash: md5 + path: train.csv From 57609f13db1888ce22fd2cba765416e76e6193e1 Mon Sep 17 00:00:00 2001 From: Ajay Kumar Date: Sat, 10 Feb 2024 17:32:08 +0530 Subject: [PATCH 3/9] Trained an SGD classifier --- model/.gitignore | 1 + model/model.joblib.dvc | 5 +++++ 2 files changed, 6 insertions(+) create mode 100644 model/model.joblib.dvc diff --git a/model/.gitignore b/model/.gitignore index e69de29b..565a9d50 100644 --- a/model/.gitignore +++ b/model/.gitignore @@ -0,0 +1 @@ +/model.joblib diff --git a/model/model.joblib.dvc b/model/model.joblib.dvc new file mode 100644 index 00000000..e3935789 --- /dev/null +++ b/model/model.joblib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: eb42cc13808e1f4dba42403cd9a8b4f0 + size: 241149 + hash: md5 + path: model.joblib From 82033cb74b4c0907f06dc1c1a23231edddab2c90 Mon Sep 17 00:00:00 2001 From: Ajay Kumar Date: Sat, 10 Feb 2024 17:33:58 +0530 Subject: [PATCH 4/9] Evaluate the SGD model accuracy --- metrics/accuracy.json | 1 + 1 file changed, 1 insertion(+) create mode 100644 metrics/accuracy.json diff --git a/metrics/accuracy.json b/metrics/accuracy.json new file mode 100644 index 00000000..e8e46ac2 --- /dev/null +++ b/metrics/accuracy.json @@ -0,0 +1 @@ +{"accuracy": 0.7021546261089987} \ No newline at end of file From e9b11d965def78670dfef8cd9cc6fdfd62393409 Mon Sep 17 00:00:00 2001 From: Ajay Kumar Date: Sat, 10 Feb 2024 19:46:06 +0530 Subject: [PATCH 5/9] modified file --- src/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/train.py b/src/train.py index e5feeda6..5d3c7c2c 100644 --- a/src/train.py +++ b/src/train.py @@ -37,7 +37,7 @@ def load_data(data_path): def main(repo_path): train_csv_path = repo_path / "data/prepared/train.csv" train_data, labels = load_data(train_csv_path) - sgd = SGDClassifier(max_iter=10) + sgd = SGDClassifier(max_iter=100) trained_model = sgd.fit(train_data, labels) dump(trained_model, repo_path / "model/model.joblib") From 19d935fe5616053893b3bb244b675e6bdba82f90 Mon Sep 17 00:00:00 2001 From: Ajayk Date: Mon, 18 Mar 2024 10:15:51 +0000 Subject: [PATCH 6/9] First commit with setup and DVC files --- .dvc/config | 2 +- data/prepared/test.csv.dvc | 5 ----- data/prepared/train.csv.dvc | 5 ----- data/raw/train.dvc | 1 - data/raw/val.dvc | 1 - model/model.joblib.dvc | 5 ----- 6 files changed, 1 insertion(+), 18 deletions(-) delete mode 100644 data/prepared/test.csv.dvc delete mode 100644 data/prepared/train.csv.dvc delete mode 100644 model/model.joblib.dvc diff --git a/.dvc/config b/.dvc/config index 9ff72bdc..184e6da5 100644 --- a/.dvc/config +++ b/.dvc/config @@ -2,4 +2,4 @@ analytics = false remote = remote_storage ['remote "remote_storage"'] - url = C:\dvcData + url = /home/ajkumar/hackathonDVC/dvcDataStore diff --git a/data/prepared/test.csv.dvc b/data/prepared/test.csv.dvc deleted file mode 100644 index 8c87ee84..00000000 --- a/data/prepared/test.csv.dvc +++ /dev/null @@ -1,5 +0,0 @@ -outs: -- md5: 7ae101f311cd0d5073ad3c40c2616345 - size: 76226 - hash: md5 - path: test.csv diff --git a/data/prepared/train.csv.dvc b/data/prepared/train.csv.dvc deleted file mode 100644 index 2e1e1be9..00000000 --- a/data/prepared/train.csv.dvc +++ /dev/null @@ -1,5 +0,0 @@ -outs: -- md5: d5e8793c48b0502964966ae2fd3ffdc5 - size: 189527 - hash: md5 - path: train.csv diff --git a/data/raw/train.dvc b/data/raw/train.dvc index 77878d32..14162364 100644 --- a/data/raw/train.dvc +++ b/data/raw/train.dvc @@ -2,5 +2,4 @@ outs: - md5: 7adc7abb69056f4d7afb512c78f2fce9.dir size: 75309082 nfiles: 9470 - hash: md5 path: train diff --git a/data/raw/val.dvc b/data/raw/val.dvc index 03717a08..237148f4 100644 --- a/data/raw/val.dvc +++ b/data/raw/val.dvc @@ -2,5 +2,4 @@ outs: - md5: 0ad4dcf197b452735726bf8d8777201d.dir size: 31248080 nfiles: 3925 - hash: md5 path: val diff --git a/model/model.joblib.dvc b/model/model.joblib.dvc deleted file mode 100644 index e3935789..00000000 --- a/model/model.joblib.dvc +++ /dev/null @@ -1,5 +0,0 @@ -outs: -- md5: eb42cc13808e1f4dba42403cd9a8b4f0 - size: 241149 - hash: md5 - path: model.joblib From 509f38f6ad1fdd56acb2ba727fd83abf65f468ad Mon Sep 17 00:00:00 2001 From: Ajayk Date: Mon, 18 Mar 2024 10:52:05 +0000 Subject: [PATCH 7/9] Created train and test CSV files --- data/prepared/test.csv.dvc | 4 ++++ data/prepared/train.csv.dvc | 4 ++++ 2 files changed, 8 insertions(+) create mode 100644 data/prepared/test.csv.dvc create mode 100644 data/prepared/train.csv.dvc diff --git a/data/prepared/test.csv.dvc b/data/prepared/test.csv.dvc new file mode 100644 index 00000000..d9b3d28d --- /dev/null +++ b/data/prepared/test.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 5151fcd60de0b43c2d18fde128ee5e09 + size: 83326 + path: test.csv diff --git a/data/prepared/train.csv.dvc b/data/prepared/train.csv.dvc new file mode 100644 index 00000000..53c085ca --- /dev/null +++ b/data/prepared/train.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 0004b6e0e6d489f6902d6d6db47c24fe + size: 206725 + path: train.csv From b60b222d90246e224cc020d033694b8661d76263 Mon Sep 17 00:00:00 2001 From: Ajayk Date: Mon, 18 Mar 2024 10:53:25 +0000 Subject: [PATCH 8/9] Trained an SGD classifier --- model/model.joblib.dvc | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 model/model.joblib.dvc diff --git a/model/model.joblib.dvc b/model/model.joblib.dvc new file mode 100644 index 00000000..fd3b52d6 --- /dev/null +++ b/model/model.joblib.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 13c7384218e443fdf941f153ce53d134 + size: 241222 + path: model.joblib From 2333ba6926ad4b79f7043870cb3aa198b6f5fcb7 Mon Sep 17 00:00:00 2001 From: Ajayk Date: Mon, 18 Mar 2024 10:54:34 +0000 Subject: [PATCH 9/9] Evaluate the SGD model accuracy --- metrics/accuracy.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/accuracy.json b/metrics/accuracy.json index e8e46ac2..959bf973 100644 --- a/metrics/accuracy.json +++ b/metrics/accuracy.json @@ -1 +1 @@ -{"accuracy": 0.7021546261089987} \ No newline at end of file +{"accuracy": 0.7351077313054499} \ No newline at end of file