Skip to content

Commit 067875e

Browse files
committed
Add repository template
1 parent 75c6d1c commit 067875e

File tree

9 files changed

+125
-1
lines changed

9 files changed

+125
-1
lines changed

README.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,15 @@
1-
# data-version-control
1+
# Data Version Control Tutorial
2+
3+
Example repository for the [Data Version Control With Python and DVC](https://realpython.com/python-data-version-control/) article on [Real Python](https://realpython.com/).
4+
5+
To use this repo as part of the tutorial, you first need to get your own copy by clicking _Fork_ button in the top right corner of the screen. Select your personal GitHub account, and GitHub will create your own copy.
6+
7+
Then clone the fork you just created to your computer with the `git clone` command:
8+
9+
```console
10+
git clone git@github.com:YourUsername/data-version-control.git
11+
```
12+
13+
Make sure to replace `YourUsername` in the above command with your actual GitHub username.
14+
15+
Happy coding!

data/prepared/.DS_Store

6 KB
Binary file not shown.

data/prepared/.gitignore

Whitespace-only changes.

data/raw/.gitignore

Whitespace-only changes.

metrics/.gitignore

Whitespace-only changes.

model/.gitignore

Whitespace-only changes.

src/evaluate.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from joblib import load
2+
import json
3+
from pathlib import Path
4+
5+
from sklearn.metrics import accuracy_score
6+
7+
from train import load_data
8+
9+
10+
def main(repo_path):
11+
test_csv_path = repo_path / "data/prepared/test.csv"
12+
test_data, labels = load_data(test_csv_path)
13+
model = load(repo_path / "model/model.joblib")
14+
predictions = model.predict(test_data)
15+
accuracy = accuracy_score(labels, predictions)
16+
metrics = {"accuracy": accuracy}
17+
accuracy_path = repo_path / "metrics/accuracy.json"
18+
accuracy_path.write_text(json.dumps(metrics))
19+
20+
21+
if __name__ == "__main__":
22+
repo_path = Path(__file__).parent.parent
23+
main(repo_path)

src/prepare.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from pathlib import Path
2+
3+
import pandas as pd
4+
5+
FOLDERS_TO_LABELS = {"n03445777": "golf ball", "n03888257": "parachute"}
6+
7+
8+
def get_files_and_labels(source_path):
9+
images = []
10+
labels = []
11+
for image_path in source_path.rglob("*/*.JPEG"):
12+
filename = image_path.absolute()
13+
folder = image_path.parent.name
14+
if folder in FOLDERS_TO_LABELS:
15+
images.append(filename)
16+
label = FOLDERS_TO_LABELS[folder]
17+
labels.append(label)
18+
return images, labels
19+
20+
21+
def save_as_csv(filenames, labels, destination):
22+
data_dictionary = {"filename": filenames, "label": labels}
23+
data_frame = pd.DataFrame(data_dictionary)
24+
data_frame.to_csv(destination)
25+
26+
27+
def main(repo_path):
28+
data_path = repo_path / "data"
29+
train_path = data_path / "raw/train"
30+
test_path = data_path / "raw/val"
31+
train_files, train_labels = get_files_and_labels(train_path)
32+
test_files, test_labels = get_files_and_labels(test_path)
33+
prepared = data_path / "prepared"
34+
save_as_csv(train_files, train_labels, prepared / "train.csv")
35+
save_as_csv(test_files, test_labels, prepared / "test.csv")
36+
37+
38+
if __name__ == "__main__":
39+
repo_path = Path(__file__).parent.parent
40+
main(repo_path)

src/train.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from joblib import dump
2+
from pathlib import Path
3+
4+
import numpy as np
5+
import pandas as pd
6+
from skimage.io import imread_collection
7+
from skimage.transform import resize
8+
from sklearn.linear_model import SGDClassifier
9+
10+
11+
def load_images(data_frame, column_name):
12+
filelist = data_frame[column_name].to_list()
13+
image_list = imread_collection(filelist)
14+
return image_list
15+
16+
17+
def load_labels(data_frame, column_name):
18+
label_list = data_frame[column_name].to_list()
19+
return label_list
20+
21+
22+
def preprocess(image):
23+
resized = resize(image, (100, 100, 3))
24+
reshaped = resized.reshape((1, 30000))
25+
return reshaped
26+
27+
28+
def load_data(data_path):
29+
df = pd.read_csv(data_path)
30+
labels = load_labels(data_frame=df, column_name="label")
31+
raw_images = load_images(data_frame=df, column_name="filename")
32+
processed_images = [preprocess(image) for image in raw_images]
33+
data = np.concatenate(processed_images, axis=0)
34+
return data, labels
35+
36+
37+
def main(repo_path):
38+
train_csv_path = repo_path / "data/prepared/train.csv"
39+
train_data, labels = load_data(train_csv_path)
40+
sgd = SGDClassifier(max_iter=10)
41+
trained_model = sgd.fit(train_data, labels)
42+
dump(trained_model, repo_path / "model/model.joblib")
43+
44+
45+
if __name__ == "__main__":
46+
repo_path = Path(__file__).parent.parent
47+
main(repo_path)

0 commit comments

Comments
 (0)