realpython
diff --git a/‎README.md
Lines changed: 15 additions & 1 deletion b/‎README.md
Lines changed: 15 additions & 1 deletion
diff --git a/‎data/prepared/.DS_Store
6 KB b/‎data/prepared/.DS_Store
6 KB
diff --git a/‎data/prepared/.gitignore b/‎data/prepared/.gitignore
diff --git a/‎data/raw/.gitignore b/‎data/raw/.gitignore
diff --git a/‎metrics/.gitignore b/‎metrics/.gitignore
diff --git a/‎model/.gitignore b/‎model/.gitignore
diff --git a/‎src/evaluate.py
Lines changed: 23 additions & 0 deletions b/‎src/evaluate.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/prepare.py
Lines changed: 40 additions & 0 deletions b/‎src/prepare.py
Lines changed: 40 additions & 0 deletions
diff --git a/‎src/train.py
Lines changed: 47 additions & 0 deletions b/‎src/train.py
Lines changed: 47 additions & 0 deletions
@@ -1 +1,15 @@
-# data-version-control
+# Data Version Control Tutorial
+
+Example repository for the [Data Version Control With Python and DVC](https://realpython.com/python-data-version-control/) article on [Real Python](https://realpython.com/).
+
+To use this repo as part of the tutorial, you first need to get your own copy by clicking _Fork_ button in the top right corner of the screen. Select your personal GitHub account, and GitHub will create your own copy.
+
+Then clone the fork you just created to your computer with the `git clone` command:
+
+```console
+git clone git@github.com:YourUsername/data-version-control.git
+```
+
+Make sure to replace `YourUsername` in the above command with your actual GitHub username.
+
+Happy coding!
@@ -0,0 +1,23 @@
+from joblib import load
+import json
+from pathlib import Path
+
+from sklearn.metrics import accuracy_score
+
+from train import load_data
+
+
+def main(repo_path):
+    test_csv_path = repo_path / "data/prepared/test.csv"
+    test_data, labels = load_data(test_csv_path)
+    model = load(repo_path / "model/model.joblib")
+    predictions = model.predict(test_data)
+    accuracy = accuracy_score(labels, predictions)
+    metrics = {"accuracy": accuracy}
+    accuracy_path = repo_path / "metrics/accuracy.json"
+    accuracy_path.write_text(json.dumps(metrics))
+
+
+if __name__ == "__main__":
+    repo_path = Path(__file__).parent.parent
+    main(repo_path)
@@ -0,0 +1,40 @@
+from pathlib import Path
+
+import pandas as pd
+
+FOLDERS_TO_LABELS = {"n03445777": "golf ball", "n03888257": "parachute"}
+
+
+def get_files_and_labels(source_path):
+    images = []
+    labels = []
+    for image_path in source_path.rglob("*/*.JPEG"):
+        filename = image_path.absolute()
+        folder = image_path.parent.name
+        if folder in FOLDERS_TO_LABELS:
+            images.append(filename)
+            label = FOLDERS_TO_LABELS[folder]
+            labels.append(label)
+    return images, labels
+
+
+def save_as_csv(filenames, labels, destination):
+    data_dictionary = {"filename": filenames, "label": labels}
+    data_frame = pd.DataFrame(data_dictionary)
+    data_frame.to_csv(destination)
+
+
+def main(repo_path):
+    data_path = repo_path / "data"
+    train_path = data_path / "raw/train"
+    test_path = data_path / "raw/val"
+    train_files, train_labels = get_files_and_labels(train_path)
+    test_files, test_labels = get_files_and_labels(test_path)
+    prepared = data_path / "prepared"
+    save_as_csv(train_files, train_labels, prepared / "train.csv")
+    save_as_csv(test_files, test_labels, prepared / "test.csv")
+
+
+if __name__ == "__main__":
+    repo_path = Path(__file__).parent.parent
+    main(repo_path)
@@ -0,0 +1,47 @@
+from joblib import dump
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from skimage.io import imread_collection
+from skimage.transform import resize
+from sklearn.linear_model import SGDClassifier
+
+
+def load_images(data_frame, column_name):
+    filelist = data_frame[column_name].to_list()
+    image_list = imread_collection(filelist)
+    return image_list
+
+
+def load_labels(data_frame, column_name):
+    label_list = data_frame[column_name].to_list()
+    return label_list
+
+
+def preprocess(image):
+    resized = resize(image, (100, 100, 3))
+    reshaped = resized.reshape((1, 30000))
+    return reshaped
+
+
+def load_data(data_path):
+    df = pd.read_csv(data_path)
+    labels = load_labels(data_frame=df, column_name="label")
+    raw_images = load_images(data_frame=df, column_name="filename")
+    processed_images = [preprocess(image) for image in raw_images]
+    data = np.concatenate(processed_images, axis=0)
+    return data, labels
+
+
+def main(repo_path):
+    train_csv_path = repo_path / "data/prepared/train.csv"
+    train_data, labels = load_data(train_csv_path)
+    sgd = SGDClassifier(max_iter=10)
+    trained_model = sgd.fit(train_data, labels)
+    dump(trained_model, repo_path / "model/model.joblib")
+
+
+if __name__ == "__main__":
+    repo_path = Path(__file__).parent.parent
+    main(repo_path)