diff --git a/.devcontainer.json b/.devcontainer.json index acfcc56f..14c968a3 100644 --- a/.devcontainer.json +++ b/.devcontainer.json @@ -1,6 +1,7 @@ { "name": "example-repos-dev", "image": "mcr.microsoft.com/devcontainers/python:3.10", + "runArgs": ["--ipc=host"], "extensions": ["Iterative.dvc", "ms-python.python", "redhat.vscode-yaml"], "features": { "ghcr.io/devcontainers/features/nvidia-cuda:1": { diff --git a/example-get-started-experiments/code/README.md b/example-get-started-experiments/code/README.md index d5b51b64..1575308f 100644 --- a/example-get-started-experiments/code/README.md +++ b/example-get-started-experiments/code/README.md @@ -1,7 +1,5 @@ -[![DVC](https://img.shields.io/badge/-Open_in_Studio-grey.svg?style=flat-square&logo=dvc)](https://studio.iterative.ai/team/Iterative/projects/example-get-started-experiments-y8toqd433r) -[![DVC-metrics](https://img.shields.io/badge/dynamic/json?style=flat-square&colorA=grey&colorB=F46737&label=Dice%20Metric&url=https://github.com/iterative/example-get-started-experiments/raw/main/results/evaluate/metrics.json&query=dice_multi)](https://github.com/iterative/example-get-started-experiments/raw/main/results/evaluate/metrics.json) - -[Train Report](./results/train/report.md) - [Evaluation Report](./results/evaluate/report.md) +[![DVC Studio](https://img.shields.io/badge/-Open_in_Studio-grey.svg?style=flat-square&logo=dvc)](https://studio.iterative.ai/team/Iterative/projects/example-get-started-experiments-y8toqd433r) +[![DVC-metrics](https://img.shields.io/badge/dynamic/json?style=flat-square&colorA=grey&colorB=F46737&label=Dice%20Metric&url=https://github.com/iterative/example-get-started-experiments/raw/main/dvclive/metrics.json&query=metrics/mAP50(M))](https://github.com/iterative/example-get-started-experiments/raw/main/dvclive/metrics.json) # DVC Get Started: Experiments @@ -11,8 +9,6 @@ This is an auto-generated repository for use in [DVC](https://dvc.org) This is a Computer Vision (CV) project that solves the problem of segmenting out swimming pools from satellite images. -[Example results](./results/evaluate/plots/images/) - We use a slightly modified version of the [BH-Pools dataset](http://patreo.dcc.ufmg.br/2020/07/29/bh-pools-watertanks-datasets/): we split the original 4k images into tiles of 1024x1024 pixels. @@ -58,7 +54,7 @@ $ dvc pull ## Running in your environment Run [`dvc exp run`](https://man.dvc.org/exp/run) to reproduce the -[pipeline](https://dvc.org/doc/user-guide/pipelines/defining-pipelinese): +[pipeline](https://dvc.org/doc/user-guide/pipelines/defining-pipelines): ```console $ dvc exp run @@ -107,47 +103,3 @@ This tag also contains a GitHub Actions workflow that reruns the pipeline if any changes are introduced to the pipeline-related files. [CML](https://cml.dev/) is used in this workflow to provision a cloud-based GPU machine as well as report model performance results in Pull Requests. - -## Deploying the model - -Check out the [PR](https://github.com/iterative/example-get-started-experiments/pulls) -that adds this model to -[Iterative Studio Model Registry](https://dvc.org/doc/studio/user-guide/model-registry/what-is-a-model-registry). -You can [trigger CI/CD](https://dvc.org/doc/studio/user-guide/model-registry/use-models#deploying-and-publishing-models-in-cicd) -by [registering versions](https://dvc.org/doc/studio/user-guide/model-registry/register-version) -and [assigning stages](https://dvc.org/doc/studio/user-guide/model-registry/assign-stage) -in Model Registry, building and publishing Docker images with the model, -or deploying the model to the cloud. - -## Project structure - -The data files, DVC files, and results change as stages are created one by one. -After cloning and using [`dvc pull`](https://man.dvc.org/pull) to download -data, models, and plots tracked by DVC, the workspace should look like this: - -```console -$ tree -L 2 -. -├── LICENSE -├── README.md -├── data. # <-- Directory with raw and intermediate data -│ ├── pool_data # <-- Raw image data -│ ├── pool_data.dvc # <-- .dvc file - a placeholder/pointer to raw data -│ ├── test_data # <-- Processed test data -│ └── train_data # <-- Processed train data -├── dvc.lock -├── dvc.yaml # <-- DVC pipeline file -├── models -│ └── model.pkl # <-- Trained model file -├── notebooks -│ └── TrainSegModel.ipynb # <-- Initial notebook (refactored into `dvc.yaml`) -├── params.yaml # <-- Parameters file -├── requirements.txt # <-- Python dependencies needed in the project -├── results # <-- DVCLive reports and plots -│ ├── evaluate -│ └── train -└── src # <-- Source code to run the pipeline stages - ├── data_split.py - ├── evaluate.py - └── train.py -``` diff --git a/example-get-started-experiments/code/TrainSegModel.ipynb b/example-get-started-experiments/code/TrainSegModel.ipynb new file mode 100644 index 00000000..cebc38b3 --- /dev/null +++ b/example-get-started-experiments/code/TrainSegModel.ipynb @@ -0,0 +1,178 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "from pathlib import Path\n", + "\n", + "import cv2\n", + "from ultralytics import YOLO\n", + "\n", + "DATA = Path(\"datasets\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load data and split it into train/test\n", + "\n", + "We have some [data in DVC](https://dvc.org/doc/start/data-management/data-versioning) that we can pull. \n", + "\n", + "This data includes:\n", + "* satellite images\n", + "* masks of the swimming pools in each satellite image\n", + "\n", + "DVC can help connect your data to your repo, but it isn't necessary to have your data in DVC to start tracking experiments with DVC and DVCLive." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!dvc pull" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert to YOLO Dataset format\n", + "\n", + "https://docs.ultralytics.com/datasets/segment/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def mask_to_yolo_annotation(mask):\n", + " contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n", + " annotation = \"\"\n", + " for contour in contours:\n", + " single_annotation = \"0\"\n", + " for row, col in contour.squeeze():\n", + " single_annotation += f\" {round(col / mask.shape[1], 3)} {round(row / mask.shape[0], 3)}\"\n", + " annotation += f\"{single_annotation}\\n\"\n", + " return annotation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_regions = [\"REGION_1-\"]\n", + "\n", + "train_data_dir = DATA / \"yolo_dataset\" / \"train\"\n", + "train_data_dir.mkdir(exist_ok=True, parents=True)\n", + "test_data_dir = DATA / \"yolo_dataset\" / \"val\"\n", + "test_data_dir.mkdir(exist_ok=True, parents=True)\n", + "\n", + "for img_path in DATA.glob(\"pool_data/images/*.jpg\"):\n", + " yolo_annotation = mask_to_yolo_annotation(\n", + " cv2.imread(\n", + " str(DATA / \"pool_data\" / \"masks\" / f\"{img_path.stem}.png\"),\n", + " cv2.IMREAD_GRAYSCALE\n", + " )\n", + " )\n", + "\n", + " if any(region in str(img_path) for region in test_regions):\n", + " dst = test_data_dir / img_path.name\n", + " else:\n", + " dst = train_data_dir / img_path.name\n", + "\n", + " shutil.copy(img_path, dst)\n", + " dst.with_suffix(\".txt\").write_text(yolo_annotation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "yolo_dataset_yaml = DATA / \"yolo_dataset.yaml\"\n", + "yolo_dataset_yaml.write_text(\n", + " \"\"\"\n", + "path: ./yolo_dataset\n", + "train: train\n", + "val: val\n", + "\n", + "names:\n", + " 0: pool\n", + " \"\"\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train model\n", + "Set up model training, using DVCLive to capture the results of each experiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "imgsz = 512\n", + "epochs = 20\n", + "model = \"yolov8n-seg.pt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "yolo = YOLO(model)\n", + "\n", + "yolo.train(data=yolo_dataset_yaml, epochs=epochs, imgsz=imgsz)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "vscode": { + "interpreter": { + "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/example-get-started-experiments/code/data/.gitignore b/example-get-started-experiments/code/data/.gitignore deleted file mode 100644 index 6830f1fe..00000000 --- a/example-get-started-experiments/code/data/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/pool_data -/test_data -/train_data diff --git a/example-get-started-experiments/code/datasets/.gitignore b/example-get-started-experiments/code/datasets/.gitignore new file mode 100644 index 00000000..d2268e0e --- /dev/null +++ b/example-get-started-experiments/code/datasets/.gitignore @@ -0,0 +1,2 @@ +/pool_data +/yolo_dataset diff --git a/example-get-started-experiments/code/data/pool_data.dvc b/example-get-started-experiments/code/datasets/pool_data.dvc similarity index 100% rename from example-get-started-experiments/code/data/pool_data.dvc rename to example-get-started-experiments/code/datasets/pool_data.dvc diff --git a/example-get-started-experiments/code/notebooks/TrainSegModel.ipynb b/example-get-started-experiments/code/notebooks/TrainSegModel.ipynb deleted file mode 100644 index 6ebc1a0d..00000000 --- a/example-get-started-experiments/code/notebooks/TrainSegModel.ipynb +++ /dev/null @@ -1,311 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "\n", - "ROOT = Path(\"../\")\n", - "DATA = ROOT / \"data\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import shutil\n", - "from functools import partial\n", - "\n", - "import numpy as np\n", - "import torch\n", - "from box import ConfigBox\n", - "from dvclive import Live\n", - "from dvclive.fastai import DVCLiveCallback\n", - "from fastai.data.all import Normalize, get_files\n", - "from fastai.metrics import DiceMulti\n", - "from fastai.vision.all import (Resize, SegmentationDataLoaders,\n", - " imagenet_stats, models, unet_learner)\n", - "from ruamel.yaml import YAML\n", - "from PIL import Image" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load data and split it into train/test\n", - "\n", - "We have some [data in DVC](https://dvc.org/doc/start/data-management/data-versioning) that we can pull. \n", - "\n", - "This data includes:\n", - "* satellite images\n", - "* masks of the swimming pools in each satellite image\n", - "\n", - "DVC can help connect your data to your repo, but it isn't necessary to have your data in DVC to start tracking experiments with DVC and DVCLive." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!dvc pull" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_regions = [\"REGION_1-\"]\n", - "\n", - "img_fpaths = get_files(DATA / \"pool_data\" / \"images\", extensions=\".jpg\")\n", - "\n", - "train_data_dir = DATA / \"train_data\"\n", - "train_data_dir.mkdir(exist_ok=True)\n", - "test_data_dir = DATA / \"test_data\"\n", - "test_data_dir.mkdir(exist_ok=True)\n", - "for img_path in img_fpaths:\n", - " msk_path = DATA / \"pool_data\" / \"masks\" / f\"{img_path.stem}.png\"\n", - " if any(region in str(img_path) for region in test_regions):\n", - " shutil.copy(img_path, test_data_dir)\n", - " shutil.copy(msk_path, test_data_dir)\n", - " else:\n", - " shutil.copy(img_path, train_data_dir)\n", - " shutil.copy(msk_path, train_data_dir)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a data loader\n", - "\n", - "Load and prepare the images and masks by creating a data loader." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_mask_path(x, train_data_dir):\n", - " return Path(train_data_dir) / f\"{Path(x).stem}.png\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bs = 8\n", - "valid_pct = 0.20\n", - "img_size = 256\n", - "\n", - "data_loader = SegmentationDataLoaders.from_label_func(\n", - " path=train_data_dir,\n", - " fnames=get_files(train_data_dir, extensions=\".jpg\"),\n", - " label_func=partial(get_mask_path, train_data_dir=train_data_dir),\n", - " codes=[\"not-pool\", \"pool\"],\n", - " bs=bs,\n", - " valid_pct=valid_pct,\n", - " item_tfms=Resize(img_size),\n", - " batch_tfms=[\n", - " Normalize.from_stats(*imagenet_stats),\n", - " ],\n", - " )" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Review a sample batch of data\n", - "\n", - "Below are some examples of the images overlaid with their masks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_loader.show_batch(alpha=0.7)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Train multiple models with different learning rates using `DVCLiveCallback`\n", - "\n", - "Set up model training, using DVCLive to capture the results of each experiment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def dice(mask_pred, mask_true, classes=[0, 1], eps=1e-6):\n", - " dice_list = []\n", - " for c in classes:\n", - " y_true = mask_true == c\n", - " y_pred = mask_pred == c\n", - " intersection = 2.0 * np.sum(y_true * y_pred)\n", - " dice = intersection / (np.sum(y_true) + np.sum(y_pred) + eps)\n", - " dice_list.append(dice)\n", - " return np.mean(dice_list)\n", - "\n", - "def evaluate(learn):\n", - " test_img_fpaths = sorted(get_files(DATA / \"test_data\", extensions=\".jpg\"))\n", - " test_dl = learn.dls.test_dl(test_img_fpaths)\n", - " preds, _ = learn.get_preds(dl=test_dl)\n", - " masks_pred = np.array(preds[:, 1, :] > 0.5, dtype=np.uint8)\n", - " test_mask_fpaths = [\n", - " get_mask_path(fpath, DATA / \"test_data\") for fpath in test_img_fpaths\n", - " ]\n", - " masks_true = [Image.open(mask_path) for mask_path in test_mask_fpaths]\n", - " dice_multi = 0.0\n", - " for ii in range(len(masks_true)):\n", - " mask_pred, mask_true = masks_pred[ii], masks_true[ii]\n", - " width, height = mask_true.shape[1], mask_true.shape[0]\n", - " mask_pred = np.array(\n", - " Image.fromarray(mask_pred).resize((width, height)),\n", - " dtype=int\n", - " )\n", - " mask_true = np.array(mask_true, dtype=int)\n", - " dice_multi += dice(mask_true, mask_pred) / len(masks_true)\n", - " return dice_multi" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "train_arch = 'shufflenet_v2_x2_0'\n", - "models_dir = ROOT / \"models\"\n", - "models_dir.mkdir(exist_ok=True)\n", - "results_dir = ROOT / \"results\" / \"train\"\n", - "\n", - "for base_lr in [0.001, 0.005, 0.01]:\n", - " # initialize dvclive, optionally provide output path, and save results as a dvc experiment\n", - " with Live(str(results_dir), save_dvc_exp=True, report=\"notebook\") as live:\n", - " # log a parameter\n", - " live.log_param(\"train_arch\", train_arch)\n", - " fine_tune_args = {\n", - " 'epochs': 8,\n", - " 'base_lr': base_lr\n", - " }\n", - " # log a dict of parameters\n", - " live.log_params(fine_tune_args)\n", - "\n", - " learn = unet_learner(data_loader, \n", - " arch=getattr(models, train_arch), \n", - " metrics=DiceMulti)\n", - " # train model and automatically capture metrics with DVCLiveCallback\n", - " learn.fine_tune(\n", - " **fine_tune_args,\n", - " cbs=[DVCLiveCallback(live=live)])\n", - "\n", - " learn.export(fname=(models_dir / \"model.pkl\").absolute())\n", - "\n", - " # add additional post-training summary metrics\n", - " live.summary[\"evaluate/dice_multi\"] = evaluate(learn)\n", - "\n", - " # save model artifact to dvc\n", - " live.log_artifact(\n", - " str(models_dir / \"model.pkl\"),\n", - " type=\"model\",\n", - " name=\"pool-segmentation\",\n", - " desc=\"This is a Computer Vision (CV) model that's segmenting out swimming pools from satellite images.\",\n", - " labels=[\"cv\", \"segmentation\", \"satellite-images\", \"unet\"],\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compare experiments\n", - "!dvc exp show --only-changed" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Review sample preditions vs ground truth\n", - "\n", - "Below are some example of the predicted masks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "learn.show_results(max_n=6, alpha=0.7)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - }, - "vscode": { - "interpreter": { - "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/example-get-started-experiments/code/params.yaml b/example-get-started-experiments/code/params.yaml index d0c05908..61c353ed 100644 --- a/example-get-started-experiments/code/params.yaml +++ b/example-get-started-experiments/code/params.yaml @@ -1,18 +1,8 @@ -base: - random_seed: 42 - -data_split: +create_yolo_dataset: test_regions: - REGION_1 train: - valid_pct: 0.1 - arch: shufflenet_v2_x2_0 - img_size: 256 - batch_size: 8 - fine_tune_args: - epochs: 8 - base_lr: 0.01 - -evaluate: - n_samples_to_save: 10 + epochs: 30 + imgsz: 512 + model: yolov8n-seg.pt diff --git a/example-get-started-experiments/code/requirements.txt b/example-get-started-experiments/code/requirements.txt index aeeffe22..f8326de5 100644 --- a/example-get-started-experiments/code/requirements.txt +++ b/example-get-started-experiments/code/requirements.txt @@ -1,4 +1,5 @@ dvc[s3]>=3.0 dvclive>=2.11.3 -fastai -python-box \ No newline at end of file +fire +ultralytics +shapely \ No newline at end of file diff --git a/example-get-started-experiments/code/src/create_yolo_dataset.py b/example-get-started-experiments/code/src/create_yolo_dataset.py new file mode 100644 index 00000000..680f976c --- /dev/null +++ b/example-get-started-experiments/code/src/create_yolo_dataset.py @@ -0,0 +1,49 @@ +import shutil +from pathlib import Path +from typing import List, Union + +import cv2 +from fire import Fire +from shapely import Polygon + + +def mask_to_yolo_annotation(mask): + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + annotation = "" + for contour in contours: + if contour.shape[0] < 3: + continue + polygon = Polygon(contour.squeeze()).simplify(1, preserve_topology=False) + single_annotation = "0" + for col, row in polygon.exterior.coords: + single_annotation += f" {round(col / mask.shape[1], 3)} {round(row / mask.shape[0], 3)}" + annotation += f"{single_annotation}\n" + return annotation + + +def create_yolo_dataset(test_regions: Union[str, List[str]]): + if isinstance(test_regions, str): + test_regions = [test_regions] + data = Path("datasets") + train_data_dir = data / "yolo_dataset" / "train" + train_data_dir.mkdir(exist_ok=True, parents=True) + test_data_dir = data / "yolo_dataset" / "val" + test_data_dir.mkdir(exist_ok=True, parents=True) + + for img_path in data.glob("pool_data/images/*.jpg"): + yolo_annotation = mask_to_yolo_annotation( + cv2.imread( + str(data / "pool_data" / "masks" / f"{img_path.stem}.png"), + cv2.IMREAD_GRAYSCALE + ) + ) + + if any(region in str(img_path) for region in test_regions): + dst = test_data_dir / img_path.name + else: + dst = train_data_dir / img_path.name + shutil.copy(img_path, dst) + dst.with_suffix(".txt").write_text(yolo_annotation) + +if __name__ == "__main__": + Fire(create_yolo_dataset) diff --git a/example-get-started-experiments/code/src/data_split.py b/example-get-started-experiments/code/src/data_split.py deleted file mode 100644 index b7fb3d4c..00000000 --- a/example-get-started-experiments/code/src/data_split.py +++ /dev/null @@ -1,33 +0,0 @@ -import shutil -from pathlib import Path - -import numpy as np -from box import ConfigBox -from fastai.vision.all import get_files -from ruamel.yaml import YAML - - -yaml = YAML(typ="safe") - - -def data_split(): - params = ConfigBox(yaml.load(open("params.yaml", encoding="utf-8"))) - np.random.seed(params.base.random_seed) - img_fpaths = get_files(Path("data") / "pool_data" / "images", extensions=".jpg") - - train_data_dir = Path("data") / "train_data" - train_data_dir.mkdir(exist_ok=True) - test_data_dir = Path("data") / "test_data" - test_data_dir.mkdir(exist_ok=True) - for img_path in img_fpaths: - msk_path = Path("data") / "pool_data" / "masks" / f"{img_path.stem}.png" - if any(region in str(img_path) for region in params.data_split.test_regions): - shutil.copy(img_path, test_data_dir) - shutil.copy(msk_path, test_data_dir) - else: - shutil.copy(img_path, train_data_dir) - shutil.copy(msk_path, train_data_dir) - - -if __name__ == "__main__": - data_split() diff --git a/example-get-started-experiments/code/src/evaluate.py b/example-get-started-experiments/code/src/evaluate.py deleted file mode 100644 index 34d1b38b..00000000 --- a/example-get-started-experiments/code/src/evaluate.py +++ /dev/null @@ -1,101 +0,0 @@ -from pathlib import Path - -import numpy as np -from box import ConfigBox -from dvclive import Live -from fastai.vision.all import get_files, load_learner -from PIL import Image -from ruamel.yaml import YAML - - -yaml = YAML(typ="safe") - - -def dice(mask_pred, mask_true, classes=[0, 1], eps=1e-6): - dice_list = [] - for c in classes: - y_true = mask_true == c - y_pred = mask_pred == c - intersection = 2.0 * np.sum(y_true * y_pred) - dice = intersection / (np.sum(y_true) + np.sum(y_pred) + eps) - dice_list.append(dice) - return np.mean(dice_list) - - -def paint_mask(mask, color_map={0: (0, 0, 0), 1: (0, 0, 255)}): - vis_shape = mask.shape + (3,) - vis = np.zeros(vis_shape) - for i, c in color_map.items(): - vis[mask == i] = color_map[i] - return Image.fromarray(vis.astype(np.uint8)) - - -def stack_images(im1, im2): - dst = Image.new("RGB", (im1.width + im2.width, im1.height)) - dst.paste(im1, (0, 0)) - dst.paste(im2, (im1.width, 0)) - return dst - - -def get_overlay_image(img_fpath, mask_true, mask_pred): - img_pil = Image.open(img_fpath) - overlay_img_true = Image.blend( - img_pil.convert("RGBA"), paint_mask(mask_true).convert("RGBA"), 0.5 - ) - - new_color_map = { - 0: (0, 0, 0), # no color - TN - 1: (255, 0, 255), # purple - FN - 2: (255, 255, 0), # yellow - FP - 3: (0, 0, 255), # blue - TP - } - combined_mask = mask_true + 2 * mask_pred - - overlay_img_pred = Image.blend( - img_pil.convert("RGBA"), - paint_mask(combined_mask, color_map=new_color_map).convert("RGBA"), - 0.5, - ) - stacked_image = stack_images(overlay_img_true, overlay_img_pred) - return stacked_image - - -def get_mask_path(x, train_data_dir): - return Path(train_data_dir) / f"{Path(x).stem}.png" - - -def evaluate(): - params = ConfigBox(yaml.load(open("params.yaml", encoding="utf-8"))) - model_fpath = Path("models") / "model.pkl" - learn = load_learner(model_fpath, cpu=False) - test_img_fpaths = sorted(get_files(Path("data") / "test_data", extensions=".jpg")) - test_dl = learn.dls.test_dl(test_img_fpaths) - preds, _ = learn.get_preds(dl=test_dl) - masks_pred = np.array(preds[:, 1, :] > 0.5, dtype=np.uint8) - test_mask_fpaths = [ - get_mask_path(fpath, Path("data") / "test_data") for fpath in test_img_fpaths - ] - masks_true = [Image.open(mask_path) for mask_path in test_mask_fpaths] - with Live("results/evaluate", report=None, cache_images=True) as live: - dice_multi = 0.0 - for ii in range(len(masks_true)): - mask_pred, mask_true = masks_pred[ii], masks_true[ii] - mask_pred = np.array( - Image.fromarray(mask_pred).resize((mask_true.shape[1], mask_true.shape[0])), - dtype=int - ) - mask_true = np.array(mask_true, dtype=int) - dice_multi += dice(mask_true, mask_pred) / len(masks_true) - - if ii < params.evaluate.n_samples_to_save: - stacked_image = get_overlay_image( - test_img_fpaths[ii], mask_true, mask_pred - ) - stacked_image = stacked_image.resize((512, 256)) - live.log_image(f"{Path(test_img_fpaths[ii]).stem}.png", stacked_image) - - live.summary["dice_multi"] = dice_multi - - -if __name__ == "__main__": - evaluate() diff --git a/example-get-started-experiments/code/src/train.py b/example-get-started-experiments/code/src/train.py index 02bc43d6..72baeae1 100644 --- a/example-get-started-experiments/code/src/train.py +++ b/example-get-started-experiments/code/src/train.py @@ -1,81 +1,12 @@ -import random -from functools import partial -from pathlib import Path +from fire import Fire +from ultralytics import YOLO -import numpy as np -import torch -from box import ConfigBox -from dvclive import Live -from dvclive.fastai import DVCLiveCallback -from fastai.data.all import Normalize, get_files -from fastai.metrics import DiceMulti -from fastai.vision.all import ( - Resize, - SegmentationDataLoaders, - imagenet_stats, - models, - unet_learner, -) -from ruamel.yaml import YAML -yaml = YAML(typ="safe") +def train(epochs: int = 10, imgsz: int = 384, model: str = "yolov8n-seg.pt", **kwargs): + yolo = YOLO(model) - -def get_mask_path(x, train_data_dir): - return Path(train_data_dir) / f"{Path(x).stem}.png" - - -def train(): - params = ConfigBox(yaml.load(open("params.yaml", encoding="utf-8"))) - - np.random.seed(params.base.random_seed) - torch.manual_seed(params.base.random_seed) - random.seed(params.base.random_seed) - train_data_dir = Path("data") / "train_data" - - data_loader = SegmentationDataLoaders.from_label_func( - path=train_data_dir, - fnames=get_files(train_data_dir, extensions=".jpg"), - label_func=partial(get_mask_path, train_data_dir=train_data_dir), - codes=["not-pool", "pool"], - bs=params.train.batch_size, - valid_pct=params.train.valid_pct, - item_tfms=Resize(params.train.img_size), - batch_tfms=[ - Normalize.from_stats(*imagenet_stats), - ], - ) - - model_names = [ - name - for name in dir(models) - if not name.startswith("_") - and name.islower() - and name not in ("all", "tvm", "unet", "xresnet") - ] - if params.train.arch not in model_names: - raise ValueError(f"Unsupported model, must be one of:\n{model_names}") - - with Live("results/train", report=None) as live: - learn = unet_learner( - data_loader, arch=getattr(models, params.train.arch), metrics=DiceMulti - ) - - learn.fine_tune( - **params.train.fine_tune_args, - cbs=[DVCLiveCallback(live=live)], - ) - models_dir = Path("models") - models_dir.mkdir(exist_ok=True) - learn.export(fname=(models_dir / "model.pkl").absolute()) - live.log_artifact( - str(models_dir / "model.pkl"), - type="model", - name="pool-segmentation", - desc="This is a Computer Vision (CV) model that's segmenting out swimming pools from satellite images.", - labels=["cv", "segmentation", "satellite-images", params.train.arch], - ) + yolo.train(data="datasets/yolo_dataset.yaml", epochs=epochs, imgsz=imgsz, **kwargs) if __name__ == "__main__": - train() + Fire(train) \ No newline at end of file diff --git a/example-get-started-experiments/generate.sh b/example-get-started-experiments/generate.sh index 7e0a48fa..cd3b110d 100755 --- a/example-get-started-experiments/generate.sh +++ b/example-get-started-experiments/generate.sh @@ -27,8 +27,8 @@ tick(){ export GIT_COMMITTER_DATE="${TAG_TIME} +0000" } -export GIT_AUTHOR_NAME="Alex Kim" -export GIT_AUTHOR_EMAIL="alex000kim@gmail.com" +export GIT_AUTHOR_NAME="David de la Iglesia" +export GIT_AUTHOR_EMAIL="daviddelaiglesiacastro@gmail.com" export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME" export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL" @@ -38,7 +38,11 @@ pushd $REPO_PATH virtualenv -p python3 .venv export VIRTUAL_ENV_DISABLE_PROMPT=true source .venv/bin/activate -echo '.venv/' > .gitignore +echo '.venv/' >> .gitignore +echo 'yolo*.pt' >> .gitignore +echo '/runs' >> .gitignore +echo '/weights' >> .gitignore +echo 'dvclive/report.html' >> .gitignore # Installing from main since we'd like to update repo before # the release @@ -67,56 +71,45 @@ tick git commit -m "Initialize DVC project" -cp -r $HERE/code/data . -git add data/.gitignore data/pool_data.dvc +cp -r $HERE/code/datasets . +git add datasets/.gitignore datasets/pool_data.dvc tick git commit -m "Add data" dvc pull -cp -r $HERE/code/notebooks . +cp -r $HERE/code/TrainSegModel.ipynb . git add . git commit -m "Add notebook using DVCLive" +sudo apt-get update && sudo apt-get install ffmpeg libsm6 libxext6 -y pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu118 pip install jupyter -jupyter nbconvert --execute 'notebooks/TrainSegModel.ipynb' --inplace -# Apply best experiment -BEST_EXP_ROW=$(dvc exp show --drop '.*' --keep 'Experiment|evaluate/dice_multi|base_lr' --csv --sort-by evaluate/dice_multi | tail -n 1) -BEST_EXP_NAME=$(echo $BEST_EXP_ROW | cut -d, -f 1) -BEST_EXP_BASE_LR=$(echo $BEST_EXP_ROW | cut -d, -f 3) -dvc exp apply $BEST_EXP_NAME +yolo settings datasets_dir="/workspaces/example-repos-dev/example-get-started-experiments/build/example-get-started-experiments/datasets/" +yolo settings runs_dir="/workspaces/example-repos-dev/example-get-started-experiments/build/example-get-started-experiments/runs/" +yolo settings weights_dir="/workspaces/example-repos-dev/example-get-started-experiments/build/example-get-started-experiments/weights/" +jupyter nbconvert --execute 'TrainSegModel.ipynb' --inplace git add . tick git commit -m "Run notebook and apply best experiment" git tag -a "1-notebook-dvclive" -m "Experiment using Notebook" -gto register results/train:pool-segmentation --version v1.0.0 -gto assign results/train:pool-segmentation --version v1.0.0 --stage dev +gto register dvclive:best --version v0.1.0 +gto assign dvclive:best --version v0.1.0 --stage dev cp -r $HERE/code/src . cp $HERE/code/params.yaml . -sed -e "s/base_lr: 0.01/base_lr: $BEST_EXP_BASE_LR/" -i".bkp" params.yaml -rm params.yaml.bkp -dvc stage add -n data_split \ - -p base,data_split \ - -d src/data_split.py -d data/pool_data \ - -o data/train_data -o data/test_data \ - python src/data_split.py +dvc stage add -n create_yolo_dataset \ + -d src/create_yolo_dataset.py -d datasets/pool_data \ + -o datasets/yolo_dataset/train -o datasets/yolo_dataset/val \ + "python src/create_yolo_dataset.py \${create_yolo_dataset}" -dvc remove models/model.pkl.dvc dvc stage add -n train \ - -p base,train \ - -d src/train.py -d data/train_data \ - -o models/model.pkl \ - python src/train.py - -dvc stage add -n evaluate \ - -p base,evaluate \ - -d src/evaluate.py -d models/model.pkl -d data/test_data \ - python src/evaluate.py + -d src/train.py -d datasets/yolo_dataset/train -d datasets/yolo_dataset/val \ + "python src/train.py \${train}" +git rm TrainSegModel.ipynb git add . tick git commit -m "Convert Notebook to dvc.yaml pipeline" @@ -127,16 +120,16 @@ git add . tick git commit -m "Run dvc.yaml pipeline" git tag -a "2-dvc-pipeline" -m "Experiment using dvc pipeline" -gto register results/train:pool-segmentation --version v1.0.1 -gto assign results/train:pool-segmentation --version v1.0.0 --stage prod -gto assign results/train:pool-segmentation --version v1.0.1 --stage dev +gto register dvclive:best --version v0.2.0 +gto assign dvclive:best --version v0.1.0 --stage prod +gto assign dvclive:best --version v0.2.0 --stage dev -export GIT_AUTHOR_NAME="David de la Iglesia" -export GIT_AUTHOR_EMAIL="daviddelaiglesiacastro@gmail.com" +export GIT_AUTHOR_NAME="Dave Berenbaum" +export GIT_AUTHOR_EMAIL="dave.berenbaum@gmail.com" export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME" export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL" -dvc exp run --queue --set-param 'train.arch=alexnet,resnet34,squeezenet1_1' --message 'Tune train.arch' +dvc exp run --queue --set-param 'train.model=yolov8s-seg.pt,yolov8m-seg.pt,yolov8l-seg.pt' --message 'Tune model' dvc exp run --run-all dvc push -A