From 30fdcd509146fc20ca20f531e49f121d260343db Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 3 Oct 2018 09:18:16 +0200 Subject: [PATCH 001/182] fix: math fixed --- docs/source/content/examples/bayesian_optimization.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/content/examples/bayesian_optimization.ipynb b/docs/source/content/examples/bayesian_optimization.ipynb index fa8458c..203bc5c 100644 --- a/docs/source/content/examples/bayesian_optimization.ipynb +++ b/docs/source/content/examples/bayesian_optimization.ipynb @@ -100,8 +100,8 @@ "source": [ "## Optimizing using *expected improvement*\n", "During the optimization, the utility of each point is given by the so-called *acquisition function*. In this case, we are going to use the *expected improvement*, which is defined by\n", - "

$$ EI(x) = (\\mu(x) - f(x^+)) \\psi\\Big( \\frac{\\mu(x) - f(x^+)}{\\sigma(x)} \\Big) + \\sigma(x) \\phi\\Big( \\frac{\\mu(x) - f(x^+)}{\\sigma(x)} \\Big), $$

\n", - "where $ \\mu(x) $ and $ \\sigma(x) $ are the mean and variance of the Gaussian process regressor at $ x $, $ f $ is the function to be optimized with estimated maximum at $ x^+ $, and $ \\psi(z) $, $ \\phi(z) $ denotes the cumulative distribution function and density function of a standard Gaussian distribution. After each query, the acquisition function is reevaluated and the new query is chosen to maximize the acquisition function. " + "$$EI(x) = (\\mu(x) - f(x^+)) \\psi\\Big( \\frac{\\mu(x) - f(x^+)}{\\sigma(x)} \\Big) + \\sigma(x) \\phi\\Big( \\frac{\\mu(x) - f(x^+)}{\\sigma(x)} \\Big),$$\n", + "where $\\mu(x)$ and $\\sigma(x)$ are the mean and variance of the Gaussian process regressor at $x$, $f$ is the function to be optimized with estimated maximum at $x^+$, and $\\psi(z)$, $\\phi(z)$ denotes the cumulative distribution function and density function of a standard Gaussian distribution. After each query, the acquisition function is reevaluated and the new query is chosen to maximize the acquisition function. " ] }, { From b7490f4ed28aa75fbe69820c154dca6cbee51a76 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 3 Oct 2018 09:24:45 +0200 Subject: [PATCH 002/182] fix: math fixed once more --- docs/source/content/examples/bayesian_optimization.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/content/examples/bayesian_optimization.ipynb b/docs/source/content/examples/bayesian_optimization.ipynb index 203bc5c..83ab89d 100644 --- a/docs/source/content/examples/bayesian_optimization.ipynb +++ b/docs/source/content/examples/bayesian_optimization.ipynb @@ -100,7 +100,9 @@ "source": [ "## Optimizing using *expected improvement*\n", "During the optimization, the utility of each point is given by the so-called *acquisition function*. In this case, we are going to use the *expected improvement*, which is defined by\n", + "\n", "$$EI(x) = (\\mu(x) - f(x^+)) \\psi\\Big( \\frac{\\mu(x) - f(x^+)}{\\sigma(x)} \\Big) + \\sigma(x) \\phi\\Big( \\frac{\\mu(x) - f(x^+)}{\\sigma(x)} \\Big),$$\n", + "\n", "where $\\mu(x)$ and $\\sigma(x)$ are the mean and variance of the Gaussian process regressor at $x$, $f$ is the function to be optimized with estimated maximum at $x^+$, and $\\psi(z)$, $\\phi(z)$ denotes the cumulative distribution function and density function of a standard Gaussian distribution. After each query, the acquisition function is reevaluated and the new query is chosen to maximize the acquisition function. " ] }, From c66e37101c0a764cbfe1ef5e68630ece451bf970 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Tue, 9 Oct 2018 14:41:18 +0200 Subject: [PATCH 003/182] fix: broken image links fixed --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f76a668..1ac84e6 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ regressor = ActiveLearner( ``` The initial regressor is not very accurate.

- +

The blue band enveloping the regressor represents the standard deviation of the Gaussian process at the given point. Now we are ready to do active learning! @@ -142,7 +142,7 @@ for idx in range(n_queries): After a few queries, we can see that the prediction is much improved.

- +

## Additional examples From fa61e068a194f545866379b95cb0dc4173492c46 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 18 Oct 2018 18:42:07 +0200 Subject: [PATCH 004/182] fix: modAL.models documentation fixed, docs for missing methods added --- docs/source/content/apireference/models.rst | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/source/content/apireference/models.rst b/docs/source/content/apireference/models.rst index 20e8a41..c03cab5 100644 --- a/docs/source/content/apireference/models.rst +++ b/docs/source/content/apireference/models.rst @@ -1,5 +1,14 @@ modAL.models ============ -.. automodule:: modAL.models - :members: +.. autoclass:: modAL.models.ActiveLearner + :members: fit, predict, predict_proba, query, score, teach + +.. autoclass:: modAL.models.BayesianOptimizer + :members: fit, predict, predict_proba, query, score, teach + +.. autoclass:: modAL.models.Committee + :members: fit, predict, predict_proba, query, rebag, score, teach, vote, vote_proba + +.. autoclass:: modAL.models.CommitteeRegressor + :members: fit, predict, query, rebag, teach, vote From b5e0fb43f4608ca02a9026ab0c0b13efba56941e Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 18 Oct 2018 19:11:54 +0200 Subject: [PATCH 005/182] fix: travis-ci, codecov and rtd badges fixed --- README.md | 2 +- docs/source/index.rst | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1ac84e6..e2a1664 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Modular Active Learning framework for Python3 -[![travis-ci-master](https://travis-ci.org/cosmic-cortex/modAL.svg?branch=master)](https://travis-ci.org/cosmic-cortex/modAL) [![codecov-master](https://codecov.io/gh/cosmic-cortex/modAL/branch/master/graph/badge.svg)](https://codecov.io/gh/cosmic-cortex/modAL) [![readthedocs](https://readthedocs.org/projects/modal-python/badge/?version=latest)](http://modal-python.readthedocs.io/en/latest/?badge=latest) +[![travis-ci-master](https://travis-ci.org/modAL-python/modAL.svg?branch=master)](https://travis-ci.org/cosmic-cortex/modAL) [![codecov-master](https://codecov.io/gh/modAL-python/modAL/branch/master/graph/badge.svg)](https://codecov.io/gh/cosmic-cortex/modAL) [![readthedocs](https://readthedocs.org/projects/modal-python/badge/?version=latest)](http://modal-python.readthedocs.io/en/latest/?badge=latest) ## Page contents - [Introduction](#introduction) diff --git a/docs/source/index.rst b/docs/source/index.rst index 3fb63cb..508677d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,6 +1,13 @@ modAL: A modular active learning framework for Python3 ====================================================== +.. image:: https://travis-ci.org/modAL-python/modAL.svg?branch=master + :target: https://travis-ci.org/modAL-python/modAL +.. image:: https://codecov.io/gh/modAL-python/modAL/branch/master/graph/badge.svg + :target: https://codecov.io/gh/modAL-python/modAL +.. image:: https://readthedocs.org/projects/modal-python/badge/?version=latest + :target: https://modal-python.readthedocs.io/en/latest/?badge=latest + Welcome to the documentation for modAL! modAL is an active learning framework for Python3, designed with *modularity, flexibility* and *extensibility* in mind. Built on top of scikit-learn, it allows you to rapidly create active learning workflows with nearly complete freedom. What is more, you can easily replace parts with your custom built solutions, allowing you to design novel algorithms with ease. From 38182cc3b2141d02a5e2258ce14b98ba3e5db6a9 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 18 Oct 2018 19:25:11 +0200 Subject: [PATCH 006/182] fix: badge links fixed --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e2a1664..5f0153b 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Modular Active Learning framework for Python3 -[![travis-ci-master](https://travis-ci.org/modAL-python/modAL.svg?branch=master)](https://travis-ci.org/cosmic-cortex/modAL) [![codecov-master](https://codecov.io/gh/modAL-python/modAL/branch/master/graph/badge.svg)](https://codecov.io/gh/cosmic-cortex/modAL) [![readthedocs](https://readthedocs.org/projects/modal-python/badge/?version=latest)](http://modal-python.readthedocs.io/en/latest/?badge=latest) +[![travis-ci-master](https://travis-ci.org/modAL-python/modAL.svg?branch=master)](https://travis-ci.org/modAL-python/modAL) [![codecov-master](https://codecov.io/gh/modAL-python/modAL/branch/master/graph/badge.svg)](https://codecov.io/gh/modAL-python/modAL) [![readthedocs](https://readthedocs.org/projects/modal-python/badge/?version=latest)](http://modal-python.readthedocs.io/en/latest/?badge=latest) ## Page contents - [Introduction](#introduction) From d16ca1d30ce55279e597003cac1c210ca314e18c Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 18 Oct 2018 19:44:00 +0200 Subject: [PATCH 007/182] fix: installation commands fixed in docs --- README.md | 2 +- docs/source/content/overview/Installation.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5f0153b..a6e5db4 100644 --- a/README.md +++ b/README.md @@ -169,7 +169,7 @@ pip install modAL ``` Alternatively, you can install modAL directly from source: ``` -pip install git+https://github.com/cosmic-cortex/modAL.git +pip install git+https://github.com/modAL-python/modAL.git ``` # Documentation diff --git a/docs/source/content/overview/Installation.rst b/docs/source/content/overview/Installation.rst index c526317..16a209a 100644 --- a/docs/source/content/overview/Installation.rst +++ b/docs/source/content/overview/Installation.rst @@ -17,6 +17,6 @@ Alternatively, you can install modAL directly from source: :: - pip install git+https://github.com/cosmic-cortex/modAL.git + pip install git+https://github.com/modAL-python/modAL.git For running the examples, Matplotlib >= 2.0 is recommended. From 5740fe6091fdcb6f5a93c410b74387fc834a7b31 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 19 Oct 2018 19:48:29 +0200 Subject: [PATCH 008/182] add: jupyter notebook to demonstrate interactive labeling --- .../examples/interactive_labeling.ipynb | 236 ++++++++++++++++++ docs/source/index.rst | 1 + 2 files changed, 237 insertions(+) create mode 100644 docs/source/content/examples/interactive_labeling.ipynb diff --git a/docs/source/content/examples/interactive_labeling.ipynb b/docs/source/content/examples/interactive_labeling.ipynb new file mode 100644 index 0000000..2f1b9fb --- /dev/null +++ b/docs/source/content/examples/interactive_labeling.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Interactive labeling with Jupyter\n", + "\n", + "In this example, the active learning workflow of modAL is demonstrated - with you in the loop! By running this notebook, you'll be queried to label digits using the [DIGITS dataset](http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits). If you would like to try this out, you can [download this notebook here](https://github.com/modAL-python/modAL/blob/master/docs/source/content/examples/interactive_labeling.ipynb)!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/namazu/.local/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n", + " from numpy.core.umath_tests import inner1d\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "from modAL.models import ActiveLearner\n", + "from modAL.uncertainty import uncertainty_sampling\n", + "\n", + "from sklearn.datasets import load_digits\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "from IPython import display\n", + "from matplotlib import pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we set up the initial training set for our classifier. If you would like to play around, you can try to modifiy the value ```n_initial``` below and see if it impacts the algorithm!" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "n_initial = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = load_digits(return_X_y=True)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y)\n", + "\n", + "initial_idx = np.random.choice(range(len(X_train)), size=n_initial, replace=False)\n", + "\n", + "X_initial, y_initial = X_train[initial_idx], y_train[initial_idx]\n", + "X_pool, y_pool = np.delete(X_train, initial_idx, axis=0), np.delete(y_train, initial_idx, axis=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initializing the learner" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we initialize the active learner. Feel free to change the underlying ```RandomForestClassifier``` or the ```uncertainty_sampling```!" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "learner = ActiveLearner(\n", + " estimator=RandomForestClassifier(),\n", + " query_strategy=uncertainty_sampling,\n", + " X_training=X_initial, y_training=y_initial\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also set how many queries we want to make. The more the better! (Usually :) )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "n_queries = 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The active learning loop" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkgAAAFICAYAAACr9gEyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzs3Xd4lGX28PHvZJJJ7yEJCR2kSJHQm4ihhGJhbURYcMV31dWfqyjsGnRFxUXRxbW7rmXXEgWjEEWRKAiuCoJAaCJdShpk0odMMpmZ5/0jzJCEhLSZeSaT87kur8upz5kRycm57/scjaIoCkIIIYQQws5L7QCEEEIIIdyNJEhCCCGEEHVIgiSEEEIIUYckSEIIIYQQdUiCJIQQQghRhyRIQgghhBB1SIIkhBACgOTkZK677jq1w3CqPXv2cNVVV3H33XerHYpDPfLII7z88suXfM7q1av5wx/+4JqAPIAkSEIIIThy5AjBwcHExcWRmZmpdjhO88MPPzBixAj+9a9/qR2KcHOSIAkhhGD16tVMnTqVa665hvT09FqPrVmzhqSkJJKSkli0aBEmk6nB+7dt28bkyZPtr615++WXX+bRRx/lpptu4r///S9Wq5UnnniCpKQkEhMTWbRoEVVVVQAUFhZy9913M3HiRK699lp++OEHNm3axDXXXFMrthtuuIENGzZc9Hnee+89pk+fztSpU/nTn/5EYWEh69ev57333mPTpk388Y9/rPX85cuX8+STT9pvl5SUcMUVV1BYWMjBgwdJTk5m6tSpXH/99Xz//ff276xmRabm7Ycffpinn36aa6+9lq+++qrWtbKyshg3bhxvvvmm/fvbvXs3d955J1deeSUpKSn253711Vdcc801TJ06lXnz5nHq1CkAioqKmD9/PomJidx5552UlZXZX3P06FF+//vfk5SUxLXXXsu+ffsu+n5EEyhCCCHaNbPZrEycOFEpKytTysvLlQkTJiiVlZWKoijK6dOnlVGjRil5eXmK1WpV7r33XuXNN99s8P6ffvpJmTRpkv29a95+6aWXlHHjxikFBQWKoijK+vXrlWuuuUYxmUxKRUWFMm3aNCU9PV1RFEVZvHix8uyzzyqKoii//PKLMmLECKWyslIZMWKE8uuvvyqKoig5OTnK0KFD7bHaZGZmKuPHj1f0er2iKIry5JNPKosXL7bHYPv3mvbv36+MHj1aqaqqUhRFUdasWaPMnz9fsVgsyrRp05S1a9cqiqIoe/fuVYYPH66UlZUpn376qXLbbbfZ36Pm7b/+9a/Ktddeq1RUVFx0rdOnTyuXX365smbNGkVRFOW+++5TJkyYoBQUFCiFhYXKgAEDlJMnTyrZ2dnK0KFDlRMnTiiKoihvv/22/f2XL1+uPPjgg/b3S0hIUF566SXFYrEo11xzjfLxxx8riqIoO3bsUMaNG6dUVVVdFK+4NKkgCSFEO/fDDz8wcOBAgoKC8Pf3Z8SIEWzatAmAH3/8kYSEBGJiYtBoNKxYsYI//OEPDd7fmCuuuIKIiAgAkpKS+PTTT/Hx8cHX15eBAwdy+vRpAL777jt7tejyyy9n48aN6HQ6kpKS+PLLLwH45ptvmDhxIjqdrtY1Nm/eTFJSEpGRkQDcfPPN/Pjjj5eMq3///gQHB7N161YANmzYwPTp08nKykKv1zNjxgwABg4cSFxcXJOqMqNHj8bX17fex8xmM1OnTgWgd+/eDBw4kIiICMLDw+nQoQNnz57lxx9/ZOTIkXTt2tX+ObZt20ZVVRU7duxg2rRpAHTq1IkRI0YAcPz4cU6dOsWNN94IwNChQ4mIiPDoZVNn8VY7ACGEEOpavXo1//vf/xg2bBgAFouFkpISkpKSKCoqIiQkxP5c2w/8hu5vTGhoqP3fCwsLWbp0KQcOHECj0aDX67ntttsAKC4uJjg42P7coKAgAGbMmEFKSgoPPfQQGzZs4I477rjoGoWFhURHR9tvh4SEUFBQ0Ghs11xzDV988QXDhw9n+/bt/P3vf+e3334jODgYjUZT6/0KCwub9Vnr0mq1+Pn5AeDl5UVAQECtxywWy0XfcXBwMIqiUFxcTElJSa3vx/a80tJSLBYL06dPtz9mMBgoLi5uNF5RmyRIQgjRjpWWlrJ9+3a2bdtmr8SYzWauuuoqCgsLCQ8Pr1V9MBgMVFRUNHi/7Ye7TUlJSYPX/uc//4m3tzdr165Fp9Px0EMP2R8LCwujqKiITp06AdX7dmJiYhg+fDhms5lNmzZx5MgRxowZc9H7RkVF1UoIiouLiYqKavS7mDFjBrfccgvjx49nyJAhhIaGEhkZSUlJCYqi2JOk4uJiIiMjycvLa/JnbYnIyMha33FJSQleXl6Eh4cTEhJSa99RYWEhnTt3Jjo6msDAQNavX3/R+61evdqh8Xk6WWITQoh27IsvvmDUqFG1lqm8vb0ZN24cX3zxBVdddRW7du0iKysLRVFYsmQJn3zySYP3d+jQgfz8fAoKCrBYLHzxxRcNXrugoIDLLrsMnU7HwYMHyczM5Ny5cwAkJiayZs0aoHrT8Q033IDFYsHLy4vp06ezdOlSEhMT8fHxueh9J0yYwDfffENRUREAK1eu5Kqrrmr0u+jRowddunRhxYoVtZavYmNjWbduHQC7du1Cr9czaNAgoqOjOXHiBJWVlRiNRjIyMpr4rTfN2LFj2bFjh33ZceXKlYwdOxZvb28GDx5s35x+6tQpdu7cCUB8fDyxsbH2BKmwsJAHH3yQ8vJyh8bWHkiCJIQQ7Vh6ejqTJk266P7JkyeTnp5ObGwsTz75JLfddhtJSUkA3H777Q3e37VrV2688UZmzpzJ7NmzGTVqVIPXnj9/PitXrmTKlCmkpqby17/+lVWrVvHVV1+xaNEi8vLySExMZMGCBfzjH/+wL0nNmDGD7OzsWstINQ0aNIg777yTOXPmMHXqVMrKyliwYEGTvo8ZM2ag1+uZOHEiABqNhueff54PPviAadOm8dRTT/Hiiy8SEBDAyJEjGTRoEElJSfzxj3+s93tsjdjYWJYuXco999zDtGnT+Pnnn+0n7e666y6ys7NJTExk6dKlTJkypVa8qampTJ06ld///veMHj261hKeaBqNoiiK2kGIlunTpw9dunTBy8sLo9FIv379uPvuu0lISABgxYoVxMXFceuttzb4Ht988w3ffvstTz/9NMePH6egoIDhw4df9Lzvv/+enj17EhcX1+T4Hn74Ybp06cI999zT4HO2bdvGo48+yjfffNPk94Xqz/7dd98RGxvbrNcJIdo+vV7P7373OzZv3oxWq3Xoe69bt46MjAxefPFFh76vaHtkD1Ib9/777xMbG4uiKKxfv567776bV155heHDh9daz2/I5MmT7T1KNmzYgNlsrjdB+u9//8uf/vSnZiVIQgjhDC+99BK33nqrw5Mjo9HIW2+9xeLFix36vqJtkiU2D6HRaJg2bRr33nsvK1asAKorOK+99hpQXQFKSkpi+vTprFq1iiFDhpCVlWVvbPbtt9/yxhtv8N577/HMM8/Ueu8XXniBn376iUWLFrFu3ToqKyt57LHHSEpKYtq0aTzzzDO1NirWJzMzkxtuuIGpU6cyffp0tmzZUuvx5cuXk5SUxNSpU9m1axcAJpOJp556yt5ETjrfCtG+2Za+9Ho98+fPd+h7b9q0iWnTpnH11VfbT/OJ9k0SJA8zY8YM9uzZQ0VFhf0+i8VCSkoKixcvZt26dZw4cQKj0VjrdYmJiUyePJl58+bx8MMP13rsgQceICYmhueee47p06fz7rvvkpeXx5dffsmaNWvYsWPHJTdiAjz22GPccccdrF+/njvvvJMlS5bYH8vOzmbAgAFkZGQwf/58+xr7+++/z9GjR1m7di1ffPEFGRkZ9t4sQoj2Jyoqio0bN/Laa6/Z9yM5ytVXX83mzZu57777HPq+ou2SBMnDBAUFYbVa7SdBAPspC9spjrlz52K1Wlt8jc2bN3PLLbfg7e2Nn58f1157baNN2NLT0+2nQoYOHWo/lQHV/VNsj02bNo1ff/2VyspKvvrqK2666SZ0Oh0BAQFcf/31fP311y2OW4i6zGYzWVlZmM1mtUMRQrhAc/6flz1IHiYrKwsfH59aDcTqNhSr2UCtJQoLC2s1QAsNDW20CdvatWt57733OHfuHFarlZpnA8LCwvDyqs7Vbc3gSkpKKCsrY8WKFbzyyitA9ZLboEGDWhW7EDVlZ2fbT1DJhn8hPF9eXh5z5szh66+/tncob4gkSB4mIyODESNG1OppEhQUVKuipNfrW3WN5jZhO3PmDI8++ihpaWn069ePEydO2I8FQ+3maqWlpUB10hQdHc38+fO5+uqrWxWvEA3Jz88HYM6cOSpHIoRwpfz8fEmQ2gtFUcjIyODdd9/lrbfeqvVYt27dsFqtbNu2jZEjR/LRRx/Vaptv4+3tXasza0OPXXXVVXzyySckJiZSWVnJZ599dtFk7JoKCwsJCAige/fumM1mVq1aBVR33gWoqKjgm2++YfLkyaxfv56BAwei0+lITEwkLS2N8ePH4+Xlxeuvv86AAQMYP358i74jIerq0KEDgFSQhGgnbBUk2//7lyIJUhs3d+5ctFotBoOBnj178u9//5uBAwfWeo5Op+Pxxx8nJSWF4OBgbr/9dry8vC5Kkq6++moWLlxIdnY2L730Uq3HkpKSWLBgAffffz/z5s0jKyuLGTNmoNFomDp1qn0PUX369u3L+PHjSUxMpGPHjjz88MPs2rWL2bNn88gjj9CjRw8yMzNZsWIFXl5e9lN0c+bMITs7mxkzZqAoCgMGDLDPaRLCEWzHxGNjY+0jLYQQnq8pLSKkUWQ7VF5eTkJCAjt27Ki1N0mI9iYrK4uJEyeyceNGSZCEaAea8/+8nGJrJ2688Ub7LKF169bRs2dPSY6EEEKIBsgSWzuRkpLCk08+yYsvvkhgYOBFzSCFEEIIcYEkSO3EsGHD+Pzzz9UOQwghhGgTZIlNCCGEEKKOVlWQKioq2L9/Px06dHD40EAhhHuyWCzk5+czYMAAh497EEIId9GqBGn//v3SYE2Idio1NVWGegohPFarEiR3aLJm67yshkWLFql2bUD1uWRTpkxR9fpvvPGGqtdvr5rTaE0IIdqqViVI7tBkrebIC1erOc5DDWoP2FT780vfGnXJsroQwpPJKTYhhBBCtBnpmdk8l3GInGIjcWH+LErqw8yEeIdfRxIkIYQQQrQJ6ZnZpKzeh7HKAkB2sZGU1fsAHJ4kyTF/IYQQQrQJz2UcsidHNsYqC89lHHL4tSRBEkIIIUSbkFNsbNb9rSEJkhBCCCHahLgw/2bd3xqSIAkhhBCiTViU1Adf79qpi7+PlkVJfRx+LUmQhBBCCNEmzEyI5+6retpvx4f58/QNA+UUmxBCCCHat34dQwD44r5xDIgPddp1pIIkhBBCiDYj31AJQHSwr1Ov06QK0rJly9izZw8ajYbFixczaNAgpwYlhBBCCFEffVklGg1EBDp3mkOjCdL27ds5efIkq1at4ujRo6SkpJCWlubUoIQQQggh6pNvqCQiQIe31rmLYI2++9atW5k0aRIAvXr1orS0FIPB4NSghBBCCCHqoy+rJCrIuctr0IQESa/XEx4ebr8dGRlJfn6+U4MSQoi6li1bxqxZs0hOTmbv3r21HktNTWXWrFnceuut/P3vf6/1mF6vZ/jw4Wzbts2V4QohnCTfUEkHJ+8/giYkSIqiXHRbo9E4LSAhhKir5lL/U089xdKlS+2PGQwG3n77bVJTU/noo484duwYu3fvtj/+7LPP0rlzZzXCFkI4gd5QSVSQc/cfQRMSpJiYGPR6vf322bNniYqKcmpQQghR06WW+n18fPDx8aG8vByz2YzRaCQ0NNT+usDAQHr37q1a7EIIx1EUBX2ZyT0qSGPHjiUjIwOAAwcOEB0dTVBQkNMDE0IIm0st9fv6+nLvvfcyadIkEhMTGTx4MN27d8dkMvHqq6+yYMECtcIWQjjYOZMFY5XFJXuQGj3FNmTIEPr3709ycjIajYYlS5Y4PSghhKjpUkv9BoOBN954g/Xr1xMUFMRtt93GwYMH2bBhAzfffDMhISFqhCyEcAJ9WXUPJFdUkJrUB2nhwoXOjkMIIRp0qaX+Y8eO0blzZyIiIgAYNmwY+/fv54cffsBqtZKamsqpU6fYu3cvL774Ipdddpkqn0EI0Xq2JpFucYpNCCHUdqml/vj4eI4dO0ZFRQWKorB//366devGypUr+fjjj/n444+ZMGECS5YskeRIiDbO7SpIQgihpvqW+levXk1wcDCTJ0/mjjvuYN68eWi1WhISEhg2bJjaIQshnMCVFSRJkIQQbULdpf6+ffva/z05OZnk5OQGX/vMM884LS4hhOvoyyrxcsGYEZAlNiGEEEK0EfmGSiICfdF6Ob8foyRIQgghhGgT8l3UAwkkQRJCCCFEG5Hvoi7aIAmSEEIIIdoIfVklHVywQRskQRJCCCFEG6AoissG1YIkSEIIIYRoA8oqzZjMVpcc8QdJkIQQQgjRBuS7sEkkeEAfJFt3XTWkp6erdm2AMWPGqHp9tT9/dna2ateOj49X7dpCCNEe2bpoSwVJCCGEEOI8Wxdt2YMkhBBCCHHehQqSHPMXQgghhACqK0haLw3hAZIgCSGEEEIAoC8zERmow8sFY0ZAEiQhhBBCtAGu7IEEkiAJIYQQog3QGypddoINJEESQgghRBuQXyYVJCGEEEIIO0VRpIIkhBBCCFFTibGKKosiFSQhhBBCCBu9wbU9kEASJCGEEEK4ubMunsMGkiAJIYQQws3pDSYAOrjbHqTDhw8zadIkPvjgA2fHI4QQQghRi94dK0jl5eUsXbqU0aNHuyIeIYQQQoha8g2V+Gg1hPr7uOyajSZIOp2ON998k+joaFfEI4QQQghRi76s+oi/RuOaMSMA3o0+wdsbb+9GnyaEEEII4RT5Lu6BBLJJWwghhBBuTu/iOWwgCZIQQggh3Fx+WaVLeyCBJEhCCCGEcGNWq0KBweTyJbZGNxft37+f5cuXk52djbe3NxkZGbz88suEhYW5Ij4hhBBCtGPFxirMVteOGYEmJEgDBgzg/fffd0UsQgjRoGXLlrFnzx40Gg2LFy9m0KBB9sdSU1P5/PPP8fLyYsCAATzyyCOYzWYeeeQRTp8+jdls5i9/+QvDhg1T8RMIIVriwpgRN0uQhBBCbdu3b+fkyZOsWrWKo0ePkpKSQlpaGgAGg4G3336br7/+Gm9vb+bPn8/u3bs5duwY/v7+fPjhhxw5coSUlBQ++eQTlT+JEKK58lVoEgmSIAkh2oCtW7cyadIkAHr16kVpaSkGg4GgoCB8fHzw8fGhvLycgIAAjEYjoaGhXHfddVxzzTUAREREUFxcrOZHEEK0kFSQhBCiAXq9nv79+9tvR0ZGkp+fT1BQEL6+vtx7771MmjQJPz8/ZsyYQffu3Wu9/t1337UnS0KItkWtCpKcYhNCuD1FUS66beuoazAYeOONN1i/fj0bNmxg9+7dHDx40P7c1NRUfvnlF+69916XxiyEcIx8QyU6rRchfq6t6UiCJIRwezExMej1evvts2fPEhUVBcCxY8fo3LkzERER6HQ6hg0bxv79+wFIS0vj22+/5bXXXsPHx3UznIQQjpNfVt0k0pVjRkASJCFEGzB27FgyMjIAOHDgANHR0QQFBQEQHx/PsWPHqKioQFEU9u/fT7du3Th9+jQrV67klVdewdfXtaV5IYTj6A0mlzeJBNmDJIRoA4YMGUL//v1JTk5Go9GwZMkSVq9eTXBwMJMnT+aOO+5g3rx5aLVaEhISGDZsGM8//zzFxcXceeed9vd5++230elc/xetEKLl8ssqiQ/zc/l1JUESQrQJCxcurHW7b9++9n9PTk4mOTm51uMPPvggDz74oEtiE0I4j95QyRWdQl1+XVliE0IIIYRbslgVClQYVAseUEGaNWuWatceN26catcGyM3NVfX6w4cPV/X68fHxql5fCCGEcxWVm7Aqru+BBFJBEkIIIYSbUqsHEkiCJIQQQgg3pVYXbZAESQghhBBuSipIQgghhBB1XKggub49hyRIQgghhHBL+WWV+Pl4EeTr+jNlkiAJIYQQwi1Vd9F2/ZgRkARJCCGEEG7KNodNDZIgCSGEEMIt6Q2VqpxgA0mQhBBCCOGm8sskQRJCCCGEsDNbrBSWm2SJTQghhBDCprDchKJABxWO+IMkSEIIIYRwQ2o2iQRJkIQQQgjhhvQGE6DOmBGAJnVeevbZZ9m5cydms5m77rqLKVOmODsuIYQQQrRjaleQGk2QfvrpJ44cOcKqVasoKirid7/7nSRIQgghhHAqNQfVQhMSpOHDhzNo0CAAQkNDMRqNWCwWtFqt04MTQgghRPuUX1ZJgE5LoApjRqAJe5C0Wi0BAQEApKWlMX78eEmOhBBCCOFUajaJhCbuQQLYsGEDn3zyCe+8844z4xFCCCGEUHXMCDTxFNv333/Pv/71L958802Cg4OdHZMQQggh2rnqCpI6PZCgCQlSWVkZzz77LG+88QZhYWGuiEkIIYQQ7ZzaFaRGl9jWrVtHUVERDzzwgP2+5cuXExcX59TAhBBCCNE+VVmsFJVXufcepFmzZjFr1ixXxCKEEEIIQcH5JpFuvwdJCCGEEMJV1O6BBJIgCSGEEMLNqN1FGyRBEkIIIYSbyT9fQeogFSQhhBBCiGq2CpIssQkhhBBCnKc3VBLk642/Tr3JHeoMOBFCiGZatmwZe/bsQaPRsHjxYvuMSIDU1FQ+//xzvLy8GDBgAI888ghVVVU8/PDD5OTkoNVqefrpp+ncubOKn6Bp0jOzeS7jEDnFRuLC/FmU1IeZCfFqhyWES6ndAwkkQRJCtAHbt2/n5MmTrFq1iqNHj5KSkkJaWhoABoOBt99+m6+//hpvb2/mz5/P7t27+e233wgJCWHFihV89913rFixghdeeEHlT3Jp6ZnZpKzeh7HKAkB2sZGU1fsAJEkS7YraXbRBltiEEG3A1q1bmTRpEgC9evWitLQUg8EAgI+PDz4+PpSXl2M2mzEajYSGhrJ161YmT54MwLhx49i5c6dq8TfVcxmH7MmRjbHKwnMZh1SKSAh1uEMFSRIkIYTb0+v1hIeH229HRkaSn58PgK+vL/feey+TJk0iMTGRwYMH0717d/R6PREREQBotVq8vLwwmUyqxN9UOcXGZt0vhKfSG0yqbtAGWWJr06ZMmaLq9VeuXKnq9UX7oSjKRbc1Gg1QvcT2xhtvsH79eoKCgrjttts4ePDgJV/jruLC/MmuJxmKC/NXIZr2RfZ+uY9Ks4USo7pjRkAqSEKINiAmJga9Xm+/ffbsWaKiogA4duwYnTt3JiIiAp1Ox7Bhw9i/fz8xMTH2KlNVVRWKouDj46NK/E21KKkPOm3tv5b9fbQsSuqjUkTtg23vV3axEYULe7/SM7PVDq1dcocxIyAJkhCiDRg7diwZGRkAHDhwgOjoaIKCggCIj4/n2LFjVFRUoCgK+/fvp1u3bowdO5b169cDsGnTJkaOHKla/E01MyGeWcMvnLQL9fPh6RsGSiXDyWTvl3txhx5IIEtsQog2YMiQIfTv35/k5GQ0Gg1Llixh9erVBAcHM3nyZO644w7mzZuHVqslISGBYcOGYbFY2LJlC7feeis6nY5nnnlG7Y/RJPHh1ctpMSG+9I8LleTIBWTvl3uxzWFTu4IkCZIQok1YuHBhrdt9+/a1/3tycjLJycm1Hrf1PmprcouNBPt6M6lfDOmZ2VRZrPhopdjvTO1x75c777m6UEGSY/5CCCHOyympoGOYH2N6RnHOZGFfdonaIXm89rb3y933XNkqSGovsUmCJIQQbiS3xEjHUH9G9ahuUbD1WIHKEXm+mQnxXBYThO2MY6i/Z+/9cvc9V3qDiWA/b/x81BszApIgCSGEW8ktriAuzI/IIF/6dQxhyzF94y8SrZJdbOTX3FLuntATjQZuG9PNY5MjcP89V+7QJBIkQRJCCLdRUWWh4JyJjqHVe1/G9Ixkx4kiKur8ti8ca9X2UyjAnJFdCPP3oeicezcUba2OoX713u8ue67yDZWqL6+BJEhCCOE28koqgAs/qMb0jKTSbGXXqSI1w/JoVRYrK38+zdV9oukUHkB4oI7Ccs9OkG4Y0umi+9xpz5VeKkhCCCFqsi1xxJ3/DX9E9wi0XhrZh+REG389w9mySuaM7AJARICOQoNnJ0h7s0sI8vUmNqT6z1mATutWe67yDZV0kAqSEEIIm5zzFaSO5ytIwX4+DIwPZYskSE6Tuu0UcaF+TOgTDUB4oI4iD64g7csq4X+H87nn6p78tHgiycM7oyhwdd9otUMDqpeZyyrMUkESQghxQe75ClLNPSJje0Wy53QxhkqzWmF5rBP6c3x/RM+tI7qg9ao+wxYZqKPQg/cgvf7dUYJ9vfn9qK4AzBnZFWOVxQ2P+KvbAwmakCAZjUbuv/9+fv/733PzzTezadMmV8QlhBDtTk5JBRGBulrHm8f0jMJsVfj5RKGKkXmmj7afQuulqTXexVZBqjvs2BMcyzfw1f485o3pSohf9VzCgZ1CGdQplNRtJ93iM9uaRLaJCtKmTZsYMGAAH3zwAS+88EKbadcvhBBtTXUPpNonjIZ2DUen9WLLUTnu70iVZgsf7zjNlMtjiA658J1HBOiosiiUeWDF7l+bj6HTenH72O617p8zsguHzxjYcVL9wwD68/u/2sQptunTp/PHP/4RgNzcXGJiYpwelBDCcy1fvpxffvlF7TDcUm5xhf2Iv42fj5YhXcNkH5KDrd+fR1F5FXNGdq11f3hg9dKOpx31zy42siYzm+ThnS9KPq69Io5gX29SfzqpUnQXtKkKkk1ycjILFy5k8eLFzoxHCOHh+vXrx5tvvsnNN9/Mq6++yqlTp9QOyW3klBiJC7u4R82YnlEcyC31uB/aakr96RTdIgMY0zOy1v2R5xMkT9uH9Ob/jgPwx/E9LnosQOfNDUPiWbcvT/XPbduDFBnYhhKklStX8vrrr7No0SK3WKcUQrRN111+4CJHAAAgAElEQVR3HS+88AIffvghAwcO5KGHHmLWrFmsWbOmXf/dUlZRRVmFud5mfWN6RqIosO03qSI5wuEzZWw/UcjskV3w8tLUesxeQXLwSbb0zGzGPvMt3R/+krHPfOvSTdEFhkpW/nyK6wfH0yk8oN7nzBnVFZPFyic7T7ssrvrkl1USFuCDzlv9M2SNRrB//35yc3OB6t/8LBYLhYWyWVAI0XK7d+/m2WefZcWKFVxxxRX85S9/ISsriwceeEDt0FSTazviX0+X4ys6hxGg08oym4N8uO0UOq0XNw3tfNFjEQG2ClKVw66n9nDY/245QaXZyp8mXFw9sukdE8yIbhF8uO0UVqt6v6jo3aSLNjQhQdqxYwfvvPMOAHq9nvLycsLDw50emBDCMyUlJfGf//yH0aNH8+mnn/Loo48ydOhQ7rvvPioqKtQOTzX2JpH1VJB8tF6M6B7Bj7JRu9XKTWY+3ZXF9IGxRARefJQ8PLD6dFfhuUqHXVPN4bBlFVX8d8sJki6PpVd08CWfO2dUF04UlKuaiOeXuUeTSGhCgpScnExhYSGzZ8/mzjvv5LHHHsPLS/3SlxCibVq1ahV33HEHiYmJeHt7s3XrVvvS2htvvKFydOq5VAUJqpfZjuWf40xp+00iHeGLPbmUVZiZM6prvY8H+Xrjo9U4tIKk5nDY1G2nKKswc8/VPRt97tQB1Ulj6jb1NmvrDZVEucEGbQDvxp7g5+fHihUrXBGLEKIdWLZsGdHR0QwaNAiAn3/+mfT0dJYvX65yZOrKLTai0UBMSEMJUhQAW48VuM1IiLYoddtJescEMaxr/SshGo2GiECdQzfEx4X5k11PMuTs4bAVVRbe+v43xvWKYlCnsEaf7+ut5eahnXjrh984U1rR4J9FZ8ovq3SLJpEgnbSFEC6Wk5PDwoUL7bf//Oc/k5OTo2JE7iGnpILoYF98tPX/tdyvYwih/j5sOSbLbC21L6uEPVklzBnZFY1G0+DzwgMcO7B2UVIf/Gs0/wTQAAsmXeawa9QnbWcWekNlk6pHNreO6ILFqvDxz67frF1uMnPOZHGLI/4gCZIQwsU0Gg2bN2+mpKSEoqIivvrqK7y9Gy1me7zqJpENVxS0XhpG9YiQjdqt8OH2k/j7aPndkEtX4CIcPG5kZkI8f585oNb7K8CRswaHXaMus8XKv/93jMGdwxjdI7LxF5zXLSqQKy+L4qPtp7C4eLO2vsx9mkSCJEhCCBdbvnw5X375JbNnz2bevHn88MMP0qGf6iaR9fVAqmlsryiyioycKih3UVSeo7Siis9253DdFXH2MRsNCXfwEhvAlb07APDEdf3Z9bfJzBnZhTf+d9xpG++/2JvL6UIj90zoeclqWX3mjOxCTkkFmw+ddUpsDck3uE+TSJAESQjhYnFxcTz33HN8+eWXrF27lscff5yXX35Z7bBUpSgKOY1UkAB7U0NZZmu+zzKzKTdZmDOqS6PPjQx07BIbQMH5U3GR5/fXPDrjcnp0COShj/c4PBmzWhVe23yU3jFBTOrX/OkXE/vFEB3sS+o21zZxtXfRlgqSEKI9SktL48orr2TAgAEMHTqU4cOHYzA4b6mhLSgqr6Kiytropt2eHYLoEOwry2zNpCgKqdtOMTA+tEmblcMDdJQYqzBbrA6Loe7ykb9Oy0vJCRScq2Txmn0ObZK68eBZDp8x8KcJPS9qhNkUPlovkod3ZtOhs2QVua5aqZcKkhCiPVu1ahUbNmwgISGBnTt3smLFChISEtQOS1X2HkgNHPG30Wg0jOkZyZZjBe2663hz7TpVxMG8MuaMbLx6BOf3CClQbHTcUX/bD/+a+2sGxIeycEofvtqfR9qOLIdcR1EUXt10lE7h/lw7KK7F7zNrRBc0wMrtrtusbasg1defSg2SIAkhXMrX1xdfX1+qqqqwWq1MnDiRDRs2qB2Wquw9kJpw7HtMz0j0hkqOOnGDr6dJ/ekUwb7eXHtF0xIGZwystVdH6iwf/fHKHozpGcnja3/hN/25Vl9n6/ECdp8u5q6reuLdwInIpogP8yexbzQrfz5NlQMraZeiN1QSEahr8CSnq7lHFEKIdmPgwIF88MEHjBs3jttuu41Fixa16w7aUH2CDRqvIMGFfkiyzNa49MxsRi3byOrMbKyKwjcHzjTpdRfGjTgyQTLho9UQ4l/7xKaXl4YVt1yBj9aLB1ZmtjgZsc16m/3mNrw04OeAWWZzRnZFb6hs8vfWWu7UAwma0ChSCCEcaf78+YSFhaHT6Rg5ciRFRUWMGTNG7bBUlVNcgY9W06TjzZ0jAugc4c+PR/XcNqab84Nro2zzz2wjPs6ZLKSs3gfQaKPNCCcMrNUbKokM9K33RFnHUH+euWEgf0rdxYsbjrAwqU+z3rvuZ7Uq8Nhnv+Cj9WpVU9HxvTsQH+ZP6raTTB/YscXv01R6Q6Xb7D8CqSAJIVzswQcfRKer/gE0fPhwpkyZQlBQkMpRqSu3xEhMiF+TN9SO6RHFT8cLXN6npi1pzfwzW4LkyHEj1SM0Gq6OTBvYkVuGdeLVzUfZdrx51UFnzXrTemmYPbILPx4t4Hi+85d0891oUC1IBalV/va3v6l6/X79+ql6/VmzZql6fdE2dejQgeTkZAYOHIiPz4V+NH/5y19UjEpducUVxDVyxL+mMb0iWbXjNAdyShnYKdSJkbVdrZl/Fhbg+IG1BQZToz/8l1zbn+2/FfLgx3tYd/+VhPpful+Txarw0/GCeseYgGNmvd08rBP//OYwH20/xSMzLm/1+12KvszkNkf8QRIkIYSLjR8//qL7mtLIbtmyZezZsweNRsPixYvts9zOnDlTa3TJ6dOneeihhxgxYgSLFy/GZDJhtVpJSUlhwIABDb29qnJKjAxtYDZYfWydkbcc00uC1ICOoX7klFy8t60p88/8fLQE6rQOryD1jgm+5HMCfb15ITmBG1/fwqPp+3kpefBF/28oisLerBI+253D2r055JdVogHqqyU6YtZbdLAfSf1jSduZxUNT+uBXZ2SKo5yrNGOssrjNoFqQBEkIoYLmdvbdvn07J0+eZNWqVRw9epSUlBTS0tIAiImJ4f333wfAbDYzd+5cEhMTeeWVV5g8eTLJycns2rWLf/7zn7z99tsO/yytZbEq5JVUNOuHWXSIH72ig9hyrIC7rmr6nK32ZEjXcHL25ta6z99Hy6Im7u+JCNI5bA+SoijVFaRLLLHZDO4cxoJJl/GPrw/zw5F8isuriAvzZ97orpwzWfh8dzYnCsrRab24um8HrrsinnOVZpZ8/kutZbbmfNbGdI0M4Mt9VfT923riw/xZlNTH4QOT3a1JJEiCJIRwscOHD9v/3Ww2s2fPHi677DJmzpzZ4Gu2bt3KpEmTAOjVqxelpaUYDIaL9i6tWbOGpKQkAgMDCQ8Pp7i4GIDS0lLCw5teoXElvaESs1Vp0gm2msb0jCRtRxYmsxWdA04seZKzZRVsPpTPgLgQisqryCk2EtfMH+wRAY6bx1ZqNGOyWJv8wz8u1B8vTXUDUYDsYiNPf3UQDdXLq/dM6EXSgNhaS3A6by+eyzjUos96KemZ2bzzw2/229nFxiZvdm8Oe58oqSAJIdqrv/71r7VuWywW/vznP1/yNXq9nv79+9tvR0ZGkp+ff1GClJaWxjvvvAPAH/7wB2666SbS09MxGAx89NFHDvoEjmXbJ9LYmJG6xvSM4r2tJ9mTVczwbhHOCK3Nev7rw1RUWXjp1gR6dGjZAYDwQB0FBsckSPpzFzeJvJQV3xymvv33MSF+pP6/UfW+ZmZCvMOrOlC9AbzCXLv1gG0DuCOv544VJPm1QwjhUkajsdY/2dnZHD9+/JKvqds1WlGUi5bpMjMz6dGjhz1peuutt5g2bRrr169n6dKlLF++3LEfxEEuNIlsXgVpVI8INBrYclT6IdX0S04Jq3ac5rYx3VqcHIFjK0j6stpz2BrT0ObqM6Wu7xfWms3uzXGhgiR9kIQQ7dSMGTPQaDT2JCc4OJj58+df8jUxMTHo9RcGtJ49e5aoqKhaz9m8eTOjR4+23961axcPPPAAAGPHjuWJJ55w4KdwnAtjRppXQQoL0NE/LoQtx/TcP+kyZ4TW5iiKwtIvDhDm78OfE1v3nYQHOm4Pkt5Qew5bY+LC/Os9meaITdfN5YpY0jOzeXZ9dUuCma/8yF+m9nVKNay5pIIkhHCpb7/9lnXr1vHtt9+yceNG3n//fW6++eZLvmbs2LFkZGQAcODAAaKjoy9aXtu3bx99+/a13+7atSt79uwBYO/evXTt2tXBn8Qxcksq8PPxsh8tb44xPaPIPFWM0WRp/MntwNcHzvDT8UIWTO5NaAu+z5oiAnWUmyxUVLX+u61vDtulLErqg3+d02KO3HTdHPXFotN6OSwWW5PLskozADklFaSs3kd6ZrZD3r81JEESQrjUe++9x/3332+/vWjRIt57771LvmbIkCH079+f5ORkli5dypIlS1i9ejXffPON/Tn5+flERkbab991111s3ryZuXPn8uKLL5KSkuL4D+MAuSVG4kL9m32yD2B0z0hMFis7ThY6IbK2pdJsYdm6X7ksOojZI5o2lPZSHNlNu8BQiUbT9CGsMxPiefqGgcSH+aOhei7a0zcMVKWqUjMWAA0wvFu4w2JxVpNLR5AlNiGES61bt44PP/zQfvv111/n1ltvZd68eZd8Xc1eR0CtahHA2rVra92Ojo7mzTffbGW0zpdTXNHs/Uc2I7pF4O2lYcuxAq68rIODI2tb3ttykpMF5fz39uGtGtJqE35+HluBwdTsDfR15RtMRATo0DaxUzo4b9N1S9SM5f8+3MWPR/VUWawOGSrrqj1OLSEVJCGES5nNZkpLS+238/PzVYxGfbYKUksE+nrTOSKAt74/TveHv2TsM9+6xdKEqxUYKnnp2yNM6NOBCX2iHfKejqwg6d1shEZrzBwcT1F5Ff877Jj/bxvay6TGfqu6pIIkhHCpBQsWMGvWLHx9fbFarVitVh577DG1w1KFyWzlbFklHVv4wyA9M5vTheWYz58Jd1aPGnf3zw2HKTdZeHSG48YvXZjH5qAEyY1OZ7XG+N4dCAvwIX13DhP7xbT6/R6c3JuFaXtqdQJXa79VXZIgCSFcauzYsXz++eecO3cOLy8vtFotwcGXHsHgqc6UVqAoNLtJpM1zGYfsyZGNM3rUuLNDeWV8uO0Uc0d1pVe04/4c2StIDkiQCgwmunQJa/X7uAOdtxczBnbk011ZGCrNBPm2Lo0I8fdBobqtQlG5yaFNLlurSUtsFRUVTJw4kdWrVzs7HiGEh3v33Xe5//77iYiIICwsrEmbtD3VhR5ILasgufP+DVdQFIWnvjxAkK83D0zq7dD3DvX3QaNxXAUpMtAzltigujpZUWXl61/yWv1eqdtOEhPiy/ZHJvLbMzP48eFEt0iOoIkJ0uuvv05YmGdkv0IIdX311Ve89tpr9tuvv/4669atUzEi9eSW2HogtayC5M77N1xh06GzfH9Ez/2TehPexBNiTaX10hDm70NhK/cglZvMlJssHrPEBjC0SzjxYf6k785p1fucLiznu8P5JA/v4pCN9Y7WaETHjh3j6NGjTJgwwQXhCCE8nWzSviCnuHUVJHfql+NqVRYrT335Kz2iApk7yjk9rsIDdRSdq2rVe+jLmtcksi3w8tJw/eA4fjiSbx8R0hIfbT+FBkge0dlxwTlQo4uHy5cv529/+xvp6emuiEcI4eEefPBB+yZtRVGwWCzccccdaoelitwSI8F+3i3ex2Fbinh2/UFySirw9fZSrV+Oq33w00mO55/j7duGOW1Yb2Rg68eN2OawudOMMUeYmRDPa5uP8cXeHG4f273ZrzeZrXy84zQT+8W0uo2Cs1zyT1V6ejqDBw+mc2f3zO6EEG1PcHAwffr0wWKpbg4XEhLCv/71L5WjUkdOcUWLj/jbzEyIZ0vKRO6+qidmq8K4y6Iaf1EbV1xu4oUNRxjXK4rEvo451l+f8IDWjxtp7hy2tqJ3TDD9Ooa0eJnt6wN56A0m5oxsfVNPZ7lkgrR582Y2btzILbfcQlpaGq+99hpbtmxxVWxCCA/01FNPMXv2bAICAnj66acZOXIkixcvVjssVeSWGIlrYZPIumYmxGGxKny5N9ch7+eO0jOzGfvMtwx+8htKjFWM6RnZog7kTRURqKOgtRWkZs5ha0tmDo5jz+liftOfa/ZrU386Radwf8a7cYPTSyZIL7zwAp9++ikff/wxN998M/fccw9jxoxxVWxCCA/k5+fHqFGj8PHxYcCAASxYsIAPPvhA7bBUkVNsbPH+o7r6xobQNzaY9N2e2SjSNrOr5uDUl7896tTGmNV7kEwoitL4kxtgm8PmaRUkgOsGx6HRwGfN/DN39KyBrccLmD2yC17N6C7uau63bVwI4dH8/f3ZuHEjnTp14vnnnyctLY3cXM+tejTEaLJQVF7V4hNs9bl+cDyZp4o5WdD83+jdnRozuyIDdZitin2QaksUGCoJ8fPG11vb+JPbmI6h/ozsHsFnu3OalUR+tP0UPloNNw917+07TU6Q7rvvPm644QZnxiKEaAf+8Y9/0LNnTx577DF0Oh2HDh1i+fLlaoflcrYj/o7coHrd4DgAPmvl8Wt3lK1CzyfbPLbWNIvUG0weubxmM3NwPL/pz7E3q6RJz6+osvDJziyS+sfSIdi9vxfppC2EcKmgoCCCgoIA+L//+z+Vo1HPhSaRjqsgxYf5M6J7BOm7s7kvsZdT9+e4itWq8O7WEw0+7syeT7Zu2gXnTHSNDGzRe+R70By2+kwb2JHHPvuF9N3ZXNG58X6JX+7NpcRYxZyRzmnN4EiyxCaEECqwVT5ae4qtrpmD4zmef4792aWNP9nNnSmt4A///Zkn1h7g8o4h+NU5zu/snk/hDhg34klz2OoT6u/D1X07sHZPLmaLtdHnp247SY8OgYzqEeGC6FpHEiQhhFCBrYIU68A9SADTB8bio9W0+c3a6/fnMvWF/7H9twKemjmAL/88jmduHER8mD8aqqtlzu75FBHQ+oG1BR6+xAbVSbneUMmWYwWXfN6BnFJ2nSpmzsiubaK6KUtsQgihgtwSI5GBOvx8HLt5NyxAx4Q+0azdk8Pi6f3QuvEpofoYKs08ufYXPt6RxcD4UF5IHkzPDtVLsjMT4l3aBDPi/MmzlvZCMpmtlBirPGoOW32u7htNsJ836buzGd+74WP7H24/ia+3FzcOaRuNTKWCJIQQKsgprnDa/pmZg+M5W1bJ1kZ+o3c3O08WMf3F7/lkZxb/d3UvVt8zxp4cqSFQp0Wn9aKwheNGCs530fbkJTYAPx8t0wbEkrE/D6PJUu9zDJVm1uzK5ppBcYQFtI3vQypIQgihgpxiI92jWrbxtzET+0UT5Fv9G707d9ZOz8zmuYxDZBcbCfb1xmAyEx/mz6q7RjO8m/p7VDQaDeGBPhSea9m8MU+cw9aQmYPj+XhHFhsPnuGaQXEXPf757hzOmSzMGeW+nbPrkgqSEEKoILfEeRUkPx8tUwfEsn5/HhVV9f9Gr7a6jR/LKs14oeH/ru7lFsmRTXiArsUVJNsctvaQII3sEUlMiC/pmRe3mFAUhdRtJ+nXMYSEJpx0cxeSIAkhhIuVVlRhqDTT0cEbtGuaOTgeQ6WZjb+eddo1WqO+xo8WReHlb4+qFFH9IoNaPo/NNoctygO7aNel9dJw3RVxfHf4LMV1vq89WSX8klPKnJFd2sTmbBtJkIQQwsVyi209kJzXw2d0z0iig33d9jSbGo0fWyI8QNfiY/6ePIetPtcPjqfKovDlvtqd8VN/OkmgTuvSDfaOIAmSEEK4WE6JrQeS8ypIWi8N114Rx+ZDF/9G7w7C/H3qvd+ZjR9bojUDa/WGSvx9tAT6to/tvv3jQugVHcRnNZbZSsqrWLs3h+sT4glqY99D24q2Hhs3blTt2ocOOW8GUFOo+dmFEC3nigoSVC+zvf3Db6zbl8fske6zOfboWQPnTGa8NGCtMcLL2Y0fWyI8QEeJsQqzxYq3tnk1hQIPbxJZl0ajYebgOP7x9WGyisrpFB7A6swsKqqszB7hPn/+mkoqSEII4WK5JUa8NBDj5FlUA+JD6NEh0K2W2UxmKw+syiTI15sl1/Z3aePHlrCNGyk2Nn+jtt5g8vgeSHVdP7j6v9/ne3LOb84+xeDOYQyID1U5suZr8xUkIYRoa3KKK4gJ8Wt2RaK5qn+jj+f5bw6TXWwk3g2Wr57/5jD7s0t5Y+5QkvrHctuYbmqHdEkRNcaNNHcvkd5QSafwAGeE5bY6RwQwtGs4n2XmMLRLOEfPGnjupkFqh9UiUkESQggXyyk2OvUEW03XD67uSfP57ouPX7valmN63vjfMW4d0YWk/rFqh9MktgSpJeNG9IZKOrSjJTab7pGBHDpTxqx//4RGA4rS+GvckSRIQgjhYrklRqfvP7LpGhlIQpcwPlN5ma243MSDq/bQPTKQv13TT9VYmiO8hfPYLFaFwhZUndq69Mxsvth7IRlXFFjy+S+kZ7rPMm9TSYIkhBAupChKdZNIF1WQoHqz9sG8Mg7mlbrsmjUpisLiNfvQGyp5MTmBAF3b2d1hryA18yRgUbkJqwKRge2rgvRcxiEqzNZa9xmrLDyXoe6hppaQBEkIIVyo8JyJSrOVjqGu2w80Y1BHtF6aerscu8InO7NYty+Ph6b0YWCntrVZNzywuh1Bc3sh6Q22OWztq4LUUB8rd+tv1RSSIAkh2oRly5Yxa9YskpOT2bt3r/3+M2fOMHfuXPs/EyZMYO3atQC8/fbbXH/99dx4443s27dPrdBryS2pPuIfF+a6ClJUkC9XXhbF57uzsVpduyHkhP4cj3/+C6N6RHDn+B4uvbYj+HprCfL1bva4kfY0h62mhvpYuVt/q6aQBEkI4fa2b9/OyZMnWbVqFU899RRLly61PxYTE8P777/P+++/z3/+8x86duxIYmIiR44c4csvv+TTTz/lySefZNOmTSp+ggtsv0m7soIE1ctsOSUV/Hyi0GXXrLJYeWDVbrReGp6/ZTBar7YzZqKmlgysLWhHc9hqWpTUB38fba373LG/VVO0nYVgIUS7tXXrViZNmgRAr169KC0txWAwEBQUVOt5a9asISkpicDAQDZt2sS0adPw9vamf//+9O/fX43QL2KrIHV0YQUJYPLlMfj7aEnfncPIHpEuuebLG4+w+3Qxr84e0iYrCDYRAToKy5tXQcpvR3PYarL1sXou4xA5xUbiwvxZlNTH7fpbNYUkSEIIt6fX62slOJGRkeTn51+UIKWlpfHOO+8AkJ2dTUBAAPfeey8Gg4GUlBT69u3r0rjrk1NixEerIcrFDQQDfb2Z0j+GdftyeeK6/ui8nbuA8POJQl7ZdJSbhnZixqCOTr2Ws4UH6igwNHcPkgkfrYbQBkaqeLKZCfFtMiGqSxIkIYTbU+o0UlEU5aKp4JmZmfTo0cOeNCmKgtFo5JVXXmHnzp088sgjfPrppy6LuSG5xRV0DPXHS4XlpphgP0qMVfR+9CvinfCbfXpmNs9lHCK72IhWoyE8QMfj17lH5a41IgJ1HDljaNZr9IZKIgN929T0elGb7EESQri9mJgY9Hq9/fbZs2eJioqq9ZzNmzczevRo++2oqCiGDRuGRqNh2LBhZGe7Rx8WVzaJrCk9M5v3tp6w384uNpKyep/D+tOkZ2aTsnof2ef3WFkUBUOFmQ0Hzjjk/dUUEaCjqJnH/NvbHDZP1GiCtH//fsaPH28/IVJzc6QQQrjC2LFjycjIAODAgQNER0dftLy2b9++Wkto48eP5/vvvwfg2LFjdOzoHss8uSUVquzHcXZ/mucyDmGsstS6r9JibZP9b+oKD9RRbrJQUefzXUp7nMPmaRpdYisvLycpKYlHHnnEFfEIIcRFhgwZQv/+/UlOTkaj0bBkyRJWr15NcHAwkydPBiA/P5/IyAubjwcPHsz333/P3LlzMZlMPPbYY2qFb2exKuSVVqhSQXJ2fxpP6n9TV81xI01NbvWGSnrHBDszLOFkjSZI586dc0UcQghxSQsXLqx1u+6Ga1vvo5ruu+8+p8bUXPlllVisisvGjNQUF+ZvX/6qe39beH81NTdBUhSFAoNJltjauEaX2MrLy9m5cyf/7//9P+bMmcNPP/3kiriEEMLj5JRUJxCuHDNi4+z+NIuS+uBX52RcW+1/U5ctQWrqPqTSCjMmi5UO7awHkqdptILUt29f7r33XiZOnMhvv/3G7bffztdff41OJ5mxEEI0R27x+R5ILm4SCbX702QXV7caePqGgQ47xTYzIZ4jZ8t4ddMxAKecklNLcwfW2seMSILUpjWaIPXs2ZOePXsC0L17d6Kiojhz5gydO3d2enBCCOFJcm0VJBc3ibSx9ad5et2v/OfHEw7vT2RLCLY/MpHoYHU+ozPUXGJrCv35JpGR7axJpKdpdIntk08+4b333gOqN0EWFBQQExPj9MCEEMLT5BRXEKDTqt48sG/HYEwWKyf0jt1jeiivjIhAncctLYX6+6DRNH1grd7QPueweZpGK0iTJ09m4cKFZGRkYDKZePzxx2V5TQghWsDWA0nt5oF9YkIA+DWvjMsceNLq17wy+sYGq/75HE3rVd30srCJe5Da6xw2T9NoghQaGsqbb77piliEEMKj5ZYY3eJUV8/oQLy9NBzKK4Ur4hzynlarwuG8MpJHeOb2i/AAH4rONW0em76sEo3mwtKcaJukk7YQQrhITok6PZDq8vXW0qNDIAdzyxz2nqcKyzFWWegXG+Kw93QnEYE6e2WoMfkGExEBOrQqjJMRjiMJkhBCuIDJbEVvqFTlBFt9+saGcDDPcQmS7b36xHpmc8TwAF3TK0iGSlle8wCSIAkhhAucKeXRPzoAAB4RSURBVK1AUdQ7wVZXn9hgsouNlFY07Yd+Yw7mlaLR4LHdoyODmrEHSeaweQRJkIQQwgVsIzfcpYLUr2N1InPYQVWkQ3lldIsMxF+nbfzJbVB1BcmEoiiNPldvMEkFyQNIgiSEEC6QW1LdJNJ9KkgXTrI5wsHzJ9g8VUSgDrNVobTC3Ohz9YZKGVTrASRBEkIIF7CNGXGXClJcqB/Bft7VJ9layWiycKLgnMfuP4IL3bQb64VUbjJTbrLIEpsHkARJCCFcIKfYSKi/D4G+jXZXcQmNRkPf2GCHnGQ7fKYMRane+O2p7N20G9mHVCBNIj2GJEhCCOECucXuccS/pr6xIRzKK2vSvppLOXR+mc7Tl9ig8QpS/vk5bJ7WTbw9kgRJCCFcIKekwi2aRNbUJzaYskoz2ec3kLfUr3ml+Pto6RIR4KDI3I8tQSpoJEGSOWyewz1qvW3Uli1bVL3+Pffco+r1hw0bpur1d+zYodq12/t3L5ovt8TIkC5haodRi+0k26G8MjqFtzy5OZRXRu/YYLw8uDFieBMrSDKHzXNIBUkIIZzMaLJQXF7ldhUkW8+i1jSMVBSl+gSbh/Y/sgnUadFpvZqwB0kqSJ5CEiQhhHCyCyfY3GsPUrCfD53C/VuVIOUbKik8Z6JvR89OkDQaDRGBuiZUkCoJ8fPG19sz+0G1J5IgCSGEk+UWV/dAcpcj/jVVn2Rr+VF/2yk4Tz7ibxMeqKOwkXEj0iTSc0iCJIQQTmarIMW72RIbVJ9kO64/R6XZ0qLXXzjB5rlH/G0iAn0obGRgbb7MYfMYkiAJIYQTpWdms/SLAwAk/3sr6ZnZKkdUW5/YYCxWhaNnDS16/a95pUQH+9pPeXmy8AAdReWXriDJHDbPIQmSEEI4SXpmNimr91F2fjxFTkkFKav3uVWSVPMkW0scyiujb0fPrx5B9VH/wiacYpMKkmeQBEkIIZzkuYxDGKtqL10Zqyw8l3FIpYgu1i0yEJ23V4s2apstVo6cNXh0g8iaIgJ1lBirMFus9T5uMlspMVbJHDYPIQmSEEI4SU4DDRgbul8N3lovLosOalGCdKLgHCaztV0lSECDy2y26pIssXkGSZCEEMJJGup75G79kPq08CTbr+3oBBvUGFjbQC8k/fkeSLLE5hkkQRJCCCdZOKU3dXtL+/toWZTUR5V4GtIvNoSzZZWN7q+p61BeGVovDb2ig5wUmXuxD6xt4HvKlwTJo0iCJIQQTtIlMhAFCPP3QUP1Mf+nbxjIzIR4tUOrxVYBOpjXvCrSwbxSekQFtpumiI0NrLXNYYuSLtoeQWaxCSGEk6RuO0mQrzc/PJxIkK/7/nXbt8ZJtjE9o5r8uoN5ZSR0CXdWWG7HXkFqYInNNshWKkieoUkVpM8//5zrrruOG264ge+++87ZMQkhxEWWLVvGrFmzSE5OZu/evfb7z5w5w9y5c+3/TJgwgbVr19of1+v1DB8+nG3btrk03uJyE1/szWVmQpxbJ0cAHYKq+xjZumI3RVlFFVlFxnazQRsgLMAHgEJDwxUkfx8tgW7+31s0TaP/FYuKinj11Vf59NNPKS8v5+WXX+aqq65yRWxCCAHA9u3bOXnyJKtWreLo0aOkpKSQlpYGQExMDO+//z4AZrOZuXPnkpiYaH/ts88+S+fOnV0e8yc7szCZrcwe0dXl124ujUZTPXLkTNMTpMNnbB2020+C5OutJcjXu8EKkl6aRHqURitIW7duZfTo0QQFBREdHc3SpUtdEZcQQtht3bqVSZMmAdCrVy9KS0sxGC7u/LxmzRqSkpIIDAy0vy4wMJDevXu7NF5FUfhw2ymGdAnj8ri20USxT2wwh/PKsFqVJj2/vZ1gswkP9Gl4D5LBJD2QPEijCVJWVhaKovDAAw8we/Zstm7d6oq4hBDCTq/XEx5+Ya9LZGQk+fn5Fz0vLS2Nm266CQCTycSrr77KggULXBanzdbjBRzXn2POSPevHtn0iw3BWGXhVGF5k55/KK+MYF9vt5wv50wRgb4UNtAHSS9z2DxKkxZKz5w5wyuvvEJOTg7z5s1j06ZNaDR1D68KIYRzKIpy0e26fwdlZmbSo0cPgoKqj5z/+9//5uabbyYkxPUVnNRtpwj192HGoI4uv3ZL1TzJ1i0qsNHnH8wrpU9scLv7WRAR4GM/zl+X3mAioUuYiyMSztJoBSkyMpKEhAS8vb3p0qULgYGBFBYWuiI2IYQAqvcZ6fV6++2zZ88SFVX7tNXmzZsZPXq0/fYPP/xAamoqt9xyC5s3b+aJJ57gyJEjTo81v6ySjP153DS0E34+bef4e++YYDQamtRRW/n/7d1pVJNn2gfw/0MADSHKIiToqRtWULAtjr5zFEfbIuV0Ou0ZneGVUqnTsZ1xbD3TsbggWHvEFe02aB2PS20dOiIMMtra4unr0n5AqJZRUVmLiigQBAokYCTk/YCJbCoIeR548v99SnKS3FfygFzey3WZzcgrr7eefrMn7ipn1Og7zyCZWsyo1nMGSU4emiDNmDEDp0+fRktLC6qrq2EwGNpNdRMR2VpwcDAyMjIAAJcuXYK3t7d1psjiwoUL8Pf3t94/cOAADh48iIMHD+Lpp5/GmjVr8Pjjj9s81oNnStHcYkbkL0fafKy+pHRWYLSnqlsn2W783IT6pmb4aQfG/qq+5OHSdcPaGoMRLWbAU8VN2nLx0CU2jUaDsLAwLFiwAI2NjYiLi4ODA+tLEpF4Jk+ejICAAEREREAQBKxZswZpaWlQq9UIDQ0FAOh0Onh6ekoap6nFjH9lX8O0sZ7w9Rp41aX9tepuzSDl3y0oOcHONmgDgIerMxrvmNBoNEHpfG+G8FaDpQ8bZ5Dkolt7kCIiIhAREWHrWIiI7is6Orrd/bazRQDa1T7qaNOmTTaJqaPvCnW4XtOIlc/7P/zJ/ZCfVo1vLpbDYGyGi/P9/zxYTrCNt8cEqU0/NqXzvQ3q7MMmP5wKIiLqI0mnr2GYqzOem6iVOpRH4q8dArMZKKzoXEKhrfzyeoxwU2LIYCeRIus/3O/Tj40JkvwwQSIi6gM3ahtxPK8C/zvlMTg7Dsx/Wv272ZMtr7zOrgpEtnW/hrU69mGTnYH5W0xE1M8c+KEUZgAv/8/A2pzd1kgPFyidFA/ch3S72YSfdHq7PMEGAO5tltjauqU3wkkhYKjS/mbV5IoJEhFRL90xteBA9jXMGu+FxzxcpA7nkTk4CBivVT/wJFtxpR7NLWa7PMEG3Dul1mmJrf42PFWD7K4ulJwxQSIi6qX/u1yJyvrbA6py9v1M0KqRV17XqTinRX6F/Z5gA4AhSic4CF3vQWIfNnlhgkRE1EtJWVfhM3QwnvHzkjqUXvPTqlFjuGPdU9NR3s16OCsculVtW44UDgLcuqiFxD5s8sMEiYioF67e0uP7wipETB0JR8XA/yfV/+7S2f32IeWV12OctyucZPBZH5W7i1PnPUjswyY79vsTTkTUB77IvgaFg4B5Ux+TOpQ+8bCTbPZ8gs3CUzWo3QyS2WxGVYORS2wywwSJiOgR3W42IeXMdcye4A3t0MFSh9Mn3FXO0AwZ1OUMUo3eiIq623Z7gs3CXeXUrh9bXVMzjKYWeHEGSVaYIBERPaJvcstRrTfKYnN2W37aIV2eZLMkTfZ6gs3CQ+WMW21mkCxFIj1ZA0lWutVqpD8LCQmRbGypm/Z++umndj2+lN9/fHy8ZGNT/5GUdQ0jPVwwY9wwqUPpUxO0anxafAvNppZ2+6rsuQdbW+4uzqgxGGE2myEIwr0+bJxBkhXOIBERPYLCinpkl1Qj8pcj4eAgr9o3flo1jKYWlFTp2z2eV14PdxcneNl5Q1YPlTNMLWbUNTUDYJsRuRrwM0hERGJKzynDlox8lNU2AgBcH9DUdaBqe5Ltcc292aK88nr4a4fYfTFES7uRGr0RQ5VOTJBkijNIRETdlJ5Thpi0C9bkCADWH72M9JwyCaPqe77eKigchHYn2VpazCioqIefnS+vAfca1lr2IVXV34YgtB7/J/lggkRE1E1bMvLReMfU7rHGOyZsyciXKCLbGOSogK+XCvltTrKV1hhgMJowwc5PsAGAh8u9GSQAqNIb4eHiLIs6WHQPryYRUTfdaDNz1J3HBzI/7RBcbnOSzXLb3k+wAfeW2KoN92aQuLwmP0yQiIi6abibskePD2T+WjXKahtR19Ra7ye/vB6CAIzXuEocmfTa7kEC2IdNrpggERF107IwPyidFO0eUzopsCzMT6KIbMdSLbvg7jJbXnkdRnm4wEWGm9J7ysVZAWdHh3szSOzDJktMkIiIuum3QSOwce4kjHBTQgAwwk2JjXMn4bdBI6QOrc/5+7TvyZZ/9wQbAYIgwMPFGdV36x+xD5s88b8CREQ98NugEbJMiDoaPnQw1IMdkVdeh0ajCSW39HjxyeFSh9VvuKtai0U2Gk3QG01cYpMhziAREVEngiDAX6tGfnk9CivrYTaDJ9ja8FA5oVpvZA0kGWOCREREXfLTqpFXXo/LN+vu3ucSm4WHahBqDHegsyZInEGSGyZIRETUJX/tENQ3NeNEng5KJwVGerhIHVK/4eHihFsNt9mHTcYeugcpJSUFhw8ftt7Pzc1FTk6OTYMiIiLpWU6yHc+rxAQfNRQy6znXG+4qZ9Q1NaO8rgkAEyQ5emiCFB4ejvDwcABAdnY2vv76a5sHRURE0ht/N0Eymlp4gq0DSy2koorWU36eXGKTnR4tsW3fvh2LFy+2VSxERNSPHL9cCcXdxrTfXCyXXc+53rAkSAUVDVAPdsQgR8VDXkEDTbcTpPPnz8PHxwdeXl62jIeIiPoBS2Nek9kMAPi58Q5i0i4wSbrL0o+tsLIBXlxek6VuJ0ipqamYM2eOLWMhIqJ+wl4a8z4q97szSFUsEilb3U6QsrKyEBQUZMtYiIion7CnxryPwrLEBoBFImWqW5W0KyoqoFKp4OzMHwIiksaGDRtw7tw5CIKAVatW4YknngDQ+u9TdHS09XmlpaV455138PzzzyM2NhalpaVobm7G8uXLMWXKFKnCH3CGuylR1kUyJMfGvI/CzcXJept92OSpWwmSTqeDh4eHrWMhIupSdnY2rl69iuTkZBQVFSEmJgYpKSkAAI1Gg/379wMAmpubERUVhWeffRb/+c9/oFQq8cUXX6CwsBAxMTFITU2V8mMMKMvC/BCTdqHdMptcG/M+ikGOCqgHOaL+djOX2GSqWwlSYGAgdu/ebetYiIi6lJmZidmzZwMAxo0bh7q6OjQ0NMDV1bXd8w4dOoSwsDCoVCq89NJL+M1vfgMA8PDwQG1trehxD2SWfnNbMvJxo7YRw92UWBbmZxd96LrLXeXcmiBxiU2W2KyWiPq9qqoqBAQEWO97enpCp9N1SpBSUlKwd+9eAICT070lkM8++8yaLFH32Utj3kflrnLGtWoDZ5Bkiq1GiKjfM989at72viC0r+qck5ODsWPHdkqakpKScPHiRbz55ps2j5PsR3pOGfLu9qiLPcTyB3LEBImI+j2NRoOqqirr/crKSgwbNqzdc06ePIlp06a1eywlJQXHjx/HJ5980m5Giag3LDWibje3AACqGoysESVDTJCIqN8LDg5GRkYGAODSpUvw9vbuNFN04cIF+Pv7W++XlpbiwIED2LZtGwYN4hII9R3WiLIP3INERP3e5MmTERAQgIiICAiCgDVr1iAtLQ1qtRqhoaEAWk/benp6Wl+TkpKC2tpa/OlPf7I+tmfPHpYroV5jjSj7wASJiAaEtrWOALSbLQKAI0eOtLu/dOlSLF261OZxkf1hjSj7wCU2IiKiHlgW5gelU/vmtKwRJT+cQSIiIuoB1oiyD0yQiIiIeog1ouSvVwmSydS6i7+8vLxPghloHBykXaF0dLTv/FbK71/qn/mOdYHEZPnslt9/IiI56tVfWJ1OBwB45ZVX+iSYgabtiRl7HN+eRUZGSh2C5HQ6HUaNGiV1GERENtGrBCkwMBBJSUnw8vKCQqF4+AuIaMAzmUzQ6XQIDAyUOhQiIpvpVYI0ePBgTJkypa9iIaIBgjNHRCR3POZPRERE1IGkCdKGDRswb948RERE4Pz586KPX1BQgNmzZ+Of//yn6GMnJCRg3rx5+N3vfodjx46JOnZjYyP++te/Yv78+QgPD8eJEydEHR8AmpqaEBISgrS0NFHHzc3NxcyZMxEVFYWoqCjEx8eLOj4AHD58GC+99BLmzp2LU6dOiTp2SkqK9bNHRUUhKChI1PGJiAYKyY5BZWdn4+rVq0hOTkZRURFiYmKQkpIi2vgGgwHx8fGdmluK4fTp0ygsLERycjJqamowZ84cPPfcc6KNf+LECQQGBuKNN95AWVkZ/vjHP+KZZ54RbXwA2LFjB9zc3EQdE2i97mFhYYiNjRV9bACoqanB9u3b8e9//xsGgwGJiYmYNWuWaOOHh4cjPDwcQOvv4Ndffy3a2EREA4lkCVJmZiZmz54NABg3bhzq6urQ0NDQqQGlrTg7O2PXrl3YtWuXKOO1NXXqVDzxxBMAgKFDh6KxsREmk0m0je6//vWvrbdv3rwJjUYjyrgWxcXFKCoqwtNPPy3quACg1+tFH7OtzMxMTJs2Da6urnB1dZVkBsti+/bt2Lp1q2TjExH1Z5ItsVVVVcHd3d1639PT01o2QAyOjo4YPHiwaOO1pVAo4OLiAqB1yWPmzJmSnAKMiIhAdHQ0Vq1aJeq4mzdvxsqVK0Ud08JgMODs2bN4/fXX8corr+D06dOijn/9+nWYzWa8/fbbiIyMRGZmpqjjW5w/fx4+Pj7w8vKSZHwiov5OshmkjoXuzGYzBEGQKBppfPvtt0hNTcXevXslGf/AgQO4fPkyli1bhsOHD4vy/aenp+Opp57CY489ZvOxuuLv748333wTISEhKCkpwWuvvYZjx46J2uG9oqIC27Ztw40bN/Dqq6/ixIkTov/sp6amYs6cOaKO2R/Ze7FbInvTk0K3kiVIGo0GVVVV1vuVlZUYNmyYVOGI7vvvv8c//vEP7N69G2q1WtSxc3Nz4enpCR8fH0yYMAEmkwnV1dWiFJ48efIkSktLcfLkSZSXl8PZ2RlarRbTp0+3+dgA4OvrC19fXwDAmDFjMGzYMFRUVIiWsHl6eiIoKAiOjo4YOXIkVCqVaN99W1lZWYiLixN1zP7I3ovdEtmr7hS6lSxBCg4ORmJiIiIiInDp0iV4e3uLtv9IavX19UhISMC+ffsk2ah85swZlJWVITY2FlVVVTAYDO2WO23po48+st5OTEzEiBEjREuOgNaZE4PBgFdffRU6nQ63bt0SdQ/WjBkzsHLlSrzxxhuora0V9bu3qKiogEqlEnXWrL9isVsi+9KTQreSJUiTJ09GQEAAIiIiIAgC1qxZI+r4ubm52Lx5M8rKyuDo6IiMjAwkJiaKkrAcPXoUNTU1ePvtt62Pbd68GcOHD7f52EDr3qPY2FhERkaiqakJ7777ruR95cQSGhqK6OhoZGRkwGg04r333hM1UdBoNAgLC8OCBQvQ2NiIuLg40b97nU4HDw8PUcfsr1jslsj+dLfQrWCWsuslERERUT9kH9MGRERERD0g2RIbEdFAtWHDBpw7dw6CIGDVqlXWumZyk5ubi8WLF1uXJMaPH4/Vq1dLHFXfKygowOLFi/GHP/wB8+fPx82bN7F8+XKYTCZ4eXlhy5Ytstmz1/GzxsfHIycnByqVCgCwcOFCSWrU2UJCQgLOnj2L5uZm/PnPf8akSZN6dF2ZIBER9YDUXQDEJHXleTF01VXh73//OyIjI/H8888jISEBqampiIyMlDDKvtHVZzUYDFi/fj0mTJggYWR9r6uOFdOmTevRdeUSGxFRD9yvC4AcSV15XgyWrgre3t7Wx7KyshASEgIACAkJkayga1/r6rPK9RpPnToVH3/8MYB7HSt6el2ZIBER9YDUXQDEJHXleTF01VWhsbHRuvTi5eUlm+vb1WfV6/XYtm0boqKiEB0djdraWomi61tddazo6XVlgkRE1AP21AXAUnl+9+7dWLduHVauXAmj0Sh1WDbX9nrK/aC3peXU/v374evri8TERKlD6lOWjhXvvvtuj68rEyQioh6wpy4Avr6+1iWJtpXn5U6pVKKpqQlAa2HVtktSchMaGooxY8ZYb+fn50scUd+xdKzYtWsX1Gp1j68rEyQioh4IDg5GRkYGAMi+C0Bqaio+//xzAJCk8rxUpk+fbr3Gx44dw69+9SuJI7KdRYsW4caNGwBa9149/vjjEkfUNywdK3bu3GktAN3T68pCkUREPbR161acOXPG2gXA399f6pBs4ueff0Z0dDQMBgOMRiPeeustzJo1S+qw+lTHrgoajQZbt27FypUrcfv2bQwfPhwbN26Ek5OT1KH2Wlef9eWXX8aePXvg4uICpVKJjRs3it4b0haSk5ORmJhonR0DgE2bNiEuLq7b15UJEhEREVEHXGIjIiIi6oAJEhEREVEHTJCIiIiIOmCCRERERNQBEyQiIiKiDpggERFRv3b9+nXMnTvXJu9dXV2NF154Ae+//75N3t/iL3/5i03fn/oeEyQiIrJbxcXFGDVqFN555x2bjrNjxw6bvj/1PdZBIiKiPpeWloazZ8+iuroaJSUlWLhwIcLDw/Hss8/iyJEjUKlU2Lx5s7Vy8w8//ICamhoUFhbib3/7G7788ksUFxdj69at8PT0xOLFixEQEICffvoJfn5+WLt2LSoqKhAXFwej0QiFQoF169Zh+PDheO655zBx4kQEBwcjPDzcGtPRo0exb98+KBQKBAQEIC4uDnPnzsWNGzcQHh7eLklKT0/Hnj17oNVqoVKpMHPmTABAYWEhVqxYAb1ejxdffBHHjx/HmTNn8MEHH8DR0RE+Pj6Ij49HTk4O9u7dC4PBgBUrVmDhwoXIyspCUVER1q5dC0EQoFKpsGnTJiiVSixbtgw6nQ5GoxFLliyxjkfScZQ6ACIikqeCggIcOHAAV65cwdKlS9slKx1duXIFX3zxBVJSUrBz506kp6cjLS0NX375JRYsWIDi4mLs3LkTWq0Wv//975Gfn4/PPvsMr732GqZPn45Tp07hk08+wbp161BaWort27e3a5uh1+vx4YcfIj09HSqVCosWLcLp06exYsUKJCUltUuOzGYzPvroI6SlpUGtVmPu3LkPTFjWrVuHffv2wc3NDQkJCfjmm2+g0WhQUFCAjIwMawd5AIiPj8fatWsxevRoJCUlISkpCTNnzkRNTQ2SkpJQV1eHU6dO9fKbp77ABImIiGziqaeegkKhgFarRX19/QOfGxgYCEEQ4OXlBT8/PygUCgwbNgw//vgjAGDkyJHw8fEBAEyaNAklJSXIyclBSUkJduzYAZPJBA8PDwCtzWY79hS7cuUKRo0aBZVKBQCYPHkyLl++jIkTJ3aKpaamBq6urtb3CwoKum/cVVVVuHr1KpYsWQIAMBgMcHd3h0ajgZ+fX7vkCADOnz+P1atXAwCMRiMmTZqEsWPHQq/XY9myZQgNDcULL7zwwO+KxMEEiYiIbMLR8cF/Yu7cudPlc9vetuwCEQSh3WsFQYCTkxM+/vjjTl3Zu+qvJQgC2u4oMZvNnd6z4/MfFENzc7N1LG9vb+zfv7/d67OysjolR0Br8vb55593GvvgwYP48ccfcejQIZw4cQIbN268b2wkDm7SJiIi0bi6ukKn08FkMuHcuXPdft21a9dQWVkJs9mMCxcuwNfXF08++SS+/fZbAEBmZiaOHDly39ePHj0aV69eRUNDAwAgOzsbgYGBXT7X3d0ddXV1qK2tRXNzM/773/9aY6+srAQAnD17FgAwdOhQAEBRUREAYP/+/cjLy7tvHP7+/vjuu+8AAF999RUyMzNx8eJFHDlyBFOmTMF7772H4uLibn8vZDucQSIiItHMnz8fixYtwpgxYzBu3Lhuv87f3x8ffvghioqKEBQUhHHjxuGtt97CqlWr8NVXX0EQhAfOuri4uGD58uV4/fXX4eDggF/84heYMmUKsrKyOj1XEAQsWbIEUVFR0Gg0GD16NABg2rRp2LFjB6KiojBr1izrLND69esRExNjnU2aN28ecnJyuowjNjYWq1evxq5duzBo0CC8//77EAQBH3zwAZKTk6FQKLBw4cJufy9kOzzFRkRE9ACW03a2qsVE/ROX2IiIiIg64AwSERERUQecQSIiIiLqgAkSERERUQdMkIiIiIg6YIJERERE1AETJCIiIqIO/h+3y7jAuZZxswAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Which digit is this?\n", + "2\n" + ] + } + ], + "source": [ + "accuracy_scores = [learner.score(X_test, y_test)]\n", + "\n", + "for i in range(n_queries):\n", + " display.clear_output(wait=True)\n", + " query_idx, query_inst = learner.query(X_pool)\n", + " with plt.style.context('seaborn-white'):\n", + " plt.figure(figsize=(10, 5))\n", + " plt.subplot(1, 2, 1)\n", + " plt.title('Digit to label')\n", + " plt.imshow(query_inst.reshape(8, 8))\n", + " plt.subplot(1, 2, 2)\n", + " plt.title('Accuracy of your model')\n", + " plt.plot(range(i+1), accuracy_scores)\n", + " plt.scatter(range(i+1), accuracy_scores)\n", + " plt.xlabel('number of queries')\n", + " plt.ylabel('accuracy')\n", + " display.display(plt.gcf())\n", + " plt.close('all')\n", + " \n", + " print(\"Which digit is this?\")\n", + " y_new = np.array([int(input())], dtype=int)\n", + " learner.teach(query_inst.reshape(1, -1), y_new)\n", + " X_pool, y_pool = np.delete(X_pool, query_idx, axis=0), np.delete(y_pool, query_idx, axis=0)\n", + " accuracy_scores.append(learner.score(X_test, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we can visualize the accuracy during the training." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAFICAYAAAAYvikoAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzs3Xtc0/X3B/DXNjZu435VVEDloiCKDDWhTATBS369Fmle0rL6WuYlTKxfVpqZllam5bdMu5C3JCuvaWqlGRdFFPCKIJchF7mMO2x7//7ALa46dHfO8/Hokfts++xsg3H2fp/3eXMYYwyEEEIIIUSruLoOgBBCCCGkK6IkjBBCCCFEBygJI4QQQgjRAUrCCCGEEEJ0gJIwQgghhBAdoCSMEEIIIUQHKAkjXVZ0dDQmTJig6zA0KjU1FSNGjMCLL77Y5rq//voLYrEYALBp0ya88cYbGokhLCwMycnJaj3n999/j48//hgAcPz4cYSGhmLlypUtjqvLnDlzEB8f36n7HDt2DLGxsWqLQVvv1b0cOnQIVVVVAIBly5bhxIkTD33OvLw89O/f/6HPo4qLFy9i3rx5WnksQlRlousACNGF69evw8rKCra2tkhJSUFgYKCuQ9KI06dPY8iQIVi/fn2b63bs2IGXXnoJ3bt310FkD+eZZ55R/vvEiROYOnUqFi1apMOIWoqIiEBERITazqcP79Wnn36KwYMHQygUYt26dTqL40EFBARg27Ztug6DkBZoJIx0SfHx8YiKisL48eOxf//+Ftf99NNPiIyMRGRkJGJiYtDQ0NDh8YSEhBZ/bJtf3rRpE958801MnToVO3bsgFwuxzvvvIPIyEiEhYUhJiYGjY2NAIDS0lK8+OKLGDVqFJ544gmcPn0aJ0+exPjx41vENnnyZBw/frzN8/n2228xduxYREVF4aWXXkJpaSmOHDmCb7/9FidPnsTzzz/f4vYff/wx/vnnH8TExODQoUMAgIaGBixZsgRhYWF48sknUVhYCAAoLCzEiy++qHzuf/zxR7uvaW5uLmbMmIGIiAhMmTIF6enpbW6zd+9ejBkzBqNHj8aMGTOQn5+vfIzZs2dj7NixCA8Px8aNG+95XDEa9M033+Do0aPYtWsX3nzzzRajRB3FnZeXh9DQUKxZs6ZFMtf8eUybNg3h4eFYunQpZDKZ8n7NR22aX46Pj8fLL7+M2bNnY926dYiPj8ecOXMAAMuXL8enn36KZ599FiNHjsSzzz6L2tpaAE0jXJGRkRg7dix2796NwYMHIy8vT+Pv1c2bN/H0009jzJgxiIiIwIEDB5TX/fXXXxg3bhwiIyPxwgsvoLy8HLGxscjKysLMmTORnJyMmTNn4ueff8bChQuxfft25X0zMjIQGhoKuVyOc+fOYcqUKYiIiMCTTz6J3NzcdmNpbvPmzYiMjMTIkSOxevVq5Wt/r3h9fHywdetWREZGQiaTISwsDLt27cLUqVMRGhqKtWvXAmj7u/nuu+9iwYIFGDVqFKZOnYqioiIAQHp6Op544glERkbis88+wxNPPIGEhIT7xk7IA2GEdDFSqZSNGjWKVVZWspqaGvb444+z+vp6xhhjubm5bNiwYez27dtMLpezBQsWsC+//LLD4//88w8LDw9Xnrv55U8//ZSFhoayO3fuMMYYO3LkCBs/fjxraGhgdXV1bMyYMWz//v2MMcZWrFjB1q1bxxhjLD09nQ0ZMoTV19ezIUOGsMuXLzPGGBOLxSwoKEgZq0JKSgp77LHHWElJCWOMsXfffZetWLFCGYPi362NHDmSJSUlKW8XEhLC8vLyGGOMvfDCC+yzzz5jjDE2f/58tnHjRsYYY9nZ2WzIkCGstLS0zflmz57N4uLiGGOMHTt2jI0dO7bF45SUlDB/f39WUFDAGGNs+fLlytjWrl3LNm3axBhjrKamhi1evJgVFhZ2eLz583r99dfZ5s2b2zzfjuLOzc1lfn5+LD4+vt3XZeHChWzDhg2MMcZSU1NZ//792b59+1hubi7r16+f8nbNL+/bt48NGjSIZWVlKS/Pnj1bGd+YMWNYWVkZa2xsZBMmTGA///wzk0qlLCQkhJ06dUr5Gvj6+rLc3FyNv1cvvPAC27p1K2OMscTERBYQEMAaGhpYdXU1Cw4OZlevXmWMMbZ69Wr29ttvM8YY8/b2Vr53zzzzDNu/fz87ePAgmzFjhvK8n3zyCVu1ahWrqqpiISEh7PTp04wxxn799Vc2adKkNnE0fw0PHz7Mxo0bxyQSCWtsbGTz589n33333T3jVcT1+eeft3itlixZwqRSKbt9+zbz8/NjBQUFbX43H3nkEZaXl8fkcjmbP38+27JlC2OMsUmTJil/jrdv3878/f3ZP//80yZ2QtSBRsJIl3P69GkMGDAAQqEQ5ubmGDJkCE6ePAkAOHPmDAIDA+Hi4gIOh4OPPvoIc+bM6fD4/QwcOBD29vYAgMjISOzbtw98Ph+mpqYYMGCAcnTgjz/+UI569e/fH7///jsEAgEiIyNx8OBBAE11RqNGjYJAIGjxGKdOnUJkZCQcHBwAANOmTcOZM2c6/boEBQXBzc0NAODr64vCwkLU1NTgjz/+wPTp0wEA7u7uCAoKajPCUl9fj4SEBOVzGDVqFPbs2dPiNg4ODjh37hxcXV0BACKRSPn8HRwccPr0aSQnJ0MgEGDDhg1wdnbu8Pj93C/uxsbGDqcLk5OTMWbMGABNU1i9e/e+/4sHwMPDAx4eHu1eN2LECNja2sLExATe3t4oKChAdnY26uvrMWLECADAzJkzIZfLVXqsh3mvAGDLli3K+qigoCDU19ejuLgY58+fR7du3eDt7Q0AiImJuWdt2+OPP4709HSUl5cDaPoZjYqKQnJyMiwtLRESEgIAGD9+PHJycpR1be05fPgwnnjiCVhZWcHExATTpk3Db7/9ds94m8fR3BNPPAEejwcXFxc4ODigoKCgzeOJRCK4ubmBw+GgX79+KCgoQF1dHdLT05U/xzNmzACjnf2IBlFNGOly4uPj8eeff0IkEgEAZDIZKioqEBkZibKyMlhbWytva2pqCgAdHr8fGxsb5b9LS0uxatUqZGRkgMPhoKSkBLNnzwYAlJeXw8rKSnlboVAIABg3bhxiY2OxdOlSHD9+vN3C4tLS0haJibW1Ne7cuaNSfM0pHhMAeDweZDIZKisrwRjDrFmzlNfV1NRg2LBhLe5bXl4OuVyufA4cDgeWlpYtbiOTybBp0yb8/vvvkMlkqK6uhqenJ4Cm4nfFdG1RURFmzJiBV155pcPj93O/uHk8Xovn21xFRUWL65q/7/fS/L1urfl7q3htKyoqWhxXJblUeJj3Cmiacvz8889RVlYGDocDxhjkcnmbn/PWCX9rFhYWGD58OE6dOoWgoCBIJBIEBQXhwIEDKCwsRFRUVItzlZaWdljXVllZie+++w4//fQTgKafF8UXmI7iVbC1tb3v69NaR+8J8O97zufzlV9uCNEESsJIlyKRSJCYmIiEhATlHxipVIoRI0agtLQUdnZ2SElJUd6+qqoKdXV1HR5v/QGv+BBvz8aNG2FiYoJff/0VAoEAS5cuVV5na2uLsrIy9OjRA0BTvZGLiwuCg4MhlUpx8uRJXL9+HcOHD29zXkdHR+VIBNCUEDk6Oj7Aq9OWg4MDeDwe9u3b1yapas7Ozg4cDgdlZWWwt7cHYww5OTno1auX8jaHDh3C77//ju+//x729vbYs2cPfv31VwCAiYkJ5s+fj/nz5yMrKwvPP/88goKCEBIS0u7xh4m7dc1Va9bW1spVgEBTkgs0/aGWy+VgjIHD4dzzvVaFUChEdXW18nJJSclDnU/V96qxsRGLFi3Cxx9/jBEjRqChoQEBAQEAmt7HsrIy5W1ra2tRUVGhHL1sT2RkJI4fP46ysjJERkaCw+HA2dkZvXv37tSqUmdnZ4SFhbWp07tXvOqmSN6qqqogFAohlUqV7z8hmkDTkaRLOXDgAIYNG9biG76JiQlCQ0Nx4MABjBgxAufPn0deXh4YY1i5ciV+/PHHDo87OTmhuLgYd+7cgUwma1Ew3NqdO3fg5eUFgUCAK1euICUlRflHOCwsTDkCcOPGDUyePBkymQxcLhdjx47FqlWrEBYWBj6f3+a8jz/+OI4dO6b847lr1y7lFNe9mJiYoLKy8r63eeyxx7Br1y4ATX+UY2Nj20zvCAQChISEKJ/DX3/9hfnz54PD4bR4/m5ubso/9IcOHVI+/7feeks5hdqrVy84OjqCw+F0eFyV56ZK3O0ZNGgQjh07BgA4f/48cnJyADQlKDweD1evXgUA/Pzzz/c91714eHhALpcri7537tzZ4XNT53tVW1uLmpoa5aKCb775Bnw+H9XV1QgKCkJxcTEuXrwIoGkacPPmzcrzSySSNo8bFhaGlJQUHD9+XDmNO3DgQBQXFyM1NRVA02KHmJiYe07thYWF4eeff1YuWti1axd++umne8arbpaWlujTp49yGnT37t0q/bwR8qAoCSNdyv79+xEeHt7meEREBPbv3w9XV1e8++67mD17NiIjIwEAzz77bIfH3d3dMWXKFEycOBHTp09vd+pHYe7cudi1axdGjx6NuLg4vP7669i9ezcOHz6MmJgY3L59G2FhYVi8eDE+/PBDmJmZAWiakszPz8fYsWPbPW9AQADmz5+PGTNmICoqCpWVlVi8ePF9X4vIyEgsXry4xeq29rzzzjtISkpCVFQUJk2ahJ49e6Jbt25tbvfee+/h5MmTGDVqFD7++GN8+OGHLa4fP348ysvLMXLkSCxduhSLFy/G7du3sXr1akRHR2Pjxo2IiorC2LFjERgYiEceeaTD46pQNe7WYmJicPLkSYSHhyMuLk45+mhmZoZXXnkFzz33HCZPnox+/fqpFEdHBAIB3n77bcTGxuI///kPPD09weVy2/2jr873ytraGs899xyeeOIJTJw4Eb169UJ4eDiee+45MMawadMmxMTEIDIyElevXlX+LEVFRSE6Olq5QlNBKBTCz88PYrEYgwYNAtD0Wn366adYtWoVxowZgwULFiAqKuqeCU1ERARGjhyJSZMmISoqCidOnEBoaOg9462pqVHpte6MlStX4osvvsC4ceNQXV2trAMlRBM4jKoOCdFrJSUlmDRpEk6dOgUej6frcIiG1NTUIDAwEMnJyS3qlYj2KaacAWDYsGHYsWMHfH19dRwVMUY0EkaInvv000/x9NNPUwJmhKZMmaIcWTp06BD69OlDCZiOLVy4EF9++SUA4OzZs2CMdbjqlZCHRSNhhOipkpISPPXUU/Dx8cGGDRuU05PEeCQnJ+Pdd99FfX09LC0t8fbbb2us6JyoJjMzE7GxsaioqACfz0dMTIxKNZaEPAhKwgghhBBCdMCgpiOlUiny8vIglUp1HQohhBBCyD3dL28xqD5h+fn5ypVl9+pbQwghhBCia7dv38aMGTPw22+/wd3dvc31BpWEKbapmDFjho4jIYQQQghRTXFxseEnYU5OTgBAI2GEEEII0XuKkTBF/tKaQSVhiiX6rq6uyu1dCCGEEEL0WUcthgyqMJ8QQgghxFhQEkYIIYQQogOUhBFCCCGE6AAlYYQQQgghOkBJGCGEEEKIDlASRgghhBCiA5SEEUIIIYToACVhhBBCCCE6QEkYIYQQQogOGFTHfEIIIYQYn/0p+Vh/9CrE5bXobmuOmEgfTAx003VYGkdJGCGEEEJ0Zn9KPmLjL6G2UQYAyC+vRWz8JQAw+kSMpiMJIYQQojPrj15VJmAKtY0yrD96VUcRaQ8lYYQQQgjRGXF5baeOGxNKwgghhBCiM91tzTt13JhQEkYIIYQQnYmJ9IEJl9PimDmfh5hIHx1FpD2UhBFCCCFEZyYGusG3mxUUeZiNOR/vTx5g9EX5ACVhhBBCCNGxsupGjA/oDjdbc4T0degSCRhASRghhBBCdKi0ugH55bXwd7OGyMMOydllYIzpOiytoCSMEEIIITqTLq4AAPh3t4HI3Q5FlfXIKzP+lZEANWslhBBCiA6l5UsAAP27W8POUgAASMouRU97C12GpRU0EkYIIYQQnUkTV6CHnTlsLQTwdrGClakJkm+V6TosrdBoErZmzRo89dRTiI6OxsWLF1tcFxcXh6eeegpPP/003nvvvRbXlZSUIDg4GAkJCZoMjxBCCCE6liGWwL+7DQCAx+Ug0N0OydmlOo5KOzSWhCUmJuLWrVvYvXs3Vq9ejVWrVimvq6qqwrZt2xAXF4edO3ciMzMTFy5cUF6/bt069OzZU1OhEUIIIUQPVNY1IqukGv5u1spjwe52uFZYhYqaRh1Gph0aS8LOnj2L8PBwAEDfvn0hkUhQVVUFAODz+eDz+aipqYFUKkVtbS1sbGyU97O0tIS3t7emQiOEEEKIHsgQN9WD+d0dCQOAIA87AMD5HOOfktRYElZSUgI7OzvlZQcHBxQXFwMATE1NsWDBAoSHhyMsLAyDBg2Cp6cnGhoasHnzZixevFhTYRFCCCFET6QpkrBmI2GDetqCx+UgqQtMSWpsdWTrHh+MMXA4Te1wq6qqsHXrVhw5cgRCoRCzZ8/GlStXcPz4cUybNg3W1tbtnZIQQgghRiRdXAFnK1M4W5kpj1kITODf3bpLFOdrLAlzcXFBSUmJ8nJRUREcHR0BAJmZmejZsyfs7e0BACKRCGlpaTh9+jTkcjni4uKQk5ODixcv4pNPPoGXl5emwiSEEEKIjqTnS+DvZtPmeJC7PeISbqFBKofAxHgbOWjsmYWEhODo0aMAgIyMDDg7O0MoFAIA3NzckJmZibq6OjDGkJaWBg8PD+zatQt79uzBnj178Pjjj2PlypWUgBFCCCFGqLZBhutFlfDr3nb2S+Rhh3qpHGl3G7kaK42NhA0ePBh+fn6Ijo4Gh8PBypUrER8fDysrK0RERGDevHmYNWsWeDweAgMDIRKJNBUKIYQQQvTMldsSyFnLonwFkXtTTfm57DIM7mXX5npjodGO+a+99lqLy76+vsp/R0dHIzo6usP7rl27VmNxEUIIIUS30u8W5TdvT6HgbG2GXvYWSL5ViufRW9uhaY3xTrQSQgghRG+liytga8GHm615u9eL3I1/M29KwgghhBCidWn5TZ3yFZ0TWhN52ONOdQOy79RoOTLtoSSMEEIIIVrVIJXj6u32i/IVRHebthrzFkaUhBFCCCFEq64XVaJBJodfO+0pFPo6CWFtZoLkbOPtF0ZJGCGEEEK0SlmUf4+RMC6XA5GHPZJv0UgYIYQQQohapOdXwFLAg4eD5T1vF+Ruh8ziapRWN2gpMu2iJIwQQgghWpUmlqB/d2twue0X5Sso+4UZ6RZGlIQRQgghRGtkcoYMsaTdJq2tDexpCz6PY7RTkpSEEUIIIURrskqqUdsoa3fPyNbM+Dz4u9ngnJEW51MSRgghhBCtSb+7H2R7nfLbI3K3w8W8CtQ1yjQZlk5QEkYIIYQQrUnLr4DAhIs+TkKVbi/ysEeDTI60fOPbzJuSMEIIIYRoTVq+BP1crcDnqZaCBN0tzk82wuJ8SsIIIYQQohWMMaSLK+7ZpLU1R6EpPB0tjbJzPiVhhBBCCNGKvLJaSOqk8FdhZWRzInc7nLtlfJt5UxJGCCGEEK1Q1HWpWpSvIPKwQ1lNIzKLqzURls5QEkYIIYQQrUgTV4DH5cDbxapT9xN52AMwvs28KQkjhBBCiFakiyXwchbCjM/r1P16O1rC3lJgdMX5lIQRQgghROMYY0jLr1CpSWtrHA4Hg3vZGd32RZSEEUIIIUTjiirrUVLVAP/unasHUwj2sENWSTWKK+vVHJnuUBJGCCGEEI1TFOV3pj1FcyIP49vMm5IwQgghhGhcWr4EHA7Qr9uDjYT5u9lAYMLFOSPazJuSMEIIIYRoXLq4Ap6OlhCamjzQ/U1NeBjYwwZJRrSZNyVhhBBCCNG4dLGk001aWwtyt0e62Hg286YkjBBCCCEaVVrdgPzyWvg9YFG+gsjdDo0yhtTccjVFpluUhBFCCCFEo9LFik75DzsSZlybeVMSRgghhBCNShdLAOChR8LsLAXo6yw0ms75lIQRQgghRKPS8ivQw84cthaChz6XYjNvudzwN/OmJIwQQgghGqWOonwFkYc9JHVSXC+qUsv5dOnB1omqaM2aNUhNTQWHw8GKFSsQEBCgvC4uLg6//PILuFwu/P398cYbb0AqleKNN95Abm4upFIpli1bBpFIpMkQCSGEEKJBlXWNyCqpxuRAN7WcT6SsCyuFj2vnNgLXNxpLwhITE3Hr1i3s3r0bN27cQGxsLPbu3QsAqKqqwrZt2/Dbb7/BxMQEc+fOxYULF5CZmQlzc3P88MMPuH79OmJjY/Hjjz9qKkRCCCGEaNjlgkoAD1+Ur+DuYAFHoQDnssswY6i7Ws6pKxpLws6ePYvw8HAAQN++fSGRSFBVVQWhUAg+nw8+n4+amhpYWFigtrYWNjY2mDBhAsaPHw8AsLe3R3m5cSxBJYQQQrqqf7creriifAUOhwORuz2SjKBzvsZqwkpKSmBnZ6e87ODggOLiYgCAqakpFixYgPDwcISFhWHQoEHw9PQEn8+HqakpAOCbb75RJmSEEEIIMUxp4go4W5nC2cpMbecUedght7QWRZI6tZ1TFzSWhDHG2lzmcDgAmqYjt27diiNHjuD48eO4cOECrly5orxtXFwc0tPTsWDBAk2FRwghhBAtSM+XPHRritaMpV+YxpIwFxcXlJSUKC8XFRXB0dERAJCZmYmePXvC3t4eAoEAIpEIaWlpAIC9e/fixIkT2LJlC/h8vqbCI4QQQoiG1TXKcKO4Sm31YAp+3W1gxuciycD7hWksCQsJCcHRo0cBABkZGXB2doZQKAQAuLm5ITMzE3V1dWCMIS0tDR4eHsjNzcWuXbvw2WefKaclCSGEEGKYrtyuhEzO4Kem9hQKAhMuBvawxTkDHwnTWGH+4MGD4efnh+joaHA4HKxcuRLx8fGwsrJCREQE5s2bh1mzZoHH4yEwMBAikQgbNmxAeXk55s+frzzPtm3bIBA8fHM3QgghhGiXoijfX01F+c2JPOzwxR83UdMghYVAox23NEajUb/22mstLvv6+ir/HR0djejo6BbXL1myBEuWLNFkSIQQQgjRknRxBWzM+XCzNVf7uUUe9pCdzMSFnHIM7+uo9vNrA3XMJ4QQQohGpOVL4O9mrVyYp06De9mBwzHs4nxKwgghhBCido0yOa7erlTbdkWt2Zjz4e1sRUkYIYQQQkhz1wur0CCTw0/NKyObE3nY4fytMsgMdDNvSsIIIYQQonZp4rud8tXcI6w5kYcdquqluHq7UmOPoUmUhBFCCCFE7dLzK2Ap4MHTwVJjjyFytwcAnDPQLYwoCSOEEEKI2qWLJejf3RpcrvqL8hV62JnDxdoUSdmGWRdGSRghhBBC1EomZ8gokKi9SWtris28DbVpKyVhhBBCCFGrrJJq1DTI1L5dUXtEHnbIL6+FuLxW44+lbpSEEUIIIUSt0rVQlK+gqAszxFYVlIQRQgghRK3S8isgMOGir7NQ44/Vr5sVLAQ8nDPAzbwpCSOEEEKIWqWLJejnagU+T/NphgmPi8BetgZZnE9JGCGEEELUhjGGtPwKjTZpbS3I3R5XbktQVS/V2mOqAyVhhBBCCFGbvLJaSOqkWqkHUxC520HOgJQcwxoNoySMEEIIIWqTlt9UlK+pPSPbE9jLFlwODG5KkpIwQgghhKhNulgCHpcDH1crrT2mlRkfvq7WBtc5n5IwQgghhKhNmrgCXs5CmPF5Wn1ckYcdUnLKIZXJtfq4D4OSMEIIIYSohbIoX4tTkQoiD3vUNMhwucBwNvOmJIwQQgghalFUWY+Sqgb4u2mvKF9B5G4HAEg2oClJSsIIIYQQohaKTvna2K6ote625uhuY2ZQnfMpCSOEEEKIWqTlS8DhAP26aX8kDGiakkzOLgVjTCeP31mUhBFCCCFELdLyK+DpaAmhqYlOHl/kYYdCST3yygxjM29KwgghhBCiFuliiU6K8hWC7taFnTOQKUlKwgghhBDy0MqqG5BfXgt/LXbKb83X1RpCUxMkGchm3pSEEUIIIeShpYslAHRTlK/A43IQ2MuWRsIIIYQQ0nWk3V0Zqc09I9sjcrfH1cJKVNQ26jQOVVASRgghhJCHlpZfATdbc9haCHQaR7CHHRgDzhvAZt6UhBFCCCHkoaWLJTpp0traoF624HE5OGcAm3lTEkYIIYSQh1JZ14iskmr463BlpIKFwAR+3a0NonM+JWGEEEIIeSiK/Rp1WZTfXJC7HS7klqNRzzfz1mg3tTVr1iA1NRUcDgcrVqxAQECA8rq4uDj88ssv4HK58Pf3xxtvvIHGxkYsX74cYrEYPB4P77//Pnr27KnJELuE/Sn5WH/0KsTltehua46YSB9MDHTTdViEEEKMRFq+fhTlK4jc7bH9TDbSxRIM6mmr63A6pLGRsMTERNy6dQu7d+/G6tWrsWrVKuV1VVVV2LZtG+Li4rBz505kZmbiwoULOHDgAKytrbFz5048//zz+OijjzQVXpexPyUfsfGXkF9eCwYgv7wWsfGXsD8lX9ehEUIIMRJp4go4WZnC2dpM16EAaOqcDwDJet4vTGNJ2NmzZxEeHg4A6Nu3LyQSCaqqqgAAfD4ffD4fNTU1kEqlqK2thY2NDc6ePYuIiAgAQGhoKM6dO6ep8LqM9UevorZR1uJYbaMM649e1VFEhBBCjE2GWKLTJq2tuViboae9OZL1vDhfY0lYSUkJ7OzslJcdHBxQXFwMADA1NcWCBQsQHh6OsLAwDBo0CJ6enigpKYG9vT0AgMfjgcvloqGhQVMhdgni8vb3z+roOCGEENIZdY0yXC+q0pt6MAWRuz2Sb5Xp9WbeGkvCWj9pxhg4HA6ApunIrVu34siRIzh+/DguXLiAK1eu3PM+5MF0tzXv1HFCupr9KfkIWXsCnssPImTtCZqqJ6STrtyuhEzOdLpnZHtEHnYoqapHTmmNrkPpkMaSMBcXF5SUlCgvFxUVwdHREQCQmZmJnj17wt7eHgKBACKRCGlPaiZnAAAgAElEQVRpaXBxcVGOljU2NoIxBj6fr6kQu4SYSB9wW+WxZiZcxET66CYgQvQI1UwS8vD0rShfQeTeNLOWpMdTkhpLwkJCQnD06FEAQEZGBpydnSEUCgEAbm5uyMzMRF1dHRhjSEtLg4eHB0JCQnDkyBEAwMmTJzF06FBNhddlTBjYHeZ8HiwEPOWxKUE9aHUkIaCaSULUIV0sgY05Hz3s9GuGxctZCGszE5zT435hGmtRMXjwYPj5+SE6OhocDgcrV65EfHw8rKysEBERgXnz5mHWrFng8XgIDAyESCSCTCbD33//jaeffhoCgQBr167VVHhdRmZxFaobZFg3NQBTB/fAxC1ncPJKEeoaZTDj8+5/AkKMGNVMEvLw0sUV8Hez1rvyIS6XgyB3O70uztdon7DXXnutxWVfX1/lv6OjoxEdHd3iekVvMKI+CVlN3wCGetqDy+Vg+RhfTP8yATv+zsaLI/roODpCdMvVxgwFFXVtjlPNpPGj/onq0SiT40pBJZ4N8dB1KO0Sedjj5NWrKK9p0Pmelu2hjvlGLjGrFC7WpuhlbwEAGN7HEWG+zth88gbKqmnlKem6ZHIGB8u2H8rmfB7VTBo5qgVUn+uFVWiQydFfz+rBFILcm7o0nLuln6NhlIQZMcYYErNKEexh32KYePkYX1TXS7HpxA0dRkeIbr138DLSxBJMGeyGbjZNDSatTE3w/uQBNCJi5KgWUH3SxE1F+frWnkJhYA9b8HkcJFMSRrQtr6wWtyV1GOpp3+K4t4sVnhT1xHf/ZCPnjv4u3SVEU775Oxtfn8nC3BBPfPTkIJyNHQV3Bws86u1ICVgXkE+1gGqTIZbAUsCDp4OlrkNpl7mAB7/uNnrbOZ+SMCOmqAcb4unQ5rrFEd7gcTlY/xt98yNdy++XC/HOr+mI6O+CN8b1Ux73crbCtcIqHUZGtEEuZ7AUtL8oiWoBOy8tvwL9u1uD27oXkh4RudshNa8C9VLZ/W+sZZSEGbHErDuwMefDy1nY5joXazM8/2hv/JoqRmpuuQ6iI0T70vIr8MrOFPh1t8En0YPAa/aHw8tFiOySajRI5TqMkGiSXM7wfz+nobpBBpNWSYOAR/0TO0smZ8gokOhdk9bWRB72aJDKkZYv0XUobVASZsSSsssQ7GHf4TeU+Y/1hoOlAGsOXdbrbR0IUYeCilrM+yYJtuZ8bJstgoWg5eJwbxchpHKG7DvVOoqQaJJczvDmz2mIS8jBS4/3wfqpAXC7O/LF5zV9RvZxavuFlXQsq6QaNQ0yvWvS2pqiOF8fpyQpCTNSRZI6ZJVUt6kHa87KjI9F4V5IyCrFiStFWoyOEO2qqpfi2e1JqK6X4etng+FsbdbmNl7OVgCAa4WV2g6PaJhczvDG/jT8kJCD/z7eB8sifTBpcA+cWR6G7LXjcGZ5GJysTDH3m6QO68VIW+l6XpSv4GRlCg8HC70szqckzEglZivqwTpOwgAgekgveDpaYu3hK5DKaBqGGB+pTI4FcedxvagKW2YMhq9r+9/a+zoLweWA6sKMTFMCdgk7E3Pw8si+iIn0adNU1NnKDNufDUZdgwxztyehsq5RR9EalnSxBAITLvq2U/Kib0Qe9jinh5t5UxJmpBKzSmEh4N13mJjP4+L1KB9cL6rC3nN5WoqOEO1gjGHlL+n441oxVk/0x2PeTh3e1ozPQy97C1ynkTCjIZczrPjpEnYm5uKVsL5YOtq7w67u3i5W+PyZIGQWV+G/cefRSF9K7ystvwL9XK3A5+l/KiFyt0NpdQNuluhXuYH+v3LkgSRmlSLI3Q4mKvxyRPq5IsjdDhuOXUNNg1QL0RGiHdtOZyEuIQcvjOiNp4f0uu/tvVysaDrSSMjlDLHxl7ArKRcLw/piSUTHCZhCqJcj3pvkj7+ul+Ctn9P0btREnzDG7q6M1O+pSAWRx92mrXq2hZFKSdgHH3yA9PR0TcdC1KS8pgFXCysxxOPeU5EKHA4HK8b6oriyHl/9laXh6AjRjiNpt/HeocsYO8AVr0f63v8OaCrOz75To5dL2Ynq5HKG5fEXsTs5FwtHeWGxCgmYwlPBvfDfx/tgZ2Iutv55U8ORGq68slpI6qTwd9PvonyFPk5C2Fnwkaxnm3mrlIT169cPX375JaZNm4bNmzcjJydH03GRh5CcXQbGgOD71IM1F+Rujyg/V2z9IxPFlfUajI4QzbuQW45Fu1MwqKctNjw5SOUeRl7OVpDJGbL0bMqCqE4uZ3h930XsSc7Dq6O8VBoBa+210T4YH9ANaw9fwaFLBRqK1LApi/INZCSMw9HPzbxV2sB7woQJmDBhAhobG3H27FksXboUXC4X0dHRmDhxot7tnN7VJWWXQsDjYlBP207db1mUD45dLsQnv1/D6okDNBQdIZqVW1qD575JgpOVKb6cJYIZv/3GnO3xcmkqML5WWNVhAb++og2pm/pWvb7vIn48l4dF4V5YFO79QOfhcjn4cNpAFFTUYfHuC3C1McPgXnZqjtawpeVLwONy4ONqpetQVGbG5+FmSTU8lh+Em578jqhcE3bhwgWsW7cOH330EQYOHIhly5YhLy8PixYt0mR85AEkZJViYE+bTv3xAYDeTkJMH9ILOxNzkVlMK8SI4amobcTcHUlokMqxfU4wHIWmnbp/H6emFZI3DKwujDakbkrAlv3YlIAtDvd+4ARMwYzPw/9mBjU1tv4mmbZ4ayVNXAEvZ2Gn/87oyv6UfPyWXqi8rC+/IyolYZGRkdi+fTseeeQR7Nu3D2+++SaCgoLwyiuvoK6uTtMxkk6orpciLb/ivq0pOvJquBfMTLhYd+SKmiMjRLMapHL8N+4csu9U44uZQejr3Plv6GZ8HtwdLA2uTUVX35BaJmeI2ZuKfefzsCTCG6+Ge6nlvA5CU2x/NhhSOcOzOxJRUUOtKxTS8vW/U35z649eRUOrFa/68DuiUhK2e/duzJs3D2FhYTAxMcHZs2eVq0a2bt2q0QBJ56TklEMqZwhWsSi/NUehKV4c0QdH0wv1srswIe1hjOHN/Zdw5sYdvD85AMP7OD7wubychbhWZFgjYR1tPN0VGo8qErD4lHwsjfDGwlHqScAU+jgJsXVmEHJKa/Di9+doWys0NQMvqao3mKJ8oOPfEV1v2q5SErZmzRr89ttvystJSUlYvny5xoIiDy4xuxRczr/bNDyIeY96wtnKlLYzIgZjy6lM7EnOw8Kwvpga1OOhzuXtYoVbBrZCsqONp024HJzTs9Vg6iSTM7x2NwF7bbQ3XlFzAqYwrLcD1k0NwNmbdxAbf6nLfy6mGUin/OY6+h3R9abtKiVhYrEYr732mvLywoULIRaLNRYUeXCJWXfg190GVmb8Bz6HhcAESyK8cT6nHEfSbqsxOkLU75dUMdYfvYr/DOqOxREPVwcENBXny+QMN4sNZ4VkTKQPzExafpwLeFxYmZlg6hdnsfpABmobDCepVIVMzrB0zwX8lJKPmEgfvBymmQRMYVJgDywK98K+83n47MQNjT6WvkvLl4DDAfp1M5yRsJhIH5i3ql8z5/N0vmm7SkkYh8PBqVOnUFFRgbKyMhw+fBgmJiotrCRaVC+VISWn/IGnIpubGtQD3i5CfHDkCnWOJnorObsUr+1NRbCHHdZNDVDLSm1vF8PbQ3JioBuWj/m3F5qbrTnWTQ3AX6+HYcbQXvjqdBbGfvoXkoykxEAqk2PJngvYf0GMZVE+WDCyr1Ye99VRXpgc6IaPjl3Dzxe6zqKH1tLyK+DpYAmhqeHkARMD3fD+5AFwszUHB02/I+9PHqDz1ZEqvYIffPABNm7ciPXr14PL5SIgIABr167VdGykk9LyK1AvlT9wUX5zJjwulo/xxdwdyfghIQezh3s8fICEqFF2STWe/zYZbrbm+N9MEUxN1LNKy9PRElwOcN3AivN7OVgAAPa++EiLL2KrJw7AWP9uWLbvIp7cehZzhntgWaQvzAWGsaqttaYELBW/pIrxepQvXnq8j9Yem8Ph4P0pA5BfXouYvRfRzcZcLZ+3hiZdLMHghyh50ZWJgW46T7paU2kkrHv37li/fj0OHjyIX3/9FW+//TY2bdqk6dhIJyVkNX3LDfZQzy/HSB9nDOttj09+v04b2hK9UlbdgGd3JAEAts8Jhp2lQG3nNuPz4OFgiesGVpyfIZYAAHzb6ds0vK8jji56DDOHuWP7mWxEffInEm7e0XaID00qk2Px3QRs+RjtJmAKpiY8bJ0ZhB725pj/XTJudrF2PmXVDcgvr4X/ffYlJqpRKQnbu3cvHn30Ufj7+yMoKAjBwcGoqupaP3iGIDGrFF7OQjh0sjdSR5q2M+qH0uoGbP2Dtu8g+qFeKsML351Dflkt/jdLBA9HS7U/hpeL0OBGwjIKJHB3sOiwHtTS1ATv/scfO58fBsaAp/73D97+Jd1g9ouVyuRYtPsCfk0VI3aML14cof0ETMHWQoDtc4LB5XAwd0cSSqsbdBaLtqXfTfYNqShfn6ncouL48eMIDAzEuXPn8NFHHyEwMFDTsZFOkMkZzmWXdWqrIlUE9LDFhIHd8dXpm7hdQT3hiG4xxvD6jxeRmF2K9dMC1FL/2B5vFytk36lGXaPhFLOniyXwU2F04pE+Djiy6FHMGe6BHX9nI+rjv3A2U79HxaQyOV7dfQEHLhZgxVhfvKDDBEzB3cESX84KgriiDvO/TTaon5WHoVgZqcrPGrk/lZIwU1NTmJqaorGxEXK5HKNGjcLx48c1HRvphMsFElTWSzFUA/UJMZE+kMkZNhzrGo0fif76+Ph17L8gxmujvfGfQZqr7fBysYKcwWBWSFbWNeLWnRr0V3G1moXABG9P8MPu+cPA4QBPf/kP3vo5DdX1+jcq1iiT49VdF3DwYgHeHNcP8x/TfQKmEORujw1PDkTyrTIs+/Ei5HLjb12RLpbAzdYcthbqKwHoylQqzB8wYAC+//57hIaGYvbs2XB1daVO+XomUVkPpv4krKe9BWY94oHtZ7IwN9TT4PbUI4ar+X6IthZ8lNU0YmpQD42vhvO+u4fk9aJK9DeAb/xXbjfVr3U21qG9HXDk1cew/uhVbP87CyeuFGHdlAAM7/vgzW4fVvP3vJuNGZysTJGaV4E3x/XDc4/21llcHRkf0B05pTVYd+Qq3B0ssHS0blseaFp6foVBNWnVdyolYXPnzoWtrS0EAgGGDh2KsrIyDB8+XNOxkU5IzCpFDztzjTWee3lkX+xJzsUHh69g+7NDNPIYhDSn2A9RsR1PWU0juBxgmKe9WlpR3IunoyV4XI7BtKlQFOX379b5Oh1zAQ9vPdEfYwa4YtmPFzH9qwQ8M6wXlo/pp/UWBK3fc3FFHcQVdZg4qLteJmAKL43og5w7Ndh04gZ62lvgSVFPXYekEZV1jbhZUo1JerbC0JCpNB25ZMkSCARNQ4/BwcEYPXo0hEKhRgMjqmOMISm7VKNLpe0sBXh5ZF+cvFqMv2+UaOxxCFFobz9EOQM2Hr+u8cc2NeHB3cHCYPaQzBBLYG8pgIv1gy/KCfawx6GFj+K5UE/EJeQgcuOfOKPl3/X23nMASMou02ocncXhcLBqoj9C+zpiRfwlo/2MvFzQ9KXEj0bC1EalrzlOTk6Ijo7GgAEDwOf/u/Jm2bJlGguMqC6zuBp3qhs0Ug/W3OzhHvj27C2sOXwZvywIBZer2dEI0rXpeq83b2crXDWQkbD0ggr4dbd+6BFCcwEPb45vGhWL2XsRM75KwPShvRA7xvehduFoT3W9FDeKqnCtsFL5/472utT1/n6q4PO42PLMYEz9/G+88P05/PTf4Q+0ibw+S8u/u12RAW3cre9USsIee+yxNsdU+WVfs2YNUlNTm1odrFiBgIAAAEBhYWGLbZByc3OxdOlSDBkyBCtWrEBDQwPkcjliY2Ph7++v6nPpshT1YEM8HTT6OGZ8HpaO9saSPan49aJYo4XRhNiY81Fe27Y/nbb2evN2EeK3jNuoa5TBjK+/jU0bZXJcu12FZ0M81HbOIHd7HHr1UWw8dg1f/nUTp64UYe2UADzm7dTpc1W1k2xdL6xqkXAJeFz0drKEOZ/X7kiYrvf3U5W1GR9fzwnGxM1/Y872JPz03xA4WamnZZA+SBdL4GRlCmdrM12HYjRUnvDv7DesxMRE3Lp1C7t378aNGzcQGxuLvXv3AgBcXFzw3XffAQCkUilmzpyJsLAwfPbZZ4iIiEB0dDTOnz+PjRs3Ytu2bZ163K4oMesOHIWm8LjbMVuTJg5yw1d/ZWHdkauI8ndVW5dyQhSkMjlWH7yM8tqmGrDmC860udebYoVkZnEV/PT4m39mcRUaZHK1LyAw4/MQO7YfIv1dEbM3FbO+TsSw3va4dacGtyvq0N3WHDGRPsoO5JV1jbheVIUbhVW4XlSJa4VVuFHUKtky4aKPkxBB7nZ4ekhP9HW2greLEL3sLWDC47apCQP0Y3+/zuhhZ4Fts0V46n9nMeXzv9Eok7f7ehmidHEFNWlVM5WSsGvXrin/LZVKkZqaCi8vL0ycOLHD+5w9exbh4eEAgL59+0IikaCqqqpNLdlPP/2EyMhIWFpaws7ODuXl5QAAiUQCOzvD2xZBF5KyyzBUC8XKAMDlNjVwfWZbAr79+xaef0x/i2WJ4amobcTLP5zHX9dLMC/UE/27WWPDsWsQl9dq/Y+YYg/J64X6nYT9W5SvmT+Og3vZ4eDCR7Eg7jx+v1KkPJ5fXoule1PxxR+ZqKhtREGzPoICEy76Ogkh8rDDdJde6OsshLeLFXramcOE13EpsuK9VayONNTEZWBPW0wf0gtfn8lWHssvr0Vs/CUAMLjnAwB1jTJcL6pCeD8XXYdiVFRKwl5//fUWl2UyGRYuXHjP+5SUlMDPz0952cHBAcXFxW2SsL179+Lrr78GAMyZMwdTp07F/v37UVVVhZ07d6r0JLqyvLIa5JfXYr4Wk6FQL0c85u2ETSeuY5qoB/WLIWpxs7gKz32bjNzSGnwwZQCeCu4FAJgS1EMn8Xg6WsLEAFZIZoglMDXhwlMDOwcomPF5yjYYzcnkDJnFVRg3oBu8XKzgpUi27C3Ae8CaUX3c3+9BHE0vbHOstlGG9UevGuTzu3K7EjI5o/YUaqbS6sja2toW/+Xn5+PmzXtvY8MYa3O59UhNSkoKevfurUzMvvrqK4wZMwZHjhzBqlWr8MEHH3TmuXRJmuwPdi/Lo3xRWS/FllOZWn1cYpxOXy/BxM1nUF7TiO/nDVUmYLokMOHCw9FS71dIposl8O1mfc8RJnXoqDheKmP4ODoQC0b2xWg/V3jcbe/R1el6YYm6pSs75evvqLAhUmkkbNy4ceBwOMpEysrKCnPnzr3nfVxcXFBS8u8y3aKiIjg6tmwAeOrUKTzyyCPKy+fPn8eiRYsAACEhIXjnnXdUfiJdVVJ2KazNTODTzqa9mtS/uzUmB/bAjjPZmDnMHT3tNV+PRozTt2ez8c6vGejjZIlts4P16mfJy1mIjAKJrsPoEGMMGQUSjB3QTeOP1d3WvN3Vi4ZSNK9txvR67U/Jx3sHLwMAntp6FsuifA1yNE8fqfTV6cSJEzh06BBOnDiB33//Hd999x2mTZt2z/uEhITg6NGjAICMjAw4Ozu3mYq8dOkSfH19lZfd3d2RmpoKALh48SLc3d079WS6ooSsUgR72Ovkm+fS0d7gcICPfqPtjEjnNcrkeHP/Jbz1czoe93bCvpeG61UCBjQV5+eU1ujtvoDiijpU1DZqpat/TKQPzFutEjW0onltau/14nKAJRHeOorowSgWS9Q0/NtANzb+Evan5Os4MuOgUhL27bff4tVXX1VejomJwbfffnvP+wwePBh+fn6Ijo7GqlWrsHLlSsTHx+PYsWPK2xQXF8PB4d+2Ci+88AJOnTqFmTNn4pNPPkFsbGxnn0+XUlxZj5vF1Rpt0nov3W3NMTfUE/sviJX9YwhRRVl1A2ZtS8T3/+TgxRF98L9ZIrX3oVIHbxchGANuFOnnlKSmi/KbmxjohvcnD4CbrTk4ANxszfH+5AE0ItKB1q+XrQUfcgbcumMY+5EqtNdAV1HbRh6eStORhw4dwg8//KC8/Pnnn+Ppp5/GrFmz7nm/5r3AALQY9QKAX3/9tcVlZ2dnfPnll6qERNA0FQkAwTpKwgDgpcf7YFdiDtYcuoy454ZqZYUmMWw3iiox75tkFJTX4aNpA3VWeK8K5QrJokr4u+lfLUy6uAIcDtCvm3bKEYylaF5bWr9eMXtTsenkDYg87B+o55ouGHIDXUOg0kiYVCqFRPJvXURxcbHGAiKqS8wqhTmfp9PuxdZmfLwS5oW/M+9AtPo4PJcfRMjaEzRUTdp16moRJm3+G9X1UuycP0yvEzAA8HBQrJDU35EwT0dLWAi0u8cjeTDv/scf3s5WWLz7Agoldfe/g47VNcog6GDBhyHWtukjlX5zFy9ejKeeegqmpqaQy+WQy+V46623NB0buY/ErFIMdreFwESzq6Lux9rMBBwAd6obABh+PxyifowxfH0mG+8dzICPqzW+mi2CmwF8iAvutn64rqdtKjIKJBjU01bXYRAVmQt42DxjMCZ8dhqv/JCCH54fqvFVrQ9KLmdY9uNFNMjk4PM4aJT92/GAagHVR6UkLCQkBL/88guqq6vB5XLB4/FgZWVce2IZmoraRly+LcGiUbov8tx4/DpYq2OG3A+HqFeDVI7/25+G3cm5iPRzwYYnB8HS1HBGbrxdrHBJD2seK2obkVdWi+lDdd/Og6iur7MQ703yx+Ldqdhw7BqWRfne/046sPH4NfySKsayKB90tzE3+Aa6+kqlT8JvvvkGZ8+exRdffAEAePHFFzF8+PD71oQRzTl/qwyMAcGeut9VwNj64RD1uVNVj5e+P4/E7FIsDOuLReHeBrfxe19nIQ6lFaC2QQZzgf5s03W5QHtF+US9JgX2QMLNUmw5lYlgT3uM9HHWdUgt7EnOxaYTNxAd3BMvjegDDodDSZeGqDQOevjwYWzZskV5+fPPP8ehQ4c0FhS5v4SsUvB5HAT21H0S1lFtANUMdG1Xbkvwn81nkJpXjk+fDsSS0T4Gl4ABTSNh7O4ekvok/e7KSGqeaZjenuAHX1crLNl9Qa++sP59owQr4i8htK8jVk30p8VWGkaF+QYqMesOAnrY6sU38/b64fA4HLw2WvdTpUQ3jmcUYsqWv9EglWPPC49gwsDuug7pgXm7NPU31LftizLEEjhZmcLJylTXoZAHYMbnYcuMwWiQyvHKzhQ0yuS6Dgk3iirxwvfn0NvJElueGQy+ntarGROVpiOXLFmiLMxnjEEmk2HevHmajo10oLZBhot5FXjuUf3YPLv1prtWZiaQ1ElR26j7DxWiXYwxbP3zJj44cgX+3W3w5SwRXG3MdB3WQ/FwtASfp38rJDMKJDQVaeB6Ownx/pQALNyZgg9/u4rYMf10FktxZT3mbE+CqQkPX88JhrUe9u0zRiolYVZWVvDx8UFmZia4XC6sra3xxRdfYMqUKZqOj7QjJbcMUjnDUB32B2uteT8cuZxh9vZEvP1rOgb2tKHpki6irlGGFfGXEJ+Sj/EB3bB+6kC9GKl9WHye/q2QbJDKcaOoEo/7GEavKdKxCQO7I+HmHWz94yaGeNhjVD8XrcdQ2yDDc98mo6SqHrvnP4Iedvq1c4UxU2mscfXq1Zg+fTosLCzw/vvvY+jQoVixYoWmYyMdSMwqBYcDBHnovh6sPVwuBx8/NQh2FnwsiDuPyrpGXYdENGB/Sj5C1p6A5/KDGLbmd0R+/CfiU/KxJMIbm54ONIoETMHLxQrXivQnCbteVIlGGYOfFrYrIpr3f+P7o383ayzdm9phc1RNkcsZluy5gIt55fgkOhADqeWJVqmUhJmZmWHYsGHg8/nw9/fH4sWL8f3332s6NtKBxKxS9O9mrdfDxQ5CU2x6ejByy2qxPP4SGGvdxIIYMsV+cvnltWAAbkvqcOtODZ4d7oGFo7yMrpjX29kKuaW1qGmQ6joUAP8W5dN0pHFQ1IdJZQwv/3AeDVLtlXJ8cOQKDqfdxhtj+yHSz1Vrj0uaqJSEmZub4/fff0ePHj2wYcMG7N27FwUFBZqOjbSjQSrH+ZwyBHvoz1RkR4Z42mPpaG8cvFiA7/+5petwiBq1t58cAPyWUaiDaDTP625xvr7sIZkhlsBCwIO7g6WuQyFq4uFoiQ+mBCAlpxzrjlzRymPGJdzC1j9vYuYwd8wL9dTKY5KWVErCPvzwQ/Tp0wdvvfUWBAIBrl69ig8++EDTsZF2pIkrUNco16t6sHt58bE+GOnjhFUHLtMm30agqLIO357N7nL7ySlWSF7Xk+L8jAIJfF2twDPAlh+kY+MCumHWI+746nQWfku/rdHHOnW1CG/9nI6RPk5Y+UR/oxu9NhQqFeYLhUIIhU0fQi+//LJGAyL3lpil+027O4PL5eCjJwdh3Kd/4b9x53FgYaheT6OStookdTicdhsHLxUgKbsUjAEmXA6k8rZTzMbaG87d4e4KST2oC2OM4bJYgv8EGm7bD9KxN8b1Q0pOOV7bm4qD3azR0179RfKXCyR4+YcU+LhYYdP0wXq7dVJXQK+8gUnMKkVvJ0s4Cg2nN5C9pQCfTQ+EuLwWr/94kerDDEChpA47zmThyS/OYuj7v2PlL+koq27AwjAv/Lb4MXw4bWCb3nDGvJ8cn8dFb0ehXoyE5ZbWorJeSquOjZSpCQ+bpw8GAzRSH1YoqcPcHUkQmprg6znBEBrQFmLGiF59AyKTMyRll2J8QDddh9JpQe72WBblgzWHruCbv7MxJ4TqD/TN7Yo6HE4rwKFLBUi+uy2Wt4sQr47ywrgB3eDl8u9+sd53/92V9pPzchHiQm65rsNARkHTtD4V5RuvXg4WWD81AC9+f9w0mh0AACAASURBVB7vH76MlU/4qeW81fVSzN2RBEltI/a8+IjB9/AzBpSEGZCrtytRWSfFEAOZimztudDeSLhZivcOXUZgLztaCq0HCipqcfjSbWXiBQA+LlZYNMob4wJc0dfZqsP7Nu8N1xV4u1jhwMUCVNdLdboBeYZYAi4H8HHt+L0hhi/KvxvmDPfA9jPZGOppjyj/h/vyLZMzvLorBZcLJNg2O5hGUvUEJWEGJDHrDgBgiKeDjiN5ME31YQMx7tPTWPDDeRx85VHYWFB9mLaJy2tx6FLTiNf5nKaRHV9XKyyJ8MbYAd3Q11mo4wj1k3ezFZK6/AKRUSBBHychzPjG04eNtG/F2H5IySlDzI8X0b+bDXo5PHh92KoDGTh+uQir/uOHkb76tWF4V0ZJmAFJyi6Dm6053Ay4+NnWQoBN0wPx5BdnEfNjKrbODKJVORqwPyW/xVThc6GekDGGg5cKkNIs8Voa4Y2xAd3Qx4kSr/tRTMdeK6zUbRImlhjMwhzycAQmXHw2fTDGffoXFvxwHj++9AhMTTqffG8/k4Udf2djXqgnZj7iof5AyQOjJMxAMMaQkFWKR70cdR3KQxvcyw7Lx/hi9cHL+PpMNvWnUTNFI1VFH6/88lq8cyADQFMdUUykD8b4u6I3JV6d4m5vAQGPq9NeYWXVDRBX1FGn/C6kp70FPpw2EPO/O4f3Dl7Gu//x79T9j2UU4t0DGRjd3wUrxupub0rSPkrCDERWSTVKquoNth6stXmhnkjMKsX7hy5jcC9bBPbSzy2YDNG6I1fabaTqYmWKQ68+qoOIjIMJj4veTpa4psM9JDMKFJ3yqZ6nKxnt54rnQj3x1eksDPG0x/gA1dqTXMqrwMKdKRjgZoOPowdRXzk9RC0qDERS9t3+YAbQKV8VHA4H66cOhKuNGV7+IQXlNQ26Dsko5JXVQFxR1+51RZX1Wo7G+Hi5WOGaDttUZNzdrqhfNyrK72peH+OLwF62WL7vErJLqu97+/zyWsz9Jgn2lgJ8NVsECwGNuegjSsIMREJWKRwsBejjZDzblNhY8LF5+mAUVdbhtb2p1D/sITDGEJdwC5Eb/0RH33WNtZGqNnk7C5FfXovqet3sIZlRIIGrtRkcDKhPIFEPPq+pPozH5eC/cedR185ot0JlXSPm7UhCXYMMX88JhrMVtaLQV5SEGYjErFIM8bQ3uiL2gT1tsWJsPxy/XIQv/7qp63AMUm5pDWZ8lYA3fkrDoF62eHN8vy7VSFWbFMX513VUF5YhllA9WBfmZmuODU8OREaBBKvu1nm21iiTY8EPKbhRVIUtzwymViZ6jpIwAyAur0VeWa3R1IO1Nme4B8b4u+KDI1dx7laprsMxGHI5w3dnsxH58Z+4mFeBNZMG4Pt5QzEvtDfenzwAbrbm4KDpg/v9yQO6VE8vTVG0qdBFXVhdoww3iqvQn5KwLm1UPxe88FhvxCXk4OcL+S2uY4xh5S/p+PNaMVZP9MejXk46ipKoiiaJDYCx1YO1xuFw8MHUAKR/ehqv/JCCgwsfhZ2lQNdh6bWcOzVYti8V/9xsWjG7dkpAi9YlXa2Rqrb0sreAwISL6zpIwq4VVkImZ9Qpn+C1SB8k3yrDivhL8HezUbaY+d+fN/FDQg5eerwPoof00nGURBWUhBmAhKxSWJmaoJ8Rf/ham/GxZcZgTN7yN5bsuYBts4PBpZU8bcjlDN/9cwtrD1+BCZeDD6YMwJOinkY3Ta2vTHhc9Ha01Ml0pKIon0bCSFN9WCDGfvIXZn6VAADKBTmDetoiZjSVHhgKmo40AIlZpRB52Bn98mJ/Nxv83/h+OHm1GFv/pPqw1m7dqUb0/7d352FR1nv/wN83w86M7DMguLMl4gJYmqWmkk+7Ghx5TvI79djplC3nZ6lF2bFTHi217TLzabPFY78MQovsCU+LLc8h1BERQREUDRiEYR12mOH+/YGMjiyCMtyzvF/X1XV13/csn+F2Zj7zvT/fz/e937DuqzxcP84HGStnY+n00UzAhlmYSiHJQt755TrIXRwxyvvqu6aT7Qj0dENC7Cho6ltNZkSfLNfhqxyNhJHRYDAJs3DVjW0oqmy0mw7Zy2aMwR1Rgdiyv8B4GdbedXaK+PB/i/Efb/yCE+U6bIqfjI8emM7ZjhIJU3XNkGwc5hmS+RodJgaO4AgxGe07Vt5jX6u+E5szCiSIhq4GkzALd+hs16LKN9hJEiYIAl6+NwqjvN3w2KdHUN1o372tiquakPjub/h7ej5mjPfB/pWzeflRYsYZksNYF9bZKeJEuY6XIsmEpq5lUPvJ8pg1CduwYQOWLl2KxMREHDt2zLi/oqICSUlJxv/mzp2L9PR0AMAHH3yAe+65B/feey9yc3PNGZ5VOFhcAxdHB0QFSbdW3XBTuDph233RqG3uwMrPc9DZaX/9wwydIt7/5Qxue/NnnDyvw6sJU7Dj/ukI9OTol9TCjEnY8F2SPFfTjKZ2A4vyyURfo+EcJbceZivMP3jwIM6dO4fdu3ejqKgIycnJSElJAQCoVCrs3LkTAKDX65GUlIR58+ahsLAQ+/btwxdffIGCggJ8//33iIqKMleIVuHg2WpEj/aGs6N9DVpGjvTEursm4rk9x7H9p9N49JYQqUMaNme0jVidegzqc7WYH6HEhiVRUI1gs0VLMdrHHS6ODsPapoJF+dSb1QvDTdaJBdgT0NqYLQnLzMzEggULAAAhISHQ6XRobGyEXG66aPCePXuwcOFCeHh44Mcff8Rtt90GR0dHREZGIjIy0lzhWYWG1g7ka3R4bF6o1KFI4o/Xj0bWmRq8ur8AMWO8MWO8r9QhmZWhU8SOX4uxZX8BXJ1keH3pFCyaGsRLjxZG5iBggr8cp4ZxhmR+eT0cHQSEqrjoOl3U3YZmc0YBNHUtGOnlhtULw9mexoqYLQmrqqoySaJ8fX2h1Wp7JGEpKSnYsWMHAKCsrAzu7u549NFH0djYiOTkZERERJgrRIunPleLTtF+6sEuJwgCNiyJwvGyrkVo9z1xM/wVtrlcS1FlI1an5iD79zosuE6FDYsnQcnRL4sVqpLjUPHwTRzJ0+gQopTDxVF25RuTXWFPQOtmtiTs8nUARVHs8Ys+Ozsb48ePNyZmoiiipaUFb731FtRqNZ577jl88cUX5grR4h0sroGjg4Bpo+2nHuxychdHbLsvGou2/S/ue/83NLbqUV7fatW/+PZmlxl/uQZ6uiJ2rA++zTsPd2cZ3kycirunjOTol4ULUynw5VENGlo7oHB1Mvvz5Wt0uCnUz+zPQ0TDy2xJmEqlQlVVlXG7srISfn6mHyIHDhzAzJkzjdt+fn4YP348BEFAbGwsyspMl2SwN4fO1mBSkCfcne27p+51gSOwaGoQdh8uMe4rq2tBclrXxA1rSsT2ZpeZ1HBo6lvxVY4Gk4M88f79sVxo10qEKrt+OBZWNiJ6tLdZn0vb0IbKhjYW5RPZILNVe8+aNQsZGRkAgPz8fCiVyh6XInNzc00uN86ePRu//PILAOD06dMIDAw0V3gWr7XDgJySeru9FHm5Xwq1Pfa1dBisrh/O5owCkyLablWNbUzArEjYMLapOFHOonwiW2W2IZbo6GhERkYiMTERgiBg3bp1SEtLg0KhQFxcHABAq9XC1/disfXUqVPxyy+/ICkpCe3t7fjb3/5mrvAs3tGSOrQbOm120e7BKr+kI/SlrK0fTl/x9vX6yDKNMs6QNH9xfv6FJCwy0NPsz0VEw8us17lWrVplsn15kX13b7BLPf744+YMyWocLK6BIACxY5iEAV19b8p6SWCsrR+OrbwOeydzEBCilA9Lm4o8jQ5BXm7wdDd/7RkRDS/7aj5lRQ6drUG4SsEP3gtWLwyHm5PpzDBr7IezemE4Ll91xhpfBw3fGpL5mnpeiiSyUUzCLFCHoRPqc7WsB7vEomlB2LgkCkEXRoxkDgI2LJ5kVUX5ADB1lBc6RWCEqyMEAEFebti4JMrqXgcBIUo5zutaoWvtMNtzNLfrcaaqiUX5RDbKvqfdWag8jQ7N7QZcP862m5MOVnc/nC/UpXgqJQdB3u5ShzRoXxwphYMA7F85BwGeLMS3ZpcuXxQzxjwzJAvON0AUgUiOhBHZJI6EWaDuJpDTx5l36ru1ui0qAB7OMqSqS658YwvS2SniC3Upbg71ZwJmA8IudK835wzJfM6MJLJpTMIsUFZxDcb5ebBlQR/cnR1xe1Qg9h0rR3O7XupwBizzTDU09a2IjwmWOhQaAqO83eHqZN4ZknkaHUa4OhovwxORbWESZmE6O0UcOluD68eyHqw/CbGj0NRuwP/knpc6lAFLOVyCEa6OiJuokjoUGgIOF2ZIFlaacSRMo8PEkSO4ggKRjWISZmFOVTagvqWD/cGuYPpYb4zxdUequlTqUAZE19qBb/PO4+6pI+HqxPX/bEWYUmG2NhWGThEnz+swkf3BiGwWkzAL010PxiSsf4IgID46GJlnqlFS0yx1OFe071g5Wjs6kRAzSupQaAiFqhSo0LWhvmXoZ0gWVzWhtaOTRflENoxJmIXJKq5BoKcrgr1ZA3IlS2KCIQhdMw4tXaq6FKFKOSYHc1TDlnSvIVlkhkuSeZp6ACzKJ7JlTMIsiCiKOFhcg+vH+bAGZACCvNwwa4IfUtWl6OwUpQ6nT6e1jVCfq0V8TDDPq43pblNhjuL8/HIdnGUOmOAvv/KNicgqMQmzIL/XNKOyoY2XIgchPiYYpbUtyLpwGdcSfaEuhcxBwGI2ZLU5wd5ucHOSmaUuLF+jQ6hKDmdHfkwT2Sq+uy1IdyLBmZEDtzAyAAoXR4st0Dd0ikg7UoY5Yf5QjmDLEVtjnCE5xCNhoih2zYxkp3wim8YkzIIcLK6Bj4czQpS8/DBQbs4y3DklEN/klqOxzfJ6hv1aVIXzulYksDeYzQpVDf1C3tqGNlQ3tbMon8jGMQmzIAeLazB9rDfrhgYpPiYYLR0GfJNbLnUoPaSqS+Hl7oR51ymlDoXMJEylQGVDG+qbh26GZJ6mu1M+J3IQ2TImYRbifH0rfq9pxnReihy06NHeGO/ngdTDlnVJsr65Axl557FoahBcHNkbzFZ1L190aghnSHYvVxQRqBiyxyQiy8MkzEIcPNtVD3YDF+0eNEEQcG9MMA6ercHZqiapwzFKP6ZBu76TyxTZuFDlxYW8h0q+RofRPu4Y4eo0ZI9JRJaHSZiFOFhcDbmLI67jL9+rcm90MBwEIM2CeoalqEsREaBgXY+NC/Ia+hmS+eU6/rshsgNMwizEoeJaxIzxhqOMp+RqBHi64qZQf3xxpMwieoYVVjQgp6SOvcHsgIODgFDV0K0h2dimx9nqJs6MJLID/Ma3ALVN7SioaGB/sGuUEBOMsroWZJ6pljoUpKpL4eggYBF7g9mFUKViyBq2nizXQRTZKZ/IHjAJswCHznK9yKEQN1EFhasjUg6XSBqH3tCJtOwy3BKhhJ/cRdJYaHiEqeTQNrShrrn9mh+ruyifSRiR7WMSJrG92WV48vMcAMBf/1829maXSRyR9XJ1kuHuKSPxbd556FqHfkHlgfq5UAttQxsL8u3IUC5flK/RwdvdCQFs7ktk85iESWhvdhmS03KNTUY19a1ITstlInYNEmJHobWjE/uOSdczLFVdCl8PZ8yLYG8wexHa3aZiCIrzu4ryPVlLSGQHmIRJaHNGAVo6DCb7WjoM2JxRIFFE1m9KsCdClHLJljGqbWrHd/mVuGdqEJw4ycJuBHm5wcNZhqLKaxsJ6zB04uT5Bl6KJLIT/JaQUFldS6/7NX3spysTBAEJMcFQn6vFGe3Qruc3EF/laNBuYG8weyMIXWtIXutI2BltE9r1nZwZSWQnmIRJRBRFuDv33kV9pJfbMEdjWxZPC4LMQZBkNCxVXYrIkSM4kmGHQlXXPkMyv7weAIvyiewFkzCJvPvzGTS3G+DoYFr34eYkw+qF4RJFZRuUI1wxJ8wfaUfKYBjGnmEnz+uQW1bPUTA7FaaSo6qxDbVNVz9DMl+jg4ujA8b7eQxhZERkqZiESeCb3HJs/J+TuHNyIDbFT0aQlxsEdNWVbFwSxd5SQyA+Jhjnda34tahq2J4z9XApnGQC7pnK82ePQo0zJK/+kmR+uQ4RAQo2bSayE45SB2Bvjvxei5W7jyJmjDe2JEyBq5MMS6I5cjLU5l+nhJe7E1LVpZgT5m/25+swdGLv0TLMj1DBx8PZ7M9HlsfYpqKyETeMH/wasKIoIk+jw22TAoY6NCKyUGb9ubVhwwYsXboUiYmJOHbsmHF/RUUFkpKSjP/NnTsX6enpxuNVVVWYPn06srKyzBnesCupacafPz6MAE9XvJsUA1en3mvC6Nq5OMpwz5SRyMg7j/pm8/cMO1CgRVVjOxJimVDbq5GerpC7OKLwKkfCyutbUdfcwaJ8IjtitpGwgwcP4ty5c9i9ezeKioqQnJyMlJQUAIBKpcLOnTsBAHq9HklJSZg3b57xvps2bcKoUaPMFZok6ps7cP+HB6HvFLHj/unwZSd1s4uPGYWPM88h/ZgGy2aMMetzpapL4Cd3wexhGHUjy9Q9Q7LwKovz8zXslE9kb8w2EpaZmYkFCxYAAEJCQqDT6dDY2PPDac+ePVi4cCE8PDyM9/Pw8EBYWJi5Qht27fpOPPxPNX6vacY7STGY4C+XOiS7MCloBCICFGafJVnd2IbvT1Ri8bSR7A1m50KVV7+Qd365DoIARAQwCSOyF2b7xqiqqoK3t7dx29fXF1qttsftUlJSEB8fDwBob2/Htm3bsHLlSnOFNexEUURyWi4yz1RjU/xkzLiKWhG6OoIgID4mGEdL6lB0lV+MA/HlUQ30nSLiY2xr9JYGL0ylQFVjO2quYoZkvkaHcb4e8HBhqS6RvTBbEiaKYo/ty5fhyM7Oxvjx4yGXd40Mvfvuu0hISMCIEbbzS/CtH4rwxZFS/N8FoVg8jfVCw23RtCA4OghIMeNoWIq6FJODPREeoDDbc5B1uJbli/LK63EdL0US2RWzJWEqlQpVVRfbA1RWVsLPz8/kNgcOHMDMmTON27/++it27dqFP/zhDzhw4AD+/ve/o7Cw0Fwhmt2XR8vw6r9OYcm0IPx1fqjU4dglP7kL5oYrkXakDHpD55A/fp6mHifKdUhgbzDCxRmSgy3Or2/pQElNC4vyieyM2ZKwWbNmISMjAwCQn58PpVJpHPHqlpubi4iICOP2Z599hs8//xyff/455s6di3Xr1iE01DqTl4PFNVidcgw3jPPBxnujuBivhBJig6FtaMMvhUPfMyzlcCmcZQ64a8rIIX9ssj6Bnq5QuDgOunP+yXIW5RPZI7MVH0RHRyMyMhKJiYkQBAHr1q1DWloaFAoF4uLiAABarRa+vrZXI1Vc1YSHdh5GsI8b3kmKgYsjW1FI6ZZwJXw8nJGiLsEtEcohe9x2fSe+PFqGuIkqeLmzNxhdmCGpGvwakvkXkrBIJmFEdsWsFaCrVq0y2b501AuASW+wy7388stmicncapra8cCHB+EgCPjw/un8crYAzo4OWDQ1CP/87Rxqm9rhPUTNVH84WYHa5g7EszcYXSJMqcB3JyoGdZ98jQ5+chcoFa5mioqILBHn0w+h1g4DHvrkMDT1rXjv/8RgjC/Xf7MU8THBaDd0Iv2YZsgeM1VdCqXCBTeH+F35xmQ3QlVyVDe1o7qxbcD3ydPoeCmSyA4xCRsioihiTeoxHD5Xi9f+MAUxY3ykDokuMXHkCEwMHIGUw0MzS1Lb0IYfC7RYEh3Mdf7IxMU1JAdWF9au70RhZQOL8onsEL89hshr/zqFr3I0WPMf4bhzMou0LVFCbDByy+px8rzumh9rb3YZDJ0i4jkrki4TdqFNxUCbthZVNqLDILIejMgOMQkbAimHS7D1hyIsjR2FR+ZMkDoc6sM9U4PgJBOQeo2jYaIoIlVdimmjvRCi5OoHZCpgRPcMyYElYfmcGUlkt5iEXaN/F1UhOS0XN4X4Yf3iSWxFYcF8PJwxP0KFvUfL0HENPcNyy+pRUNHAUTDqlSAICFXJB3w5Mk9TDzcnGcayhpTI7jAJuwZFlQ34yz/VGOfngbeXRXPdQCsQHxOMqsZ2/FTQcwmtgUpVl8LF0YGXnalPYSoFCisaeqwc0pt8jQ4RgQrIHPgDjsjeMGu4StqGNtz/4SG4OMrw4QPTMcLVSeqQaADmhPvDT+6CFHXJVd2/tcOAL49qsDAyAJ5uPOfUu1CVArXNHai+whqSoigiv1zHonwiO8Uk7Cq0tBvw4CeHUdXYhg/+FItgb3epQ6IBcpI5YPG0kfj+ROWgWgh0+/5EJepbOngpkvoVqhzYGpKltS1oaNUjcqTncIRFRBaGSdggdXaKePLzozhWWoc3E6dhyigvqUOiQYqPGQV9p4gvjw6+Z1iqugSBnq6Yxd5g1I+La0j2XxfGonwi+8YkbJBe+fYk/uf4eTx3+3VYGBkgdTh0FcIDFJgc7IlU9eBmSVboWvHTKS2WRAexfof6pRrhAoXrlWdI5ml0cBCA8AtJGxHZFyZhg7Ar6xze+fkMkmaMwfKbxkkdDl2D+Jhg5JfrkKepH/B99mSXoVPsGkkj6o8gCBeK868wEqbRYby/HG7OXF+WyB4xCRugAwWV+NuXebgl3B/r7prIVhRW7u4pI+EscxjwaJgoikg5XILYMd4Y58dWAnRlYSo5TlX2P0PyRLmOTVqJ7BiTsH7szS7DrJd/wNhn9uGBDw9BNcIFW/8YzWVqbICXuzPiJqrw5VEN2vVX7hl2tKQOp7VNLMinAQtVKlDX3AFtHxNA6prbUVbXwpmRRHaM2UQf9maXITktF2V1LQAAEUBNYzu+y6+QNjAaMvGxwahpascPJyuveNsUdSlcnRxwx+TAYYiMbEF3cX5RH5ckWZRPREzC+rA5owAtHQaTfa36TmzOKJAoIhpqN4f4QalwueIlydYOA9JzNLhtUiAU7AdHA9S9hmRfxfn5mq4k7DqOhBHZLSZhfdBcGAEb6H6yPo4yByyJDsaPBZXQNvTdMywj7zwaWvVI4KVIGgR/hQtGuDriVGUfI2EaHVQjXOAndxnmyIjIUjAJ68NIL7dB7SfrFB8TBEOniC+PlvV5m1R1KYK83DBjvO8wRkbW7uIMyT5Gwsp1bNJKZOeYhPVh9cJwuDmZTht3c5Jh9cJwiSIicwhRKjB1lBdSDpf2OoutvL4FvxZV4d6YYDiwNxgNUqhKgVMVjT3+bbV2GFBU2ciifCI7xySsD4umBWHjkigEeblBABDk5YaNS6KwaFqQ1KHREEuIDUZBRQOOl+l6HEs7UgZRBO6N5nmnwQtTyVHf0tHjcndhRSP0nSKL8onsnKPUAViyRdOCmHTZgTsnj8SL6flIVZcgKvji5SFRFJGqLsX143wwxpe9wWjwumdInqpohHKEq3F/fnlXk2COhBHZN46Ekd3zdHPCwsgAfJmjQZv+4oxY9blaFFc1sSCfrlrohRmShZWmdWH5Gh3kLo4Y7eMuRVhEZCGYhBGhaxmjuuYOfH/iYs+wVHUp3J1luD2KvcHo6vjLXeDl7oRTl/UKyy/X4bpABesMiewckzAiALNC/BDo6YqUwyUAgOZ2Pb4+Vo7bowLh4cKr9nR1BEFAqFJuMkOys1PEifIGXookIiZhRAAgcxCwJDoIP53SolLXioy882hs03OZIrpmXTMkL64h+XtNMxrb9CzKJyImYUTd4mNGoVME0rLLkKouxWgfd1w/1kfqsMjKhSnl0LXqUXlhhqRxuaJA9ggjsndMwoguGOfngXF+Htj8bQH+t6gatc3t+CpHI3VYZOUuzpDsuiSZr9HB0UEwFu0Tkf1iEkZ0wd7sMpTWNsNw4bJRQ6seyWm52Jvddzd9oisJvaRNBdA1EhailMP1smbQRGR/mIQRXbA5owAdBtPO5i0dBi7aTtfET+4Mb3cnFFVeHAljUT4RAUzCiIy4aDuZgyAIxuWLqhvbcF7XyqJ8IgJg5o75GzZsQE5ODgRBwLPPPovJkycDACoqKrBq1Srj7UpKSvDUU0/htttuw3PPPYeSkhLo9XqsWbMGsbGx5gyRyGiklxvKekm4uGg7XatQpRxf5WiQp+kuymcSRkRmTMIOHjyIc+fOYffu3SgqKkJycjJSUlIAACqVCjt37gQA6PV6JCUlYd68efjyyy/h5uaGTz/9FIWFhUhOTkZqaqq5QiQysXphOJLTctHScbFrPhdtp6EQplKgoVWPHwu6mgFzJIyIADMmYZmZmViwYAEAICQkBDqdDo2NjZDLTWcE7dmzBwsXLoSHhwfuvvtu3HnnnQAAHx8f1NXVmSs8oh661wndnFEATV0LRnq5YfXCcK4fSteseyZkek45grzc4OXuLHFERGQJzJaEVVVVITIy0rjt6+sLrVbbIwlLSUnBjh07AABOTk7G/R9//LExISMaLly0ncyhu01FVWMbFlynkjgaIrIUZivM7+4Ofem2IJiuk5adnY3x48f3SMx27dqFvLw8PProo+YKj4ho2PxaWIXuZSKzzlSz7QkRATBjEqZSqVBVVWXcrqyshJ+fn8ltDhw4gJkzZ5rsS0lJwQ8//IC3337bZGSMiMga7c0uQ3JaLjov/C5taGP/OSLqYrYkbNasWcjIyAAA5OfnQ6lU9hjxys3NRUREhHG7pKQEn332Gd566y24uLiYKzQiomGzOaPAZLIHwP5zRNTFbDVh0dHRiIyMRGJiIgRBwLp165CWlgaFQoG4uDgAgFarha+vr/E+KSkpqKurw0MPPWTc98EHH8DZmUWsRGSd2H+OiPpi1j5hl/YCA2Ay6gUA6enpJttPPvkknnzySXOGREQ0rNh/joj6wo75RERmtHphONwuWyeS/eeICDDzSBgRkb1j/zki6guTMCIiM2P/OSLqDS9HJvvmPAAADZpJREFUEhEREUmASRgRERGRBJiEEREREUmASRgRERGRBJiEEREREUmASRgRERGRBJiEEREREUmASRgRERGRBJiEEREREUnAqjrmGwwGAMD58+cljoSIiIiof935Snf+cjmrSsK0Wi0A4L777pM4EiIiIqKB0Wq1GDNmTI/9giiKogTxXJXW1lYcP34c/v7+kMlkUodDRERE1CeDwQCtVotJkybB1dW1x3GrSsKIiIiIbAUL84mIiIgkwCSMiIiISAJWVZhvbhs2bEBOTg4EQcCzzz6LyZMnG4/9+9//xmuvvQaZTIbZs2fj0UcflTDSobdp0yao1Wro9Xr85S9/wa233mo8tmjRIigUCuP2li1boFKppAhzSB0/fhwrVqwwFkuGhYXh+eefNx635XOekpKCr776yrh9/PhxZGdnG7dvuukmjBs3zrj90UcfWX0d5qlTp7BixQrcf//9WLZsGcrLy7FmzRoYDAb4+/tj8+bNcHZ2NrlPf58J1qS3156cnAy9Xg9HR0ds3rwZ/v7+xttf6b1hLS5/3S+99BKys7Ph4eEBAFi+fDnmzp1rch9bPedPPPEEamtrAQB1dXWYOnUqXnrpJePtMzIysGXLFgQEBAAAbrzxRjzyyCOSxH6tLv8+i4qKstz3ukiiKIpiVlaW+NBDD4miKIqFhYVifHy8yfHbbrtN1Gg0osFgEJcuXSoWFhZKEaZZZGZmig8++KAoiqJYU1Mjzpkzx+T4PffcI0FU5peVlSWuX7++z+O2fM4vlZWVJb7wwgvG7c7OTnHx4sUSRjT0mpqaxGXLlolr164Vd+7cKYqiKD7zzDPiN998I4qiKL7yyivirl27TO5zpc8Ea9Hba1+zZo24b98+URRF8Z///Kf4yiuvmNznSu8Na9DXOc/Pz+/zPrZ8zi/1zDPPiDk5OSb70tLSxA8//HCYIjSf3r7PLPm9zsuRF2RmZmLBggUAgJCQEOh0OjQ2NgIASkpK4OnpicDAQDg4OGDOnDnIzMyUMtwhNX36dLz55psAAE9PT7S0tJj0NGlqapIqNLPq73XZ+jm/1LZt27BixQrjdnNzc589bayVs7Mz3nvvPSiVSuO+rKwszJ8/HwAwf/78Hue3v88Ea9Lba1+3bh0WLlwIAPD29kZdXZ3JfWzhPd/b677S67Llc97tzJkzaGho6DHSYwvnHOj9+8yS3+tMwi6oqqqCt7e3cdvX19fYl0yr1cLHx8d4zM/Pz3jMFshkMri7uwPoukw1e/Zsk0tPdXV1eOqpp5CYmIjXX38doo1MqG1uboZarcaDDz6I++67D7/99pvxmK2f827Hjh1DYGCgyaWo5uZmVFdX44knnkBiYiI++eQTCSMcGo6Ojj2mh7e0tBgvSfj7+/c4v/19JliT3l67u7s7ZDIZDAYDPv30U9x1110mx/t7b1iL3l53U1MT3nrrLSQlJWHVqlU9kk9bPufdPvnkEyxbtqzH/ubmZvzrX//Cf/3Xf+GBBx7AyZMnzR2mWfT2fWbJ73XWhF1weWIhiiIEQej1GADjMVvy3XffITU1FTt27DDZv3LlStx9991wcXHBihUrsH//fuOvaGsWERGBRx99FPPnz0dxcTEeeOAB7N+/H87OznZzzlNTU7F48WKTfW5ubvjrX/+Ke+65Bx0dHVi2bBmio6MxadIkiaI0j0vPZ2/nu7/PBFtgMBiwZs0azJgxAzNnzjQ51t97w5olJiYiJCQE48aNw/bt27F161aTWjdbP+ft7e1Qq9V44YUXehybMWMGJk+ejBkzZuDw4cNYvXo10tPThz/IIXLp99ml31eW9l7nSNgFKpUKVVVVxu3Kykr4+fn1eqyiosJk5MAW/PLLL/jv//5vvPfeeyZF+ADwxz/+EXK5HE5OTpg7dy4KCgokinJoTZgwwThEPW7cOPj5+aGiogKAfZxzoOuS3LRp00z2yeVyJCQkwNnZGR4eHpg5c6bNnPNLubm5obW1FUDX+b380k1/nwm2IDk5GWPGjMFjjz3W41h/7w1rFhcXZ5xwEhcX1+Pfta2f80OHDvVZcN6dgAFAbGwsampqrLYs4fLvM0t+rzMJu2DWrFnIyMgAAOTn50OpVEIulwMAgoOD0djYiNLSUuj1evz444+YNWuWlOEOqYaGBmzatAnvvPMOvLy8TI7V1NTgz3/+Mzo6OgB0vYlDQ0OlCHPIpaamGi+1abVaVFdXG2d92vo5B7o+jDw8PHqMbhQUFODpp5+GKIrQ6/U4cuSIzZzzS914443G9/z+/ftx8803mxzv7zPB2n311VdwcnLCE0880evx/t4b1uzhhx+GRqMB0PUD5PJ/17Z8zgEgNzcXERERvR7btm2b8bWfOnUKPj4+VjkjurfvM0t+r7Nj/iW2bNmCw4cPQxAErFu3Dvn5+VAoFIiLi8OhQ4ewZcsWAMCtt96K5cuXSxzt0Nm9eze2bt1q0pLghhtuQHh4OOLi4vD+++/jm2++gbOzMyZOnIi1a9fCwcH68/f6+nqsWrUKzc3NaG9vx2OPPYbq6mq7OOdAVxuCN954A++//z4A4N1338X06dMxbdo0bNy4EWq1Gg4ODrjlllusdqp6t+PHj+OVV15BWVkZHB0doVKpsGXLFjzzzDNoa2vDyJEjsXHjRjg5OWHlypXYuHEjXF1de3wm9PUFZsl6e+3V1dVwcXExftFMmDABL7zwgvG1t7W19XhvzJkzR+JXMji9ve7//M//xAcffAB3d3e4ublh48aN8PX1tYtzvnXrVmzduhUxMTG4/fbbjbd95JFHsH37dpSWliI5Odn448ta23P09n328ssvY+3atRb5XmcSRkRERCQB6x/OICIiIrJCTMKIiIiIJMAkjIiIiEgCTMKIiIiIJMAkjIiIiEgCTMKIyOqUlpZiyZIlZnnsmpoa3HHHHXj11VfN8vjdrL3tBxFdOyZhRESXOH36NMaMGYOnnnrKrM+zfft2sz4+EVk+9gkjomGRlpYGtVqNmpoaFBcXY/ny5UhISMC8efOQnp4ODw8PvPLKK8Yu5ocOHUJtbS0KCwuxcuVKfP311zh9+jS2bNkCX19frFixApGRkThz5gzCw8Px4osvoqKiAmvXrkV7eztkMhnWr1+PkSNH4tZbb8XEiRMxa9YsJCQkGGP65ptv8NFHH0EmkyEyMhJr167FkiVLoNFokJCQYJKI7d27Fx988AECAgLg4eGB2bNnAwAKCwvx9NNPo6mpCXfddRd++OEHHD58GK+99hocHR0RGBiIl156CdnZ2dixYweam5vx9NNPY/ny5cjKykJRURFefPFFCIIADw8PvPzyy3Bzc8Pq1auh1WrR3t6Oxx9/3Ph8RGQ7uIA3EQ2bU6dO4bPPPsPZs2fx5JNPmiRElzt79iw+/fRTpKSk4J133sHevXuRlpaGr7/+Gn/6059w+vRpvPPOOwgICEB8fDwKCgrw8ccf44EHHsCNN96In376CW+//TbWr1+PkpISbNu2zWSZmqamJrz++uvYu3cvPDw88PDDD+O3337D008/jV27dpkkYKIo4o033kBaWhoUCgWWLFnSb1K0fv16fPTRR/Dy8sKmTZvw7bffQqVS4dSpU8jIyDBZKuqll17Ciy++iLFjx2LXrl3YtWsXZs+ejdraWuzatQs6nQ4//fTTNf7licgSMQkjomEzdepUyGQyBAQEoKGhod/bTpo0CYIgwN/fH+Hh4ZDJZPDz88ORI0cAAKNHj0ZgYCAAICoqCsXFxcjOzkZxcTG2b98Og8EAHx8fAF2LdV++TuDZs2cxZswYeHh4AACio6Nx4sQJTJw4sUcstbW1kMvlxse7fNHzS1VVVeHcuXN4/PHHAQDNzc3w9vaGSqVCeHh4j7U6jx07hueffx4A0N7ejqioKIwfPx5NTU1YvXo14uLicMcdd/T7tyIi68QkjIiGjaNj/x853QvFX37bS/+/u4JCEAST+wqCACcnJ7z55ptQKpUmx5ycnHo8lyAIuLQaQxTFHo95+e37i0Gv1xufS6lUYufOnSb3z8rK6pGAAV0J4ieffNLjuT///HMcOXIEe/bswY8//oiNGzf2GRsRWScW5hORpORyObRaLQwGA3JycgZ8v99//x2VlZUQRRG5ubmYMGECpkyZgu+++w4AkJmZifT09D7vP3bsWJw7dw6NjY0AgIMHD2LSpEm93tbb2xs6nQ51dXXQ6/U4evSoMfbKykoAgFqtBgB4enoCAIqKigAAO3fuxMmTJ/uMIyIiAj///DMAYN++fcjMzEReXh7S09MRGxuLF154AadPnx7w34WIrAdHwohIUsuWLcPDDz+McePGISQkZMD3i4iIwOuvv46ioiJMmzYNISEheOyxx/Dss89i3759EASh39Ejd3d3rFmzBg8++CAcHBwQExOD2NhYZGVl9bitIAh4/PHHkZSUBJVKhbFjxwIAZs6cie3btyMpKQlz5swxjmb94x//QHJysnFUbOnSpcjOzu41jueeew7PP/883nvvPbi4uODVV1+FIAh47bXXsHv3bshkMixfvnzAfxcish6cHUlENEjdszjN1auMiOwDL0cSERERSYAjYUREREQS4EgYERERkQSYhBERERFJgEkYERERkQSYhBERERFJgEkYERERkQT+P+BXrvWeG9vvAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with plt.style.context('seaborn-white'):\n", + " plt.figure(figsize=(10, 5))\n", + " plt.title('Accuracy of the classifier during the active learning')\n", + " plt.plot(range(n_queries+1), accuracy_scores)\n", + " plt.scatter(range(n_queries+1), accuracy_scores)\n", + " plt.xlabel('number of queries')\n", + " plt.ylabel('accuracy')\n", + " plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/index.rst b/docs/source/index.rst index 508677d..7a67391 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -43,6 +43,7 @@ modAL is an active learning framework for Python3, designed with *modularity, fl :maxdepth: 1 :caption: Examples + content/examples/interactive_labeling content/examples/pool-based_sampling content/examples/ranked_batch_mode content/examples/stream-based_sampling From b9c833f384c55ea07ebd0ec72bc33786108d1bcb Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Sat, 20 Oct 2018 14:24:30 +0200 Subject: [PATCH 009/182] fix: math typo fixed --- .../Acquisition-functions.rst | 2 +- .../query_strategies/Uncertainty-sampling.rst | 103 ------------------ examples/multilabel_svm.py | 0 modAL/multilabel.py | 0 4 files changed, 1 insertion(+), 104 deletions(-) delete mode 100644 docs/source/content/query_strategies/Uncertainty-sampling.rst create mode 100644 examples/multilabel_svm.py create mode 100644 modAL/multilabel.py diff --git a/docs/source/content/query_strategies/Acquisition-functions.rst b/docs/source/content/query_strategies/Acquisition-functions.rst index 5986222..df28c39 100644 --- a/docs/source/content/query_strategies/Acquisition-functions.rst +++ b/docs/source/content/query_strategies/Acquisition-functions.rst @@ -14,7 +14,7 @@ The probability of improvement is defined by PI(x) = \psi\Big( \frac{\mu(x) - f(x^+) - \xi}{\sigma(x)} \Big), -where :math:`\mu(x)` and :math:`\sigma(x)` are the mean and variance of the regressor at :math:`x`, :math:`f` is the function to be optimized with estimated maximum at :math:`x^+ `, :math:`\xi` is a parameter controlling the degree of exploration and :math:`\psi(z)` denotes the cumulative distribution function of a standard Gaussian distribution. +where :math:`\mu(x)` and :math:`\sigma(x)` are the mean and variance of the regressor at :math:`x`, :math:`f` is the function to be optimized with estimated maximum at :math:`x^+`, :math:`\xi` is a parameter controlling the degree of exploration and :math:`\psi(z)` denotes the cumulative distribution function of a standard Gaussian distribution. .. image:: img/bo-PI.png :align: center diff --git a/docs/source/content/query_strategies/Uncertainty-sampling.rst b/docs/source/content/query_strategies/Uncertainty-sampling.rst deleted file mode 100644 index 835c259..0000000 --- a/docs/source/content/query_strategies/Uncertainty-sampling.rst +++ /dev/null @@ -1,103 +0,0 @@ -.. _Uncertainty-sampling: - -Uncertainty sampling -==================== - -When you present unlabelled examples to an active learner, it finds you the most *useful* example and presents it for you to be labelled. This is done by first calculating the *usefulness* of prediction (whatever it means) for each example and select an instance based on the usefulness. The thing is, there are several ways to measure this. They are based upon the classification uncertainty, hence they are called *uncertainty measures*. In modAL, currently you can select from three built-in measures: *classification uncertainty*, *classification margin* and *classification entropy*. In this quick tutorial, we are going to review them. For more details, see Section 2.3 of the awesome book `Active learning by Burr Settles `__. - -Classification uncertainty --------------------------- - -The simplest measure is the uncertainty of classification defined by - -.. math:: - - U(x)=1-P(\hat{x}|x) - -where :math:`x` is the instance to be predicted and :math:`\hat{x}` is the most likely prediction. - -For example, if you have classes ``[0, 1, 2]`` and classification probabilities ``[0.1, 0.2, 0.7]``, the most likely class according to the classifier is ``2`` with uncertainty 0.3. If you have three instances with class probabilities - -.. code:: python - - >>> proba - ... [[0.1 , 0.85, 0.05] - ... [0.6 , 0.3 , 0.1 ] - ... [0.39, 0.61, 0.0 ]] - -the corresponding uncertainties are - -.. code:: python - - >>> uncertainty - ... [0.15, 0.4, 0.39] - -In the above example, the most uncertain sample is the second one. When -querying for labels based on this measure, the strategy selects the -sample with the highest uncertainty. - -For this ternary classification problem, given the first two -probabilities, the classification uncertainty looks like the following. - -.. image:: img/unc-uncertainty.png - :align: center - -Classification margin ---------------------- - -Classification margin is the difference in probability of the first and second most likely prediction, that is, it is defined by - -.. math:: - - M(x)=P(\hat{x_1}|x)-P(\hat{x_2}|x) - -where :math:`\hat{x_1}` and :math:`\hat{x_2}` are the first and second most likely classes. Using the same example we used for classification uncertainty, if the class probabilities are - -.. code:: python - - >>> proba - ... [[0.1 , 0.85, 0.05] - ... [0.6 , 0.3 , 0.1 ] - ... [0.39, 0.61, 0.0 ]] - -the corresponding margins are - -.. code:: python - - >>> margin - ... [0.75, 0.3, 0.22] - -When querying for labels, the strategy selects the sample with the *smallest* margin, since the smaller the decision margin is, the more unsure the decision. In this case, it would be the third sample. For this ternary classification problem, the classifier margin plotted against the first two probabilities are the following. - -.. image:: img/unc-margin.png - :align: center - -Classification entropy ----------------------- - -The third built-in uncertainty measure is the classification entropy, which is defined by - -.. math:: - - H(x)=-\sum_{k}p_k\log(p_k) - -where :math:`p_k` is the probability of the sample belonging to the *k*-th class. Heuristically, the entropy is proportional to the average number of guesses one has to make to find the true class. In our usual example - -.. code:: python - - >>> proba - ... [[0.1 , 0.85, 0.05] - ... [0.6 , 0.3 , 0.1 ] - ... [0.39, 0.61, 0.0 ]] - -the corresponding entropies are approximately - -.. code:: python - - >>> entropy - ... [0.5181, 0.8979, 0.6687] - -The closer the distribution to uniform, the larger the entropy. Again, if we plot the entropy against the first two probabilities of a ternary classification problem, we obtain the following. - -.. image:: img/unc-entropy.png - :align: center diff --git a/examples/multilabel_svm.py b/examples/multilabel_svm.py new file mode 100644 index 0000000..e69de29 diff --git a/modAL/multilabel.py b/modAL/multilabel.py new file mode 100644 index 0000000..e69de29 From 20000b830cbca023c42db3de9991c3df1639cbaf Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 24 Oct 2018 18:35:46 +0200 Subject: [PATCH 010/182] add: multilabel SVM binary maximum strategy added --- examples/multilabel_svm.py | 35 +++++++++++++++++++++++++++ modAL/__init__.py | 3 +++ modAL/multilabel.py | 25 +++++++++++++++++++ tests/example_tests/multilabel_svm.py | 28 +++++++++++++++++++++ 4 files changed, 91 insertions(+) create mode 100644 tests/example_tests/multilabel_svm.py diff --git a/examples/multilabel_svm.py b/examples/multilabel_svm.py index e69de29..6aa249d 100644 --- a/examples/multilabel_svm.py +++ b/examples/multilabel_svm.py @@ -0,0 +1,35 @@ +import numpy as np +import matplotlib.pyplot as plt + +from modAL.models import ActiveLearner +from modAL.multilabel import SVM_binary_minimum + +from sklearn.multiclass import OneVsRestClassifier +from sklearn.svm import LinearSVC + +n_samples = 500 +X = np.random.normal(size=(n_samples, 2)) +y = np.array([[int(x1 > 0), int(x2 > 0)] for x1, x2 in X]) + +n_initial = 10 +initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False) +X_initial, y_initial = X[initial_idx], y[initial_idx] +X_pool, y_pool = np.delete(X, initial_idx, axis=0), np.delete(y, initial_idx, axis=0) + +with plt.style.context('seaborn-white'): + plt.figure(figsize=(10, 10)) + plt.scatter(X[:, 0], X[:, 1], c='k', s=20) + plt.scatter(X[y[:, 0] == 1, 0], X[y[:, 0] == 1, 1], + facecolors='none', edgecolors='b', s=50, linewidths=2, label='class 1') + plt.scatter(X[y[:, 1] == 1, 0], X[y[:, 1] == 1, 1], + facecolors='none', edgecolors='r', s=100, linewidths=2, label='class 2') + plt.legend() + plt.show() + +learner = ActiveLearner( + estimator=OneVsRestClassifier(LinearSVC()), + query_strategy=SVM_binary_minimum, + X_training=X_initial, y_training=y_initial +) + +learner.query(X_pool) \ No newline at end of file diff --git a/modAL/__init__.py b/modAL/__init__.py index e69de29..b34800b 100644 --- a/modAL/__init__.py +++ b/modAL/__init__.py @@ -0,0 +1,3 @@ +from .models import ActiveLearner, Committee, CommitteeRegressor + +__all__ = ['ActiveLearner', 'Committee', 'CommitteeRegressor'] \ No newline at end of file diff --git a/modAL/multilabel.py b/modAL/multilabel.py index e69de29..e08341f 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -0,0 +1,25 @@ +import numpy as np + +from sklearn.base import BaseEstimator + +from modAL.utils.data import modALinput +from typing import Tuple + + +def SVM_binary_minimum(classifier: BaseEstimator, X_pool: modALinput) -> Tuple[np.ndarray, modALinput]: + """ + SVM binary minimum multilabel active learning strategy. For details see the paper + Klaus Brinker, On Active Learning in Multi-label Classification + (https://link.springer.com/chapter/10.1007%2F3-540-31314-1_24) + + Args: + classifier: The multilabel classifier for which the labels are to be queried. Must be an SVM model + such as the ones from sklearn.svm. + X: The pool of samples to query from. + + Returns: + The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + """ + min_abs_dist = np.min(np.abs(classifier.estimator.decision_function(X_pool)), axis=1) + query_idx = np.argmin(min_abs_dist) + return query_idx, X_pool[query_idx] \ No newline at end of file diff --git a/tests/example_tests/multilabel_svm.py b/tests/example_tests/multilabel_svm.py new file mode 100644 index 0000000..ea91dae --- /dev/null +++ b/tests/example_tests/multilabel_svm.py @@ -0,0 +1,28 @@ +import numpy as np + +from modAL.models import ActiveLearner +from modAL.multilabel import SVM_binary_minimum + +from sklearn.multiclass import OneVsRestClassifier +from sklearn.svm import LinearSVC + +n_samples = 500 +X = np.random.normal(size=(n_samples, 2)) +y = np.array([[int(x1 > 0), int(x2 > 0)] for x1, x2 in X]) + +n_initial = 10 +initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False) +X_initial, y_initial = X[initial_idx], y[initial_idx] +X_pool, y_pool = np.delete(X, initial_idx, axis=0), np.delete(y, initial_idx, axis=0) + +learner = ActiveLearner( + estimator=OneVsRestClassifier(LinearSVC()), + query_strategy=SVM_binary_minimum, + X_training=X_initial, y_training=y_initial +) + +n_queries = 10 +for idx in range(n_queries): + query_idx, query_inst = learner.query(X_pool) + learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(1, -1)) + X_pool, y_pool = np.delete(X_pool, query_idx, axis=0), np.delete(y_pool, query_idx, axis=0) \ No newline at end of file From 37c68bb4555d49e868b3f0a182878005e393fd70 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 25 Oct 2018 11:05:56 +0200 Subject: [PATCH 011/182] add: _SVM_loss utility function for MeanMaxLoss and MaxLoss multilabel strategies added --- examples/multilabel_svm.py | 6 ++--- modAL/multilabel.py | 55 +++++++++++++++++++++++++++++++++++--- tests/core_tests.py | 19 +++++++++++++ 3 files changed, 73 insertions(+), 7 deletions(-) diff --git a/examples/multilabel_svm.py b/examples/multilabel_svm.py index 6aa249d..b74d686 100644 --- a/examples/multilabel_svm.py +++ b/examples/multilabel_svm.py @@ -5,7 +5,7 @@ from modAL.multilabel import SVM_binary_minimum from sklearn.multiclass import OneVsRestClassifier -from sklearn.svm import LinearSVC +from sklearn.svm import SVC n_samples = 500 X = np.random.normal(size=(n_samples, 2)) @@ -24,10 +24,10 @@ plt.scatter(X[y[:, 1] == 1, 0], X[y[:, 1] == 1, 1], facecolors='none', edgecolors='r', s=100, linewidths=2, label='class 2') plt.legend() - plt.show() + #plt.show() learner = ActiveLearner( - estimator=OneVsRestClassifier(LinearSVC()), + estimator=OneVsRestClassifier(SVC(probability=True)), query_strategy=SVM_binary_minimum, X_training=X_initial, y_training=y_initial ) diff --git a/modAL/multilabel.py b/modAL/multilabel.py index e08341f..c421f6c 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -1,12 +1,47 @@ import numpy as np from sklearn.base import BaseEstimator +from sklearn.multiclass import OneVsRestClassifier from modAL.utils.data import modALinput -from typing import Tuple +from typing import Tuple, Optional -def SVM_binary_minimum(classifier: BaseEstimator, X_pool: modALinput) -> Tuple[np.ndarray, modALinput]: +def _SVM_loss(multiclass_classifier: OneVsRestClassifier, + X: modALinput, + most_certain_classes: Optional[int] = None) -> np.ndarray: + """ + Utility function for max_loss and mean_max_loss strategies. + + Args: + multiclass_classifier: sklearn.multiclass.OneVsRestClassifier instance for which the loss + is to be calculated. + X: The pool of samples to query from. + most_certain_classes: optional, indexes of most certainly predicted class for each instance. + If None, loss is calculated for all classes. + + Returns: + np.ndarray of shape (n_instances, ), losses for the instances in X. + + """ + predictions = 2*multiclass_classifier.predict(X)-1 + n_classes = len(multiclass_classifier.classes_) + + if most_certain_classes is None: + cls_mtx = 2*np.eye(n_classes, n_classes) - 1 + loss_mtx = np.maximum(1-np.dot(predictions, cls_mtx), 0) + return loss_mtx.mean(axis=0) + else: + cls_mtx = -np.ones(shape=(len(X), n_classes)) + for inst_idx, most_certain_class in enumerate(most_certain_classes): + cls_mtx[inst_idx, most_certain_class] = 1 + + cls_loss = np.maximum(1 - np.multiply(cls_mtx, predictions), 0).sum(axis=1) + return cls_loss + + +def SVM_binary_minimum(classifier: BaseEstimator, + X_pool: modALinput) -> Tuple[np.ndarray, modALinput]: """ SVM binary minimum multilabel active learning strategy. For details see the paper Klaus Brinker, On Active Learning in Multi-label Classification @@ -14,7 +49,7 @@ def SVM_binary_minimum(classifier: BaseEstimator, X_pool: modALinput) -> Tuple[n Args: classifier: The multilabel classifier for which the labels are to be queried. Must be an SVM model - such as the ones from sklearn.svm. + such as the ones from sklearn.svm. X: The pool of samples to query from. Returns: @@ -22,4 +57,16 @@ def SVM_binary_minimum(classifier: BaseEstimator, X_pool: modALinput) -> Tuple[n """ min_abs_dist = np.min(np.abs(classifier.estimator.decision_function(X_pool)), axis=1) query_idx = np.argmin(min_abs_dist) - return query_idx, X_pool[query_idx] \ No newline at end of file + return query_idx, X_pool[query_idx] + + +def max_loss(classifier: BaseEstimator, + X_pool: modALinput, + n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + pass + + +def mean_max_loss(classifier: BaseEstimator, + X_pool: modALinput, + n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + pass \ No newline at end of file diff --git a/tests/core_tests.py b/tests/core_tests.py index d74b59f..b7b3dbb 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -11,13 +11,17 @@ import modAL.utils.selection import modAL.utils.validation import modAL.utils.combination +import modAL.multilabel from copy import deepcopy from itertools import chain, product from collections import namedtuple + from sklearn.ensemble import RandomForestClassifier from sklearn.exceptions import NotFittedError from sklearn.metrics import confusion_matrix +from sklearn.svm import SVC +from sklearn.multiclass import OneVsRestClassifier from scipy.stats import entropy, norm from scipy.special import ndtr from scipy import sparse as sp @@ -922,6 +926,21 @@ def test_vote(self): ) +class TestMultilabel(unittest.TestCase): + def test_SVM_loss(self): + for n_classes in range(3, 10): + for n_instances in range(5, 10): + X_training = np.random.rand(n_instances, 5) + y_training = np.random.randint(0, 2, size=(n_instances, n_classes)) + X_pool = np.random.rand(n_instances, 5) + y_pool = np.random.randint(0, 2, size=(n_instances, n_classes)) + classifier = OneVsRestClassifier(SVC()) + classifier.fit(X_training, y_training) + loss = modAL.multilabel._SVM_loss(classifier, X_pool) + loss = modAL.multilabel._SVM_loss(classifier, X_pool, + most_certain_classes=np.random.randint(0, n_classes, size=(n_instances))) + + class TestExamples(unittest.TestCase): def test_examples(self): From 21b29f11e4f9e0eee5b49c8ee707fd05a30a4728 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 25 Oct 2018 12:17:15 +0200 Subject: [PATCH 012/182] add: __getattr__ for BaseLearner implemented --- modAL/models/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modAL/models/base.py b/modAL/models/base.py index c4703a1..79111a8 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -59,6 +59,12 @@ def __init__(self, if X_training is not None: self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs) + def __getattr__(self, item): + try: + return getattr(self, item) + except AttributeError: + return getattr(self.estimator, item) + def _add_training_data(self, X: modALinput, y: modALinput) -> None: """ Adds the new data and label to the known data, but does not retrain the model. From f2cf52efd79f617f06833dc3769a8c122c3416fd Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 25 Oct 2018 15:14:39 +0200 Subject: [PATCH 013/182] add: Mean Max Loss and Max Loss multilabel SVM query strategies from Li et al. added --- examples/multilabel_svm.py | 2 +- modAL/multilabel.py | 77 ++++++++++++++++++++++++++++++++++++-- tests/core_tests.py | 36 +++++++++++++++--- 3 files changed, 106 insertions(+), 9 deletions(-) diff --git a/examples/multilabel_svm.py b/examples/multilabel_svm.py index b74d686..8f112f6 100644 --- a/examples/multilabel_svm.py +++ b/examples/multilabel_svm.py @@ -24,7 +24,7 @@ plt.scatter(X[y[:, 1] == 1, 0], X[y[:, 1] == 1, 1], facecolors='none', edgecolors='r', s=100, linewidths=2, label='class 2') plt.legend() - #plt.show() + plt.show() learner = ActiveLearner( estimator=OneVsRestClassifier(SVC(probability=True)), diff --git a/modAL/multilabel.py b/modAL/multilabel.py index c421f6c..7a82d61 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -5,6 +5,7 @@ from modAL.utils.data import modALinput from typing import Tuple, Optional +from itertools import combinations def _SVM_loss(multiclass_classifier: OneVsRestClassifier, @@ -30,7 +31,7 @@ def _SVM_loss(multiclass_classifier: OneVsRestClassifier, if most_certain_classes is None: cls_mtx = 2*np.eye(n_classes, n_classes) - 1 loss_mtx = np.maximum(1-np.dot(predictions, cls_mtx), 0) - return loss_mtx.mean(axis=0) + return loss_mtx.mean(axis=1) else: cls_mtx = -np.ones(shape=(len(X), n_classes)) for inst_idx, most_certain_class in enumerate(most_certain_classes): @@ -63,10 +64,80 @@ def SVM_binary_minimum(classifier: BaseEstimator, def max_loss(classifier: BaseEstimator, X_pool: modALinput, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: - pass + + """ + Max Loss query strategy for SVM multilabel classification. + + For more details on this query strategy, see + Li et al., Multilabel SVM active learning for image classification + (http://dx.doi.org/10.1109/ICIP.2004.1421535) + + Args: + classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model + such as the ones from sklearn.svm. Although the function will execute for other models as well, + the mathematical calculations in Li et al. work only for SVM-s. + X: The pool of samples to query from. + + Returns: + The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + """ + + most_certain_classes = classifier.predict_proba(X_pool).argmax(axis=1) + loss = _SVM_loss(classifier, X_pool, most_certain_classes=most_certain_classes) + + assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)' + + if n_instances == 1: + query_idx = np.argmax(loss) + return query_idx, X_pool[query_idx] + else: + max_val = -np.inf + max_idx = None + for subset_idx in combinations(range(len(X_pool)), n_instances): + subset_sum = loss[list(subset_idx)].sum() + if subset_sum > max_val: + max_val = subset_sum + max_idx = subset_idx + + query_idx = np.array(max_idx) + return query_idx, X_pool[query_idx] def mean_max_loss(classifier: BaseEstimator, X_pool: modALinput, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: - pass \ No newline at end of file + """ + Mean Max Loss query strategy for SVM multilabel classification. + + For more details on this query strategy, see + Li et al., Multilabel SVM active learning for image classification + (http://dx.doi.org/10.1109/ICIP.2004.1421535) + + Args: + classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model + such as the ones from sklearn.svm. Although the function will execute for other models as well, + the mathematical calculations in Li et al. work only for SVM-s. + X: The pool of samples to query from. + + Returns: + The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + """ + + loss = _SVM_loss(classifier, X_pool) + + assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)' + + if n_instances == 1: + query_idx = np.argmax(loss) + return query_idx, X_pool[query_idx] + else: + max_val = -np.inf + max_idx = None + for subset_idx in combinations(range(len(X_pool)), n_instances): + subset_sum = loss[list(subset_idx)].sum() + if subset_sum > max_val: + max_val = subset_sum + max_idx = subset_idx + + query_idx = np.array(max_idx) + return query_idx, X_pool[query_idx] \ No newline at end of file diff --git a/tests/core_tests.py b/tests/core_tests.py index b7b3dbb..009e0ef 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -928,17 +928,43 @@ def test_vote(self): class TestMultilabel(unittest.TestCase): def test_SVM_loss(self): - for n_classes in range(3, 10): - for n_instances in range(5, 10): + for n_classes in range(2, 10): + for n_instances in range(1, 10): X_training = np.random.rand(n_instances, 5) y_training = np.random.randint(0, 2, size=(n_instances, n_classes)) X_pool = np.random.rand(n_instances, 5) y_pool = np.random.randint(0, 2, size=(n_instances, n_classes)) classifier = OneVsRestClassifier(SVC()) classifier.fit(X_training, y_training) - loss = modAL.multilabel._SVM_loss(classifier, X_pool) - loss = modAL.multilabel._SVM_loss(classifier, X_pool, - most_certain_classes=np.random.randint(0, n_classes, size=(n_instances))) + avg_loss = modAL.multilabel._SVM_loss(classifier, X_pool) + mcc_loss = modAL.multilabel._SVM_loss(classifier, X_pool, + most_certain_classes=np.random.randint(0, n_classes, size=(n_instances))) + self.assertEqual(avg_loss.shape, (len(X_pool), )) + self.assertEqual(mcc_loss.shape, (len(X_pool),)) + + def test_mean_max_loss(self): + for n_classes in range(2, 10): + for n_pool_instances in range(1, 10): + for n_query_instances in range(1, min(n_pool_instances, 3)): + X_training = np.random.rand(n_pool_instances, 5) + y_training = np.random.randint(0, 2, size=(n_pool_instances, n_classes)) + X_pool = np.random.rand(n_pool_instances, 5) + y_pool = np.random.randint(0, 2, size=(n_pool_instances, n_classes)) + classifier = OneVsRestClassifier(SVC()) + classifier.fit(X_training, y_training) + query_idx, query_inst = modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances) + + def test_max_loss(self): + for n_classes in range(2, 10): + for n_pool_instances in range(1, 10): + for n_query_instances in range(1, min(n_pool_instances, 3)): + X_training = np.random.rand(n_pool_instances, 5) + y_training = np.random.randint(0, 2, size=(n_pool_instances, n_classes)) + X_pool = np.random.rand(n_pool_instances, 5) + y_pool = np.random.randint(0, 2, size=(n_pool_instances, n_classes)) + classifier = OneVsRestClassifier(SVC(probability=True)) + classifier.fit(X_training, y_training) + query_idx, query_inst = modAL.multilabel.max_loss(classifier, X_pool, n_query_instances) class TestExamples(unittest.TestCase): From c6f8d7b1b1126296209391f9c3f9d0c439d267d6 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 29 Oct 2018 08:39:51 +0100 Subject: [PATCH 014/182] refactor: max_loss and mean_max_loss multilabel strategies refactored --- modAL/multilabel.py | 40 ++++++++-------------------------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/modAL/multilabel.py b/modAL/multilabel.py index 7a82d61..c1ed990 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -4,6 +4,7 @@ from sklearn.multiclass import OneVsRestClassifier from modAL.utils.data import modALinput +from modAL.utils.selection import multi_argmax from typing import Tuple, Optional from itertools import combinations @@ -82,25 +83,13 @@ def max_loss(classifier: BaseEstimator, The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. """ - most_certain_classes = classifier.predict_proba(X_pool).argmax(axis=1) - loss = _SVM_loss(classifier, X_pool, most_certain_classes=most_certain_classes) - assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)' - if n_instances == 1: - query_idx = np.argmax(loss) - return query_idx, X_pool[query_idx] - else: - max_val = -np.inf - max_idx = None - for subset_idx in combinations(range(len(X_pool)), n_instances): - subset_sum = loss[list(subset_idx)].sum() - if subset_sum > max_val: - max_val = subset_sum - max_idx = subset_idx + most_certain_classes = classifier.predict_proba(X_pool).argmax(axis=1) + loss = _SVM_loss(classifier, X_pool, most_certain_classes=most_certain_classes) - query_idx = np.array(max_idx) - return query_idx, X_pool[query_idx] + query_idx = multi_argmax(loss, n_instances) + return query_idx, X_pool[query_idx] def mean_max_loss(classifier: BaseEstimator, @@ -123,21 +112,8 @@ def mean_max_loss(classifier: BaseEstimator, The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. """ - loss = _SVM_loss(classifier, X_pool) - assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)' + loss = _SVM_loss(classifier, X_pool) - if n_instances == 1: - query_idx = np.argmax(loss) - return query_idx, X_pool[query_idx] - else: - max_val = -np.inf - max_idx = None - for subset_idx in combinations(range(len(X_pool)), n_instances): - subset_sum = loss[list(subset_idx)].sum() - if subset_sum > max_val: - max_val = subset_sum - max_idx = subset_idx - - query_idx = np.array(max_idx) - return query_idx, X_pool[query_idx] \ No newline at end of file + query_idx = multi_argmax(loss, n_instances) + return query_idx, X_pool[query_idx] From c2cf0ffc556bcb54af4d0ed577fa8936fee02a2f Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 5 Nov 2018 14:51:36 +0100 Subject: [PATCH 015/182] add: multilabel strategies from Esuli-Sebastiani paper added --- examples/multilabel_svm.py | 4 ++-- modAL/multilabel.py | 44 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/examples/multilabel_svm.py b/examples/multilabel_svm.py index 8f112f6..3da9155 100644 --- a/examples/multilabel_svm.py +++ b/examples/multilabel_svm.py @@ -2,7 +2,7 @@ import matplotlib.pyplot as plt from modAL.models import ActiveLearner -from modAL.multilabel import SVM_binary_minimum +from modAL.multilabel import * from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC @@ -28,7 +28,7 @@ learner = ActiveLearner( estimator=OneVsRestClassifier(SVC(probability=True)), - query_strategy=SVM_binary_minimum, + query_strategy=mean_score, X_training=X_initial, y_training=y_initial ) diff --git a/modAL/multilabel.py b/modAL/multilabel.py index c1ed990..f369ece 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -117,3 +117,47 @@ def mean_max_loss(classifier: BaseEstimator, query_idx = multi_argmax(loss, n_instances) return query_idx, X_pool[query_idx] + + +def max_uncertainty(classifier: BaseEstimator, + X_pool: modALinput, + n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + classwise_uncertainty = classifier.predict_proba(X_pool) + classwise_max = np.max(classwise_uncertainty, axis=1) + query_idx = multi_argmax(classwise_max, n_instances) + + return query_idx, X_pool[query_idx] + + +def mean_uncertainty(classifier: BaseEstimator, + X_pool: modALinput, + n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + classwise_uncertainty = classifier.predict_proba(X_pool) + classwise_mean = np.mean(classwise_uncertainty, axis=1) + query_idx = multi_argmax(classwise_mean, n_instances) + + return query_idx, X_pool[query_idx] + + +def max_score(classifier: BaseEstimator, + X_pool: modALinput, + n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + classwise_uncertainty = classifier.predict_proba(X_pool) + classwise_predictions = classifier.predict(X_pool) + classwise_scores = classwise_uncertainty*(classwise_predictions - 1/2) + classwise_max = np.max(classwise_scores, axis=1) + query_idx = multi_argmax(classwise_max, n_instances) + + return query_idx, X_pool[query_idx] + + +def mean_score(classifier: BaseEstimator, + X_pool: modALinput, + n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + classwise_uncertainty = classifier.predict_proba(X_pool) + classwise_predictions = classifier.predict(X_pool) + classwise_scores = classwise_uncertainty*(classwise_predictions-1/2) + classwise_mean = np.mean(classwise_scores, axis=1) + query_idx = multi_argmax(classwise_mean, n_instances) + + return query_idx, X_pool[query_idx] From fe6bba054a3e9a58e30ebeadfd3699b9233d0c5a Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 5 Nov 2018 14:51:57 +0100 Subject: [PATCH 016/182] fix: BaseLearner.__getattr__ removed --- modAL/models/base.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 79111a8..c4703a1 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -59,12 +59,6 @@ def __init__(self, if X_training is not None: self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs) - def __getattr__(self, item): - try: - return getattr(self, item) - except AttributeError: - return getattr(self.estimator, item) - def _add_training_data(self, X: modALinput, y: modALinput) -> None: """ Adds the new data and label to the known data, but does not retrain the model. From af6bb328395ee20ff63fb039d239349124bd160b Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 5 Nov 2018 16:52:16 +0100 Subject: [PATCH 017/182] add: docstrings for multilabel strategies added --- examples/multilabel_svm.py | 2 +- modAL/multilabel.py | 96 +++++++++++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 19 deletions(-) diff --git a/examples/multilabel_svm.py b/examples/multilabel_svm.py index 3da9155..551c23c 100644 --- a/examples/multilabel_svm.py +++ b/examples/multilabel_svm.py @@ -28,7 +28,7 @@ learner = ActiveLearner( estimator=OneVsRestClassifier(SVC(probability=True)), - query_strategy=mean_score, + query_strategy=avg_score, X_training=X_initial, y_training=y_initial ) diff --git a/modAL/multilabel.py b/modAL/multilabel.py index f369ece..725742e 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -119,21 +119,51 @@ def mean_max_loss(classifier: BaseEstimator, return query_idx, X_pool[query_idx] -def max_uncertainty(classifier: BaseEstimator, - X_pool: modALinput, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: - classwise_uncertainty = classifier.predict_proba(X_pool) - classwise_max = np.max(classwise_uncertainty, axis=1) - query_idx = multi_argmax(classwise_max, n_instances) +def min_confidence(classifier: BaseEstimator, + X_pool: modALinput, + n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + """ + MinConfidence query strategy for multilabel classification. + + For more details on this query strategy, see + Esuli and Sebastiani., Active Learning Strategies for Multi-Label Text Classification + (http://dx.doi.org/10.1007/978-3-642-00958-7_12) + + Args: + classifier: The multilabel classifier for which the labels are to be queried. + X: The pool of samples to query from. + + Returns: + The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + """ + + classwise_confidence = classifier.predict_proba(X_pool) + classwise_min = np.min(classwise_confidence, axis=1) + query_idx = multi_argmax((-1)*classwise_min, n_instances) return query_idx, X_pool[query_idx] -def mean_uncertainty(classifier: BaseEstimator, - X_pool: modALinput, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: - classwise_uncertainty = classifier.predict_proba(X_pool) - classwise_mean = np.mean(classwise_uncertainty, axis=1) +def avg_confidence(classifier: BaseEstimator, + X_pool: modALinput, + n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + """ + AvgConfidence query strategy for multilabel classification. + + For more details on this query strategy, see + Esuli and Sebastiani., Active Learning Strategies for Multi-Label Text Classification + (http://dx.doi.org/10.1007/978-3-642-00958-7_12) + + Args: + classifier: The multilabel classifier for which the labels are to be queried. + X: The pool of samples to query from. + + Returns: + The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + """ + + classwise_confidence = classifier.predict_proba(X_pool) + classwise_mean = np.mean(classwise_confidence, axis=1) query_idx = multi_argmax(classwise_mean, n_instances) return query_idx, X_pool[query_idx] @@ -142,21 +172,51 @@ def mean_uncertainty(classifier: BaseEstimator, def max_score(classifier: BaseEstimator, X_pool: modALinput, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: - classwise_uncertainty = classifier.predict_proba(X_pool) + """ + MaxScore query strategy for multilabel classification. + + For more details on this query strategy, see + Esuli and Sebastiani., Active Learning Strategies for Multi-Label Text Classification + (http://dx.doi.org/10.1007/978-3-642-00958-7_12) + + Args: + classifier: The multilabel classifier for which the labels are to be queried. + X: The pool of samples to query from. + + Returns: + The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + """ + + classwise_confidence = classifier.predict_proba(X_pool) classwise_predictions = classifier.predict(X_pool) - classwise_scores = classwise_uncertainty*(classwise_predictions - 1/2) + classwise_scores = classwise_confidence*(classwise_predictions - 1/2) classwise_max = np.max(classwise_scores, axis=1) query_idx = multi_argmax(classwise_max, n_instances) return query_idx, X_pool[query_idx] -def mean_score(classifier: BaseEstimator, - X_pool: modALinput, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: - classwise_uncertainty = classifier.predict_proba(X_pool) +def avg_score(classifier: BaseEstimator, + X_pool: modALinput, + n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + """ + AvgScore query strategy for multilabel classification. + + For more details on this query strategy, see + Esuli and Sebastiani., Active Learning Strategies for Multi-Label Text Classification + (http://dx.doi.org/10.1007/978-3-642-00958-7_12) + + Args: + classifier: The multilabel classifier for which the labels are to be queried. + X: The pool of samples to query from. + + Returns: + The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + """ + + classwise_confidence = classifier.predict_proba(X_pool) classwise_predictions = classifier.predict(X_pool) - classwise_scores = classwise_uncertainty*(classwise_predictions-1/2) + classwise_scores = classwise_confidence*(classwise_predictions-1/2) classwise_mean = np.mean(classwise_scores, axis=1) query_idx = multi_argmax(classwise_mean, n_instances) From 7cb213b20e75b0f2b8b756690c505f972322f4bf Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 5 Nov 2018 16:54:38 +0100 Subject: [PATCH 018/182] add: tests for multilabel strategies added --- tests/core_tests.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/tests/core_tests.py b/tests/core_tests.py index 009e0ef..a34ef92 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -942,29 +942,21 @@ def test_SVM_loss(self): self.assertEqual(avg_loss.shape, (len(X_pool), )) self.assertEqual(mcc_loss.shape, (len(X_pool),)) - def test_mean_max_loss(self): + def test_strategies(self): for n_classes in range(2, 10): for n_pool_instances in range(1, 10): for n_query_instances in range(1, min(n_pool_instances, 3)): X_training = np.random.rand(n_pool_instances, 5) y_training = np.random.randint(0, 2, size=(n_pool_instances, n_classes)) X_pool = np.random.rand(n_pool_instances, 5) - y_pool = np.random.randint(0, 2, size=(n_pool_instances, n_classes)) - classifier = OneVsRestClassifier(SVC()) - classifier.fit(X_training, y_training) - query_idx, query_inst = modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances) - - def test_max_loss(self): - for n_classes in range(2, 10): - for n_pool_instances in range(1, 10): - for n_query_instances in range(1, min(n_pool_instances, 3)): - X_training = np.random.rand(n_pool_instances, 5) - y_training = np.random.randint(0, 2, size=(n_pool_instances, n_classes)) - X_pool = np.random.rand(n_pool_instances, 5) - y_pool = np.random.randint(0, 2, size=(n_pool_instances, n_classes)) classifier = OneVsRestClassifier(SVC(probability=True)) classifier.fit(X_training, y_training) - query_idx, query_inst = modAL.multilabel.max_loss(classifier, X_pool, n_query_instances) + modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances) + modAL.multilabel.max_loss(classifier, X_pool, n_query_instances) + modAL.multilabel.min_confidence(classifier, X_pool, n_query_instances) + modAL.multilabel.avg_confidence(classifier, X_pool, n_query_instances) + modAL.multilabel.max_score(classifier, X_pool, n_query_instances) + modAL.multilabel.avg_score(classifier, X_pool, n_query_instances) class TestExamples(unittest.TestCase): From d42fc29a46535a1a2266b3e155a5a2696661c55a Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 5 Nov 2018 16:58:27 +0100 Subject: [PATCH 019/182] add: multilabel API reference added to docs --- docs/source/content/apireference/multilabel.rst | 5 +++++ docs/source/index.rst | 1 + 2 files changed, 6 insertions(+) create mode 100644 docs/source/content/apireference/multilabel.rst diff --git a/docs/source/content/apireference/multilabel.rst b/docs/source/content/apireference/multilabel.rst new file mode 100644 index 0000000..b469d62 --- /dev/null +++ b/docs/source/content/apireference/multilabel.rst @@ -0,0 +1,5 @@ +modAL.multilabel +============= + +.. automodule:: modAL.multilabel + :members: diff --git a/docs/source/index.rst b/docs/source/index.rst index 7a67391..2a0697e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -62,6 +62,7 @@ modAL is an active learning framework for Python3, designed with *modularity, fl content/apireference/models.rst content/apireference/uncertainty.rst content/apireference/disagreement.rst + content/apireference/multilabel.rst content/apireference/acquisition.rst content/apireference/batch.rst content/apireference/density.rst From 4c19e275ef5ae0136ac9cdb560e8a63cef20e4e5 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 26 Nov 2018 12:42:55 +0100 Subject: [PATCH 020/182] fix: some parameters are set explicitly to silence warnings --- examples/multilabel_svm.py | 5 +++-- tests/core_tests.py | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/multilabel_svm.py b/examples/multilabel_svm.py index 551c23c..ab4eb3c 100644 --- a/examples/multilabel_svm.py +++ b/examples/multilabel_svm.py @@ -27,9 +27,10 @@ plt.show() learner = ActiveLearner( - estimator=OneVsRestClassifier(SVC(probability=True)), + estimator=OneVsRestClassifier(SVC(probability=True, gamma='auto')), query_strategy=avg_score, X_training=X_initial, y_training=y_initial ) -learner.query(X_pool) \ No newline at end of file +query_idx, query_inst = learner.query(X_pool) +learner.teach(X_pool[query_idx], y_pool[query_idx]) \ No newline at end of file diff --git a/tests/core_tests.py b/tests/core_tests.py index a34ef92..31d87d5 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -642,7 +642,7 @@ def test_keras(self): def test_sklearn(self): learner = modAL.models.learners.ActiveLearner( - estimator=RandomForestClassifier(), + estimator=RandomForestClassifier(n_estimators=10), X_training=np.random.rand(10, 10), y_training=np.random.randint(0, 2, size=(10,)) ) @@ -667,7 +667,7 @@ def test_sparse_matrices(self): initial_idx = np.random.choice(range(n_samples), size=5, replace=False) learner = modAL.models.learners.ActiveLearner( - estimator=RandomForestClassifier(), query_strategy=query_strategy, + estimator=RandomForestClassifier(n_estimators=10), query_strategy=query_strategy, X_training=X_pool[initial_idx], y_training=y_pool[initial_idx] ) query_idx, query_inst = learner.query(X_pool) @@ -949,7 +949,7 @@ def test_strategies(self): X_training = np.random.rand(n_pool_instances, 5) y_training = np.random.randint(0, 2, size=(n_pool_instances, n_classes)) X_pool = np.random.rand(n_pool_instances, 5) - classifier = OneVsRestClassifier(SVC(probability=True)) + classifier = OneVsRestClassifier(SVC(probability=True, gamma='auto')) classifier.fit(X_training, y_training) modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances) modAL.multilabel.max_loss(classifier, X_pool, n_query_instances) From 41f28f4ec5f371531ba6513406ff09fcaa36c80e Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 26 Nov 2018 15:21:17 +0100 Subject: [PATCH 021/182] fix: SVM_binary_minimum fixed and tested --- modAL/multilabel.py | 23 ++++++++++++++--------- tests/core_tests.py | 7 ++++++- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/modAL/multilabel.py b/modAL/multilabel.py index 725742e..db907aa 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -3,13 +3,14 @@ from sklearn.base import BaseEstimator from sklearn.multiclass import OneVsRestClassifier +from modAL.models import ActiveLearner from modAL.utils.data import modALinput from modAL.utils.selection import multi_argmax from typing import Tuple, Optional from itertools import combinations -def _SVM_loss(multiclass_classifier: OneVsRestClassifier, +def _SVM_loss(multiclass_classifier: ActiveLearner, X: modALinput, most_certain_classes: Optional[int] = None) -> np.ndarray: """ @@ -42,7 +43,7 @@ def _SVM_loss(multiclass_classifier: OneVsRestClassifier, return cls_loss -def SVM_binary_minimum(classifier: BaseEstimator, +def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput) -> Tuple[np.ndarray, modALinput]: """ SVM binary minimum multilabel active learning strategy. For details see the paper @@ -57,12 +58,16 @@ def SVM_binary_minimum(classifier: BaseEstimator, Returns: The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. """ - min_abs_dist = np.min(np.abs(classifier.estimator.decision_function(X_pool)), axis=1) + + decision_function = np.array([svm.decision_function(X_pool) + for svm in classifier.estimator.estimators_]).T + + min_abs_dist = np.min(np.abs(decision_function), axis=1) query_idx = np.argmin(min_abs_dist) return query_idx, X_pool[query_idx] -def max_loss(classifier: BaseEstimator, +def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: @@ -92,7 +97,7 @@ def max_loss(classifier: BaseEstimator, return query_idx, X_pool[query_idx] -def mean_max_loss(classifier: BaseEstimator, +def mean_max_loss(classifier: OneVsRestClassifier, X_pool: modALinput, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: """ @@ -119,7 +124,7 @@ def mean_max_loss(classifier: BaseEstimator, return query_idx, X_pool[query_idx] -def min_confidence(classifier: BaseEstimator, +def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: """ @@ -144,7 +149,7 @@ def min_confidence(classifier: BaseEstimator, return query_idx, X_pool[query_idx] -def avg_confidence(classifier: BaseEstimator, +def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: """ @@ -169,7 +174,7 @@ def avg_confidence(classifier: BaseEstimator, return query_idx, X_pool[query_idx] -def max_score(classifier: BaseEstimator, +def max_score(classifier: OneVsRestClassifier, X_pool: modALinput, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: """ @@ -196,7 +201,7 @@ def max_score(classifier: BaseEstimator, return query_idx, X_pool[query_idx] -def avg_score(classifier: BaseEstimator, +def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: """ diff --git a/tests/core_tests.py b/tests/core_tests.py index 31d87d5..7b61582 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -943,7 +943,7 @@ def test_SVM_loss(self): self.assertEqual(mcc_loss.shape, (len(X_pool),)) def test_strategies(self): - for n_classes in range(2, 10): + for n_classes in range(3, 10): for n_pool_instances in range(1, 10): for n_query_instances in range(1, min(n_pool_instances, 3)): X_training = np.random.rand(n_pool_instances, 5) @@ -951,6 +951,10 @@ def test_strategies(self): X_pool = np.random.rand(n_pool_instances, 5) classifier = OneVsRestClassifier(SVC(probability=True, gamma='auto')) classifier.fit(X_training, y_training) + + active_learner = modAL.models.ActiveLearner(classifier) + modAL.multilabel.SVM_binary_minimum(active_learner, X_pool) + modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances) modAL.multilabel.max_loss(classifier, X_pool, n_query_instances) modAL.multilabel.min_confidence(classifier, X_pool, n_query_instances) @@ -959,6 +963,7 @@ def test_strategies(self): modAL.multilabel.avg_score(classifier, X_pool, n_query_instances) + class TestExamples(unittest.TestCase): def test_examples(self): From 365b6b41991c6c13fbc8b60849017ff875246cd7 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 26 Nov 2018 16:12:31 +0100 Subject: [PATCH 022/182] add: summary of implemented algorithms --- docs/source/index.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 2a0697e..aeeafc9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -12,6 +12,18 @@ Welcome to the documentation for modAL! modAL is an active learning framework for Python3, designed with *modularity, flexibility* and *extensibility* in mind. Built on top of scikit-learn, it allows you to rapidly create active learning workflows with nearly complete freedom. What is more, you can easily replace parts with your custom built solutions, allowing you to design novel algorithms with ease. +Currently supported active learning strategies are + +- **uncertainty-based sampling:** *least confident* (`Lewis and Catlett `_), *max margin* and *max entropy* +- **committee-based algorithms:** *vote entropy*, *consensus entropy* and *max disagreement* (`Cohn et al. `_) +- **multilabel strategies:** *SVM binary minimum* (`Brinker `_), *max loss*, *mean max loss*, (`Li et al. `_) *MinConfidence*, *MeanConfidence*, *MinScore*, *MeanScore* (`Esuli and Sebastiani `_) +- **Bayesian optimization:** *probability of improvement*, *expected improvement* and *upper confidence bound* (`Snoek et al. `_) +- **batch active learning:** *ranked batch-mode sampling* (`Cardoso et al. `_) +- **information density framework** (`McCallum and Nigam `_) +- **stream-based sampling** (`Atlas et al. `_) +- **active regression** with *max standard deviance* sampling for Gaussian processes or ensemble regressors + + .. toctree:: :maxdepth: 1 :caption: Overview From 32c3e4285ca5d349ce0d4e52d3efa5fd1dd26694 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 26 Nov 2018 16:16:12 +0100 Subject: [PATCH 023/182] setup script updated --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b288b83..f4978cc 100644 --- a/setup.py +++ b/setup.py @@ -2,12 +2,12 @@ setup( name='modAL', - version='0.3.1', + version='0.3.2', author='Tivadar Danka', author_email='85a5187a@opayq.com', description='A modular active learning framework for Python3', license='MIT', - url='https://cosmic-cortex.github.io/modAL', + url='https://modAL-python.github.io/', packages=['modAL', 'modAL.models', 'modAL.utils'], classifiers=['Development Status :: 4 - Beta'], install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18'], From d623d932073c433e1fab62b4d13f1b3c7e075842 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 26 Nov 2018 16:41:43 +0100 Subject: [PATCH 024/182] add: contributing guidelines added --- docs/source/content/overview/Contributing.rst | 38 +++++++++++++++++++ docs/source/index.rst | 1 + 2 files changed, 39 insertions(+) create mode 100644 docs/source/content/overview/Contributing.rst diff --git a/docs/source/content/overview/Contributing.rst b/docs/source/content/overview/Contributing.rst new file mode 100644 index 0000000..4499739 --- /dev/null +++ b/docs/source/content/overview/Contributing.rst @@ -0,0 +1,38 @@ +Contributing +============ + +Contributions to modAL are very much welcome! If you would like to help in general, visit the Issues page, where you'll find bugs to be fixed, features to be implemented. If you have a concrete feature in mind, you can proceed as follows. + +1. Open a new issue. This helps us to discuss your idea and makes sure that you are not working in parallel with other contributors. + +2. Fork the modAL repository and clone your fork to your local machine: + +.. code:: bash + + $ git clone git@github.com:username/modAL.git + + +3. Create a feature branch for the changes from the dev branch: + +.. code:: bash + + $ git checkout -b new-feature dev + + +Make sure that you create your branch from ``dev``. + +4. After you have finished implementing the feature, make sure that all the tests pass. The tests can be run as + +.. code:: bash + + $ python3 path-to-modAL-repo/tests/core_tests.py + +5. Commit and push the changes. + +.. code:: bash + + $ git add modified_files + $ git commit -m 'commit message explaning the changes briefly' + $ git push origin new-feature3 + +6. Create a pull request from your fork **to the dev branch**. After the code is reviewed and possible issues are cleared, the pull request is merged to ``dev``. diff --git a/docs/source/index.rst b/docs/source/index.rst index aeeafc9..a75fbf6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -31,6 +31,7 @@ Currently supported active learning strategies are content/overview/modAL-in-a-nutshell content/overview/Installation content/overview/Extending-modAL + content/overview/Contributing .. toctree:: :maxdepth: 1 From d61b50066e25872434c039db03c28f7bd99d0b4d Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Tue, 27 Nov 2018 22:17:43 +0100 Subject: [PATCH 025/182] add: expected error reduction almost implemented --- modAL/expected_error_reduction.py | 37 +++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 modAL/expected_error_reduction.py diff --git a/modAL/expected_error_reduction.py b/modAL/expected_error_reduction.py new file mode 100644 index 0000000..f9c2990 --- /dev/null +++ b/modAL/expected_error_reduction.py @@ -0,0 +1,37 @@ +""" +Expected error reduction framework for active learning. +""" + +from typing import Tuple + +import numpy as np + +from scipy.stats import entropy +from sklearn.base import clone + +from modAL.models import ActiveLearner +from modAL.utils.data import modALinput, data_vstack +from modAL.utils.selection import multi_argmax + + +def expected_error_reduction(classifier: ActiveLearner, X: modALinput, + p_subsample=1.0: np.float, n_instances=1: int) -> Tuple[np.ndarray, modALinput]: + + expected_error = np.full(shape=(len(X), ), fill_value=-np.nan) + possible_labels = np.unique(classifier.y_training) + + for x_idx, x in enumerate(X): + # subsample the data if needed + if np.random.rand() <= p_subsample: + # estimate the expected error + for y in possible_labels: + X_new = data_vstack((classifier.X_training, x)) + y_new = None + + refitted_estimator = clone(classifier.estimator).fit() + + + query_idx = multi_argmax(expected_error, n_instances) + + return query_idx, X[query_idx] + From e17b1bbc94582e775304709d88b3026c60502342 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 28 Nov 2018 12:16:15 +0100 Subject: [PATCH 026/182] add: expected_error_reduction implementation finished, tests added --- modAL/expected_error.py | 70 ++++++++++ modAL/expected_error_reduction.py | 37 ----- modAL/utils/data.py | 4 +- tests/core_tests.py | 224 ++++++++++++++++-------------- 4 files changed, 191 insertions(+), 144 deletions(-) create mode 100644 modAL/expected_error.py delete mode 100644 modAL/expected_error_reduction.py diff --git a/modAL/expected_error.py b/modAL/expected_error.py new file mode 100644 index 0000000..737506d --- /dev/null +++ b/modAL/expected_error.py @@ -0,0 +1,70 @@ +""" +Expected error reduction framework for active learning. +""" + +from typing import Tuple + +import numpy as np + +from scipy.stats import entropy + +from sklearn.base import clone +from sklearn.exceptions import NotFittedError + +from modAL.models import ActiveLearner +from modAL.utils.data import modALinput, data_vstack +from modAL.utils.selection import multi_argmax + + +def expected_error_reduction(learner: ActiveLearner, X: modALinput, + p_subsample: np.float = 1.0, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + """ + Expected error reduction query strategy. + + References: + Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf) + + Args: + learner: The ActiveLearner object for which the expected error is to be estimated. + X: The samples. + p_subsample: Probability of keeping a sample from the pool when calculating expected error. + Significantly improves runtime for large sample pools. + n_instances: The number of instances to be sampled. + + + Returns: + The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + """ + + assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0' + + #expected_error = np.full(shape=(len(X), ), fill_value=-np.nan) + expected_error = np.zeros(shape=(len(X), )) + possible_labels = np.unique(learner.y_training) + + try: + X_proba = learner.predict_proba(X) + except NotFittedError: + # TODO: implement a proper cold-start + return 0, X[0] + + for x_idx, x in enumerate(X): + # subsample the data if needed + if np.random.rand() <= p_subsample: + # estimate the expected error + for y_idx, y in enumerate(possible_labels): + X_new = data_vstack((learner.X_training, x.reshape(1, -1))) + y_new = data_vstack((learner.y_training, np.array(y).reshape(1, ))) + + refitted_estimator = clone(learner.estimator).fit(X_new, y_new) + uncertainty = 1 - np.max(refitted_estimator.predict_proba(X), axis=1) + + expected_error[x_idx] += np.sum(uncertainty)*X_proba[x_idx, y_idx] + + else: + expected_error[x_idx] -np.nan + + query_idx = multi_argmax(expected_error, n_instances) + + return query_idx, X[query_idx] + diff --git a/modAL/expected_error_reduction.py b/modAL/expected_error_reduction.py deleted file mode 100644 index f9c2990..0000000 --- a/modAL/expected_error_reduction.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Expected error reduction framework for active learning. -""" - -from typing import Tuple - -import numpy as np - -from scipy.stats import entropy -from sklearn.base import clone - -from modAL.models import ActiveLearner -from modAL.utils.data import modALinput, data_vstack -from modAL.utils.selection import multi_argmax - - -def expected_error_reduction(classifier: ActiveLearner, X: modALinput, - p_subsample=1.0: np.float, n_instances=1: int) -> Tuple[np.ndarray, modALinput]: - - expected_error = np.full(shape=(len(X), ), fill_value=-np.nan) - possible_labels = np.unique(classifier.y_training) - - for x_idx, x in enumerate(X): - # subsample the data if needed - if np.random.rand() <= p_subsample: - # estimate the expected error - for y in possible_labels: - X_new = data_vstack((classifier.X_training, x)) - y_new = None - - refitted_estimator = clone(classifier.estimator).fit() - - - query_idx = multi_argmax(expected_error, n_instances) - - return query_idx, X[query_idx] - diff --git a/modAL/utils/data.py b/modAL/utils/data.py index 27637f7..32976e4 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, Container from itertools import chain import numpy as np @@ -8,7 +8,7 @@ modALinput = Union[list, np.ndarray, sp.csr_matrix] -def data_vstack(blocks: modALinput) -> modALinput: +def data_vstack(blocks: Container) -> modALinput: """ Stack vertically both sparse and dense arrays. diff --git a/tests/core_tests.py b/tests/core_tests.py index 7b61582..368f9cb 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -5,13 +5,16 @@ import mock import modAL.models.base import modAL.models.learners -import modAL.uncertainty -import modAL.disagreement -import modAL.density import modAL.utils.selection import modAL.utils.validation import modAL.utils.combination +import modAL.acquisition +import modAL.batch +import modAL.density +import modAL.disagreement +import modAL.expected_error import modAL.multilabel +import modAL.uncertainty from copy import deepcopy from itertools import chain, product @@ -280,6 +283,119 @@ def test_selection(self): modAL.acquisition.max_UCB(optimizer, X, beta=np.random.rand(), n_instances=n_instances) +class TestDensity(unittest.TestCase): + + def test_similarize_distance(self): + from scipy.spatial.distance import cosine + sim = modAL.density.similarize_distance(cosine) + for _ in range(100): + for n_dim in range(1, 10): + X_1, X_2 = np.random.rand(n_dim), np.random.rand(n_dim) + np.testing.assert_almost_equal( + sim(X_1, X_2), + 1/(1 + cosine(X_1, X_2)) + ) + + def test_information_density(self): + for n_samples in range(1, 10): + for n_dim in range(1, 10): + X_pool = np.random.rand(n_samples, n_dim) + similarities = modAL.density.information_density(X_pool) + np.testing.assert_equal(len(similarities), n_samples) + + +class TestDisagreements(unittest.TestCase): + + def test_vote_entropy(self): + for n_samples in range(1, 10): + for n_classes in range(1, 10): + for true_query_idx in range(n_samples): + # 1. fitted committee + vote_return = np.zeros(shape=(n_samples, n_classes), dtype=np.int16) + vote_return[true_query_idx] = np.asarray(range(n_classes), dtype=np.int16) + committee = mock.MockCommittee(classes_=np.asarray(range(n_classes)), vote_return=vote_return) + vote_entr = modAL.disagreement.vote_entropy( + committee, np.random.rand(n_samples, n_classes) + ) + true_entropy = np.zeros(shape=(n_samples, )) + true_entropy[true_query_idx] = entropy(np.ones(n_classes)/n_classes) + np.testing.assert_array_almost_equal(vote_entr, true_entropy) + + # 2. unfitted committee + committee = mock.MockCommittee(fitted=False) + true_entropy = np.zeros(shape=(n_samples,)) + vote_entr = modAL.disagreement.vote_entropy( + committee, np.random.rand(n_samples, n_classes) + ) + np.testing.assert_almost_equal(vote_entr, true_entropy) + + def test_consensus_entropy(self): + for n_samples in range(1, 10): + for n_classes in range(2, 10): + for true_query_idx in range(n_samples): + # 1. fitted committee + proba = np.zeros(shape=(n_samples, n_classes)) + proba[:, 0] = 1.0 + proba[true_query_idx] = np.ones(n_classes)/n_classes + committee = mock.MockCommittee(predict_proba_return=proba) + consensus_entropy = modAL.disagreement.consensus_entropy( + committee, np.random.rand(n_samples, n_classes) + ) + true_entropy = np.zeros(shape=(n_samples,)) + true_entropy[true_query_idx] = entropy(np.ones(n_classes) / n_classes) + np.testing.assert_array_almost_equal(consensus_entropy, true_entropy) + + # 2. unfitted committee + committee = mock.MockCommittee(fitted=False) + true_entropy = np.zeros(shape=(n_samples,)) + consensus_entropy = modAL.disagreement.consensus_entropy( + committee, np.random.rand(n_samples, n_classes) + ) + np.testing.assert_almost_equal(consensus_entropy, true_entropy) + + def test_KL_max_disagreement(self): + for n_samples in range(1, 10): + for n_classes in range(2, 10): + for n_learners in range (2, 10): + # 1. fitted committee + vote_proba = np.zeros(shape=(n_samples, n_learners, n_classes)) + vote_proba[:, :, 0] = 1.0 + committee = mock.MockCommittee( + n_learners=n_learners, classes_=range(n_classes), + vote_proba_return=vote_proba + ) + + true_KL_disagreement = np.zeros(shape=(n_samples, )) + + try: + np.testing.assert_array_almost_equal( + true_KL_disagreement, + modAL.disagreement.KL_max_disagreement(committee, np.random.rand(n_samples, 1)) + ) + except: + modAL.disagreement.KL_max_disagreement(committee, np.random.rand(n_samples, 1)) + + # 2. unfitted committee + committee = mock.MockCommittee(fitted=False) + true_KL_disagreement = np.zeros(shape=(n_samples,)) + returned_KL_disagreement = modAL.disagreement.KL_max_disagreement( + committee, np.random.rand(n_samples, n_classes) + ) + np.testing.assert_almost_equal(returned_KL_disagreement, true_KL_disagreement) + + +class TestEER(unittest.TestCase): + def test_eer(self): + for n_pool, n_features, n_classes in product(range(1, 10), range(1, 5), range(2, 5)): + X_training, y_training = np.random.rand(10, n_features), np.random.randint(0, n_classes, size=10) + X_pool, y_pool = np.random.rand(n_pool, n_features), np.random.randint(0, n_classes+1, size=n_pool) + + learner = modAL.models.ActiveLearner(RandomForestClassifier(n_estimators=2), + X_training=X_training, y_training=y_training) + + modAL.expected_error.expected_error_reduction(learner, X_pool) + + class TestUncertainties(unittest.TestCase): def test_classifier_uncertainty(self): @@ -383,107 +499,6 @@ def test_entropy_sampling(self): np.testing.assert_array_equal(query_idx, true_query_idx) -class TestDensity(unittest.TestCase): - - def test_similarize_distance(self): - from scipy.spatial.distance import cosine - sim = modAL.density.similarize_distance(cosine) - for _ in range(100): - for n_dim in range(1, 10): - X_1, X_2 = np.random.rand(n_dim), np.random.rand(n_dim) - np.testing.assert_almost_equal( - sim(X_1, X_2), - 1/(1 + cosine(X_1, X_2)) - ) - - def test_information_density(self): - for n_samples in range(1, 10): - for n_dim in range(1, 10): - X_pool = np.random.rand(n_samples, n_dim) - similarities = modAL.density.information_density(X_pool) - np.testing.assert_equal(len(similarities), n_samples) - - -class TestDisagreements(unittest.TestCase): - - def test_vote_entropy(self): - for n_samples in range(1, 10): - for n_classes in range(1, 10): - for true_query_idx in range(n_samples): - # 1. fitted committee - vote_return = np.zeros(shape=(n_samples, n_classes), dtype=np.int16) - vote_return[true_query_idx] = np.asarray(range(n_classes), dtype=np.int16) - committee = mock.MockCommittee(classes_=np.asarray(range(n_classes)), vote_return=vote_return) - vote_entr = modAL.disagreement.vote_entropy( - committee, np.random.rand(n_samples, n_classes) - ) - true_entropy = np.zeros(shape=(n_samples, )) - true_entropy[true_query_idx] = entropy(np.ones(n_classes)/n_classes) - np.testing.assert_array_almost_equal(vote_entr, true_entropy) - - # 2. unfitted committee - committee = mock.MockCommittee(fitted=False) - true_entropy = np.zeros(shape=(n_samples,)) - vote_entr = modAL.disagreement.vote_entropy( - committee, np.random.rand(n_samples, n_classes) - ) - np.testing.assert_almost_equal(vote_entr, true_entropy) - - def test_consensus_entropy(self): - for n_samples in range(1, 10): - for n_classes in range(2, 10): - for true_query_idx in range(n_samples): - # 1. fitted committee - proba = np.zeros(shape=(n_samples, n_classes)) - proba[:, 0] = 1.0 - proba[true_query_idx] = np.ones(n_classes)/n_classes - committee = mock.MockCommittee(predict_proba_return=proba) - consensus_entropy = modAL.disagreement.consensus_entropy( - committee, np.random.rand(n_samples, n_classes) - ) - true_entropy = np.zeros(shape=(n_samples,)) - true_entropy[true_query_idx] = entropy(np.ones(n_classes) / n_classes) - np.testing.assert_array_almost_equal(consensus_entropy, true_entropy) - - # 2. unfitted committee - committee = mock.MockCommittee(fitted=False) - true_entropy = np.zeros(shape=(n_samples,)) - consensus_entropy = modAL.disagreement.consensus_entropy( - committee, np.random.rand(n_samples, n_classes) - ) - np.testing.assert_almost_equal(consensus_entropy, true_entropy) - - def test_KL_max_disagreement(self): - for n_samples in range(1, 10): - for n_classes in range(2, 10): - for n_learners in range (2, 10): - # 1. fitted committee - vote_proba = np.zeros(shape=(n_samples, n_learners, n_classes)) - vote_proba[:, :, 0] = 1.0 - committee = mock.MockCommittee( - n_learners=n_learners, classes_=range(n_classes), - vote_proba_return=vote_proba - ) - - true_KL_disagreement = np.zeros(shape=(n_samples, )) - - try: - np.testing.assert_array_almost_equal( - true_KL_disagreement, - modAL.disagreement.KL_max_disagreement(committee, np.random.rand(n_samples, 1)) - ) - except: - modAL.disagreement.KL_max_disagreement(committee, np.random.rand(n_samples, 1)) - - # 2. unfitted committee - committee = mock.MockCommittee(fitted=False) - true_KL_disagreement = np.zeros(shape=(n_samples,)) - returned_KL_disagreement = modAL.disagreement.KL_max_disagreement( - committee, np.random.rand(n_samples, n_classes) - ) - np.testing.assert_almost_equal(returned_KL_disagreement, true_KL_disagreement) - - class TestQueries(unittest.TestCase): def test_multi_argmax(self): @@ -963,7 +978,6 @@ def test_strategies(self): modAL.multilabel.avg_score(classifier, X_pool, n_query_instances) - class TestExamples(unittest.TestCase): def test_examples(self): From bdd741586b4102e60ebd190c13f8f7c6eeec9b19 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 28 Nov 2018 12:20:47 +0100 Subject: [PATCH 027/182] fix: SVC() warnings silenced --- tests/core_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core_tests.py b/tests/core_tests.py index 368f9cb..2771b98 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -949,7 +949,7 @@ def test_SVM_loss(self): y_training = np.random.randint(0, 2, size=(n_instances, n_classes)) X_pool = np.random.rand(n_instances, 5) y_pool = np.random.randint(0, 2, size=(n_instances, n_classes)) - classifier = OneVsRestClassifier(SVC()) + classifier = OneVsRestClassifier(SVC(probability=True, gamma='auto')) classifier.fit(X_training, y_training) avg_loss = modAL.multilabel._SVM_loss(classifier, X_pool) mcc_loss = modAL.multilabel._SVM_loss(classifier, X_pool, From 3f6bbcb3ab89fea17e973dccc7039e93f901b247 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 28 Nov 2018 14:33:02 +0100 Subject: [PATCH 028/182] add: expected log loss reduction --- modAL/expected_error.py | 53 ++++++++++++++++++++++++++++++++++++++++- tests/core_tests.py | 3 ++- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index 737506d..bc9889e 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -38,7 +38,6 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0' - #expected_error = np.full(shape=(len(X), ), fill_value=-np.nan) expected_error = np.zeros(shape=(len(X), )) possible_labels = np.unique(learner.y_training) @@ -68,3 +67,55 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, return query_idx, X[query_idx] + +def expected_log_loss_reduction(learner: ActiveLearner, X: modALinput, + p_subsample: np.float = 1.0, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + """ + Expected log loss reduction query strategy. + + References: + Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf) + + Args: + learner: The ActiveLearner object for which the expected log loss is to be estimated. + X: The samples. + p_subsample: Probability of keeping a sample from the pool when calculating expected log loss. + Significantly improves runtime for large sample pools. + n_instances: The number of instances to be sampled. + + + Returns: + The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + """ + + assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0' + + expected_log_loss = np.zeros(shape=(len(X), )) + possible_labels = np.unique(learner.y_training) + + try: + X_proba = learner.predict_proba(X) + except NotFittedError: + # TODO: implement a proper cold-start + return 0, X[0] + + for x_idx, x in enumerate(X): + # subsample the data if needed + if np.random.rand() <= p_subsample: + # estimate the expected error + for y_idx, y in enumerate(possible_labels): + X_new = data_vstack((learner.X_training, x.reshape(1, -1))) + y_new = data_vstack((learner.y_training, np.array(y).reshape(1, ))) + + refitted_estimator = clone(learner.estimator).fit(X_new, y_new) + refitted_proba = refitted_estimator.predict_proba(X) + entr = np.transpose(entropy(np.transpose(refitted_proba))) + + expected_log_loss[x_idx] += np.sum(entr)*X_proba[x_idx, y_idx] + + else: + expected_log_loss[x_idx] -np.nan + + query_idx = multi_argmax(expected_log_loss, n_instances) + + return query_idx, X[query_idx] diff --git a/tests/core_tests.py b/tests/core_tests.py index 2771b98..10109ad 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -386,13 +386,14 @@ def test_KL_max_disagreement(self): class TestEER(unittest.TestCase): def test_eer(self): - for n_pool, n_features, n_classes in product(range(1, 10), range(1, 5), range(2, 5)): + for n_pool, n_features, n_classes in product(range(5, 10), range(1, 5), range(2, 5)): X_training, y_training = np.random.rand(10, n_features), np.random.randint(0, n_classes, size=10) X_pool, y_pool = np.random.rand(n_pool, n_features), np.random.randint(0, n_classes+1, size=n_pool) learner = modAL.models.ActiveLearner(RandomForestClassifier(n_estimators=2), X_training=X_training, y_training=y_training) + modAL.expected_error.expected_log_loss_reduction(learner, X_pool) modAL.expected_error.expected_error_reduction(learner, X_pool) From a489b6c51a74cf725c0df6f6a087a212e0ca187e Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 28 Nov 2018 14:45:39 +0100 Subject: [PATCH 029/182] fix: bugfix for subsampled eer --- modAL/expected_error.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index bc9889e..0b02317 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -61,7 +61,7 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, expected_error[x_idx] += np.sum(uncertainty)*X_proba[x_idx, y_idx] else: - expected_error[x_idx] -np.nan + expected_error[x_idx] = np.inf query_idx = multi_argmax(expected_error, n_instances) @@ -114,7 +114,7 @@ def expected_log_loss_reduction(learner: ActiveLearner, X: modALinput, expected_log_loss[x_idx] += np.sum(entr)*X_proba[x_idx, y_idx] else: - expected_log_loss[x_idx] -np.nan + expected_log_loss[x_idx] = np.inf query_idx = multi_argmax(expected_log_loss, n_instances) From ee484996988fcaa918a085ae6d1e7bccdf43642e Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 28 Nov 2018 14:45:57 +0100 Subject: [PATCH 030/182] add: test cases now cover subsampling --- tests/core_tests.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/core_tests.py b/tests/core_tests.py index 10109ad..5155b4a 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -395,6 +395,14 @@ def test_eer(self): modAL.expected_error.expected_log_loss_reduction(learner, X_pool) modAL.expected_error.expected_error_reduction(learner, X_pool) + modAL.expected_error.expected_log_loss_reduction(learner, X_pool, p_subsample=0.1) + modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1) + modAL.expected_error.expected_log_loss_reduction(learner, X_pool) + modAL.expected_error.expected_error_reduction(learner, X_pool) + self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction, + learner, X_pool, p_subsample=1.5) + self.assertRaises(AssertionError, modAL.expected_error.expected_log_loss_reduction, + learner, X_pool, p_subsample=1.5) class TestUncertainties(unittest.TestCase): From 47fd12bc62c634bd916389f6266069dfae80d6b3 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 28 Nov 2018 16:46:21 +0100 Subject: [PATCH 031/182] add: functions for calculating utility measures directly from classification probabilities --- modAL/uncertainty.py | 48 ++++++++++++++++++++++++++++++++++++++++++++ tests/core_tests.py | 18 +++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/modAL/uncertainty.py b/modAL/uncertainty.py index 64a353a..de3915d 100644 --- a/modAL/uncertainty.py +++ b/modAL/uncertainty.py @@ -12,6 +12,54 @@ from modAL.utils.data import modALinput +def _proba_uncertainty(proba: np.ndarray) -> np.ndarray: + """ + Calculates the uncertainty of the prediction probabilities. + + Args: + proba: Prediction probabilities. + + Returns: + Uncertainty of the prediction probabilities. + """ + + return 1 - np.max(proba, axis=1) + + +def _proba_margin(proba: np.ndarray) -> np.ndarray: + """ + Calculates the margin of the prediction probabilities. + + Args: + proba: Prediction probabilities. + + Returns: + Margin of the prediction probabilities. + """ + + if proba.shape[1] == 1: + return np.zeros(shape=len(proba)) + + part = np.partition(-proba, 1, axis=1) + margin = - part[:, 0] + part[:, 1] + + return margin + + +def _proba_entropy(proba: np.ndarray) -> np.ndarray: + """ + Calculates the entropy of the prediction probabilities. + + Args: + proba: Prediction probabilities. + + Returns: + Uncertainty of the prediction probabilities. + """ + + return np.transpose(entropy(np.transpose(proba))) + + def classifier_uncertainty(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray: """ Classification uncertainty of the classifier for the provided samples. diff --git a/tests/core_tests.py b/tests/core_tests.py index 5155b4a..130bc63 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -411,6 +411,12 @@ def test_classifier_uncertainty(self): test_cases = (Test(p * np.ones(shape=(k, l)), (1 - p) * np.ones(shape=(k, ))) for k in range(1, 100) for l in range(1, 10) for p in np.linspace(0, 1, 11)) for case in test_cases: + # testing _proba_uncertainty + np.testing.assert_almost_equal( + modAL.uncertainty._proba_uncertainty(case.input), + case.output + ) + # fitted estimator fitted_estimator = mock.MockEstimator(predict_proba_return=case.input) np.testing.assert_almost_equal( @@ -432,6 +438,12 @@ def test_classifier_margin(self): p * np.ones(shape=(l, ))*int(k!=1)) for k in range(1, 10) for l in range(1, 100) for p in np.linspace(0, 1, 11)) for case in chain(test_cases_1, test_cases_2): + # _proba_margin + np.testing.assert_almost_equal( + modAL.uncertainty._proba_margin(case.input), + case.output + ) + # fitted estimator fitted_estimator = mock.MockEstimator(predict_proba_return=case.input) np.testing.assert_almost_equal( @@ -453,6 +465,12 @@ def test_classifier_entropy(self): for sample_idx in range(n_samples): proba[sample_idx, np.random.choice(range(n_classes))] = 1.0 + # _proba_entropy + np.testing.assert_almost_equal( + modAL.uncertainty._proba_entropy(proba), + np.zeros(shape=(n_samples,)) + ) + # fitted estimator fitted_estimator = mock.MockEstimator(predict_proba_return=proba) np.testing.assert_equal( From 1a19391fcb6509e4b21a0eca6463960a48cea96b Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 28 Nov 2018 19:11:54 +0100 Subject: [PATCH 032/182] refactor: expected error reduction and log loss reduction merged --- modAL/expected_error.py | 79 ++++++++++------------------------------- tests/core_tests.py | 10 +++--- 2 files changed, 22 insertions(+), 67 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index 0b02317..e6f5942 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -2,7 +2,7 @@ Expected error reduction framework for active learning. """ -from typing import Tuple +from typing import Tuple, Callable import numpy as np @@ -14,9 +14,10 @@ from modAL.models import ActiveLearner from modAL.utils.data import modALinput, data_vstack from modAL.utils.selection import multi_argmax +from modAL.uncertainty import _proba_uncertainty, _proba_entropy -def expected_error_reduction(learner: ActiveLearner, X: modALinput, +def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary', p_subsample: np.float = 1.0, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: """ Expected error reduction query strategy. @@ -25,18 +26,23 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf) Args: - learner: The ActiveLearner object for which the expected error is to be estimated. + learner: The ActiveLearner object for which the expected error + is to be estimated. X: The samples. - p_subsample: Probability of keeping a sample from the pool when calculating expected error. - Significantly improves runtime for large sample pools. + loss: The loss function to be used. Can be 'binary' or 'log'. + p_subsample: Probability of keeping a sample from the pool when + calculating expected error. Significantly improves runtime + for large sample pools. n_instances: The number of instances to be sampled. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled; + the instances from X chosen to be labelled. """ assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0' + assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\'' expected_error = np.zeros(shape=(len(X), )) possible_labels = np.unique(learner.y_training) @@ -56,9 +62,13 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, y_new = data_vstack((learner.y_training, np.array(y).reshape(1, ))) refitted_estimator = clone(learner.estimator).fit(X_new, y_new) - uncertainty = 1 - np.max(refitted_estimator.predict_proba(X), axis=1) + refitted_proba = refitted_estimator.predict_proba(X) + if loss is 'binary': + loss = _proba_uncertainty(refitted_proba) + elif loss is 'log': + loss = _proba_entropy(refitted_proba) - expected_error[x_idx] += np.sum(uncertainty)*X_proba[x_idx, y_idx] + expected_error[x_idx] += np.sum(loss)*X_proba[x_idx, y_idx] else: expected_error[x_idx] = np.inf @@ -66,56 +76,3 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, query_idx = multi_argmax(expected_error, n_instances) return query_idx, X[query_idx] - - -def expected_log_loss_reduction(learner: ActiveLearner, X: modALinput, - p_subsample: np.float = 1.0, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: - """ - Expected log loss reduction query strategy. - - References: - Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf) - - Args: - learner: The ActiveLearner object for which the expected log loss is to be estimated. - X: The samples. - p_subsample: Probability of keeping a sample from the pool when calculating expected log loss. - Significantly improves runtime for large sample pools. - n_instances: The number of instances to be sampled. - - - Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. - """ - - assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0' - - expected_log_loss = np.zeros(shape=(len(X), )) - possible_labels = np.unique(learner.y_training) - - try: - X_proba = learner.predict_proba(X) - except NotFittedError: - # TODO: implement a proper cold-start - return 0, X[0] - - for x_idx, x in enumerate(X): - # subsample the data if needed - if np.random.rand() <= p_subsample: - # estimate the expected error - for y_idx, y in enumerate(possible_labels): - X_new = data_vstack((learner.X_training, x.reshape(1, -1))) - y_new = data_vstack((learner.y_training, np.array(y).reshape(1, ))) - - refitted_estimator = clone(learner.estimator).fit(X_new, y_new) - refitted_proba = refitted_estimator.predict_proba(X) - entr = np.transpose(entropy(np.transpose(refitted_proba))) - - expected_log_loss[x_idx] += np.sum(entr)*X_proba[x_idx, y_idx] - - else: - expected_log_loss[x_idx] = np.inf - - query_idx = multi_argmax(expected_log_loss, n_instances) - - return query_idx, X[query_idx] diff --git a/tests/core_tests.py b/tests/core_tests.py index 130bc63..2d06db9 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -393,16 +393,14 @@ def test_eer(self): learner = modAL.models.ActiveLearner(RandomForestClassifier(n_estimators=2), X_training=X_training, y_training=y_training) - modAL.expected_error.expected_log_loss_reduction(learner, X_pool) modAL.expected_error.expected_error_reduction(learner, X_pool) - modAL.expected_error.expected_log_loss_reduction(learner, X_pool, p_subsample=0.1) modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1) - modAL.expected_error.expected_log_loss_reduction(learner, X_pool) - modAL.expected_error.expected_error_reduction(learner, X_pool) + modAL.expected_error.expected_error_reduction(learner, X_pool, loss='binary') + modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1, loss='log') self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction, learner, X_pool, p_subsample=1.5) - self.assertRaises(AssertionError, modAL.expected_error.expected_log_loss_reduction, - learner, X_pool, p_subsample=1.5) + self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction, + learner, X_pool, loss=42) class TestUncertainties(unittest.TestCase): From 1a829fbad902380648dd295650483c958c4d2dff Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 29 Nov 2018 16:07:54 +0100 Subject: [PATCH 033/182] fix: unnecessary imports removed --- modAL/expected_error.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index e6f5942..a242964 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -2,12 +2,10 @@ Expected error reduction framework for active learning. """ -from typing import Tuple, Callable +from typing import Tuple import numpy as np -from scipy.stats import entropy - from sklearn.base import clone from sklearn.exceptions import NotFittedError From 92597b0d43d6f720ff788e826a20daa9cc172890 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 30 Nov 2018 09:08:19 +0100 Subject: [PATCH 034/182] fix: links to examples fixed --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a6e5db4..4652ea1 100644 --- a/README.md +++ b/README.md @@ -148,13 +148,13 @@ After a few queries, we can see that the prediction is much improved. ## Additional examples Including this, many examples are available: - [Pool-based sampling](https://modal-python.readthedocs.io/en/latest/content/examples/pool-based_sampling.html) -- [Stream-based sampling](https://modal-python.readthedocs.io/en/latest/content/examples/Stream-based-sampling.html) +- [Stream-based sampling](https://modal-python.readthedocs.io/en/latest/content/examples/stream-based_sampling.html) - [Active regression](https://modal-python.readthedocs.io/en/latest/content/examples/active_regression.html) - [Ensemble regression](https://modal-python.readthedocs.io/en/latest/content/examples/ensemble_regression.html) - [Bayesian optimization](https://modal-python.readthedocs.io/en/latest/content/examples/bayesian_optimization.html) -- [Query by committee](https://modal-python.readthedocs.io/en/latest/content/examples/Query-by-committee.html) -- [Bootstrapping and bagging](https://modal-python.readthedocs.io/en/latest/content/examples/Bootstrapping-and-bagging.html) -- [Keras integration](https://modal-python.readthedocs.io/en/latest/content/examples/Keras-integration.html) +- [Query by committee](https://modal-python.readthedocs.io/en/latest/content/examples/query_by_committee.html) +- [Bootstrapping and bagging](https://modal-python.readthedocs.io/en/latest/content/examples/bootstrapping_and_bagging.html) +- [Keras integration](https://modal-python.readthedocs.io/en/latest/content/examples/Keras_integration.html) # Installation modAL requires From fa6e14e9c98c15daa4b9dd1b1b1d977ccfba3ec7 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 30 Nov 2018 10:39:38 +0100 Subject: [PATCH 035/182] version number updated --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f4978cc..ab7252e 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='modAL', - version='0.3.2', + version='0.3.3', author='Tivadar Danka', author_email='85a5187a@opayq.com', description='A modular active learning framework for Python3', From 9626af357ce076fba0610e3b9eb71c9c1ced33ad Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 30 Nov 2018 10:41:11 +0100 Subject: [PATCH 036/182] add: expected_error added to docs --- docs/source/content/apireference/expected_error.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/source/content/apireference/expected_error.rst diff --git a/docs/source/content/apireference/expected_error.rst b/docs/source/content/apireference/expected_error.rst new file mode 100644 index 0000000..99f0622 --- /dev/null +++ b/docs/source/content/apireference/expected_error.rst @@ -0,0 +1,5 @@ +modAL.expected_error +==================== + +.. automodule:: modAL.expected_error + :members: From 55424a51e639a83d059e5d786c813949e0527d49 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 30 Nov 2018 10:42:20 +0100 Subject: [PATCH 037/182] add: link to expected error docs added to index --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index a75fbf6..34ded7d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -76,6 +76,7 @@ Currently supported active learning strategies are content/apireference/uncertainty.rst content/apireference/disagreement.rst content/apireference/multilabel.rst + content/apireference/expected_error.rst content/apireference/acquisition.rst content/apireference/batch.rst content/apireference/density.rst From 2927cfee30239273c0098370b3be54725f10563a Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 30 Nov 2018 15:57:54 +0100 Subject: [PATCH 038/182] fix: expected_error_reduction query strategy runtime improved --- modAL/expected_error.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index a242964..65f00d9 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -51,6 +51,8 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = # TODO: implement a proper cold-start return 0, X[0] + cloned_estimator = clone(learner.estimator) + for x_idx, x in enumerate(X): # subsample the data if needed if np.random.rand() <= p_subsample: @@ -59,8 +61,8 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = X_new = data_vstack((learner.X_training, x.reshape(1, -1))) y_new = data_vstack((learner.y_training, np.array(y).reshape(1, ))) - refitted_estimator = clone(learner.estimator).fit(X_new, y_new) - refitted_proba = refitted_estimator.predict_proba(X) + cloned_estimator.fit(X_new, y_new) + refitted_proba = cloned_estimator.predict_proba(X) if loss is 'binary': loss = _proba_uncertainty(refitted_proba) elif loss is 'log': From cdda99c9866312418ad85f0f34ac5b1d2787697e Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 30 Nov 2018 16:47:23 +0100 Subject: [PATCH 039/182] add: list of implemented algorithms updated --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 34ded7d..d3adafb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -17,6 +17,7 @@ Currently supported active learning strategies are - **uncertainty-based sampling:** *least confident* (`Lewis and Catlett `_), *max margin* and *max entropy* - **committee-based algorithms:** *vote entropy*, *consensus entropy* and *max disagreement* (`Cohn et al. `_) - **multilabel strategies:** *SVM binary minimum* (`Brinker `_), *max loss*, *mean max loss*, (`Li et al. `_) *MinConfidence*, *MeanConfidence*, *MinScore*, *MeanScore* (`Esuli and Sebastiani `_) +- **expected error reduction:** *binary* and *log loss* (`Roy and McCallum `_) - **Bayesian optimization:** *probability of improvement*, *expected improvement* and *upper confidence bound* (`Snoek et al. `_) - **batch active learning:** *ranked batch-mode sampling* (`Cardoso et al. `_) - **information density framework** (`McCallum and Nigam `_) From 8f856fd716715e73f29b0e4df0cecfadbd660653 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Sat, 1 Dec 2018 13:04:55 +0100 Subject: [PATCH 040/182] add: runtime comparison script across libraries added --- examples/runtime_comparison.py | 218 +++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 examples/runtime_comparison.py diff --git a/examples/runtime_comparison.py b/examples/runtime_comparison.py new file mode 100644 index 0000000..f6fdf13 --- /dev/null +++ b/examples/runtime_comparison.py @@ -0,0 +1,218 @@ +import numpy as np + +from time import time + +from sklearn.datasets import load_iris + +from acton.acton import main as acton_main + +from alp.active_learning.active_learning import ActiveLearner as ActiveLearnerALP + +from libact.base.dataset import Dataset +from libact.labelers import IdealLabeler +from libact.query_strategies import UncertaintySampling, QueryByCommittee +from libact.query_strategies.multiclass.expected_error_reduction import EER +from libact.models.logistic_regression import LogisticRegression as LogisticRegressionLibact + +from modAL.models import ActiveLearner, Committee +from modAL.expected_error import expected_error_reduction + +from sklearn.linear_model import LogisticRegression + + +runtime = {} + + +def timeit(n_reps=10): + + def timer(func): + + def timed_func(*args, **kwargs): + start = time() + for _ in range(n_reps): + result = func(*args, **kwargs) + end = time() + print("%s has been executed in %f s avg for %d reps" % (func.__name__, (end - start)/n_reps, n_reps)) + runtime[func.__name__] = (end - start)/n_reps + return result + + return timed_func + + return timer + + +@timeit() +def libact_uncertainty(X, y, n_queries): + y_train = np.array([None for _ in range(len(y))]) + y_train[0], y_train[50], y_train[100] = 0, 1, 2 + libact_train_dataset = Dataset(X, y_train) + libact_full_dataset = Dataset(X, y) + libact_learner = LogisticRegressionLibact(solver='liblinear', n_jobs=1, multi_class='ovr') #SVM(gamma='auto', probability=True) + libact_qs = UncertaintySampling(libact_train_dataset, model=libact_learner, method='lc') + libact_labeler = IdealLabeler(libact_full_dataset) + libact_learner.train(libact_train_dataset) + + for _ in range(n_queries): + query_idx = libact_qs.make_query() + query_label = libact_labeler.label(X[query_idx]) + libact_train_dataset.update(query_idx, query_label) + libact_learner.train(libact_train_dataset) + + +@timeit() +def libact_EER(X, y, n_queries): + y_train = np.array([None for _ in range(len(y))]) + y_train[0], y_train[50], y_train[100] = 0, 1, 2 + libact_train_dataset = Dataset(X, y_train) + libact_full_dataset = Dataset(X, y) + libact_learner = LogisticRegressionLibact(solver='liblinear', n_jobs=1, multi_class='ovr') #SVM(gamma='auto', probability=True) + libact_qs = EER(libact_train_dataset, model=libact_learner, loss='01') + libact_labeler = IdealLabeler(libact_full_dataset) + libact_learner.train(libact_train_dataset) + + for _ in range(n_queries): + query_idx = libact_qs.make_query() + query_label = libact_labeler.label(X[query_idx]) + libact_train_dataset.update(query_idx, query_label) + libact_learner.train(libact_train_dataset) + + +@timeit() +def libact_QBC(X, y, n_queries): + y_train = np.array([None for _ in range(len(y))]) + y_train[0], y_train[50], y_train[100] = 0, 1, 2 + libact_train_dataset = Dataset(X, y_train) + libact_full_dataset = Dataset(X, y) + libact_learner_list = [LogisticRegressionLibact(solver='liblinear', n_jobs=1, multi_class='ovr'), + LogisticRegressionLibact(solver='liblinear', n_jobs=1, multi_class='ovr')] + libact_qs = QueryByCommittee(libact_train_dataset, models=libact_learner_list, + method='lc') + libact_labeler = IdealLabeler(libact_full_dataset) + for libact_learner in libact_learner_list: + libact_learner.train(libact_train_dataset) + + for _ in range(n_queries): + query_idx = libact_qs.make_query() + query_label = libact_labeler.label(X[query_idx]) + libact_train_dataset.update(query_idx, query_label) + for libact_learner in libact_learner_list: + libact_learner.train(libact_train_dataset) + + +@timeit() +def modAL_uncertainty(X, y, n_queries): + modAL_learner = ActiveLearner(LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr'), + X_training=X[[0, 50, 100]], y_training=y[[0, 50, 100]]) + + for _ in range(n_queries): + query_idx, query_inst = modAL_learner.query(X) + modAL_learner.teach(X[query_idx], y[query_idx]) + + +@timeit() +def modAL_QBC(X, y, n_queries): + learner_list = [ActiveLearner(LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr'), + X_training=X[[0, 50, 100]], y_training=y[[0, 50, 100]]), + ActiveLearner(LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr'), + X_training=X[[0, 50, 100]], y_training=y[[0, 50, 100]])] + + modAL_learner = Committee(learner_list) + + for _ in range(n_queries): + query_idx, query_inst = modAL_learner.query(X) + modAL_learner.teach(X[query_idx], y[query_idx]) + + +@timeit() +def modAL_EER(X, y, n_queries): + modAL_learner = ActiveLearner(LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr'), + query_strategy=expected_error_reduction, + X_training=X[[0, 50, 100]], y_training=y[[0, 50, 100]]) + + for _ in range(n_queries): + query_idx, query_inst = modAL_learner.query(X) + modAL_learner.teach(X[query_idx], y[query_idx]) + + +@timeit() +# acton requires a txt format for data +def acton_uncertainty(data_path, n_queries): + # acton has no SVM support, so the LogisticRegression model is used + acton_main( + data_path=data_path, + feature_cols=['feat01', 'feat02', 'feat03', 'feat04'], + label_col='label', + output_path='out.csv', + n_epochs=n_queries, + initial_count=3, + recommender='UncertaintyRecommender', + predictor='LogisticRegression') + + +@timeit() +# acton requires a txt format for data +def acton_QBC(data_path, n_queries): + # acton has no SVM support, so the LogisticRegression model is used + acton_main( + data_path=data_path, + feature_cols=['feat01', 'feat02', 'feat03', 'feat04'], + label_col='label', + output_path='out.csv', + n_epochs=n_queries, + initial_count=3, + recommender='QBCRecommender', + predictor='LogisticRegressionCommittee') + + +@timeit() +def alp_uncertainty(X, y, n_queries): + X_labeled, y_labeled = X[[0, 50, 100]], y[[0, 50, 100]] + estimator = LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr') + estimator.fit(X_labeled, y_labeled) + learner = ActiveLearnerALP(strategy='least_confident') + + for _ in range(n_queries): + query_idx = learner.rank(estimator, X, num_queries=1) + X_labeled = np.concatenate((X_labeled, X[query_idx]), axis=0) + y_labeled = np.concatenate((y_labeled, y[query_idx]), axis=0) + estimator.fit(X_labeled, y_labeled) + + +@timeit() +def alp_QBC(X, y, n_queries): + X_labeled, y_labeled = X[[0, 50, 100]], y[[0, 50, 100]] + estimators = [LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr'), + LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr')] + + for estimator in estimators: + estimator.fit(X_labeled, y_labeled) + + learner = ActiveLearnerALP(strategy='vote_entropy') + + for _ in range(n_queries): + query_idx = learner.rank(estimators, X, num_queries=1) + X_labeled = np.concatenate((X_labeled, X[query_idx]), axis=0) + y_labeled = np.concatenate((y_labeled, y[query_idx]), axis=0) + for estimator in estimators: + estimator.fit(X_labeled, y_labeled) + + +def comparisons(n_queries=10): + # loading the data + X, y = load_iris(return_X_y=True) + + libact_uncertainty(X, y, n_queries) + libact_QBC(X, y, n_queries) + libact_EER(X, y, n_queries) + acton_uncertainty('iris.csv', n_queries) + acton_QBC('iris.csv', n_queries) + alp_uncertainty(X, y, n_queries) + alp_QBC(X, y, n_queries) + modAL_uncertainty(X, y, n_queries) + modAL_QBC(X, y, n_queries) + modAL_EER(X, y, n_queries) + + +if __name__ == '__main__': + comparisons() + print(runtime) From 61ad80d3b2540d4e478c84914545ed384b837822 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Tue, 4 Dec 2018 13:36:50 +0100 Subject: [PATCH 041/182] add: shuffled_argmax added --- modAL/utils/selection.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/modAL/utils/selection.py b/modAL/utils/selection.py index 977bf33..73700b8 100644 --- a/modAL/utils/selection.py +++ b/modAL/utils/selection.py @@ -5,6 +5,37 @@ import numpy as np +def shuffled_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: + """ + Shuffles the values and sorts them afterwards. This can be used to break + the tie when the highest utility score is not unique. The shuffle randomizes + order, which is preserved by the mergesort algorithm. + + Args: + values: + n_instances: + + Args: + values: Contains the values to be selected from. + n_instances: Specifies how many indices to return. + + Returns: + The indices of the n_instances largest values. + """ + + # shuffling indices and corresponding values + shuffled_idx = np.random.permutation(len(values)) + shuffled_values = values[shuffled_idx] + + # getting the n_instances best instance + # since mergesort is used, the shuffled order is preserved + sorted_query_idx = np.argsort(shuffled_values, kind='mergesort')[:n_instances] + + # inverting the shuffle + query_idx = shuffled_idx[sorted_query_idx] + return query_idx + + def multi_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: """ Selects the indices of the n_instances highest values. @@ -14,7 +45,7 @@ def multi_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: n_instances: Specifies how many indices to return. Returns: - Contains the indices of the n_instances largest values. + The indices of the n_instances largest values. """ assert n_instances <= values.shape[0], 'n_instances must be less or equal than the size of utility' From a58f317eecc12826b1dd0a0df4733978cc8d0046 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Tue, 4 Dec 2018 13:37:11 +0100 Subject: [PATCH 042/182] add: tests for shuffled_argmax --- tests/core_tests.py | 58 ++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/tests/core_tests.py b/tests/core_tests.py index 2d06db9..583569b 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -159,6 +159,40 @@ def test_data_vstack(self): # not supported formats self.assertRaises(TypeError, modAL.utils.data.data_vstack, (1, 1)) + # functions from modAL.utils.selection + + def test_multi_argmax(self): + for n_pool in range(2, 100): + for n_instances in range(1, n_pool): + utility = np.zeros(n_pool) + max_idx = np.random.choice(range(n_pool), size=n_instances, replace=False) + utility[max_idx] = 1e-10 + np.random.rand(n_instances, ) + np.testing.assert_equal( + np.sort(modAL.utils.selection.multi_argmax(utility, n_instances)), + np.sort(max_idx) + ) + + def test_shuffled_argmax(self): + for n_pool in range(1, 100): + for n_instances in range(1, n_pool+1): + values = np.random.permutation(n_pool) + true_query_idx = np.argsort(values)[:n_instances] + + np.testing.assert_equal( + true_query_idx, + modAL.utils.selection.shuffled_argmax(values, n_instances) + ) + + def test_weighted_random(self): + for n_pool in range(2, 100): + for n_instances in range(1, n_pool): + utility = np.ones(n_pool) + query_idx = modAL.utils.selection.weighted_random(utility, n_instances) + # testing for correct number of returned indices + np.testing.assert_equal(len(query_idx), n_instances) + # testing for uniqueness of each query index + np.testing.assert_equal(len(query_idx), len(np.unique(query_idx))) + class TestAcquisitionFunctions(unittest.TestCase): def test_acquisition_functions(self): @@ -524,30 +558,6 @@ def test_entropy_sampling(self): np.testing.assert_array_equal(query_idx, true_query_idx) -class TestQueries(unittest.TestCase): - - def test_multi_argmax(self): - for n_pool in range(2, 100): - for n_instances in range(1, n_pool): - utility = np.zeros(n_pool) - max_idx = np.random.choice(range(n_pool), size=n_instances, replace=False) - utility[max_idx] = 1e-10 + np.random.rand(n_instances, ) - np.testing.assert_equal( - np.sort(modAL.utils.selection.multi_argmax(utility, n_instances)), - np.sort(max_idx) - ) - - def test_weighted_random(self): - for n_pool in range(2, 100): - for n_instances in range(1, n_pool): - utility = np.ones(n_pool) - query_idx = modAL.utils.selection.weighted_random(utility, n_instances) - # testing for correct number of returned indices - np.testing.assert_equal(len(query_idx), n_instances) - # testing for uniqueness of each query index - np.testing.assert_equal(len(query_idx), len(np.unique(query_idx))) - - class TestActiveLearner(unittest.TestCase): def test_add_training_data(self): From 9b0f1197d278812a0b66f525fb1d2577e0657513 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Tue, 4 Dec 2018 14:58:18 +0100 Subject: [PATCH 043/182] add: random tie break implemented for uncertainty sampling methods --- modAL/uncertainty.py | 50 +++++++++++++++++++++++++++++++------------- tests/core_tests.py | 12 +++++++++++ 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/modAL/uncertainty.py b/modAL/uncertainty.py index de3915d..2659dc3 100644 --- a/modAL/uncertainty.py +++ b/modAL/uncertainty.py @@ -8,8 +8,8 @@ from sklearn.exceptions import NotFittedError from sklearn.base import BaseEstimator -from modAL.utils.selection import multi_argmax from modAL.utils.data import modALinput +from modAL.utils.selection import multi_argmax, shuffled_argmax def _proba_uncertainty(proba: np.ndarray) -> np.ndarray: @@ -131,7 +131,8 @@ def classifier_entropy(classifier: BaseEstimator, X: modALinput, **predict_proba def uncertainty_sampling(classifier: BaseEstimator, X: modALinput, - n_instances: int = 1, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break: bool = False, + **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: """ Uncertainty sampling query strategy. Selects the least sure instances for labelling. @@ -139,53 +140,74 @@ def uncertainty_sampling(classifier: BaseEstimator, X: modALinput, classifier: The classifier for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. - **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled; + the instances from X chosen to be labelled. """ uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs) - query_idx = multi_argmax(uncertainty, n_instances=n_instances) + + if not random_tie_break: + query_idx = multi_argmax(uncertainty, n_instances=n_instances) + else: + query_idx = shuffled_argmax(uncertainty, n_instances=n_instances) return query_idx, X[query_idx] def margin_sampling(classifier: BaseEstimator, X: modALinput, - n_instances: int = 1, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break: bool = False, + **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: """ Margin sampling query strategy. Selects the instances where the difference between the first most likely and second most likely classes are the smallest. - Args: classifier: The classifier for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. - Returns: The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs) - query_idx = multi_argmax(-margin, n_instances=n_instances) + + if not random_tie_break: + query_idx = multi_argmax(-margin, n_instances=n_instances) + else: + query_idx = shuffled_argmax(-margin, n_instances=n_instances) return query_idx, X[query_idx] def entropy_sampling(classifier: BaseEstimator, X: modALinput, - n_instances: int = 1, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break: bool = False, + **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: """ - Entropy sampling query strategy. Selects the instances where the class probabilities have the largest entropy. + Entropy sampling query strategy. Selects the instances where the class probabilities + have the largest entropy. Args: classifier: The classifier for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. - **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled; + the instances from X chosen to be labelled. """ entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs) - query_idx = multi_argmax(entropy, n_instances=n_instances) + + if not random_tie_break: + query_idx = multi_argmax(entropy, n_instances=n_instances) + else: + query_idx = shuffled_argmax(entropy, n_instances=n_instances) return query_idx, X[query_idx] diff --git a/tests/core_tests.py b/tests/core_tests.py index 583569b..3e7fb39 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -528,6 +528,10 @@ def test_uncertainty_sampling(self): query_idx, query_instance = modAL.uncertainty.uncertainty_sampling( classifier, np.random.rand(n_samples, n_classes) ) + shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.uncertainty_sampling( + classifier, np.random.rand(n_samples, n_classes), + random_tie_break=True + ) np.testing.assert_array_equal(query_idx, true_query_idx) def test_margin_sampling(self): @@ -541,6 +545,10 @@ def test_margin_sampling(self): query_idx, query_instance = modAL.uncertainty.margin_sampling( classifier, np.random.rand(n_samples, n_classes) ) + shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.margin_sampling( + classifier, np.random.rand(n_samples, n_classes), + random_tie_break=True + ) np.testing.assert_array_equal(query_idx, true_query_idx) def test_entropy_sampling(self): @@ -555,6 +563,10 @@ def test_entropy_sampling(self): query_idx, query_instance = modAL.uncertainty.entropy_sampling( classifier, np.random.rand(n_samples, n_classes) ) + shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.entropy_sampling( + classifier, np.random.rand(n_samples, n_classes), + random_tie_break=True + ) np.testing.assert_array_equal(query_idx, true_query_idx) From b2eeede5735e833568843ef66f0eea98f5f6c119 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 5 Dec 2018 12:17:07 +0100 Subject: [PATCH 044/182] fix: minor docstring fixes --- modAL/uncertainty.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/modAL/uncertainty.py b/modAL/uncertainty.py index 2659dc3..c11de43 100644 --- a/modAL/uncertainty.py +++ b/modAL/uncertainty.py @@ -163,15 +163,19 @@ def margin_sampling(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: """ - Margin sampling query strategy. Selects the instances where the difference between the first most likely and second - most likely classes are the smallest. + Margin sampling query strategy. Selects the instances where the difference between + the first most likely and second most likely classes are the smallest. Args: classifier: The classifier for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. - **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled; + the instances from X chosen to be labelled. """ margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs) From e0af35f35b634b0246e1b5dd30b1ccb055885757 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 5 Dec 2018 12:18:59 +0100 Subject: [PATCH 045/182] add: random tie break for disagreement sampling --- modAL/disagreement.py | 68 +++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/modAL/disagreement.py b/modAL/disagreement.py index 3b4bcda..3d92ad5 100644 --- a/modAL/disagreement.py +++ b/modAL/disagreement.py @@ -10,7 +10,7 @@ from sklearn.base import BaseEstimator from modAL.utils.data import modALinput -from modAL.utils.selection import multi_argmax +from modAL.utils.selection import multi_argmax, shuffled_argmax from modAL.models.base import BaseCommittee @@ -103,7 +103,8 @@ def KL_max_disagreement(committee: BaseCommittee, X: modALinput, **predict_proba def vote_entropy_sampling(committee: BaseCommittee, X: modALinput, - n_instances: int = 1,**disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break=False, + **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: """ Vote entropy sampling strategy. @@ -111,19 +112,28 @@ def vote_entropy_sampling(committee: BaseCommittee, X: modALinput, committee: The committee for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. - **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement measure function. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement + measure function. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled; + the instances from X chosen to be labelled. """ disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs) - query_idx = multi_argmax(disagreement, n_instances=n_instances) + + if not random_tie_break: + query_idx = multi_argmax(disagreement, n_instances=n_instances) + else: + query_idx = shuffled_argmax(disagreement, n_instances=n_instances) return query_idx, X[query_idx] def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput, - n_instances: int = 1,**disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break=False, + **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: """ Consensus entropy sampling strategy. @@ -131,19 +141,28 @@ def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput, committee: The committee for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. - **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement measure function. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement + measure function. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled; + the instances from X chosen to be labelled. """ disagreement = consensus_entropy(committee, X, **disagreement_measure_kwargs) - query_idx = multi_argmax(disagreement, n_instances=n_instances) + + if not random_tie_break: + query_idx = multi_argmax(disagreement, n_instances=n_instances) + else: + query_idx = shuffled_argmax(disagreement, n_instances=n_instances) return query_idx, X[query_idx] def max_disagreement_sampling(committee: BaseCommittee, X: modALinput, - n_instances: int = 1,**disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break=False, + **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: """ Maximum disagreement sampling strategy. @@ -151,19 +170,28 @@ def max_disagreement_sampling(committee: BaseCommittee, X: modALinput, committee: The committee for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. - **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement measure function. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement + measure function. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled; + the instances from X chosen to be labelled. """ disagreement = KL_max_disagreement(committee, X, **disagreement_measure_kwargs) - query_idx = multi_argmax(disagreement, n_instances=n_instances) + + if not random_tie_break: + query_idx = multi_argmax(disagreement, n_instances=n_instances) + else: + query_idx = shuffled_argmax(disagreement, n_instances=n_instances) return query_idx, X[query_idx] def max_std_sampling(regressor: BaseEstimator, X: modALinput, - n_instances: int = 1, **predict_kwargs) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break=False, + **predict_kwargs) -> Tuple[np.ndarray, modALinput]: """ Regressor standard deviation sampling strategy. @@ -171,12 +199,20 @@ def max_std_sampling(regressor: BaseEstimator, X: modALinput, regressor: The regressor for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled; + the instances from X chosen to be labelled. """ _, std = regressor.predict(X, return_std=True, **predict_kwargs) std = std.reshape(X.shape[0], ) - query_idx = multi_argmax(std, n_instances=n_instances) + + if not random_tie_break: + query_idx = multi_argmax(std, n_instances=n_instances) + else: + query_idx = shuffled_argmax(std, n_instances=n_instances) + return query_idx, X[query_idx] \ No newline at end of file From bb3b579f6abfbb97aa12ca1cbaf777b4b848056b Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 5 Dec 2018 13:14:56 +0100 Subject: [PATCH 046/182] add: tests for disagreement sampling functions added --- modAL/disagreement.py | 4 ++-- tests/core_tests.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/modAL/disagreement.py b/modAL/disagreement.py index 3d92ad5..04e1f12 100644 --- a/modAL/disagreement.py +++ b/modAL/disagreement.py @@ -214,5 +214,5 @@ def max_std_sampling(regressor: BaseEstimator, X: modALinput, query_idx = multi_argmax(std, n_instances=n_instances) else: query_idx = shuffled_argmax(std, n_instances=n_instances) - - return query_idx, X[query_idx] \ No newline at end of file + + return query_idx, X[query_idx] diff --git a/tests/core_tests.py b/tests/core_tests.py index 3e7fb39..af3766d 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -21,6 +21,7 @@ from collections import namedtuple from sklearn.ensemble import RandomForestClassifier +from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.exceptions import NotFittedError from sklearn.metrics import confusion_matrix from sklearn.svm import SVC @@ -417,6 +418,39 @@ def test_KL_max_disagreement(self): ) np.testing.assert_almost_equal(returned_KL_disagreement, true_KL_disagreement) + def test_vote_entropy_sampling(self): + for n_samples, n_features, n_classes in product(range(1, 10), range(1, 10), range(1, 10)): + committee = mock.MockCommittee(classes_=np.asarray(range(n_classes)), + vote_return=np.zeros(shape=(n_samples, n_classes), dtype=np.int16)) + modAL.disagreement.vote_entropy_sampling(committee, np.random.rand(n_samples, n_features)) + modAL.disagreement.vote_entropy_sampling(committee, np.random.rand(n_samples, n_features), + random_tie_break=True) + + def test_consensus_entropy_sampling(self): + for n_samples, n_features, n_classes in product(range(1, 10), range(1, 10), range(1, 10)): + committee = mock.MockCommittee(predict_proba_return=np.random.rand(n_samples, n_classes)) + modAL.disagreement.consensus_entropy_sampling(committee, np.random.rand(n_samples, n_features)) + modAL.disagreement.consensus_entropy_sampling(committee, np.random.rand(n_samples, n_features), + random_tie_break=True) + + def test_max_disagreement_sampling(self): + for n_samples, n_features, n_classes, n_learners in product(range(1, 10), range(1, 10), range(1, 10), range(2, 5)): + committee = mock.MockCommittee( + n_learners=n_learners, classes_=range(n_classes), + vote_proba_return=np.zeros(shape=(n_samples, n_learners, n_classes)) + ) + modAL.disagreement.max_disagreement_sampling(committee, np.random.rand(n_samples, n_features)) + modAL.disagreement.max_disagreement_sampling(committee, np.random.rand(n_samples, n_features), + random_tie_break=True) + + def test_max_std_sampling(self): + for n_samples, n_features in product(range(1, 10), range(1, 10)): + regressor = GaussianProcessRegressor() + regressor.fit(np.random.rand(n_samples, n_features), np.random.rand(n_samples)) + modAL.disagreement.max_std_sampling(regressor, np.random.rand(n_samples, n_features)) + modAL.disagreement.max_std_sampling(regressor, np.random.rand(n_samples, n_features), + random_tie_break=True) + class TestEER(unittest.TestCase): def test_eer(self): From b7815d8a8b0db0309e4c296934430230909e57d3 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 5 Dec 2018 13:21:31 +0100 Subject: [PATCH 047/182] add: random tie break for multilabel classification --- modAL/multilabel.py | 131 ++++++++++++------- tests/core_tests.py | 10 +- tests/example_tests/ensemble.py | 2 +- tests/example_tests/query_by_committee.py | 2 +- tests/example_tests/shape_learning.py | 2 +- tests/example_tests/stream_based_sampling.py | 2 +- 6 files changed, 100 insertions(+), 49 deletions(-) diff --git a/modAL/multilabel.py b/modAL/multilabel.py index db907aa..28a7254 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -5,14 +5,13 @@ from modAL.models import ActiveLearner from modAL.utils.data import modALinput -from modAL.utils.selection import multi_argmax +from modAL.utils.selection import multi_argmax, shuffled_argmax from typing import Tuple, Optional from itertools import combinations def _SVM_loss(multiclass_classifier: ActiveLearner, - X: modALinput, - most_certain_classes: Optional[int] = None) -> np.ndarray: + X: modALinput, most_certain_classes: Optional[int] = None) -> np.ndarray: """ Utility function for max_loss and mean_max_loss strategies. @@ -43,8 +42,8 @@ def _SVM_loss(multiclass_classifier: ActiveLearner, return cls_loss -def SVM_binary_minimum(classifier: ActiveLearner, - X_pool: modALinput) -> Tuple[np.ndarray, modALinput]: +def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput, + random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ SVM binary minimum multilabel active learning strategy. For details see the paper Klaus Brinker, On Active Learning in Multi-label Classification @@ -53,23 +52,30 @@ def SVM_binary_minimum(classifier: ActiveLearner, Args: classifier: The multilabel classifier for which the labels are to be queried. Must be an SVM model such as the ones from sklearn.svm. - X: The pool of samples to query from. + X_pool: The pool of samples to query from. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + The index of the instance from X_pool chosen to be labelled; + the instance from X_pool chosen to be labelled. """ decision_function = np.array([svm.decision_function(X_pool) for svm in classifier.estimator.estimators_]).T min_abs_dist = np.min(np.abs(decision_function), axis=1) - query_idx = np.argmin(min_abs_dist) + + if not random_tie_break: + query_idx = np.argmin(min_abs_dist) + else: + query_idx = shuffled_argmax(min_abs_dist) + return query_idx, X_pool[query_idx] -def max_loss(classifier: OneVsRestClassifier, - X_pool: modALinput, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: +def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput, + n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ Max Loss query strategy for SVM multilabel classification. @@ -82,10 +88,13 @@ def max_loss(classifier: OneVsRestClassifier, classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model such as the ones from sklearn.svm. Although the function will execute for other models as well, the mathematical calculations in Li et al. work only for SVM-s. - X: The pool of samples to query from. + X_pool: The pool of samples to query from. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + The index of the instance from X_pool chosen to be labelled; + the instance from X_pool chosen to be labelled. """ assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)' @@ -93,13 +102,16 @@ def max_loss(classifier: OneVsRestClassifier, most_certain_classes = classifier.predict_proba(X_pool).argmax(axis=1) loss = _SVM_loss(classifier, X_pool, most_certain_classes=most_certain_classes) - query_idx = multi_argmax(loss, n_instances) + if not random_tie_break: + query_idx = multi_argmax(loss, n_instances) + else: + query_idx = shuffled_argmax(loss, n_instances) + return query_idx, X_pool[query_idx] -def mean_max_loss(classifier: OneVsRestClassifier, - X_pool: modALinput, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: +def mean_max_loss(classifier: OneVsRestClassifier, X_pool: modALinput, + n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ Mean Max Loss query strategy for SVM multilabel classification. @@ -111,22 +123,28 @@ def mean_max_loss(classifier: OneVsRestClassifier, classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model such as the ones from sklearn.svm. Although the function will execute for other models as well, the mathematical calculations in Li et al. work only for SVM-s. - X: The pool of samples to query from. + X_pool: The pool of samples to query from. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + The index of the instance from X_pool chosen to be labelled; + the instance from X_pool chosen to be labelled. """ assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)' loss = _SVM_loss(classifier, X_pool) - query_idx = multi_argmax(loss, n_instances) + if not random_tie_break: + query_idx = multi_argmax(loss, n_instances) + else: + query_idx = shuffled_argmax(loss, n_instances) + return query_idx, X_pool[query_idx] -def min_confidence(classifier: OneVsRestClassifier, - X_pool: modALinput, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: +def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, + n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ MinConfidence query strategy for multilabel classification. @@ -136,22 +154,28 @@ def min_confidence(classifier: OneVsRestClassifier, Args: classifier: The multilabel classifier for which the labels are to be queried. - X: The pool of samples to query from. + X_pool: The pool of samples to query from. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + The index of the instance from X_pool chosen to be labelled; + the instance from X_pool chosen to be labelled. """ classwise_confidence = classifier.predict_proba(X_pool) classwise_min = np.min(classwise_confidence, axis=1) - query_idx = multi_argmax((-1)*classwise_min, n_instances) + + if not random_tie_break: + query_idx = multi_argmax(-classwise_min, n_instances) + else: + query_idx = shuffled_argmax(-classwise_min, n_instances) return query_idx, X_pool[query_idx] -def avg_confidence(classifier: OneVsRestClassifier, - X_pool: modALinput, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: +def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, + n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ AvgConfidence query strategy for multilabel classification. @@ -161,22 +185,28 @@ def avg_confidence(classifier: OneVsRestClassifier, Args: classifier: The multilabel classifier for which the labels are to be queried. - X: The pool of samples to query from. + X_pool: The pool of samples to query from. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + The index of the instance from X_pool chosen to be labelled; + the instance from X_pool chosen to be labelled. """ classwise_confidence = classifier.predict_proba(X_pool) classwise_mean = np.mean(classwise_confidence, axis=1) - query_idx = multi_argmax(classwise_mean, n_instances) + + if not random_tie_break: + query_idx = multi_argmax(classwise_mean, n_instances) + else: + query_idx = shuffled_argmax(classwise_mean, n_instances) return query_idx, X_pool[query_idx] -def max_score(classifier: OneVsRestClassifier, - X_pool: modALinput, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: +def max_score(classifier: OneVsRestClassifier, X_pool: modALinput, + n_instances: int = 1, random_tie_break: bool = 1) -> Tuple[np.ndarray, modALinput]: """ MaxScore query strategy for multilabel classification. @@ -186,24 +216,30 @@ def max_score(classifier: OneVsRestClassifier, Args: classifier: The multilabel classifier for which the labels are to be queried. - X: The pool of samples to query from. + X_pool: The pool of samples to query from. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + The index of the instance from X_pool chosen to be labelled; + the instance from X_pool chosen to be labelled. """ classwise_confidence = classifier.predict_proba(X_pool) classwise_predictions = classifier.predict(X_pool) classwise_scores = classwise_confidence*(classwise_predictions - 1/2) classwise_max = np.max(classwise_scores, axis=1) - query_idx = multi_argmax(classwise_max, n_instances) + + if not random_tie_break: + query_idx = multi_argmax(classwise_max, n_instances) + else: + query_idx = shuffled_argmax(classwise_max, n_instances) return query_idx, X_pool[query_idx] -def avg_score(classifier: OneVsRestClassifier, - X_pool: modALinput, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: +def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput, + n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ AvgScore query strategy for multilabel classification. @@ -213,16 +249,23 @@ def avg_score(classifier: OneVsRestClassifier, Args: classifier: The multilabel classifier for which the labels are to be queried. - X: The pool of samples to query from. + X_pool: The pool of samples to query from. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled. + The index of the instance from X_pool chosen to be labelled; + the instance from X_pool chosen to be labelled. """ classwise_confidence = classifier.predict_proba(X_pool) classwise_predictions = classifier.predict(X_pool) classwise_scores = classwise_confidence*(classwise_predictions-1/2) classwise_mean = np.mean(classwise_scores, axis=1) - query_idx = multi_argmax(classwise_mean, n_instances) + + if not random_tie_break: + query_idx = multi_argmax(classwise_mean, n_instances) + else: + query_idx = shuffled_argmax(classwise_mean, n_instances) return query_idx, X_pool[query_idx] diff --git a/tests/core_tests.py b/tests/core_tests.py index af3766d..a84ebaf 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -1049,14 +1049,22 @@ def test_strategies(self): classifier.fit(X_training, y_training) active_learner = modAL.models.ActiveLearner(classifier) + # no random tie break modAL.multilabel.SVM_binary_minimum(active_learner, X_pool) - modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances) modAL.multilabel.max_loss(classifier, X_pool, n_query_instances) modAL.multilabel.min_confidence(classifier, X_pool, n_query_instances) modAL.multilabel.avg_confidence(classifier, X_pool, n_query_instances) modAL.multilabel.max_score(classifier, X_pool, n_query_instances) modAL.multilabel.avg_score(classifier, X_pool, n_query_instances) + # random tie break + modAL.multilabel.SVM_binary_minimum(active_learner, X_pool, random_tie_break=True) + modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.max_loss(classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.min_confidence(classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.avg_confidence(classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.max_score(classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.avg_score(classifier, X_pool, n_query_instances, random_tie_break=True) class TestExamples(unittest.TestCase): diff --git a/tests/example_tests/ensemble.py b/tests/example_tests/ensemble.py index bb06cd3..35c36df 100644 --- a/tests/example_tests/ensemble.py +++ b/tests/example_tests/ensemble.py @@ -32,7 +32,7 @@ learner_list = [] for _ in range(n_learners): learner = ActiveLearner( - estimator=RandomForestClassifier(), + estimator=RandomForestClassifier(n_estimators=10), X_training=X_pool[initial_idx], y_training=y_pool[initial_idx], bootstrap_init=True ) diff --git a/tests/example_tests/query_by_committee.py b/tests/example_tests/query_by_committee.py index a5b94fd..b974483 100644 --- a/tests/example_tests/query_by_committee.py +++ b/tests/example_tests/query_by_committee.py @@ -30,7 +30,7 @@ # initializing learner learner = ActiveLearner( - estimator=RandomForestClassifier(), + estimator=RandomForestClassifier(n_estimators=10), X_training=X_train, y_training=y_train ) learner_list.append(learner) diff --git a/tests/example_tests/shape_learning.py b/tests/example_tests/shape_learning.py index 11dd75a..f76a07a 100644 --- a/tests/example_tests/shape_learning.py +++ b/tests/example_tests/shape_learning.py @@ -34,7 +34,7 @@ # create an ActiveLearner instance learner = ActiveLearner( - estimator=RandomForestClassifier(), + estimator=RandomForestClassifier(n_estimators=10), X_training=X_train, y_training=y_train ) initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width) diff --git a/tests/example_tests/stream_based_sampling.py b/tests/example_tests/stream_based_sampling.py index 16854f5..d306f61 100644 --- a/tests/example_tests/stream_based_sampling.py +++ b/tests/example_tests/stream_based_sampling.py @@ -30,7 +30,7 @@ # initialize the learner learner = ActiveLearner( - estimator=RandomForestClassifier(), + estimator=RandomForestClassifier(n_estimators=10), X_training=X_train, y_training=y_train ) From fad91e80324e34d2a04647a95e94a2e106487c0f Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 5 Dec 2018 13:30:48 +0100 Subject: [PATCH 048/182] add: random tie break for expected error reduction --- modAL/expected_error.py | 12 +++++++++--- tests/core_tests.py | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index 65f00d9..df596f5 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -11,12 +11,13 @@ from modAL.models import ActiveLearner from modAL.utils.data import modALinput, data_vstack -from modAL.utils.selection import multi_argmax +from modAL.utils.selection import multi_argmax, shuffled_argmax from modAL.uncertainty import _proba_uncertainty, _proba_entropy def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary', - p_subsample: np.float = 1.0, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + p_subsample: np.float = 1.0, n_instances: int = 1, + random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ Expected error reduction query strategy. @@ -32,6 +33,8 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = calculating expected error. Significantly improves runtime for large sample pools. n_instances: The number of instances to be sampled. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. Returns: @@ -73,6 +76,9 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = else: expected_error[x_idx] = np.inf - query_idx = multi_argmax(expected_error, n_instances) + if not random_tie_break: + query_idx = multi_argmax(expected_error, n_instances) + else: + query_idx = shuffled_argmax(expected_error, n_instances) return query_idx, X[query_idx] diff --git a/tests/core_tests.py b/tests/core_tests.py index a84ebaf..1374ab1 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -462,6 +462,7 @@ def test_eer(self): X_training=X_training, y_training=y_training) modAL.expected_error.expected_error_reduction(learner, X_pool) + modAL.expected_error.expected_error_reduction(learner, X_pool, random_tie_break=True) modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1) modAL.expected_error.expected_error_reduction(learner, X_pool, loss='binary') modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1, loss='log') From 452898fc181b6d4ae6399dfdcb311ceb952c8486 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 5 Dec 2018 13:42:20 +0100 Subject: [PATCH 049/182] update: version number bumped --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ab7252e..be59e7b 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='modAL', - version='0.3.3', + version='0.3.4', author='Tivadar Danka', author_email='85a5187a@opayq.com', description='A modular active learning framework for Python3', From f8778f545b083a9eee63b9c97479e9e296e6d753 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Fri, 11 Jan 2019 17:50:46 +0000 Subject: [PATCH 050/182] fix: correct best_instance_index in batch.select_instance --- modAL/batch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modAL/batch.py b/modAL/batch.py index be931bc..4c00d5c 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -101,7 +101,10 @@ def select_instance( scores = alpha * (1 - similarity_scores) + (1 - alpha) * X_uncertainty[mask] # Isolate and return our best instance for labeling as the one with the largest score. - best_instance_index = np.argmax(scores) + best_instance_index_in_unlabeled = np.argmax(scores) + n_pool, _ = X_pool.shape + unlabeled_indices = [i for i in range(n_pool) if mask[i]] + best_instance_index = unlabeled_indices[best_instance_index_in_unlabeled] mask[best_instance_index] = 0 return best_instance_index, X_pool[best_instance_index].reshape(1, -1), mask From e973a320b920774bce11e1337289f7bf8cb85fb9 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Mon, 14 Jan 2019 21:43:20 +0000 Subject: [PATCH 051/182] fix: store cold start instance in batch.ranked_batch --- modAL/batch.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/modAL/batch.py b/modAL/batch.py index 4c00d5c..b0fc14c 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -15,7 +15,7 @@ def select_cold_start_instance(X: modALinput, metric: Union[str, Callable], - n_jobs: Union[int, None]) -> modALinput: + n_jobs: Union[int, None]) -> Tuple[int, modALinput]: """ Define what to do if our batch-mode sampling doesn't have any labeled data -- a cold start. @@ -35,7 +35,8 @@ def select_cold_start_instance(X: modALinput, n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. Returns: - Best instance for cold-start. + Index of the best cold-start instance from `X` chosen to be labelled; record of the best cold-start instance + from `X` chosen to be labelled. """ # Compute all pairwise distances in our unlabeled data and obtain the row-wise average for each of our records in X. n_jobs = n_jobs if n_jobs else 1 @@ -43,7 +44,7 @@ def select_cold_start_instance(X: modALinput, # Isolate and return our best instance for labeling as the record with the least average distance. best_coldstart_instance_index = np.argmin(average_distances) - return X[best_coldstart_instance_index].reshape(1, -1) + return best_coldstart_instance_index, X[best_coldstart_instance_index].reshape(1, -1) def select_instance( @@ -133,14 +134,16 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee], The indices of the top n_instances ranked unlabelled samples. """ # Make a local copy of our classifier's training data. + # Define our record container and record the best cold start instance in the case of cold start. if classifier.X_training is None: - labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs) + best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs) + instance_index_ranking = [best_coldstart_instance_index] elif classifier.X_training.shape[0] > 0: labeled = classifier.X_training[:] - - # Define our record container and the maximum number of records to sample. - instance_index_ranking = [] - ceiling = np.minimum(unlabeled.shape[0], n_instances) + instance_index_ranking = [] + + # The maximum number of records to sample. + ceiling = np.minimum(unlabeled.shape[0], n_instances) - len(instance_index_ranking) # mask for unlabeled initialized as transparent mask = np.ones(unlabeled.shape[0], np.bool) From 257a6989f2631ccbf15a81a331938d0ab1ddd6b7 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Sun, 27 Jan 2019 14:48:26 +0000 Subject: [PATCH 052/182] fix: selection.shuffled_argmax and its test case fixed --- modAL/utils/selection.py | 3 ++- tests/core_tests.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/modAL/utils/selection.py b/modAL/utils/selection.py index 73700b8..65aae76 100644 --- a/modAL/utils/selection.py +++ b/modAL/utils/selection.py @@ -22,6 +22,7 @@ def shuffled_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: Returns: The indices of the n_instances largest values. """ + assert n_instances <= values.shape[0], 'n_instances must be less or equal than the size of utility' # shuffling indices and corresponding values shuffled_idx = np.random.permutation(len(values)) @@ -29,7 +30,7 @@ def shuffled_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: # getting the n_instances best instance # since mergesort is used, the shuffled order is preserved - sorted_query_idx = np.argsort(shuffled_values, kind='mergesort')[:n_instances] + sorted_query_idx = np.argsort(shuffled_values, kind='mergesort')[len(shuffled_values)-n_instances:] # inverting the shuffle query_idx = shuffled_idx[sorted_query_idx] diff --git a/tests/core_tests.py b/tests/core_tests.py index 1374ab1..e7de509 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -177,8 +177,8 @@ def test_shuffled_argmax(self): for n_pool in range(1, 100): for n_instances in range(1, n_pool+1): values = np.random.permutation(n_pool) - true_query_idx = np.argsort(values)[:n_instances] - + true_query_idx = np.argsort(values)[len(values)-n_instances:] + np.testing.assert_equal( true_query_idx, modAL.utils.selection.shuffled_argmax(values, n_instances) From a2537595058cb84a89233b6e89e59c28442e880a Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 28 Jan 2019 14:15:10 +0100 Subject: [PATCH 053/182] fix: unnecessary lines were removed from docstring --- modAL/utils/selection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modAL/utils/selection.py b/modAL/utils/selection.py index 65aae76..3741a66 100644 --- a/modAL/utils/selection.py +++ b/modAL/utils/selection.py @@ -11,10 +11,6 @@ def shuffled_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: the tie when the highest utility score is not unique. The shuffle randomizes order, which is preserved by the mergesort algorithm. - Args: - values: - n_instances: - Args: values: Contains the values to be selected from. n_instances: Specifies how many indices to return. From 4029dfd4e5f68509a409d509ed706f544472bf25 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 21 Feb 2019 19:08:41 +0100 Subject: [PATCH 054/182] citation updated --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4652ea1..46137da 100644 --- a/README.md +++ b/README.md @@ -181,7 +181,7 @@ If you use modAL in your projects, you can cite it as @article{modAL2018, title={mod{AL}: {A} modular active learning framework for {P}ython}, author={Tivadar Danka and Peter Horvath}, - url={https://github.com/cosmic-cortex/modAL}, + url={https://github.com/modAL-python/modAL}, note={available on arXiv at \url{https://arxiv.org/abs/1805.00979}} } ``` From c4d62d6f9cefc5b2ca6ff0f8c5d87fca666fb042 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 15 Mar 2019 08:10:04 +0100 Subject: [PATCH 055/182] multi_argmax and max_std_sampling test case coverage increased --- tests/core_tests.py | 3 ++- tests/example_tests/active_regression.py | 10 ++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/core_tests.py b/tests/core_tests.py index e7de509..f8d2761 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -164,7 +164,7 @@ def test_data_vstack(self): def test_multi_argmax(self): for n_pool in range(2, 100): - for n_instances in range(1, n_pool): + for n_instances in range(1, n_pool+1): utility = np.zeros(n_pool) max_idx = np.random.choice(range(n_pool), size=n_instances, replace=False) utility[max_idx] = 1e-10 + np.random.rand(n_instances, ) @@ -1087,3 +1087,4 @@ def test_examples(self): if __name__ == '__main__': unittest.main(verbosity=2) +0 \ No newline at end of file diff --git a/tests/example_tests/active_regression.py b/tests/example_tests/active_regression.py index 331a103..72d43f3 100644 --- a/tests/example_tests/active_regression.py +++ b/tests/example_tests/active_regression.py @@ -6,16 +6,10 @@ from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import WhiteKernel, RBF from modAL.models import ActiveLearner +from modAL.disagreement import max_std_sampling np.random.seed(0) - -# query strategy for regression -def GP_regression_std(regressor, X): - _, std = regressor.predict(X, return_std=True) - query_idx = np.argmax(std) - return query_idx, X[query_idx] - # generating the data X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1) y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape) @@ -32,7 +26,7 @@ def GP_regression_std(regressor, X): # initializing the active learner regressor = ActiveLearner( estimator=GaussianProcessRegressor(kernel=kernel), - query_strategy=GP_regression_std, + query_strategy=max_std_sampling, X_training=X_initial.reshape(-1, 1), y_training=y_initial.reshape(-1, 1) ) From a8eca52816c30f9a01a0e8bf6cf4730953e3dedf Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 9 May 2019 15:48:22 +0200 Subject: [PATCH 056/182] batch uncertainty sampling fixed for higher dimensional datasets --- modAL/batch.py | 19 ++++++---- tests/core_tests.py | 1 + tests/example_tests/multidimensional_data.py | 38 ++++++++++++++++++++ 3 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 tests/example_tests/multidimensional_data.py diff --git a/modAL/batch.py b/modAL/batch.py index b0fc14c..ffe7a53 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -79,9 +79,11 @@ def select_instance( Index of the best index from X chosen to be labelled; a single record from our unlabeled set that is considered the most optimal incremental record for including in our query set. """ + X_pool_masked = X_pool[mask] + # Extract the number of labeled and unlabeled records. - n_labeled_records, _ = X_training.shape - n_unlabeled, _ = X_pool[mask].shape + n_labeled_records, *rest = X_training.shape + n_unlabeled, *rest = X_pool_masked.shape # Determine our alpha parameter as |U| / (|U| + |D|). Note that because we # append to X_training and remove from X_pool within `ranked_batch`, @@ -90,10 +92,15 @@ def select_instance( # Compute pairwise distance (and then similarity) scores from every unlabeled record # to every record in X_training. The result is an array of shape (n_samples, ). + if n_jobs == 1 or n_jobs is None: - _, distance_scores = pairwise_distances_argmin_min(X_pool[mask], X_training, metric=metric) + _, distance_scores = pairwise_distances_argmin_min(X_pool_masked.reshape(n_unlabeled, -1), + X_training.reshape(n_labeled_records, -1), + metric=metric) else: - distance_scores = pairwise_distances(X_pool[mask], X_training, metric=metric, n_jobs=n_jobs).min(axis=1) + distance_scores = pairwise_distances(X_pool_masked.reshape(n_unlabeled, -1), + X_training.reshape(n_labeled_records, -1), + metric=metric, n_jobs=n_jobs).min(axis=1) similarity_scores = 1 / (1 + distance_scores) @@ -103,11 +110,11 @@ def select_instance( # Isolate and return our best instance for labeling as the one with the largest score. best_instance_index_in_unlabeled = np.argmax(scores) - n_pool, _ = X_pool.shape + n_pool, *rest = X_pool.shape unlabeled_indices = [i for i in range(n_pool) if mask[i]] best_instance_index = unlabeled_indices[best_instance_index_in_unlabeled] mask[best_instance_index] = 0 - return best_instance_index, X_pool[best_instance_index].reshape(1, -1), mask + return best_instance_index, np.expand_dims(X_pool[best_instance_index], axis=0), mask def ranked_batch(classifier: Union[BaseLearner, BaseCommittee], diff --git a/tests/core_tests.py b/tests/core_tests.py index f8d2761..d1060c6 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -1071,6 +1071,7 @@ def test_strategies(self): class TestExamples(unittest.TestCase): def test_examples(self): + import example_tests.multidimensional_data import example_tests.active_regression import example_tests.bagging import example_tests.ensemble diff --git a/tests/example_tests/multidimensional_data.py b/tests/example_tests/multidimensional_data.py new file mode 100644 index 0000000..78c4994 --- /dev/null +++ b/tests/example_tests/multidimensional_data.py @@ -0,0 +1,38 @@ +import numpy as np +from modAL.models import ActiveLearner +from modAL.uncertainty import margin_sampling, entropy_sampling +from modAL.batch import uncertainty_batch_sampling +from modAL.expected_error import expected_error_reduction + + +class MockClassifier: + def __init__(self, n_classes=2): + self.n_classes = n_classes + + def fit(self, X, y): + return self + + def predict(self, X): + return np.random.randint(0, self.n_classes, shape=(len(X), 1)) + + def predict_proba(self, X): + return np.ones(shape=(len(X), self.n_classes))/self.n_classes + + +if __name__ == '__main__': + X_train = np.random.rand(10, 5, 5) + y_train = np.random.rand(10, 1) + X_pool = np.random.rand(10, 5, 5) + y_pool = np.random.rand(10, 1) + + strategies = [margin_sampling, entropy_sampling, uncertainty_batch_sampling] + + for query_strategy in strategies: + print("testing %s..." % query_strategy.__name__) + # max margin sampling + learner = ActiveLearner( + estimator=MockClassifier(), query_strategy=query_strategy, + X_training=X_train, y_training=y_train + ) + learner.query(X_pool) + learner.teach(X_pool, y_pool) From fc8e2e09aec7a2fbef6302cd7a0fade88d798db8 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 10 May 2019 11:47:46 +0200 Subject: [PATCH 057/182] expected_error_reduction fixed for multidimensional data --- modAL/expected_error.py | 5 +++-- tests/example_tests/multidimensional_data.py | 10 ++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index df596f5..29267a0 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -61,8 +61,8 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = if np.random.rand() <= p_subsample: # estimate the expected error for y_idx, y in enumerate(possible_labels): - X_new = data_vstack((learner.X_training, x.reshape(1, -1))) - y_new = data_vstack((learner.y_training, np.array(y).reshape(1, ))) + X_new = data_vstack((learner.X_training, np.expand_dims(x, axis=0))) + y_new = data_vstack((learner.y_training, np.array(y).reshape(1,))) cloned_estimator.fit(X_new, y_new) refitted_proba = cloned_estimator.predict_proba(X) @@ -73,6 +73,7 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = expected_error[x_idx] += np.sum(loss)*X_proba[x_idx, y_idx] + else: expected_error[x_idx] = np.inf diff --git a/tests/example_tests/multidimensional_data.py b/tests/example_tests/multidimensional_data.py index 78c4994..e87e319 100644 --- a/tests/example_tests/multidimensional_data.py +++ b/tests/example_tests/multidimensional_data.py @@ -1,11 +1,13 @@ import numpy as np +from sklearn.base import BaseEstimator + from modAL.models import ActiveLearner from modAL.uncertainty import margin_sampling, entropy_sampling from modAL.batch import uncertainty_batch_sampling from modAL.expected_error import expected_error_reduction -class MockClassifier: +class MockClassifier(BaseEstimator): def __init__(self, n_classes=2): self.n_classes = n_classes @@ -21,11 +23,11 @@ def predict_proba(self, X): if __name__ == '__main__': X_train = np.random.rand(10, 5, 5) - y_train = np.random.rand(10, 1) + y_train = np.random.randint(0, 2, size=10) X_pool = np.random.rand(10, 5, 5) - y_pool = np.random.rand(10, 1) + y_pool = np.random.randint(0, 2, size=10) - strategies = [margin_sampling, entropy_sampling, uncertainty_batch_sampling] + strategies = [margin_sampling, entropy_sampling, uncertainty_batch_sampling, expected_error_reduction] for query_strategy in strategies: print("testing %s..." % query_strategy.__name__) From e82487f10f8b7368432dd7f285d9effd8b320b56 Mon Sep 17 00:00:00 2001 From: damienlancry Date: Fri, 31 May 2019 12:11:04 +0800 Subject: [PATCH 058/182] runnable example --- examples/pytorch_integration.py | 96 +++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 examples/pytorch_integration.py diff --git a/examples/pytorch_integration.py b/examples/pytorch_integration.py new file mode 100644 index 0000000..55adcf0 --- /dev/null +++ b/examples/pytorch_integration.py @@ -0,0 +1,96 @@ +""" +This example demonstrates how to use the active learning interface with Keras. +The example uses the scikit-learn wrappers of Keras. For more info, see https://keras.io/scikit-learn-api/ +""" + +import torch +import keras +import numpy as np +from keras.datasets import mnist +from torch import nn +from skorch import NeuralNetClassifier +from modAL.models import ActiveLearner +from tqdm import tqdm + +# build class for the skorch API +class Torch_Model(nn.Module): + def __init__(self,): + super(Torch_Model, self).__init__() + self.convs = nn.Sequential( + nn.Conv2d(1,32,3), + nn.ReLU(), + nn.Conv2d(32,64,3), + nn.ReLU(), + nn.MaxPool2d(2), + nn.Dropout(0.25) + ) + self.fcs = nn.Sequential( + nn.Linear(12*12*64,128), + nn.ReLU(), + nn.Dropout(0.5), + nn.Linear(128,10), + ) + + def forward(self, x): + out = x + out = self.convs(out) + out = out.view(-1,12*12*64) + out = self.fcs(out) + return out + +# create the classifier +classifier = NeuralNetClassifier(Torch_Model, + # max_epochs=100, + criterion=nn.CrossEntropyLoss, + optimizer=torch.optim.Adam, + train_split=None, + verbose=1, + device="cuda") + +""" +Data wrangling +1. Reading data from Keras +2. Assembling initial training data for ActiveLearner +3. Generating the pool +""" + +# read training data +(X_train, y_train), (X_test, y_test) = mnist.load_data() +X_train = X_train.reshape(60000, 1, 28, 28).astype('float32') / 255. +X_test = X_test.reshape(10000, 1, 28, 28).astype('float32') / 255. +y_train = y_train.astype('long') +y_test = y_test.astype('long') + +# assemble initial data +n_initial = 1000 +initial_idx = np.random.choice(range(len(X_train)), size=n_initial, replace=False) +X_initial = X_train[initial_idx] +y_initial = y_train[initial_idx] + +# generate the pool +# remove the initial data from the training dataset +X_pool = np.delete(X_train, initial_idx, axis=0) +y_pool = np.delete(y_train, initial_idx, axis=0) + +""" +Training the ActiveLearner +""" + +# initialize ActiveLearner +learner = ActiveLearner( + estimator=classifier, + X_training=X_initial, y_training=y_initial, +) + +# the active learning loop +n_queries = 10 +for idx in tqdm(range(n_queries)): + query_idx, query_instance = learner.query(X_pool, n_instances=100) + print(query_idx) + learner.teach(X_pool[query_idx], y_pool[query_idx], only_new=True) + # remove queried instance from pool + X_pool = np.delete(X_pool, query_idx, axis=0) + y_pool = np.delete(y_pool, query_idx, axis=0) + +# the final accuracy score +print(learner.score(X_test, y_test)) From e5530b845eb24f30f5110cbf36c8bb5b7a9b03ae Mon Sep 17 00:00:00 2001 From: damienlancry Date: Fri, 31 May 2019 12:18:59 +0800 Subject: [PATCH 059/182] updated comments removed tqdm --- examples/pytorch_integration.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/pytorch_integration.py b/examples/pytorch_integration.py index 55adcf0..fba6c6f 100644 --- a/examples/pytorch_integration.py +++ b/examples/pytorch_integration.py @@ -1,16 +1,15 @@ """ -This example demonstrates how to use the active learning interface with Keras. -The example uses the scikit-learn wrappers of Keras. For more info, see https://keras.io/scikit-learn-api/ +This example demonstrates how to use the active learning interface with Pytorch. +The example uses Skorch, a scikit learn wrapper of Pytorch. +For more info, see https://skorch.readthedocs.io/en/stable/ """ import torch -import keras import numpy as np from keras.datasets import mnist from torch import nn from skorch import NeuralNetClassifier from modAL.models import ActiveLearner -from tqdm import tqdm # build class for the skorch API class Torch_Model(nn.Module): @@ -84,7 +83,7 @@ def forward(self, x): # the active learning loop n_queries = 10 -for idx in tqdm(range(n_queries)): +for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool, n_instances=100) print(query_idx) learner.teach(X_pool[query_idx], y_pool[query_idx], only_new=True) From e79b171604c2c95defaf6fe93ecb1f8fdf44394e Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Sun, 2 Jun 2019 14:24:31 +0200 Subject: [PATCH 060/182] numpy.concatenate fallback to data_vstack added --- modAL/utils/data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modAL/utils/data.py b/modAL/utils/data.py index 32976e4..48409b6 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -25,4 +25,7 @@ def data_vstack(blocks: Container) -> modALinput: elif sp.issparse(blocks[0]): return sp.vstack(blocks) else: - raise TypeError('%s datatype is not supported' % type(blocks[0])) + try: + return np.concatenate(blocks) + except: + raise TypeError('%s datatype is not supported' % type(blocks[0])) From caec2c73aad40c0e632e978964ec5c3cf4773c9e Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Sun, 2 Jun 2019 14:24:43 +0200 Subject: [PATCH 061/182] keras mnist replaced with torchvision mnist --- examples/pytorch_integration.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/examples/pytorch_integration.py b/examples/pytorch_integration.py index fba6c6f..ca33b6f 100644 --- a/examples/pytorch_integration.py +++ b/examples/pytorch_integration.py @@ -6,11 +6,16 @@ import torch import numpy as np -from keras.datasets import mnist + from torch import nn +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +from torchvision.datasets import MNIST from skorch import NeuralNetClassifier + from modAL.models import ActiveLearner + # build class for the skorch API class Torch_Model(nn.Module): def __init__(self,): @@ -37,28 +42,34 @@ def forward(self, x): out = self.fcs(out) return out + # create the classifier +device = "cuda" if torch.cuda.is_available() else "cpu" classifier = NeuralNetClassifier(Torch_Model, # max_epochs=100, criterion=nn.CrossEntropyLoss, optimizer=torch.optim.Adam, train_split=None, verbose=1, - device="cuda") + device=device) """ Data wrangling -1. Reading data from Keras +1. Reading data from torchvision 2. Assembling initial training data for ActiveLearner 3. Generating the pool """ +mnist_data = MNIST('.', download=True, transform=ToTensor()) +dataloader = DataLoader(mnist_data, shuffle=True, batch_size=60000) +X, y = next(iter(dataloader)) + # read training data -(X_train, y_train), (X_test, y_test) = mnist.load_data() -X_train = X_train.reshape(60000, 1, 28, 28).astype('float32') / 255. -X_test = X_test.reshape(10000, 1, 28, 28).astype('float32') / 255. -y_train = y_train.astype('long') -y_test = y_test.astype('long') +X_train, X_test, y_train, y_test = X[:50000], X[50000:], y[:50000], y[50000:] +X_train = X_train.reshape(50000, 1, 28, 28) +X_test = X_test.reshape(10000, 1, 28, 28) +y_train = y_train +y_test = y_test # assemble initial data n_initial = 1000 @@ -85,7 +96,6 @@ def forward(self, x): n_queries = 10 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool, n_instances=100) - print(query_idx) learner.teach(X_pool[query_idx], y_pool[query_idx], only_new=True) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) From 574afa3fd68dc107a0d616f511f85e059ac5c1a0 Mon Sep 17 00:00:00 2001 From: damienlancry Date: Fri, 7 Jun 2019 12:53:31 +0800 Subject: [PATCH 062/182] jupyter notebook --- .../examples/Pytorch_integration.ipynb | 277 ++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 docs/source/content/examples/Pytorch_integration.ipynb diff --git a/docs/source/content/examples/Pytorch_integration.ipynb b/docs/source/content/examples/Pytorch_integration.ipynb new file mode 100644 index 0000000..bacb824 --- /dev/null +++ b/docs/source/content/examples/Pytorch_integration.ipynb @@ -0,0 +1,277 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pytorch models in modAL workflows\n", + "=============================\n", + "\n", + "Thanks to Skorch API, you can seamlessly integrate Pytorch models into your modAL workflow. In this tutorial, we shall quickly introduce how to use Skorch API of Keras and we are going to see how to do active learning with it. More details on the Keras scikit-learn API [can be found here](https://skorch.readthedocs.io/en/stable/).\n", + "\n", + "The executable script for this example can be [found here](https://github.com/cosmic-cortex/modAL/blob/master/examples/pytorch_integration.py)!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Skorch API\n", + "-----------------------\n", + "\n", + "By default, a Pytorch model's interface differs from what is used for scikit-learn estimators. However, with the use of Skorch wrapper, it is possible to adapt your model." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch import nn\n", + "from skorch import NeuralNetClassifier\n", + "\n", + "# build class for the skorch API\n", + "class Torch_Model(nn.Module):\n", + " def __init__(self,):\n", + " super(Torch_Model, self).__init__()\n", + " self.convs = nn.Sequential(\n", + " nn.Conv2d(1,32,3),\n", + " nn.ReLU(),\n", + " nn.Conv2d(32,64,3),\n", + " nn.ReLU(),\n", + " nn.MaxPool2d(2),\n", + " nn.Dropout(0.25)\n", + " )\n", + " self.fcs = nn.Sequential(\n", + " nn.Linear(12*12*64,128),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.5),\n", + " nn.Linear(128,10),\n", + " )\n", + "\n", + " def forward(self, x):\n", + " out = x\n", + " out = self.convs(out)\n", + " out = out.view(-1,12*12*64)\n", + " out = self.fcs(out)\n", + " return out" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For our purposes, the ``classifier`` which we will initialize now acts just like any scikit-learn estimator." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# create the classifier\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "classifier = NeuralNetClassifier(Torch_Model,\n", + " criterion=nn.CrossEntropyLoss,\n", + " optimizer=torch.optim.Adam,\n", + " train_split=None,\n", + " verbose=1,\n", + " device=device)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Active learning with Pytorch\n", + "---------------------------------------\n", + "\n", + "In this example, we are going to use the famous MNIST dataset, which is available as a built-in for PyTorch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + "0it [00:00, ?it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 97%|█████████▋| 9584640/9912422 [00:15<00:00, 1777143.52it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./MNIST/raw/train-images-idx3-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "0it [00:00, ?it/s]\u001b[A" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " 0%| | 0/28881 [00:00 Date: Fri, 7 Jun 2019 10:49:46 +0200 Subject: [PATCH 063/182] unnecessary lines removed --- examples/pytorch_integration.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/pytorch_integration.py b/examples/pytorch_integration.py index ca33b6f..b601caa 100644 --- a/examples/pytorch_integration.py +++ b/examples/pytorch_integration.py @@ -68,8 +68,6 @@ def forward(self, x): X_train, X_test, y_train, y_test = X[:50000], X[50000:], y[:50000], y[50000:] X_train = X_train.reshape(50000, 1, 28, 28) X_test = X_test.reshape(10000, 1, 28, 28) -y_train = y_train -y_test = y_test # assemble initial data n_initial = 1000 From 57ce832833baf265f3f51ca88286f7ac8235e22d Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 7 Jun 2019 11:07:09 +0200 Subject: [PATCH 064/182] Pytorch integration notebook added to docs pages --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index d3adafb..2963c01 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -67,6 +67,7 @@ Currently supported active learning strategies are content/examples/query_by_committee content/examples/bootstrapping_and_bagging content/examples/Keras_integration + content/examples/Pytorch_integration .. toctree:: :glob: From b126921d429f7d9c5ab5fea7d1c7202848326537 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 7 Jun 2019 11:48:42 +0200 Subject: [PATCH 065/182] expected error reduction bugs fixed --- modAL/expected_error.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index 29267a0..b5dcd6d 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -67,19 +67,18 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = cloned_estimator.fit(X_new, y_new) refitted_proba = cloned_estimator.predict_proba(X) if loss is 'binary': - loss = _proba_uncertainty(refitted_proba) + nloss = _proba_uncertainty(refitted_proba) elif loss is 'log': - loss = _proba_entropy(refitted_proba) - - expected_error[x_idx] += np.sum(loss)*X_proba[x_idx, y_idx] + nloss = _proba_entropy(refitted_proba) + expected_error[x_idx] += np.sum(nloss)*X_proba[x_idx, y_idx] else: expected_error[x_idx] = np.inf if not random_tie_break: - query_idx = multi_argmax(expected_error, n_instances) + query_idx = multi_argmax(-expected_error, n_instances) else: - query_idx = shuffled_argmax(expected_error, n_instances) + query_idx = shuffled_argmax(-expected_error, n_instances) return query_idx, X[query_idx] From 3c01821f84d701c71b65fea85185a00c3276a2b9 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 7 Jun 2019 22:05:30 +0200 Subject: [PATCH 066/182] queried instance removed from the loss calculation --- modAL/expected_error.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index b5dcd6d..31296d1 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -59,13 +59,14 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = for x_idx, x in enumerate(X): # subsample the data if needed if np.random.rand() <= p_subsample: + X_reduced = np.delete(X, x_idx, axis=0) # estimate the expected error for y_idx, y in enumerate(possible_labels): X_new = data_vstack((learner.X_training, np.expand_dims(x, axis=0))) y_new = data_vstack((learner.y_training, np.array(y).reshape(1,))) cloned_estimator.fit(X_new, y_new) - refitted_proba = cloned_estimator.predict_proba(X) + refitted_proba = cloned_estimator.predict_proba(X_reduced) if loss is 'binary': nloss = _proba_uncertainty(refitted_proba) elif loss is 'log': From 85445aef9e07f31b9e3ecae9c76703655e07e633 Mon Sep 17 00:00:00 2001 From: damienlancry Date: Mon, 17 Jun 2019 13:43:12 +0800 Subject: [PATCH 067/182] DBAL with Image Data implementation using modAL --- examples/deep_bayesian_active_learning.py | 101 ++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 examples/deep_bayesian_active_learning.py diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py new file mode 100644 index 0000000..75f59f4 --- /dev/null +++ b/examples/deep_bayesian_active_learning.py @@ -0,0 +1,101 @@ +import keras +import numpy as np +from keras import backend as K +from keras.datasets import mnist +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D +from keras.regularizers import l2 +from keras.wrappers.scikit_learn import KerasClassifier +from modAL.models import ActiveLearner + + +def create_keras_model(): + model = Sequential() + model.add(Conv2D(32, (3, 3), activation='relu')) + model.add(Conv2D(32, (3, 3), activation='relu')) + model.add(MaxPooling2D(pool_size=(2, 2))) + model.add(Dropout(0.25)) + c = 3.5 + weight_decay = c / float(X_train.shape[0]) + model.add(Flatten()) + model.add(Dense(128, activation='relu', kernel_regularizer=l2(weight_decay))) + model.add(Dropout(0.5)) + model.add(Dense(10, activation='softmax')) + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"]) + return model + + +# create the classifier +classifier = KerasClassifier(create_keras_model) + +# read training data +(X_train, y_train), (X_test, y_test) = mnist.load_data() + + +# assemble initial data +initial_idx = np.array([],dtype=np.int) +for i in range(10): + idx = np.random.choice(np.where(y_train==i)[0], size=2, replace=False) + initial_idx = np.concatenate((initial_idx, idx)) + +# Preprocessing +X_train = X_train.reshape(60000, 28, 28, 1).astype('float32') / 255. +X_test = X_test.reshape(10000, 28, 28, 1).astype('float32') / 255. +y_train = keras.utils.to_categorical(y_train, 10) +y_test = keras.utils.to_categorical(y_test, 10) + +X_initial = X_train[initial_idx] +y_initial = y_train[initial_idx] + +# generate the pool +# remove the initial data from the training dataset +X_pool = np.delete(X_train, initial_idx, axis=0) +y_pool = np.delete(y_train, initial_idx, axis=0) + +""" +Query Strategy +""" + +def max_entropy(learner, X, n_instances=1, T=100): + subset = X[np.random.choice(range(len(X)), size=2000, replace=False)] + MC_output = K.function([learner.estimator.model.layers[0].input, K.learning_phase()], + [learner.estimator.model.layers[-1].output]) + learning_phase = True + MC_samples = [MC_output([subset, learning_phase])[0] for _ in range(T)] + MC_samples = np.array(MC_samples) # [#samples x batch size x #classes] + acquisition = - np.mean(np.sum(MC_samples * np.log(MC_samples + 1e-10), axis=-1), axis=0) # [batch size] + query_idx = (-acquisition).argsort()[:n_instances] + return query_idx, X[query_idx] + +def uniform(learner, X, n_instances=1): + query_idx = np.random.choice(range(len(X)), size=n_instances, replace=False) + return query_idx, X[query_idx] + +""" +Training the ActiveLearner +""" + +# initialize ActiveLearner +learner = ActiveLearner( + estimator=classifier, + X_training=X_initial, + y_training=y_initial, + query_strategy=max_entropy, + verbose=0 +) + +# the active learning loop +n_queries = 100 +perf_hist = [learner.score(X_test, y_test, verbose=0)] +for index in range(n_queries): + query_idx, query_instance = learner.query(X_pool, n_instances=10) + learner.teach(X_pool[query_idx], y_pool[query_idx], epochs=50, batch_size=128, verbose=0) + # remove queried instance from pool + X_pool = np.delete(X_pool, query_idx, axis=0) + y_pool = np.delete(y_pool, query_idx, axis=0) + model_accuracy = learner.score(X_test, y_test, verbose=0) + print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy)) + perf_hist = [model_accuracy] + +np.save('/home/damien/Results/keras_modal_entropy.npy', perf_hist) +print("saving to /home/damien/Results/keras_modal_entropy.npy") From eb2df2f9ae60be5dbd5d18273d6b65c09661e671 Mon Sep 17 00:00:00 2001 From: damienlancry Date: Mon, 17 Jun 2019 14:02:24 +0800 Subject: [PATCH 068/182] fixed max_entropy acquisition function --- examples/deep_bayesian_active_learning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py index 75f59f4..b705fc3 100644 --- a/examples/deep_bayesian_active_learning.py +++ b/examples/deep_bayesian_active_learning.py @@ -63,7 +63,8 @@ def max_entropy(learner, X, n_instances=1, T=100): learning_phase = True MC_samples = [MC_output([subset, learning_phase])[0] for _ in range(T)] MC_samples = np.array(MC_samples) # [#samples x batch size x #classes] - acquisition = - np.mean(np.sum(MC_samples * np.log(MC_samples + 1e-10), axis=-1), axis=0) # [batch size] + expected_p = np.mean(MC_samples, axis=0) + acquisition = - np.sum(expected_p * np.log(expected_p + 1e-10), axis=-1) # [batch size] query_idx = (-acquisition).argsort()[:n_instances] return query_idx, X[query_idx] From 0d8486e3ec8128630563fea3d60ccc0a1884a7da Mon Sep 17 00:00:00 2001 From: damienlancry Date: Mon, 17 Jun 2019 15:27:00 +0800 Subject: [PATCH 069/182] removed weight decay --- examples/deep_bayesian_active_learning.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py index b705fc3..8c1c87e 100644 --- a/examples/deep_bayesian_active_learning.py +++ b/examples/deep_bayesian_active_learning.py @@ -15,8 +15,6 @@ def create_keras_model(): model.add(Conv2D(32, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) - c = 3.5 - weight_decay = c / float(X_train.shape[0]) model.add(Flatten()) model.add(Dense(128, activation='relu', kernel_regularizer=l2(weight_decay))) model.add(Dropout(0.5)) From d1488fc683c35372de74c73638aeebadcb990cf3 Mon Sep 17 00:00:00 2001 From: damienlancry Date: Mon, 17 Jun 2019 15:27:26 +0800 Subject: [PATCH 070/182] removed weight decay --- examples/deep_bayesian_active_learning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py index 8c1c87e..72b1716 100644 --- a/examples/deep_bayesian_active_learning.py +++ b/examples/deep_bayesian_active_learning.py @@ -16,7 +16,7 @@ def create_keras_model(): model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) - model.add(Dense(128, activation='relu', kernel_regularizer=l2(weight_decay))) + model.add(Dense(128, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(10, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"]) From d02803f3c3e049abf695931a1a22be56eea38c20 Mon Sep 17 00:00:00 2001 From: damienlancry Date: Mon, 17 Jun 2019 18:43:04 +0800 Subject: [PATCH 071/182] fixed pool size --- examples/deep_bayesian_active_learning.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py index 72b1716..fd0955b 100644 --- a/examples/deep_bayesian_active_learning.py +++ b/examples/deep_bayesian_active_learning.py @@ -13,7 +13,7 @@ def create_keras_model(): model = Sequential() model.add(Conv2D(32, (3, 3), activation='relu')) model.add(Conv2D(32, (3, 3), activation='relu')) - model.add(MaxPooling2D(pool_size=(2, 2))) + model.add(MaxPooling2D(pool_size=(5, 5))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128, activation='relu')) @@ -96,5 +96,6 @@ def uniform(learner, X, n_instances=1): print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy)) perf_hist = [model_accuracy] -np.save('/home/damien/Results/keras_modal_entropy.npy', perf_hist) -print("saving to /home/damien/Results/keras_modal_entropy.npy") +save_path = "/home/damien/Results/keras_modal_riashat_entropy.npy" +np.save(save_path, perf_hist) +print(save_path) From f5f9753744bfd4320fbba45a97c3c9c1fce3fa42 Mon Sep 17 00:00:00 2001 From: damienlancry Date: Tue, 18 Jun 2019 11:45:09 +0800 Subject: [PATCH 072/182] append --- examples/deep_bayesian_active_learning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py index fd0955b..9f7d2e7 100644 --- a/examples/deep_bayesian_active_learning.py +++ b/examples/deep_bayesian_active_learning.py @@ -94,7 +94,7 @@ def uniform(learner, X, n_instances=1): y_pool = np.delete(y_pool, query_idx, axis=0) model_accuracy = learner.score(X_test, y_test, verbose=0) print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy)) - perf_hist = [model_accuracy] + perf_hist.append(model_accuracy) save_path = "/home/damien/Results/keras_modal_riashat_entropy.npy" np.save(save_path, perf_hist) From f0c6924c3e028bdd0f5b3db2a6c6c1a28b74e46f Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Wed, 19 Jun 2019 07:00:50 +0200 Subject: [PATCH 073/182] forced data conversion removed by passing dtype=None argument for check_X_y calls --- modAL/models/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index c4703a1..822fdbc 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -71,7 +71,7 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None: If the classifier has been fitted, the features in X have to agree with the training samples which the classifier has seen. """ - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True) + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None) if self.X_training is None: self.X_training = X @@ -117,7 +117,7 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f Returns: self """ - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True) + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None) if not bootstrap: self.estimator.fit(X, y, **fit_kwargs) @@ -146,7 +146,7 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg Returns: self """ - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True) + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None) self.X_training, self.y_training = X, y return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) From 274f974a0690ba5de50dedb3109836ba6f276b5a Mon Sep 17 00:00:00 2001 From: damienlancry Date: Wed, 19 Jun 2019 15:56:41 +0800 Subject: [PATCH 074/182] made the network closer to the one described in the paper --- examples/deep_bayesian_active_learning.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py index 9f7d2e7..d74966e 100644 --- a/examples/deep_bayesian_active_learning.py +++ b/examples/deep_bayesian_active_learning.py @@ -8,12 +8,11 @@ from keras.wrappers.scikit_learn import KerasClassifier from modAL.models import ActiveLearner - def create_keras_model(): model = Sequential() - model.add(Conv2D(32, (3, 3), activation='relu')) - model.add(Conv2D(32, (3, 3), activation='relu')) - model.add(MaxPooling2D(pool_size=(5, 5))) + model.add(Conv2D(32, (4, 4), activation='relu')) + model.add(Conv2D(32, (4, 4), activation='relu')) + model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128, activation='relu')) From bdf72524a2417daa00524ec9327d125eebbe349f Mon Sep 17 00:00:00 2001 From: damienlancry Date: Wed, 19 Jun 2019 15:57:28 +0800 Subject: [PATCH 075/182] fixed max_entropy function --- examples/deep_bayesian_active_learning.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py index d74966e..71c321b 100644 --- a/examples/deep_bayesian_active_learning.py +++ b/examples/deep_bayesian_active_learning.py @@ -54,15 +54,16 @@ def create_keras_model(): """ def max_entropy(learner, X, n_instances=1, T=100): - subset = X[np.random.choice(range(len(X)), size=2000, replace=False)] + random_subset = np.random.choice(X.shape[0], 2000, replace=False) MC_output = K.function([learner.estimator.model.layers[0].input, K.learning_phase()], [learner.estimator.model.layers[-1].output]) learning_phase = True - MC_samples = [MC_output([subset, learning_phase])[0] for _ in range(T)] + MC_samples = [MC_output([X[random_subset], learning_phase])[0] for _ in range(T)] MC_samples = np.array(MC_samples) # [#samples x batch size x #classes] expected_p = np.mean(MC_samples, axis=0) acquisition = - np.sum(expected_p * np.log(expected_p + 1e-10), axis=-1) # [batch size] - query_idx = (-acquisition).argsort()[:n_instances] + idx = (-acquisition).argsort()[:n_instances] + query_idx = random_subset[idx] return query_idx, X[query_idx] def uniform(learner, X, n_instances=1): From 300a51846b82edbb33200379840198724d573984 Mon Sep 17 00:00:00 2001 From: damienlancry Date: Wed, 19 Jun 2019 15:59:22 +0800 Subject: [PATCH 076/182] removed hard coded path --- examples/deep_bayesian_active_learning.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py index 71c321b..6ae5c4a 100644 --- a/examples/deep_bayesian_active_learning.py +++ b/examples/deep_bayesian_active_learning.py @@ -44,8 +44,7 @@ def create_keras_model(): X_initial = X_train[initial_idx] y_initial = y_train[initial_idx] -# generate the pool -# remove the initial data from the training dataset +# remove the initial data from the pool of unlabelled examples X_pool = np.delete(X_train, initial_idx, axis=0) y_pool = np.delete(y_train, initial_idx, axis=0) @@ -95,7 +94,3 @@ def uniform(learner, X, n_instances=1): model_accuracy = learner.score(X_test, y_test, verbose=0) print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy)) perf_hist.append(model_accuracy) - -save_path = "/home/damien/Results/keras_modal_riashat_entropy.npy" -np.save(save_path, perf_hist) -print(save_path) From 1bd7fe7f775c0c8123834e4ed1223ad6da096fbc Mon Sep 17 00:00:00 2001 From: Vikas Kumar Yadav Date: Mon, 24 Jun 2019 18:20:25 +0530 Subject: [PATCH 077/182] Update pool-based_sampling.ipynb --- docs/source/content/examples/pool-based_sampling.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/content/examples/pool-based_sampling.ipynb b/docs/source/content/examples/pool-based_sampling.ipynb index 14a1746..4f18408 100644 --- a/docs/source/content/examples/pool-based_sampling.ipynb +++ b/docs/source/content/examples/pool-based_sampling.ipynb @@ -8,7 +8,7 @@ "\n", "## Overview\n", "\n", - "In this example, the we apply an `ActiveLearner` onto the iris dataset using pool-based sampling. In this setting, we assume a small set of labeled data $\\mathcal{L}$ and a large set of unlabeled data $\\mathcal{U}$ such that $\\left| \\mathcal{L} \\right| \\ll \\left| \\mathcal{U} \\right|$. In his review of the active learning literature, Settles covers a high-level overview of the general pool-based sampling algorithm:\n", + "In this example, we apply an `ActiveLearner` onto the iris dataset using pool-based sampling. In this setting, we assume a small set of labeled data $\\mathcal{L}$ and a large set of unlabeled data $\\mathcal{U}$ such that $\\left| \\mathcal{L} \\right| \\ll \\left| \\mathcal{U} \\right|$. In his review of the active learning literature, Settles covers a high-level overview of the general pool-based sampling algorithm:\n", "\n", "> Queries are selectively drawn from the pool, which is usually assumed to be closed (i.e., static or non-changing), although this is not strictly necessary. Typically, instances are queried in a greedy fashion, according to an informativeness measure used to evaluate all instances in the pool (or, perhaps if $\\mathcal{U}$ is very large, some subsample thereof).\n", "\n", From 39336f21cd872974cf2f34c1c79012ca30a96819 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 27 Jun 2019 21:27:24 +0200 Subject: [PATCH 078/182] Bayesian optimization fixed for multidimensional functions --- examples/bayesian_optimization_multidim.py | 26 ++++++++++++++++++++++ modAL/acquisition.py | 6 ++--- tests/core_tests.py | 12 +++++----- 3 files changed, 35 insertions(+), 9 deletions(-) create mode 100644 examples/bayesian_optimization_multidim.py diff --git a/examples/bayesian_optimization_multidim.py b/examples/bayesian_optimization_multidim.py new file mode 100644 index 0000000..26499dc --- /dev/null +++ b/examples/bayesian_optimization_multidim.py @@ -0,0 +1,26 @@ +import numpy as np +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import Matern +from modAL.models import BayesianOptimizer +from modAL.acquisition import max_EI + + +# generating the data +x1, x2 = np.linspace(0, 10, 11).reshape(-1, 1), np.linspace(0, 10, 11).reshape(-1, 1) +x1, x2 = np.meshgrid(x1, x2) +X = np.concatenate((x1.reshape(-1, 1), x2.reshape(-1, 1)), axis=1) + +y = np.sin(np.linalg.norm(X, axis=1))/2 - ((10 - np.linalg.norm(X, axis=1))**2)/50 + 2 + +# assembling initial training set +X_initial, y_initial = X[:10], y[:10] + +# defining the kernel for the Gaussian process +kernel = Matern(length_scale=1.0) + +optimizer = BayesianOptimizer(estimator=GaussianProcessRegressor(kernel=kernel), + X_training=X_initial, y_training=y_initial, + query_strategy=max_EI) + +query_idx, query_inst = optimizer.query(X) +optimizer.teach(X[query_idx].reshape(1, -1), y[query_idx]) diff --git a/modAL/acquisition.py b/modAL/acquisition.py index 54792d3..8a97526 100644 --- a/modAL/acquisition.py +++ b/modAL/acquisition.py @@ -47,7 +47,7 @@ def optimizer_PI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0) -> """ try: mean, std = optimizer.predict(X, return_std=True) - std = std.reshape(-1, 1) + mean, std = mean.reshape(-1, ), std.reshape(-1, ) except NotFittedError: mean, std = np.zeros(shape=(X.shape[0], 1)), np.ones(shape=(X.shape[0], 1)) @@ -68,7 +68,7 @@ def optimizer_EI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0) -> """ try: mean, std = optimizer.predict(X, return_std=True) - std = std.reshape(-1, 1) + mean, std = mean.reshape(-1, ), std.reshape(-1, ) except NotFittedError: mean, std = np.zeros(shape=(X.shape[0], 1)), np.ones(shape=(X.shape[0], 1)) @@ -89,7 +89,7 @@ def optimizer_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1) -> np. """ try: mean, std = optimizer.predict(X, return_std=True) - std = std.reshape(-1, 1) + mean, std = mean.reshape(-1, ), std.reshape(-1, ) except NotFittedError: mean, std = np.zeros(shape=(X.shape[0], 1)), np.ones(shape=(X.shape[0], 1)) diff --git a/tests/core_tests.py b/tests/core_tests.py index d1060c6..bddbab7 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -210,8 +210,8 @@ def test_acquisition_functions(self): def test_optimizer_PI(self): for n_samples in range(1, 100): - mean = np.random.rand(n_samples, 1) - std = np.random.rand(n_samples, 1) + mean = np.random.rand(n_samples, ) + std = np.random.rand(n_samples, ) tradeoff = np.random.rand() max_val = np.random.rand() @@ -239,8 +239,8 @@ def test_optimizer_PI(self): def test_optimizer_EI(self): for n_samples in range(1, 100): - mean = np.random.rand(n_samples, 1) - std = np.random.rand(n_samples, 1) + mean = np.random.rand(n_samples, ) + std = np.random.rand(n_samples, ) tradeoff = np.random.rand() max_val = np.random.rand() @@ -272,8 +272,8 @@ def test_optimizer_EI(self): def test_optimizer_UCB(self): for n_samples in range(1, 100): - mean = np.random.rand(n_samples, 1) - std = np.random.rand(n_samples, 1) + mean = np.random.rand(n_samples, ) + std = np.random.rand(n_samples, ) beta = np.random.rand() # 1. fitted estimator From 271d885160094354f861fa6f344f5d27a728a9bf Mon Sep 17 00:00:00 2001 From: chkoar Date: Wed, 7 Aug 2019 15:28:57 +0300 Subject: [PATCH 079/182] Fix stream-based sampling example --- examples/stream-based_sampling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/stream-based_sampling.py b/examples/stream-based_sampling.py index 95a8082..481a09c 100644 --- a/examples/stream-based_sampling.py +++ b/examples/stream-based_sampling.py @@ -1,5 +1,5 @@ """ -In this example the use of ActiveLearner is demonstrated in a pool-based sampling setting. +In this example the use of ActiveLearner is demonstrated in a stream-based sampling setting. """ import numpy as np @@ -67,5 +67,5 @@ plt.figure(figsize=(7, 7)) prediction = learner.predict_proba(X_full)[:, 1] plt.imshow(prediction.reshape(im_width, im_height)) - plt.title('Initial prediction accuracy: %f' % learner.score(X_full, y_full)) + plt.title('Final prediction accuracy: %f' % learner.score(X_full, y_full)) plt.show() From fdd2aa79640f4c65a1c462aa24c12f939b06eb63 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 11 Nov 2019 11:35:52 +0100 Subject: [PATCH 080/182] force_all_finite support added --- modAL/models/base.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 822fdbc..dc0d35d 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -30,6 +30,8 @@ class BaseLearner(ABC, BaseEstimator): for instance, modAL.uncertainty.uncertainty_sampling. X_training: Initial training samples, if available. y_training: Initial training labels corresponding to initial training samples. + force_all_finite: When True, forces all values of the data finite. + When False, accepts np.nan and np.inf values. bootstrap_init: If initial training data is available, bootstrapping can be done during the first training. Useful when building Committee models with bagging. **fit_kwargs: keyword arguments. @@ -47,6 +49,7 @@ def __init__(self, X_training: Optional[modALinput] = None, y_training: Optional[modALinput] = None, bootstrap_init: bool = False, + force_all_finite: bool = True, **fit_kwargs ) -> None: assert callable(query_strategy), 'query_strategy must be callable' @@ -59,6 +62,9 @@ def __init__(self, if X_training is not None: self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs) + assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool' + self.force_all_finite = force_all_finite + def _add_training_data(self, X: modALinput, y: modALinput) -> None: """ Adds the new data and label to the known data, but does not retrain the model. @@ -71,7 +77,8 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None: If the classifier has been fitted, the features in X have to agree with the training samples which the classifier has seen. """ - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None) + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, + force_all_finite=self.force_all_finite) if self.X_training is None: self.X_training = X @@ -117,7 +124,8 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f Returns: self """ - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None) + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, + force_all_finite=self.force_all_finite) if not bootstrap: self.estimator.fit(X, y, **fit_kwargs) @@ -146,7 +154,8 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg Returns: self """ - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None) + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, + force_all_finite=self.force_all_finite) self.X_training, self.y_training = X, y return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) From 93c78041ecb2b7ed5893080c36cab1ec5116341e Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 11 Nov 2019 11:36:16 +0100 Subject: [PATCH 081/182] tests added for force_all_finite mode --- tests/core_tests.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/core_tests.py b/tests/core_tests.py index bddbab7..1c80d7b 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -734,6 +734,25 @@ def test_teach(self): learner.teach(X, y, bootstrap=bootstrap, only_new=only_new) + def test_nan(self): + X_training_nan = np.ones(shape=(10, 2)) * np.nan + X_training_inf = np.ones(shape=(10, 2)) * np.inf + y_training = np.random.randint(0, 2, size=10) + + learner = modAL.models.learners.ActiveLearner( + X_training=X_training_nan, y_training=y_training, + estimator=mock.MockEstimator(), + force_all_finite=False + ) + learner.teach(X_training_nan, y_training) + + learner = modAL.models.learners.ActiveLearner( + X_training=X_training_inf, y_training=y_training, + estimator=mock.MockEstimator(), + force_all_finite=False + ) + learner.teach(X_training_inf, y_training) + def test_keras(self): pass From f7a2d33443dc7cae7ac77ea637d9118dbcccc9ba Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 11 Nov 2019 11:40:37 +0100 Subject: [PATCH 082/182] version number bumped --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index be59e7b..d9fb4a8 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='modAL', - version='0.3.4', + version='0.3.5', author='Tivadar Danka', author_email='85a5187a@opayq.com', description='A modular active learning framework for Python3', From 5dda5bfc84f1fdb67d30194c25255807f99f83e1 Mon Sep 17 00:00:00 2001 From: philipjhj Date: Mon, 25 Nov 2019 14:33:24 +0100 Subject: [PATCH 083/182] correctly updates known classes after teaching estimators --- modAL/models/learners.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 7fdcf2c..b4ea394 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -311,6 +311,20 @@ def _set_classes(self): def _add_training_data(self, X: modALinput, y: modALinput): super()._add_training_data(X, y) + + def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: + """ + Adds X and y to the known training data for each learner and retrains learners with the augmented dataset. + + Args: + X: The new samples for which the labels are supplied by the expert. + y: Labels corresponding to the new instances in X. + bootstrap: If True, trains each learner on a bootstrapped set. Useful when building the ensemble by bagging. + only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + """ + + super().teach(X, y, bootstrap=bootstrap, only_new=only_new, **fit_kwargs) self._set_classes() def predict(self, X: modALinput, **predict_proba_kwargs) -> Any: From d95ecffd51a9b8168ca229ce7c64ddfe6a437f9d Mon Sep 17 00:00:00 2001 From: charles Date: Mon, 16 Mar 2020 10:49:10 -0700 Subject: [PATCH 084/182] Fix random sampling error Other option would be to use `np.random.choice(range(len(X))` --- examples/shape_learning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/shape_learning.py b/examples/shape_learning.py index c190e24..8487ce8 100644 --- a/examples/shape_learning.py +++ b/examples/shape_learning.py @@ -57,7 +57,7 @@ def random_sampling(classsifier, X): - query_idx = np.random.rand(range(len(X))) + query_idx = np.random.randint(len(X)) return query_idx, X[query_idx] @@ -100,4 +100,4 @@ def random_sampling(classsifier, X): plt.plot(list(range(len(uncertainty_sampling_accuracy))), uncertainty_sampling_accuracy, label="uncertainty sampling") plt.plot(list(range(len(random_sampling_accuracy))), random_sampling_accuracy, label="random sampling") plt.legend() - plt.show() \ No newline at end of file + plt.show() From 00ea8b16952211d024f226861746b1b7c7ef6571 Mon Sep 17 00:00:00 2001 From: kunal mehta <36035718+kunakl07@users.noreply.github.com> Date: Sun, 22 Mar 2020 11:25:33 +0530 Subject: [PATCH 085/182] Some typos fixed I have fixed some typos --- .../query_strategies/Disagreement-sampling.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/content/query_strategies/Disagreement-sampling.rst b/docs/source/content/query_strategies/Disagreement-sampling.rst index b8deeb8..6786212 100644 --- a/docs/source/content/query_strategies/Disagreement-sampling.rst +++ b/docs/source/content/query_strategies/Disagreement-sampling.rst @@ -3,7 +3,7 @@ Disagreement sampling ===================== -When you have several hypothesis about your data, selecting the next instances to label can be done by measuring the disagreement between the hypotheses. Naturally, there are many ways to do that. In modAL, there are three built-in disagreement measures and query strategies: *vote entropy*, *consensus entropy* and *maximum disagreement*. In this quick tutorial, we are going to review them. For more details, see Section 3.4 of the awesome book `Active learning by Burr Settles `__. +When you have several hypotheses about your data, selecting the next instances to label can be done by measuring the disagreement between the hypotheses. Naturally, there are many ways to do that. In modAL, there are three built-in disagreement measures and query strategies: *vote entropy*, *consensus entropy* and *maximum disagreement*. In this quick tutorial, we are going to review them. For more details, see Section 3.4 of the awesome book `Active learning by Burr Settles `__. Disagreement sampling for classifiers ------------------------------------- @@ -52,7 +52,7 @@ Instead of calculating the distribution of the votes, the *consensus entropy* disagreement measure first calculates the average of the class probabilities of each classifier. This is called the consensus probability. Then the entropy of the consensus probability is calculated -and the instance with largest consensus entropy is selected. +and the instance with the largest consensus entropy is selected. For an example, let's suppose that we continue the previous example with three classifiers, classes ``[0, 1, 2]`` and five instances to classify. @@ -100,7 +100,7 @@ Even though the votes for the second instance are ``[1, 1, 2]``, since the class Max disagreement ^^^^^^^^^^^^^^^^ -The disagreement measures so far take the actual *disagreement* into account in a weak way. Instead of this, it is possible to to measure each learner's disagreement with the consensus probabilities and query the instance where the disagreement is largest for some learner. This is called *max disagreement sampling*. Continuing our example, if the vote probabilities for each learner and the consensus probabilities are given, we can calculate the `Kullback-Leibler divergence `__ of each learner to the consensus prediction and then for each instance, select the largest value. +The disagreement measures so far take the actual *disagreement* into account in a weak way. Instead of this, it is possible to measure each learner's disagreement with the consensus probabilities and query the instance where the disagreement is largest for some learner. This is called *max disagreement sampling*. Continuing our example, if the vote probabilities for each learner and the consensus probabilities are given, we can calculate the `Kullback-Leibler divergence `__ of each learner to the consensus prediction and then for each instance, select the largest value. .. code:: python @@ -123,7 +123,7 @@ In this case, one of the learner highly disagrees with the others in the class o Disagreement sampling for regressors ------------------------------------ -Since regressors in general don't provide a way to calculate prediction probabilities, disagreement measures for classifiers may not work with regressors. Despite of this, ensemble regression models can be always used in an active learning scenario, because the standard deviation of the predictions at a given point can be thought of as a measure of disagreement. +Since regressors, in general, don't provide a way to calculate prediction probabilities, disagreement measures for classifiers may not work with regressors. Despite this, ensemble regression models can be always used in an active learning scenario, because the standard deviation of the predictions at a given point can be thought of as a measure of disagreement. Standard deviation sampling ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -131,7 +131,7 @@ Standard deviation sampling .. figure:: img/er-initial.png :align: center -When a committee of regressors is available, uncertainty of predictions can be estimated by calculating the standard deviation of predictions. This is done by the ``modAL.disagreement.max_std_sampling`` function. +When a committee of regressors is available, the uncertainty of predictions can be estimated by calculating the standard deviation of predictions. This is done by the ``modAL.disagreement.max_std_sampling`` function. Disagreement measures in action ------------------------------- @@ -151,7 +151,7 @@ The consensus predictions of these learners are .. figure:: img/dis-consensus.png :align: center -In this case, the disagreement measures from left to right are vote entropy, consensus entropy and max disagreement. +In this case, the disagreement measures from left to right are vote entropy, consensus entropy, and max disagreement. .. figure:: img/dis-measures.png :align: center From 0353aee3a68e91788bdc381a39f40bbbc2d4b144 Mon Sep 17 00:00:00 2001 From: kunal mehta <36035718+kunakl07@users.noreply.github.com> Date: Sun, 22 Mar 2020 11:27:39 +0530 Subject: [PATCH 086/182] Fixed some typos --- docs/source/content/query_strategies/Acquisition-functions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/content/query_strategies/Acquisition-functions.rst b/docs/source/content/query_strategies/Acquisition-functions.rst index df28c39..c3a5725 100644 --- a/docs/source/content/query_strategies/Acquisition-functions.rst +++ b/docs/source/content/query_strategies/Acquisition-functions.rst @@ -3,7 +3,7 @@ Acquisition functions ===================== -In Bayesian optimization, a so-called *acquisition funciton* is used instead of the uncertainty based utility measures of active learning. In modAL, Bayesian optimization algorithms are implemented in the ``modAL.models.BayesianOptimizer`` class. Currently, there are three available acquisition funcions: probability of improvement, expected improvement and upper confidence bound. +In Bayesian optimization, a so-called *acquisition funciton* is used instead of the uncertainty based utility measures of active learning. In modAL, Bayesian optimization algorithms are implemented in the ``modAL.models.BayesianOptimizer`` class. Currently, there are three available acquisition functions: probability of improvement, expected improvement and upper confidence bound. Probability of improvement -------------------------- From f62c752746477fa7fc22793a50cd1b159149dcda Mon Sep 17 00:00:00 2001 From: Parthe Pandit Date: Tue, 28 Jul 2020 17:34:38 -0700 Subject: [PATCH 087/182] Updated doc for ActiveLearner When X_training is provided a call to the method fit was not expected. This happens in the __init__() of the BaseLearner --- modAL/models/learners.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modAL/models/learners.py b/modAL/models/learners.py index b4ea394..f9936fb 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -36,7 +36,7 @@ class ActiveLearner(BaseLearner): estimator: The estimator to be used in the active learning loop. query_strategy: Function providing the query strategy for the active learning loop. X_training: If the model hasn't been fitted yet it is None, otherwise it contains the samples - which the model has been trained on. + which the model has been trained on. If provided, the method fit() of estimator is called during __init__() y_training: The labels corresponding to X_training. Examples: @@ -517,4 +517,4 @@ def vote(self, X: modALinput, **predict_kwargs): for learner_idx, learner in enumerate(self.learner_list): prediction[:, learner_idx] = learner.predict(X, **predict_kwargs).reshape(-1, ) - return prediction \ No newline at end of file + return prediction From c95fb1d3be63176bb1c226ac6de5cb4ddce87dd6 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Fri, 21 Aug 2020 08:11:02 +0200 Subject: [PATCH 088/182] version number bumped to 0.3.6 --- .gitignore | 2 ++ setup.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 999cc9e..0134267 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ *# __pycache__ build/ +dist/ +modAL.egg-info/ diff --git a/setup.py b/setup.py index d9fb4a8..b48bfd8 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='modAL', - version='0.3.5', + version='0.3.6', author='Tivadar Danka', author_email='85a5187a@opayq.com', description='A modular active learning framework for Python3', From e5bddadc6f7cfda2e8f2af36be875ba9095a4e13 Mon Sep 17 00:00:00 2001 From: Oskar Liew Date: Tue, 8 Sep 2020 17:52:06 +0200 Subject: [PATCH 089/182] Now sets classes when fitting committee --- modAL/models/learners.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/modAL/models/learners.py b/modAL/models/learners.py index f9936fb..7179d6c 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -312,6 +312,22 @@ def _set_classes(self): def _add_training_data(self, X: modALinput, y: modALinput): super()._add_training_data(X, y) + def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> 'BaseCommittee': + """ + Fits every learner to a subset sampled with replacement from X. Calling this method makes the learner forget the + data it has seen up until this point and replaces it with X! If you would like to perform bootstrapping on each + learner using the data it has seen, use the method .rebag()! + + Calling this method makes the learner forget the data it has seen up until this point and replaces it with X! + + Args: + X: The samples to be fitted on. + y: The corresponding labels. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + """ + super().fit(X, y, **fit_kwargs) + self._set_classes() + def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: """ Adds X and y to the known training data for each learner and retrains learners with the augmented dataset. @@ -323,7 +339,6 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples. **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. """ - super().teach(X, y, bootstrap=bootstrap, only_new=only_new, **fit_kwargs) self._set_classes() From 8e0cb25029e4f1443168adb9313b3f49ae13ea0e Mon Sep 17 00:00:00 2001 From: Boyan Hristov Date: Thu, 24 Sep 2020 18:12:59 +0200 Subject: [PATCH 090/182] resolves #20, #104 - added pandas support and option for transforming data in learner --- README.md | 5 +- .../content/examples/active_regression.ipynb | 7 +- .../content/overview/Extending-modAL.ipynb | 12 +-- .../content/overview/modAL-in-a-nutshell.rst | 6 +- examples/active_regression.py | 3 +- examples/custom_query_strategies.py | 13 ++- examples/deep_bayesian_active_learning.py | 6 +- examples/shape_learning.py | 3 +- modAL/acquisition.py | 18 ++-- modAL/batch.py | 15 ++-- modAL/disagreement.py | 32 +++---- modAL/expected_error.py | 18 ++-- modAL/models/base.py | 90 +++++++++++++++++-- modAL/models/learners.py | 11 ++- modAL/multilabel.py | 56 +++++------- modAL/uncertainty.py | 24 ++--- modAL/utils/combination.py | 3 +- modAL/utils/data.py | 31 ++++++- rtd_requirements.txt | 1 + setup.py | 2 +- tests/core_tests.py | 18 ++-- .../example_tests/custom_query_strategies.py | 3 +- 22 files changed, 217 insertions(+), 160 deletions(-) diff --git a/README.md b/README.md index 46137da..0e9aec5 100644 --- a/README.md +++ b/README.md @@ -100,12 +100,11 @@ import numpy as np X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1) y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape) ``` -For active learning, we shall define a custom query strategy tailored to Gaussian processes. In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance and the instance itself. In our case, the arguments are ```regressor``` and ```X```. +For active learning, we shall define a custom query strategy tailored to Gaussian processes. In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance. In our case, the arguments are ```regressor``` and ```X```. ```python def GP_regression_std(regressor, X): _, std = regressor.predict(X, return_std=True) - query_idx = np.argmax(std) - return query_idx, X[query_idx] + return np.argmax(std) ``` After setting up the query strategy and the data, the active learner can be initialized. ```python diff --git a/docs/source/content/examples/active_regression.ipynb b/docs/source/content/examples/active_regression.ipynb index aa2f21e..15c0bc9 100644 --- a/docs/source/content/examples/active_regression.ipynb +++ b/docs/source/content/examples/active_regression.ipynb @@ -70,7 +70,7 @@ "metadata": {}, "source": [ "## Uncertainty measure and query strategy for Gaussian processes\n", - "For active learning, we shall define a custom query strategy tailored to Gaussian processes. More information on how to write your custom query strategies can be found at the page [Extending modAL](https://cosmic-cortex.github.io/modAL/Extending-modAL). In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance and the instance itself. In our case, the arguments are ```regressor``` and ```X```." + "For active learning, we shall define a custom query strategy tailored to Gaussian processes. More information on how to write your custom query strategies can be found at the page [Extending modAL](https://cosmic-cortex.github.io/modAL/Extending-modAL). In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance. In our case, the arguments are ```regressor``` and ```X```." ] }, { @@ -81,8 +81,7 @@ "source": [ "def GP_regression_std(regressor, X):\n", " _, std = regressor.predict(X, return_std=True)\n", - " query_idx = np.argmax(std)\n", - " return query_idx, X[query_idx]" + " return np.argmax(std)" ] }, { @@ -234,4 +233,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/docs/source/content/overview/Extending-modAL.ipynb b/docs/source/content/overview/Extending-modAL.ipynb index bd2f794..641ca98 100644 --- a/docs/source/content/overview/Extending-modAL.ipynb +++ b/docs/source/content/overview/Extending-modAL.ipynb @@ -27,11 +27,8 @@ " # measure the utility of each instance in the pool\n", " utility = utility_measure(classifier, X)\n", "\n", - " # select the indices of the instances to be queried\n", - " query_idx = select_instances(utility)\n", - "\n", - " # return the indices and the instances\n", - " return query_idx, X[query_idx]" + " # select and return the indices of the instances to be queried\n", + " return select_instances(utility)" ] }, { @@ -213,8 +210,7 @@ "# classifier uncertainty and classifier margin\n", "def custom_query_strategy(classifier, X, n_instances=1):\n", " utility = linear_combination(classifier, X)\n", - " query_idx = multi_argmax(utility, n_instances=n_instances)\n", - " return query_idx, X[query_idx]\n", + " return multi_argmax(utility, n_instances=n_instances)\n", "\n", "custom_query_learner = ActiveLearner(\n", " estimator=GaussianProcessClassifier(1.0 * RBF(1.0)),\n", @@ -299,4 +295,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/docs/source/content/overview/modAL-in-a-nutshell.rst b/docs/source/content/overview/modAL-in-a-nutshell.rst index d0435e9..99dda7b 100644 --- a/docs/source/content/overview/modAL-in-a-nutshell.rst +++ b/docs/source/content/overview/modAL-in-a-nutshell.rst @@ -118,15 +118,13 @@ the *noisy sine* function: For active learning, we shall define a custom query strategy tailored to Gaussian processes. In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool -of examples), outputting the index of the queried instance and the -instance itself. In our case, the arguments are ``regressor`` and ``X``. +of examples), outputting the index of the queried instance. In our case, the arguments are ``regressor`` and ``X``. .. code:: python def GP_regression_std(regressor, X): _, std = regressor.predict(X, return_std=True) - query_idx = np.argmax(std) - return query_idx, X[query_idx] + return np.argmax(std) After setting up the query strategy and the data, the active learner can be initialized. diff --git a/examples/active_regression.py b/examples/active_regression.py index f4d1b4a..5426e5f 100644 --- a/examples/active_regression.py +++ b/examples/active_regression.py @@ -12,8 +12,7 @@ # query strategy for regression def GP_regression_std(regressor, X): _, std = regressor.predict(X, return_std=True) - query_idx = np.argmax(std) - return query_idx, X[query_idx] + return np.argmax(std) # generating the data diff --git a/examples/custom_query_strategies.py b/examples/custom_query_strategies.py index ec65d9b..6680457 100644 --- a/examples/custom_query_strategies.py +++ b/examples/custom_query_strategies.py @@ -5,18 +5,16 @@ The first two arguments of a query strategy function is always the estimator and the pool of instances to be queried from. Additional arguments are accepted as keyword arguments. -A valid query strategy function always returns a tuple of the indices of the queried -instances and the instances themselves. +A valid query strategy function always returns indices of the queried +instances. def custom_query_strategy(classifier, X, a_keyword_argument=42): # measure the utility of each instance in the pool utility = utility_measure(classifier, X) - # select the indices of the instances to be queried - query_idx = select_instances(utility) + # select and return the indices of the instances to be queried + return select_instances(utility) - # return the indices and the instances - return query_idx, X[query_idx] This function can be used in the active learning workflow. @@ -97,8 +95,7 @@ def custom_query_strategy(classifier, X, a_keyword_argument=42): # classifier uncertainty and classifier margin def custom_query_strategy(classifier, X, n_instances=1): utility = linear_combination(classifier, X) - query_idx = multi_argmax(utility, n_instances=n_instances) - return query_idx, X[query_idx] + return multi_argmax(utility, n_instances=n_instances) custom_query_learner = ActiveLearner( estimator=GaussianProcessClassifier(1.0 * RBF(1.0)), diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py index 6ae5c4a..265c37d 100644 --- a/examples/deep_bayesian_active_learning.py +++ b/examples/deep_bayesian_active_learning.py @@ -62,12 +62,10 @@ def max_entropy(learner, X, n_instances=1, T=100): expected_p = np.mean(MC_samples, axis=0) acquisition = - np.sum(expected_p * np.log(expected_p + 1e-10), axis=-1) # [batch size] idx = (-acquisition).argsort()[:n_instances] - query_idx = random_subset[idx] - return query_idx, X[query_idx] + return random_subset[idx] def uniform(learner, X, n_instances=1): - query_idx = np.random.choice(range(len(X)), size=n_instances, replace=False) - return query_idx, X[query_idx] + return np.random.choice(range(len(X)), size=n_instances, replace=False) """ Training the ActiveLearner diff --git a/examples/shape_learning.py b/examples/shape_learning.py index 8487ce8..d856673 100644 --- a/examples/shape_learning.py +++ b/examples/shape_learning.py @@ -57,8 +57,7 @@ def random_sampling(classsifier, X): - query_idx = np.random.randint(len(X)) - return query_idx, X[query_idx] + return np.random.randint(len(X)) X_pool = deepcopy(X_full) diff --git a/modAL/acquisition.py b/modAL/acquisition.py index 8a97526..54fd0ab 100644 --- a/modAL/acquisition.py +++ b/modAL/acquisition.py @@ -104,7 +104,7 @@ def optimizer_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1) -> np. def max_PI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1) -> np.ndarray: """ Maximum PI query strategy. Selects the instance with highest probability of improvement. @@ -118,13 +118,11 @@ def max_PI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0, The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ pi = optimizer_PI(optimizer, X, tradeoff=tradeoff) - query_idx = multi_argmax(pi, n_instances=n_instances) - - return query_idx, X[query_idx] + return multi_argmax(pi, n_instances=n_instances) def max_EI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1) -> np.ndarray: """ Maximum EI query strategy. Selects the instance with highest expected improvement. @@ -138,13 +136,11 @@ def max_EI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0, The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ ei = optimizer_EI(optimizer, X, tradeoff=tradeoff) - query_idx = multi_argmax(ei, n_instances=n_instances) - - return query_idx, X[query_idx] + return multi_argmax(ei, n_instances=n_instances) def max_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1, - n_instances: int = 1) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1) -> np.ndarray: """ Maximum UCB query strategy. Selects the instance with highest upper confidence bound. @@ -158,6 +154,4 @@ def max_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1, The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ ucb = optimizer_UCB(optimizer, X, beta=beta) - query_idx = multi_argmax(ucb, n_instances=n_instances) - - return query_idx, X[query_idx] + return multi_argmax(ucb, n_instances=n_instances) diff --git a/modAL/batch.py b/modAL/batch.py index ffe7a53..b8fab1a 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -114,7 +114,7 @@ def select_instance( unlabeled_indices = [i for i in range(n_pool) if mask[i]] best_instance_index = unlabeled_indices[best_instance_index_in_unlabeled] mask[best_instance_index] = 0 - return best_instance_index, np.expand_dims(X_pool[best_instance_index], axis=0), mask + return best_instance_index, X_pool[[best_instance_index]], mask def ranked_batch(classifier: Union[BaseLearner, BaseCommittee], @@ -142,11 +142,16 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee], """ # Make a local copy of our classifier's training data. # Define our record container and record the best cold start instance in the case of cold start. + + # transform unlabeled data if needed + if classifier.on_transformed: + unlabeled = classifier.transform_without_estimating(unlabeled) + if classifier.X_training is None: best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs) instance_index_ranking = [best_coldstart_instance_index] elif classifier.X_training.shape[0] > 0: - labeled = classifier.X_training[:] + labeled = classifier.Xt_training[:] if classifier.on_transformed else classifier.X_training[:] instance_index_ranking = [] # The maximum number of records to sample. @@ -180,7 +185,7 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee], metric: Union[str, Callable] = 'euclidean', n_jobs: Optional[int] = None, **uncertainty_measure_kwargs - ) -> Tuple[np.ndarray, Union[np.ndarray, sp.csr_matrix]]: + ) -> np.ndarray: """ Batch sampling query strategy. Selects the least sure instances for labelling. @@ -206,6 +211,6 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee], Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled. """ uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs) - query_indices = ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty, + return ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty, n_instances=n_instances, metric=metric, n_jobs=n_jobs) - return query_indices, X[query_indices] + diff --git a/modAL/disagreement.py b/modAL/disagreement.py index 04e1f12..2d5e224 100644 --- a/modAL/disagreement.py +++ b/modAL/disagreement.py @@ -104,7 +104,7 @@ def KL_max_disagreement(committee: BaseCommittee, X: modALinput, **predict_proba def vote_entropy_sampling(committee: BaseCommittee, X: modALinput, n_instances: int = 1, random_tie_break=False, - **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + **disagreement_measure_kwargs) -> np.ndarray: """ Vote entropy sampling strategy. @@ -124,16 +124,14 @@ def vote_entropy_sampling(committee: BaseCommittee, X: modALinput, disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs) if not random_tie_break: - query_idx = multi_argmax(disagreement, n_instances=n_instances) - else: - query_idx = shuffled_argmax(disagreement, n_instances=n_instances) + return multi_argmax(disagreement, n_instances=n_instances) - return query_idx, X[query_idx] + return shuffled_argmax(disagreement, n_instances=n_instances) def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput, n_instances: int = 1, random_tie_break=False, - **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + **disagreement_measure_kwargs) -> np.ndarray: """ Consensus entropy sampling strategy. @@ -153,16 +151,14 @@ def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput, disagreement = consensus_entropy(committee, X, **disagreement_measure_kwargs) if not random_tie_break: - query_idx = multi_argmax(disagreement, n_instances=n_instances) - else: - query_idx = shuffled_argmax(disagreement, n_instances=n_instances) + return multi_argmax(disagreement, n_instances=n_instances) - return query_idx, X[query_idx] + return shuffled_argmax(disagreement, n_instances=n_instances) def max_disagreement_sampling(committee: BaseCommittee, X: modALinput, n_instances: int = 1, random_tie_break=False, - **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + **disagreement_measure_kwargs) -> np.ndarray: """ Maximum disagreement sampling strategy. @@ -182,16 +178,14 @@ def max_disagreement_sampling(committee: BaseCommittee, X: modALinput, disagreement = KL_max_disagreement(committee, X, **disagreement_measure_kwargs) if not random_tie_break: - query_idx = multi_argmax(disagreement, n_instances=n_instances) - else: - query_idx = shuffled_argmax(disagreement, n_instances=n_instances) + return multi_argmax(disagreement, n_instances=n_instances) - return query_idx, X[query_idx] + return shuffled_argmax(disagreement, n_instances=n_instances) def max_std_sampling(regressor: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break=False, - **predict_kwargs) -> Tuple[np.ndarray, modALinput]: + **predict_kwargs) -> np.ndarray: """ Regressor standard deviation sampling strategy. @@ -211,8 +205,6 @@ def max_std_sampling(regressor: BaseEstimator, X: modALinput, std = std.reshape(X.shape[0], ) if not random_tie_break: - query_idx = multi_argmax(std, n_instances=n_instances) - else: - query_idx = shuffled_argmax(std, n_instances=n_instances) + return multi_argmax(std, n_instances=n_instances) - return query_idx, X[query_idx] + return shuffled_argmax(std, n_instances=n_instances) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index 31296d1..947e58c 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -10,14 +10,14 @@ from sklearn.exceptions import NotFittedError from modAL.models import ActiveLearner -from modAL.utils.data import modALinput, data_vstack +from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows from modAL.utils.selection import multi_argmax, shuffled_argmax from modAL.uncertainty import _proba_uncertainty, _proba_entropy def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary', p_subsample: np.float = 1.0, n_instances: int = 1, - random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: + random_tie_break: bool = False) -> np.ndarray: """ Expected error reduction query strategy. @@ -52,17 +52,17 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = X_proba = learner.predict_proba(X) except NotFittedError: # TODO: implement a proper cold-start - return 0, X[0] + return np.array([0]) cloned_estimator = clone(learner.estimator) - for x_idx, x in enumerate(X): + for x_idx, x in enumerate_data(X): # subsample the data if needed if np.random.rand() <= p_subsample: - X_reduced = np.delete(X, x_idx, axis=0) + X_reduced = drop_rows(X, x_idx) # estimate the expected error for y_idx, y in enumerate(possible_labels): - X_new = data_vstack((learner.X_training, np.expand_dims(x, axis=0))) + X_new = data_vstack((learner.X_training, [x])) y_new = data_vstack((learner.y_training, np.array(y).reshape(1,))) cloned_estimator.fit(X_new, y_new) @@ -78,8 +78,6 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = expected_error[x_idx] = np.inf if not random_tie_break: - query_idx = multi_argmax(-expected_error, n_instances) - else: - query_idx = shuffled_argmax(-expected_error, n_instances) + return multi_argmax(-expected_error, n_instances) - return query_idx, X[query_idx] + return shuffled_argmax(-expected_error, n_instances) diff --git a/modAL/models/base.py b/modAL/models/base.py index dc0d35d..8871fea 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -5,14 +5,18 @@ import abc import sys +import warnings from typing import Union, Callable, Optional, Tuple, List, Iterator, Any import numpy as np from sklearn.base import BaseEstimator +from sklearn.ensemble._base import _BaseHeterogeneousEnsemble +from sklearn.pipeline import Pipeline from sklearn.utils import check_X_y -from modAL.utils.data import data_vstack, modALinput +import scipy.sparse as sp +from modAL.utils.data import data_vstack, modALinput, retrieve_rows if sys.version_info >= (3, 4): ABC = abc.ABC @@ -34,6 +38,8 @@ class BaseLearner(ABC, BaseEstimator): When False, accepts np.nan and np.inf values. bootstrap_init: If initial training data is available, bootstrapping can be done during the first training. Useful when building Committee models with bagging. + on_transformed: Whether to transform samples with the pipeline defined by the estimator + when applying the query strategy. **fit_kwargs: keyword arguments. Attributes: @@ -49,6 +55,7 @@ def __init__(self, X_training: Optional[modALinput] = None, y_training: Optional[modALinput] = None, bootstrap_init: bool = False, + on_transformed: bool = False, force_all_finite: bool = True, **fit_kwargs ) -> None: @@ -56,11 +63,14 @@ def __init__(self, self.estimator = estimator self.query_strategy = query_strategy + self.on_transformed = on_transformed self.X_training = X_training + self.Xt_training = None self.y_training = y_training if X_training is not None: self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs) + self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool' self.force_all_finite = force_all_finite @@ -82,15 +92,65 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None: if self.X_training is None: self.X_training = X + self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None self.y_training = y else: try: self.X_training = data_vstack((self.X_training, X)) + self.Xt_training = data_vstack(( + self.Xt_training, + self.transform_without_estimating(X) + )) if self.on_transformed else None self.y_training = data_vstack((self.y_training, y)) except ValueError: raise ValueError('the dimensions of the new training data and label must' 'agree with the training data and labels provided so far') + def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.csr_matrix]: + """ + Transforms the data as supplied to the estimator. + + * In case the estimator is an skearn pipeline, it applies all pipeline components but the last one. + * In case the estimator is an ensemble, it concatenates the transformations for each classfier + (pipeline) in the ensemble. + * Otherwise returns the non-transformed dataset X + Args: + X: dataset to be transformed + + Returns: + Transformed data set + """ + Xt = [] + pipes = [self.estimator] + + if isinstance(self.estimator, _BaseHeterogeneousEnsemble): + pipes = self.estimator.estimators_ + + ################################ + # transform data with pipelines used by estimator + for pipe in pipes: + if isinstance(pipe, Pipeline): + # NOTE: The used pipeline class might be an extension to sklearn's! + # Create a new instance of the used pipeline class with all + # components but the final estimator. + transformation_pipe = pipe.__class__(steps=pipe.steps[:-1]) + Xt.append(transformation_pipe.transform(X)) + + # in case no transformation pipelines are used by the estimator, + # return the original, non-transfored data + if not Xt: + return X + + ################################ + # concatenate all transformations and return + # TODO: maybe use a newly implemented data_hstack() instead + + # use sparse representation if any of the pipelines do + if any([isinstance(Xti, sp.csr_matrix) for Xti in Xt]): + return sp.hstack([sp.csc_matrix(Xti) for Xti in Xt]) + + return np.hstack(Xt) + def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': """ Fits self.estimator to the training data and labels provided to it so far. @@ -185,11 +245,12 @@ def predict_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: """ return self.estimator.predict_proba(X, **predict_proba_kwargs) - def query(self, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: + def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: """ Finds the n_instances most informative point in the data provided by calling the query_strategy function. Args: + X_pool: Pool of unlabeled instances to retrieve most informative instances from *query_args: The arguments for the query strategy. For instance, in the case of :func:`~modAL.uncertainty.uncertainty_sampling`, it is the pool of samples from which the query strategy should choose instances to request labels. @@ -200,8 +261,15 @@ def query(self, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: labelled and the instances themselves. Can be different in other cases, for instance only the instance to be labelled upon query synthesis. """ - query_result = self.query_strategy(self, *query_args, **query_kwargs) - return query_result + query_result = self.query_strategy(self, X_pool, *query_args, **query_kwargs) + + if isinstance(query_result, tuple): + warnings.warn("Query strategies should no longer return the selected instances, " + "this is now handled by the query method. " + "Please return only the indices of the selected instances.", DeprecationWarning) + return query_result + + return query_result, retrieve_rows(X_pool, query_result) def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: """ @@ -301,11 +369,12 @@ def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> 'BaseCommittee': return self - def query(self, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: + def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: """ Finds the n_instances most informative point in the data provided by calling the query_strategy function. Args: + X_pool: Pool of unlabeled instances to retrieve most informative instances from *query_args: The arguments for the query strategy. For instance, in the case of :func:`~modAL.disagreement.max_disagreement_sampling`, it is the pool of samples from which the query. strategy should choose instances to request labels. @@ -316,8 +385,15 @@ def query(self, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: be labelled and the instances themselves. Can be different in other cases, for instance only the instance to be labelled upon query synthesis. """ - query_result = self.query_strategy(self, *query_args, **query_kwargs) - return query_result + query_result = self.query_strategy(self, X_pool, *query_args, **query_kwargs) + + if isinstance(query_result, tuple): + warnings.warn("Query strategies should no longer return the selected instances, " + "this is now handled by the query method. " + "Please return only the indices of the selected instances", DeprecationWarning) + return query_result + + return query_result, retrieve_rows(X_pool, query_result) def rebag(self, **fit_kwargs) -> None: """ diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 7179d6c..4c0a812 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -30,6 +30,8 @@ class ActiveLearner(BaseLearner): y_training: Initial training labels corresponding to initial training samples. bootstrap_init: If initial training data is available, bootstrapping can be done during the first training. Useful when building Committee models with bagging. + on_transformed: Whether to transform samples with the pipeline defined by the estimator + when applying the query strategy. **fit_kwargs: keyword arguments. Attributes: @@ -73,10 +75,11 @@ def __init__(self, X_training: Optional[modALinput] = None, y_training: Optional[modALinput] = None, bootstrap_init: bool = False, + on_transformed: bool = False, **fit_kwargs ) -> None: super().__init__(estimator, query_strategy, - X_training, y_training, bootstrap_init, **fit_kwargs) + X_training, y_training, bootstrap_init, on_transformed, **fit_kwargs) def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: """ @@ -177,9 +180,10 @@ def __init__(self, X_training: Optional[modALinput] = None, y_training: Optional[modALinput] = None, bootstrap_init: bool = False, + on_transformed: bool = False, **fit_kwargs) -> None: super(BayesianOptimizer, self).__init__(estimator, query_strategy, - X_training, y_training, bootstrap_init, **fit_kwargs) + X_training, y_training, bootstrap_init, on_transformed, **fit_kwargs) # setting the maximum value if self.y_training is not None: max_idx = np.argmax(self.y_training) @@ -481,8 +485,7 @@ class CommitteeRegressor(BaseCommittee): >>> # query strategy for regression >>> def ensemble_regression_std(regressor, X): ... _, std = regressor.predict(X, return_std=True) - ... query_idx = np.argmax(std) - ... return query_idx, X[query_idx] + ... return np.argmax(std) >>> >>> # initializing the CommitteeRegressor >>> committee = CommitteeRegressor( diff --git a/modAL/multilabel.py b/modAL/multilabel.py index 28a7254..3a81ba1 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -43,7 +43,7 @@ def _SVM_loss(multiclass_classifier: ActiveLearner, def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput, - random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: + random_tie_break: bool = False) -> np.ndarray: """ SVM binary minimum multilabel active learning strategy. For details see the paper Klaus Brinker, On Active Learning in Multi-label Classification @@ -67,15 +67,13 @@ def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput, min_abs_dist = np.min(np.abs(decision_function), axis=1) if not random_tie_break: - query_idx = np.argmin(min_abs_dist) - else: - query_idx = shuffled_argmax(min_abs_dist) + return np.argmin(min_abs_dist) - return query_idx, X_pool[query_idx] + return shuffled_argmax(min_abs_dist) def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput, - n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray: """ Max Loss query strategy for SVM multilabel classification. @@ -103,15 +101,13 @@ def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput, loss = _SVM_loss(classifier, X_pool, most_certain_classes=most_certain_classes) if not random_tie_break: - query_idx = multi_argmax(loss, n_instances) - else: - query_idx = shuffled_argmax(loss, n_instances) + return multi_argmax(loss, n_instances) - return query_idx, X_pool[query_idx] + return shuffled_argmax(loss, n_instances) def mean_max_loss(classifier: OneVsRestClassifier, X_pool: modALinput, - n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray: """ Mean Max Loss query strategy for SVM multilabel classification. @@ -136,15 +132,13 @@ def mean_max_loss(classifier: OneVsRestClassifier, X_pool: modALinput, loss = _SVM_loss(classifier, X_pool) if not random_tie_break: - query_idx = multi_argmax(loss, n_instances) - else: - query_idx = shuffled_argmax(loss, n_instances) + return multi_argmax(loss, n_instances) - return query_idx, X_pool[query_idx] + return shuffled_argmax(loss, n_instances) def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, - n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray: """ MinConfidence query strategy for multilabel classification. @@ -167,15 +161,13 @@ def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, classwise_min = np.min(classwise_confidence, axis=1) if not random_tie_break: - query_idx = multi_argmax(-classwise_min, n_instances) - else: - query_idx = shuffled_argmax(-classwise_min, n_instances) + return multi_argmax(-classwise_min, n_instances) - return query_idx, X_pool[query_idx] + return shuffled_argmax(-classwise_min, n_instances) def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, - n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray: """ AvgConfidence query strategy for multilabel classification. @@ -198,15 +190,13 @@ def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, classwise_mean = np.mean(classwise_confidence, axis=1) if not random_tie_break: - query_idx = multi_argmax(classwise_mean, n_instances) - else: - query_idx = shuffled_argmax(classwise_mean, n_instances) + return multi_argmax(classwise_mean, n_instances) - return query_idx, X_pool[query_idx] + return shuffled_argmax(classwise_mean, n_instances) def max_score(classifier: OneVsRestClassifier, X_pool: modALinput, - n_instances: int = 1, random_tie_break: bool = 1) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break: bool = 1) -> np.ndarray: """ MaxScore query strategy for multilabel classification. @@ -231,15 +221,13 @@ def max_score(classifier: OneVsRestClassifier, X_pool: modALinput, classwise_max = np.max(classwise_scores, axis=1) if not random_tie_break: - query_idx = multi_argmax(classwise_max, n_instances) - else: - query_idx = shuffled_argmax(classwise_max, n_instances) + return multi_argmax(classwise_max, n_instances) - return query_idx, X_pool[query_idx] + return shuffled_argmax(classwise_max, n_instances) def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput, - n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: + n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray: """ AvgScore query strategy for multilabel classification. @@ -264,8 +252,6 @@ def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput, classwise_mean = np.mean(classwise_scores, axis=1) if not random_tie_break: - query_idx = multi_argmax(classwise_mean, n_instances) - else: - query_idx = shuffled_argmax(classwise_mean, n_instances) + return multi_argmax(classwise_mean, n_instances) - return query_idx, X_pool[query_idx] + return shuffled_argmax(classwise_mean, n_instances) diff --git a/modAL/uncertainty.py b/modAL/uncertainty.py index c11de43..e00d564 100644 --- a/modAL/uncertainty.py +++ b/modAL/uncertainty.py @@ -132,7 +132,7 @@ def classifier_entropy(classifier: BaseEstimator, X: modALinput, **predict_proba def uncertainty_sampling(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, - **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + **uncertainty_measure_kwargs) -> np.ndarray: """ Uncertainty sampling query strategy. Selects the least sure instances for labelling. @@ -152,16 +152,14 @@ def uncertainty_sampling(classifier: BaseEstimator, X: modALinput, uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs) if not random_tie_break: - query_idx = multi_argmax(uncertainty, n_instances=n_instances) - else: - query_idx = shuffled_argmax(uncertainty, n_instances=n_instances) + return multi_argmax(uncertainty, n_instances=n_instances) - return query_idx, X[query_idx] + return shuffled_argmax(uncertainty, n_instances=n_instances) def margin_sampling(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, - **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + **uncertainty_measure_kwargs) -> np.ndarray: """ Margin sampling query strategy. Selects the instances where the difference between the first most likely and second most likely classes are the smallest. @@ -180,16 +178,14 @@ def margin_sampling(classifier: BaseEstimator, X: modALinput, margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs) if not random_tie_break: - query_idx = multi_argmax(-margin, n_instances=n_instances) - else: - query_idx = shuffled_argmax(-margin, n_instances=n_instances) + return multi_argmax(-margin, n_instances=n_instances) - return query_idx, X[query_idx] + return shuffled_argmax(-margin, n_instances=n_instances) def entropy_sampling(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, - **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: + **uncertainty_measure_kwargs) -> np.ndarray: """ Entropy sampling query strategy. Selects the instances where the class probabilities have the largest entropy. @@ -210,8 +206,6 @@ def entropy_sampling(classifier: BaseEstimator, X: modALinput, entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs) if not random_tie_break: - query_idx = multi_argmax(entropy, n_instances=n_instances) - else: - query_idx = shuffled_argmax(entropy, n_instances=n_instances) + return multi_argmax(entropy, n_instances=n_instances) - return query_idx, X[query_idx] + return shuffled_argmax(entropy, n_instances=n_instances) diff --git a/modAL/utils/combination.py b/modAL/utils/combination.py index 98974ca..45eee2f 100644 --- a/modAL/utils/combination.py +++ b/modAL/utils/combination.py @@ -78,7 +78,6 @@ def make_query_strategy(utility_measure: Callable, selector: Callable) -> Callab """ def query_strategy(classifier: BaseEstimator, X: modALinput) -> Tuple: utility = utility_measure(classifier, X) - query_idx = selector(utility) - return query_idx, X[query_idx] + return selector(utility) return query_strategy diff --git a/modAL/utils/data.py b/modAL/utils/data.py index 48409b6..64cdeb0 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -1,11 +1,12 @@ -from typing import Union, Container +from typing import Union, Container, List from itertools import chain import numpy as np +import pandas as pd import scipy.sparse as sp -modALinput = Union[list, np.ndarray, sp.csr_matrix] +modALinput = Union[list, np.ndarray, sp.csr_matrix, pd.DataFrame] def data_vstack(blocks: Container) -> modALinput: @@ -24,8 +25,34 @@ def data_vstack(blocks: Container) -> modALinput: return list(chain(blocks)) elif sp.issparse(blocks[0]): return sp.vstack(blocks) + elif isinstance(blocks[0], pd.DataFrame): + return blocks[0].append(blocks[1]) else: try: return np.concatenate(blocks) except: raise TypeError('%s datatype is not supported' % type(blocks[0])) + + +def retrieve_rows(X: modALinput, + I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: + """ + Returns the rows I from the data set X + """ + if isinstance(X, pd.DataFrame): + return X.iloc[I] + + return X[I] + +def drop_rows(X: modALinput, + I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: + if isinstance(X, pd.DataFrame): + return X.drop(I, axis=0) + + return np.delete(X, I, axis=0) + +def enumerate_data(X: modALinput): + if isinstance(X, pd.DataFrame): + return X.iterrows() + + return enumerate(X) diff --git a/rtd_requirements.txt b/rtd_requirements.txt index 7d9fe89..76d0d38 100644 --- a/rtd_requirements.txt +++ b/rtd_requirements.txt @@ -3,3 +3,4 @@ scipy scikit-learn ipykernel nbsphinx +pandas diff --git a/setup.py b/setup.py index b48bfd8..c58ed7c 100644 --- a/setup.py +++ b/setup.py @@ -10,5 +10,5 @@ url='https://modAL-python.github.io/', packages=['modAL', 'modAL.models', 'modAL.utils'], classifiers=['Development Status :: 4 - Beta'], - install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18'], + install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0'], ) diff --git a/tests/core_tests.py b/tests/core_tests.py index 1c80d7b..c4356e4 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -140,8 +140,7 @@ def test_make_query_strategy(self): query_1 = query_strategy(learner, X) query_2 = modAL.uncertainty.uncertainty_sampling(learner, X) - np.testing.assert_equal(query_1[0], query_2[0]) - np.testing.assert_almost_equal(query_1[1], query_2[1]) + np.testing.assert_equal(query_1, query_2) def test_data_vstack(self): for n_samples, n_features in product(range(1, 10), range(1, 10)): @@ -560,10 +559,10 @@ def test_uncertainty_sampling(self): predict_proba = np.random.rand(n_samples, n_classes) predict_proba[true_query_idx] = max_proba classifier = mock.MockEstimator(predict_proba_return=predict_proba) - query_idx, query_instance = modAL.uncertainty.uncertainty_sampling( + query_idx = modAL.uncertainty.uncertainty_sampling( classifier, np.random.rand(n_samples, n_classes) ) - shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.uncertainty_sampling( + shuffled_query_idx = modAL.uncertainty.uncertainty_sampling( classifier, np.random.rand(n_samples, n_classes), random_tie_break=True ) @@ -577,10 +576,10 @@ def test_margin_sampling(self): predict_proba[:, 0] = 1.0 predict_proba[true_query_idx, 0] = 0.0 classifier = mock.MockEstimator(predict_proba_return=predict_proba) - query_idx, query_instance = modAL.uncertainty.margin_sampling( + query_idx = modAL.uncertainty.margin_sampling( classifier, np.random.rand(n_samples, n_classes) ) - shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.margin_sampling( + shuffled_query_idx = modAL.uncertainty.margin_sampling( classifier, np.random.rand(n_samples, n_classes), random_tie_break=True ) @@ -595,10 +594,10 @@ def test_entropy_sampling(self): predict_proba[:, 0] = 1.0 predict_proba[true_query_idx] = max_proba classifier = mock.MockEstimator(predict_proba_return=predict_proba) - query_idx, query_instance = modAL.uncertainty.entropy_sampling( + query_idx = modAL.uncertainty.entropy_sampling( classifier, np.random.rand(n_samples, n_classes) ) - shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.entropy_sampling( + shuffled_query_idx = modAL.uncertainty.entropy_sampling( classifier, np.random.rand(n_samples, n_classes), random_tie_break=True ) @@ -698,7 +697,7 @@ def test_query(self): for n_features in range(1, 10): X = np.random.rand(n_samples, n_features) query_idx = np.random.randint(0, n_samples) - mock_query = mock.MockFunction(return_val=(query_idx, X[query_idx])) + mock_query = mock.MockFunction(return_val=query_idx) learner = modAL.models.learners.ActiveLearner( estimator=None, query_strategy=mock_query @@ -1107,4 +1106,3 @@ def test_examples(self): if __name__ == '__main__': unittest.main(verbosity=2) -0 \ No newline at end of file diff --git a/tests/example_tests/custom_query_strategies.py b/tests/example_tests/custom_query_strategies.py index 1181bab..441814a 100644 --- a/tests/example_tests/custom_query_strategies.py +++ b/tests/example_tests/custom_query_strategies.py @@ -42,8 +42,7 @@ # classifier uncertainty and classifier margin def custom_query_strategy(classifier, X, n_instances=1): utility = linear_combination(classifier, X) - query_idx = multi_argmax(utility, n_instances=n_instances) - return query_idx, X[query_idx] + return multi_argmax(utility, n_instances=n_instances) custom_query_learner = ActiveLearner( estimator=GaussianProcessClassifier(1.0 * RBF(1.0)), From 1ad79fecb074233a4fbe999c5a60e704fdf5f0a1 Mon Sep 17 00:00:00 2001 From: Boyan Hristov Date: Fri, 25 Sep 2020 11:24:11 +0200 Subject: [PATCH 091/182] #104 - added on_transformed support to BaseCommittee --- modAL/models/base.py | 26 +++++++++++++++++--------- modAL/utils/data.py | 26 +++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 8871fea..7787eed 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -16,7 +16,7 @@ import scipy.sparse as sp -from modAL.utils.data import data_vstack, modALinput, retrieve_rows +from modAL.utils.data import data_vstack, data_hstack, modALinput, retrieve_rows if sys.version_info >= (3, 4): ABC = abc.ABC @@ -143,13 +143,7 @@ def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.cs ################################ # concatenate all transformations and return - # TODO: maybe use a newly implemented data_hstack() instead - - # use sparse representation if any of the pipelines do - if any([isinstance(Xti, sp.csr_matrix) for Xti in Xt]): - return sp.hstack([sp.csc_matrix(Xti) for Xti in Xt]) - - return np.hstack(Xt) + return data_hstack(Xt) def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': """ @@ -297,12 +291,15 @@ class BaseCommittee(ABC, BaseEstimator): Args: learner_list: List of ActiveLearner objects to form committee. query_strategy: Function to query labels. + on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator + when applying the query strategy. """ - def __init__(self, learner_list: List[BaseLearner], query_strategy: Callable) -> None: + def __init__(self, learner_list: List[BaseLearner], query_strategy: Callable, on_transformed: bool = False) -> None: assert type(learner_list) == list, 'learners must be supplied in a list' self.learner_list = learner_list self.query_strategy = query_strategy + self.on_transformed = on_transformed def __iter__(self) -> Iterator[BaseLearner]: for learner in self.learner_list: @@ -369,6 +366,17 @@ def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> 'BaseCommittee': return self + def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.csr_matrix]: + """ + Transforms the data as supplied to each learner's estimator and concatenates transformations. + Args: + X: dataset to be transformed + + Returns: + Transformed data set + """ + return data_hstack([learner.transform_without_estimating(X) for learner in self.learner_list]) + def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: """ Finds the n_instances most informative point in the data provided by calling the query_strategy function. diff --git a/modAL/utils/data.py b/modAL/utils/data.py index 64cdeb0..77c0530 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -1,4 +1,4 @@ -from typing import Union, Container, List +from typing import Union, List, Sequence from itertools import chain import numpy as np @@ -9,9 +9,9 @@ modALinput = Union[list, np.ndarray, sp.csr_matrix, pd.DataFrame] -def data_vstack(blocks: Container) -> modALinput: +def data_vstack(blocks: Sequence[modALinput]) -> modALinput: """ - Stack vertically both sparse and dense arrays. + Stack vertically sparse/dense arrays and pandas data frames. Args: blocks: Sequence of modALinput objects. @@ -34,6 +34,26 @@ def data_vstack(blocks: Container) -> modALinput: raise TypeError('%s datatype is not supported' % type(blocks[0])) +def data_hstack(blocks: Sequence[modALinput]) -> modALinput: + """ + Stack horizontally both sparse and dense arrays + + Args: + blocks: Sequence of modALinput objects. + + Returns: + New sequence of horizontally stacked elements. + """ + # use sparse representation if any of the blocks do + if any([sp.issparse(b) for b in blocks]): + return sp.hstack(blocks) + + try: + return np.hstack(blocks) + except: + raise TypeError('%s datatype is not supported' % type(blocks[0])) + + def retrieve_rows(X: modALinput, I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: """ From 171e2e956d5107b1136cf18fb177e2dd7743d9e6 Mon Sep 17 00:00:00 2001 From: Boyan Hristov Date: Mon, 28 Sep 2020 15:48:32 +0200 Subject: [PATCH 092/182] #20, #104 - added tests for on_transformed functionality and pandas support; small fixes --- modAL/models/base.py | 2 + modAL/models/learners.py | 20 +++-- tests/core_tests.py | 173 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 188 insertions(+), 7 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 7787eed..710fed3 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -300,6 +300,8 @@ def __init__(self, learner_list: List[BaseLearner], query_strategy: Callable, on self.learner_list = learner_list self.query_strategy = query_strategy self.on_transformed = on_transformed + # TODO: update training data when using fit() and teach() methods + self.X_training = None def __iter__(self) -> Iterator[BaseLearner]: for learner in self.learner_list: diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 4c0a812..9af43a2 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -7,7 +7,7 @@ from modAL.models.base import BaseLearner, BaseCommittee from modAL.utils.validation import check_class_labels, check_class_proba -from modAL.utils.data import modALinput +from modAL.utils.data import modALinput, retrieve_rows from modAL.uncertainty import uncertainty_sampling from modAL.disagreement import vote_entropy_sampling, max_std_sampling from modAL.acquisition import max_EI @@ -187,7 +187,7 @@ def __init__(self, # setting the maximum value if self.y_training is not None: max_idx = np.argmax(self.y_training) - self.X_max = self.X_training[max_idx] + self.X_max = retrieve_rows(self.X_training, max_idx) self.y_max = self.y_training[max_idx] else: self.X_max = None @@ -198,7 +198,7 @@ def _set_max(self, X: modALinput, y: modALinput) -> None: y_max = y[max_idx] if y_max > self.y_max: self.y_max = y_max - self.X_max = X[max_idx] + self.X_max = retrieve_rows(X, max_idx) def get_max(self) -> Tuple: """ @@ -248,6 +248,8 @@ class Committee(BaseCommittee): learner_list: A list of ActiveLearners forming the Committee. query_strategy: Query strategy function. Committee supports disagreement-based query strategies from :mod:`modAL.disagreement`, but uncertainty-based ones from :mod:`modAL.uncertainty` are also supported. + on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator + when applying the query strategy. Attributes: classes_: Class labels known by the Committee. @@ -288,8 +290,9 @@ class Committee(BaseCommittee): ... y=iris['target'][query_idx].reshape(1, ) ... ) """ - def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = vote_entropy_sampling) -> None: - super().__init__(learner_list, query_strategy) + def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = vote_entropy_sampling, + on_transformed: bool = False) -> None: + super().__init__(learner_list, query_strategy, on_transformed) self._set_classes() def _set_classes(self): @@ -456,6 +459,8 @@ class CommitteeRegressor(BaseCommittee): Args: learner_list: A list of ActiveLearners forming the CommitteeRegressor. query_strategy: Query strategy function. + on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator + when applying the query strategy. Examples: @@ -499,8 +504,9 @@ class CommitteeRegressor(BaseCommittee): ... query_idx, query_instance = committee.query(X.reshape(-1, 1)) ... committee.teach(X[query_idx].reshape(-1, 1), y[query_idx].reshape(-1, 1)) """ - def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = max_std_sampling) -> None: - super().__init__(learner_list, query_strategy) + def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = max_std_sampling, + on_transformed: bool = False) -> None: + super().__init__(learner_list, query_strategy, on_transformed) def predict(self, X: modALinput, return_std: bool = False, **predict_kwargs) -> Any: """ diff --git a/tests/core_tests.py b/tests/core_tests.py index c4356e4..ba83cee 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -1,6 +1,7 @@ import random import unittest import numpy as np +import pandas as pd import mock import modAL.models.base @@ -26,6 +27,8 @@ from sklearn.metrics import confusion_matrix from sklearn.svm import SVC from sklearn.multiclass import OneVsRestClassifier +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer from scipy.stats import entropy, norm from scipy.special import ndtr from scipy import sparse as sp @@ -788,6 +791,68 @@ def test_sparse_matrices(self): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx], y_pool[query_idx]) + def test_on_transformed(self): + n_samples = 10 + n_features = 5 + query_strategies = [ + modAL.batch.uncertainty_batch_sampling + # add further strategies which work with instance representations + # no further ones as of 25.09.2020 + ] + X_pool = np.random.rand(n_samples, n_features) + + # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline + X_pool = pd.DataFrame(X_pool) + + y_pool = np.random.randint(0, 2, size=(n_samples,)) + train_idx = np.random.choice(range(n_samples), size=2, replace=False) + + for query_strategy in query_strategies: + learner = modAL.models.learners.ActiveLearner( + estimator=make_pipeline( + FunctionTransformer(func=pd.DataFrame.to_numpy), + RandomForestClassifier(n_estimators=10) + ), + query_strategy=query_strategy, + X_training=X_pool.iloc[train_idx], + y_training=y_pool[train_idx], + on_transformed=True + ) + query_idx, query_inst = learner.query(X_pool) + learner.teach(X_pool.iloc[query_idx], y_pool[query_idx]) + + def test_old_query_strategy_interface(self): + n_samples = 10 + n_features = 5 + X_pool = np.random.rand(n_samples, n_features) + y_pool = np.random.randint(0, 2, size=(n_samples,)) + + # defining a custom query strategy also returning the selected instance + # make sure even if a query strategy works in some funny way + # (e.g. instance not matching instance index), + # the old interface remains unchanged + query_idx_ = np.random.choice(n_samples, 2) + query_instance_ = X_pool[(query_idx_ + 1) % len(X_pool)] + + def custom_query_strategy(classifier, X): + return query_idx_, query_instance_ + + + train_idx = np.random.choice(range(n_samples), size=2, replace=False) + custom_query_learner = modAL.models.learners.ActiveLearner( + estimator=RandomForestClassifier(n_estimators=10), + query_strategy=custom_query_strategy, + X_training=X_pool[train_idx], y_training=y_pool[train_idx] + ) + + query_idx, query_instance = custom_query_learner.query(X_pool) + custom_query_learner.teach( + X=X_pool[query_idx], + y=y_pool[query_idx] + ) + np.testing.assert_equal(query_idx, query_idx_) + np.testing.assert_equal(query_instance, query_instance_) + class TestBayesianOptimizer(unittest.TestCase): def test_set_max(self): @@ -897,6 +962,39 @@ def test_teach(self): ) learner.teach(X, y, bootstrap=bootstrap, only_new=only_new) + def test_on_transformed(self): + n_samples = 10 + n_features = 5 + query_strategies = [ + # TODO remove, added just to make sure on_transformed doesn't break anything + # but it has no influence on this strategy, nothing special tested here + mock.MockFunction(return_val=[np.random.randint(0, n_samples)]) + + # add further strategies which work with instance representations + # no further ones as of 25.09.2020 + ] + X_pool = np.random.rand(n_samples, n_features) + + # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline + X_pool = pd.DataFrame(X_pool) + + y_pool = np.random.rand(n_samples) + train_idx = np.random.choice(range(n_samples), size=2, replace=False) + + for query_strategy in query_strategies: + learner = modAL.models.learners.BayesianOptimizer( + estimator=make_pipeline( + FunctionTransformer(func=pd.DataFrame.to_numpy), + GaussianProcessRegressor() + ), + query_strategy=query_strategy, + X_training=X_pool.iloc[train_idx], + y_training=y_pool[train_idx], + on_transformed=True + ) + query_idx, query_inst = learner.query(X_pool) + learner.teach(X_pool.iloc[query_idx], y_pool[query_idx]) + class TestCommittee(unittest.TestCase): @@ -1007,6 +1105,42 @@ def test_teach(self): committee.teach(X, y, bootstrap=bootstrap, only_new=only_new) + def test_on_transformed(self): + n_samples = 10 + n_features = 5 + query_strategies = [ + modAL.batch.uncertainty_batch_sampling + # add further strategies which work with instance representations + # no further ones as of 25.09.2020 + ] + X_pool = np.random.rand(n_samples, n_features) + + # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline + X_pool = pd.DataFrame(X_pool) + + y_pool = np.random.randint(0, 2, size=(n_samples,)) + train_idx = np.random.choice(range(n_samples), size=5, replace=False) + + learner_list = [modAL.models.learners.ActiveLearner( + estimator=make_pipeline( + FunctionTransformer(func=pd.DataFrame.to_numpy), + RandomForestClassifier(n_estimators=10) + ), + # committee learners can contain different amounts of + # different instances + X_training=X_pool.iloc[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], + y_training=y_pool[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], + ) for i in range(3)] + + for query_strategy in query_strategies: + committee = modAL.models.learners.Committee( + learner_list=learner_list, + query_strategy=query_strategy, + on_transformed=True + ) + query_idx, query_inst = committee.query(X_pool) + committee.teach(X_pool.iloc[query_idx], y_pool[query_idx]) + class TestCommitteeRegressor(unittest.TestCase): @@ -1040,6 +1174,45 @@ def test_vote(self): vote_output ) + def test_on_transformed(self): + n_samples = 10 + n_features = 5 + query_strategies = [ + # TODO remove, added just to make sure on_transformed doesn't break anything + # but it has no influence on this strategy, nothing special tested here + mock.MockFunction(return_val=[np.random.randint(0, n_samples)]) + + # add further strategies which work with instance representations + # no further ones as of 25.09.2020 + ] + X_pool = np.random.rand(n_samples, n_features) + + # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline + X_pool = pd.DataFrame(X_pool) + + y_pool = np.random.rand(n_samples) + train_idx = np.random.choice(range(n_samples), size=2, replace=False) + + learner_list = [modAL.models.learners.ActiveLearner( + estimator=make_pipeline( + FunctionTransformer(func=pd.DataFrame.to_numpy), + GaussianProcessRegressor() + ), + # committee learners can contain different amounts of + # different instances + X_training=X_pool.iloc[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], + y_training=y_pool[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], + ) for i in range(3)] + + for query_strategy in query_strategies: + committee = modAL.models.learners.CommitteeRegressor( + learner_list=learner_list, + query_strategy=query_strategy, + on_transformed=True + ) + query_idx, query_inst = committee.query(X_pool) + committee.teach(X_pool.iloc[query_idx], y_pool[query_idx]) + class TestMultilabel(unittest.TestCase): def test_SVM_loss(self): From 942d045638e0797f9fe0b86e5ee5e5ae6d1e846f Mon Sep 17 00:00:00 2001 From: Boyan Hristov Date: Tue, 29 Sep 2020 15:39:28 +0200 Subject: [PATCH 093/182] #104 - FIXED: now saving transformed data in fit() of base learner --- modAL/models/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modAL/models/base.py b/modAL/models/base.py index 710fed3..9e9949f 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -211,6 +211,7 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, force_all_finite=self.force_all_finite) self.X_training, self.y_training = X, y + self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) def predict(self, X: modALinput, **predict_kwargs) -> Any: From 68f8878146ad3b92d4d22642c2acc0ebd7b2f0f0 Mon Sep 17 00:00:00 2001 From: Boyan Hristov Date: Thu, 8 Oct 2020 22:21:26 +0200 Subject: [PATCH 094/182] #104 - added an empty pipe at the end of the transformation pipeline to prevent weird handling of last transformation pipe, which is usually expected to be an estimator --- modAL/models/base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 9e9949f..4474a0e 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -132,8 +132,10 @@ def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.cs if isinstance(pipe, Pipeline): # NOTE: The used pipeline class might be an extension to sklearn's! # Create a new instance of the used pipeline class with all - # components but the final estimator. - transformation_pipe = pipe.__class__(steps=pipe.steps[:-1]) + # components but the final estimator, which is replaced by an empty (passthrough) component. + # This prevents any special handling of the final transformation pipe, which is usually + # expected to be an estimator. + transformation_pipe = pipe.__class__(steps=[*pipe.steps[:-1], ('passthrough', 'passthrough')]) Xt.append(transformation_pipe.transform(X)) # in case no transformation pipelines are used by the estimator, From 143067c9b8ffe094efe610e6353208b55ea040a3 Mon Sep 17 00:00:00 2001 From: Boyan Hristov Date: Fri, 16 Oct 2020 21:01:04 +0200 Subject: [PATCH 095/182] #20 - fixed scipy.sparse support in expected_error, fixed issues from code review --- modAL/expected_error.py | 9 ++- modAL/utils/data.py | 126 ++++++++++++++++++++++++++++++++-------- tests/core_tests.py | 33 ++++++----- 3 files changed, 123 insertions(+), 45 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index 947e58c..f362ecd 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -10,7 +10,7 @@ from sklearn.exceptions import NotFittedError from modAL.models import ActiveLearner -from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows +from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows, data_shape, add_row from modAL.utils.selection import multi_argmax, shuffled_argmax from modAL.uncertainty import _proba_uncertainty, _proba_entropy @@ -38,14 +38,13 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = Returns: - The indices of the instances from X chosen to be labelled; - the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. """ assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0' assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\'' - expected_error = np.zeros(shape=(len(X), )) + expected_error = np.zeros(shape=(data_shape(X)[0],)) possible_labels = np.unique(learner.y_training) try: @@ -62,7 +61,7 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = X_reduced = drop_rows(X, x_idx) # estimate the expected error for y_idx, y in enumerate(possible_labels): - X_new = data_vstack((learner.X_training, [x])) + X_new = add_row(learner.X_training, x) y_new = data_vstack((learner.y_training, np.array(y).reshape(1,))) cloned_estimator.fit(X_new, y_new) diff --git a/modAL/utils/data.py b/modAL/utils/data.py index 77c0530..a24e1c7 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -1,12 +1,11 @@ from typing import Union, List, Sequence -from itertools import chain import numpy as np import pandas as pd import scipy.sparse as sp -modALinput = Union[list, np.ndarray, sp.csr_matrix, pd.DataFrame] +modALinput = Union[sp.csr_matrix, pd.DataFrame, np.ndarray, list] def data_vstack(blocks: Sequence[modALinput]) -> modALinput: @@ -19,24 +18,21 @@ def data_vstack(blocks: Sequence[modALinput]) -> modALinput: Returns: New sequence of vertically stacked elements. """ - if isinstance(blocks[0], np.ndarray): - return np.concatenate(blocks) - elif isinstance(blocks[0], list): - return list(chain(blocks)) - elif sp.issparse(blocks[0]): + if any([sp.issparse(b) for b in blocks]): return sp.vstack(blocks) elif isinstance(blocks[0], pd.DataFrame): - return blocks[0].append(blocks[1]) - else: - try: - return np.concatenate(blocks) - except: - raise TypeError('%s datatype is not supported' % type(blocks[0])) + return blocks[0].append(blocks[1:]) + elif isinstance(blocks[0], np.ndarray): + return np.concatenate(blocks) + elif isinstance(blocks[0], list): + return np.concatenate(blocks).tolist() + + raise TypeError('%s datatype is not supported' % type(blocks[0])) def data_hstack(blocks: Sequence[modALinput]) -> modALinput: """ - Stack horizontally both sparse and dense arrays + Stack horizontally sparse/dense arrays and pandas data frames. Args: blocks: Sequence of modALinput objects. @@ -44,35 +40,115 @@ def data_hstack(blocks: Sequence[modALinput]) -> modALinput: Returns: New sequence of horizontally stacked elements. """ - # use sparse representation if any of the blocks do if any([sp.issparse(b) for b in blocks]): return sp.hstack(blocks) - - try: + elif isinstance(blocks[0], pd.DataFrame): + pd.concat(blocks, axis=1) + elif isinstance(blocks[0], np.ndarray): return np.hstack(blocks) - except: - raise TypeError('%s datatype is not supported' % type(blocks[0])) + elif isinstance(blocks[0], list): + return np.hstack(blocks).tolist() + + TypeError('%s datatype is not supported' % type(blocks[0])) + + +def add_row(X:modALinput, row: modALinput): + """ + Returns X' = + + [X + + row] + """ + if isinstance(X, np.ndarray): + return np.vstack((X, row)) + elif isinstance(X, list): + return np.vstack((X, row)).tolist() + + # data_vstack readily supports stacking of matrix as first argument + # and row as second for the other data types + return data_vstack([X, row]) def retrieve_rows(X: modALinput, I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: """ Returns the rows I from the data set X + + For a single index, the result is as follows: + * 1xM matrix in case of scipy sparse NxM matrix X + * pandas series in case of a pandas data frame + * row in case of list or numpy format """ - if isinstance(X, pd.DataFrame): + if sp.issparse(X): + # Out of the sparse matrix formats (sp.csc_matrix, sp.csr_matrix, sp.bsr_matrix, + # sp.lil_matrix, sp.dok_matrix, sp.coo_matrix, sp.dia_matrix), only sp.bsr_matrix, sp.coo_matrix + # and sp.dia_matrix don't support indexing and need to be converted to a sparse format + # that does support indexing. It seems conversion to CSR is currently most efficient. + + try: + return X[I] + except: + sp_format = X.getformat() + return X.tocsr()[I].asformat(sp_format) + elif isinstance(X, pd.DataFrame): return X.iloc[I] + elif isinstance(X, np.ndarray): + return X[I] + elif isinstance(X, list): + return np.array(X)[I].tolist() + + raise TypeError('%s datatype is not supported' % type(X)) - return X[I] def drop_rows(X: modALinput, I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: - if isinstance(X, pd.DataFrame): + """ + Returns X without the row(s) at index/indices I + """ + if sp.issparse(X): + mask = np.ones(X.shape[0], dtype=bool) + mask[I] = False + return retrieve_rows(X, mask) + elif isinstance(X, pd.DataFrame): return X.drop(I, axis=0) + elif isinstance(X, np.ndarray): + return np.delete(X, I, axis=0) + elif isinstance(X, list): + return np.delete(X, I, axis=0).tolist() + + raise TypeError('%s datatype is not supported' % type(X)) - return np.delete(X, I, axis=0) def enumerate_data(X: modALinput): - if isinstance(X, pd.DataFrame): + """ + for i, x in enumerate_data(X): + + Depending on the data type of X, returns: + + * A 1xM matrix in case of scipy sparse NxM matrix X + * pandas series in case of a pandas data frame X + * row in case of list or numpy format + """ + if sp.issparse(X): + return enumerate(X.tocsr()) + elif isinstance(X, pd.DataFrame): return X.iterrows() + elif isinstance(X, np.ndarray) or isinstance(X, list): + # numpy arrays and lists can readily be enumerated + return enumerate(X) + + raise TypeError('%s datatype is not supported' % type(X)) + + +def data_shape(X: modALinput): + """ + Returns the shape of the data set X + """ + if sp.issparse(X) or isinstance(X, pd.DataFrame) or isinstance(X, np.ndarray): + # scipy.sparse, pandas and numpy all support .shape + return X.shape + elif isinstance(X, list): + return np.array(X).shape - return enumerate(X) + raise TypeError('%s datatype is not supported' % type(X)) diff --git a/tests/core_tests.py b/tests/core_tests.py index ba83cee..499b49b 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -457,21 +457,24 @@ def test_max_std_sampling(self): class TestEER(unittest.TestCase): def test_eer(self): for n_pool, n_features, n_classes in product(range(5, 10), range(1, 5), range(2, 5)): - X_training, y_training = np.random.rand(10, n_features), np.random.randint(0, n_classes, size=10) - X_pool, y_pool = np.random.rand(n_pool, n_features), np.random.randint(0, n_classes+1, size=n_pool) - - learner = modAL.models.ActiveLearner(RandomForestClassifier(n_estimators=2), - X_training=X_training, y_training=y_training) - - modAL.expected_error.expected_error_reduction(learner, X_pool) - modAL.expected_error.expected_error_reduction(learner, X_pool, random_tie_break=True) - modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1) - modAL.expected_error.expected_error_reduction(learner, X_pool, loss='binary') - modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1, loss='log') - self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction, - learner, X_pool, p_subsample=1.5) - self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction, - learner, X_pool, loss=42) + X_training_, y_training = np.random.rand(10, n_features).tolist(), np.random.randint(0, n_classes, size=10) + X_pool_, y_pool = np.random.rand(n_pool, n_features).tolist(), np.random.randint(0, n_classes+1, size=n_pool) + + for data_type in (sp.csr_matrix, pd.DataFrame, np.array, list): + X_training, X_pool = data_type(X_training_), data_type(X_pool_) + + learner = modAL.models.ActiveLearner(RandomForestClassifier(n_estimators=2), + X_training=X_training, y_training=y_training) + + modAL.expected_error.expected_error_reduction(learner, X_pool) + modAL.expected_error.expected_error_reduction(learner, X_pool, random_tie_break=True) + modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1) + modAL.expected_error.expected_error_reduction(learner, X_pool, loss='binary') + modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1, loss='log') + self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction, + learner, X_pool, p_subsample=1.5) + self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction, + learner, X_pool, loss=42) class TestUncertainties(unittest.TestCase): From a0e5f31e9f3ebb05ae3227073ff4797efb5b1127 Mon Sep 17 00:00:00 2001 From: Damien Lancry <31038265+damienlancry@users.noreply.github.com> Date: Wed, 21 Oct 2020 17:41:28 +0800 Subject: [PATCH 096/182] typo in the formula of alpha there is either a mistake at this line or later in the code ``` alpha = len(X_training)/len(X_raw) ``` --- docs/source/content/query_strategies/ranked_batch_mode.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/content/query_strategies/ranked_batch_mode.ipynb b/docs/source/content/query_strategies/ranked_batch_mode.ipynb index c8fe902..19f3be3 100644 --- a/docs/source/content/query_strategies/ranked_batch_mode.ipynb +++ b/docs/source/content/query_strategies/ranked_batch_mode.ipynb @@ -18,7 +18,7 @@ "\n", "$$score = \\alpha(1 - \\Phi(x, X_{labeled})) + (1 - \\alpha) U(x),$$\n", "\n", - "where $\\alpha = \\frac{|X_{unlabeled}|}{|X_{unlabeled}| + |X_{labeled}|}$, $X_{labeled}$ is the labeled dataset, $U(x)$ is the uncertainty of predictions for $x$, and $\\Phi$ is a so-called similarity function, for instance [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). This latter function measures how well the feature space is explored near $x$. (The lower the better.)\n", + "where $\\alpha = \\frac{|X_{labeled}|}{|X_{unlabeled}| + |X_{labeled}|}$, $X_{labeled}$ is the labeled dataset, $U(x)$ is the uncertainty of predictions for $x$, and $\\Phi$ is a so-called similarity function, for instance [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). This latter function measures how well the feature space is explored near $x$. (The lower the better.)\n", "\n", "After scoring, the highest scored instance is put at the top of a list. The instance is removed from the pool and the score is recalculated until the desired amount of instances are selected." ] From fab3e964ba62e8bc70dde867f3c2a1bb3a09f4e4 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Sat, 31 Oct 2020 11:04:48 +0100 Subject: [PATCH 097/182] fix: alpha --- docs/source/content/query_strategies/ranked_batch_mode.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/content/query_strategies/ranked_batch_mode.ipynb b/docs/source/content/query_strategies/ranked_batch_mode.ipynb index 19f3be3..f3633b0 100644 --- a/docs/source/content/query_strategies/ranked_batch_mode.ipynb +++ b/docs/source/content/query_strategies/ranked_batch_mode.ipynb @@ -18,7 +18,7 @@ "\n", "$$score = \\alpha(1 - \\Phi(x, X_{labeled})) + (1 - \\alpha) U(x),$$\n", "\n", - "where $\\alpha = \\frac{|X_{labeled}|}{|X_{unlabeled}| + |X_{labeled}|}$, $X_{labeled}$ is the labeled dataset, $U(x)$ is the uncertainty of predictions for $x$, and $\\Phi$ is a so-called similarity function, for instance [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). This latter function measures how well the feature space is explored near $x$. (The lower the better.)\n", + "where $\\alpha = \\frac{|X_{unlabeled}|}{|X_{unlabeled}| + |X_{labeled}|}$, $X_{labeled}$ is the labeled dataset, $U(x)$ is the uncertainty of predictions for $x$, and $\\Phi$ is a so-called similarity function, for instance [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). This latter function measures how well the feature space is explored near $x$. (The lower the better.)\n", "\n", "After scoring, the highest scored instance is put at the top of a list. The instance is removed from the pool and the score is recalculated until the desired amount of instances are selected." ] @@ -130,7 +130,7 @@ "distance_scores = pairwise_distances(X_pool, X_training, metric='euclidean').min(axis=1)\n", "similarity_scores = 1 / (1 + distance_scores)\n", "\n", - "alpha = len(X_training)/len(X_raw)\n", + "alpha = len(X_pool)/len(X_raw)\n", "\n", "scores = alpha * (1 - similarity_scores) + (1 - alpha) * uncertainty" ] From 8cbaea99942aacee4ce09dd5215963cc7bdb1139 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Sun, 1 Nov 2020 09:39:01 +0100 Subject: [PATCH 098/182] version number bumped --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c58ed7c..8daf94e 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='modAL', - version='0.3.6', + version='0.4.0', author='Tivadar Danka', author_email='85a5187a@opayq.com', description='A modular active learning framework for Python3', From 82c31586ccb8f7c279cacd570b3f7cc9ac07ae24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=98yvind=20Samuelsen?= Date: Sat, 28 Nov 2020 12:55:11 +0100 Subject: [PATCH 099/182] compute committee entropy on whole array, instead of once every row --- modAL/disagreement.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modAL/disagreement.py b/modAL/disagreement.py index 2d5e224..66fad5d 100644 --- a/modAL/disagreement.py +++ b/modAL/disagreement.py @@ -35,7 +35,6 @@ def vote_entropy(committee: BaseCommittee, X: modALinput, **predict_proba_kwargs return np.zeros(shape=(X.shape[0],)) p_vote = np.zeros(shape=(X.shape[0], len(committee.classes_))) - entr = np.zeros(shape=(X.shape[0],)) for vote_idx, vote in enumerate(votes): vote_counter = Counter(vote) @@ -43,8 +42,7 @@ def vote_entropy(committee: BaseCommittee, X: modALinput, **predict_proba_kwargs for class_idx, class_label in enumerate(committee.classes_): p_vote[vote_idx, class_idx] = vote_counter[class_label]/n_learners - entr[vote_idx] = entropy(p_vote[vote_idx]) - + entr = entropy(p_vote, axis=1) return entr From 47bc726ffde4385620f6e7be1f62de19e49241f8 Mon Sep 17 00:00:00 2001 From: Boyan Hristov Date: Wed, 9 Dec 2020 16:45:06 +0100 Subject: [PATCH 100/182] FIX #108 - no longer storing transformed training data for on_transformed strategies --- modAL/batch.py | 8 +++++--- modAL/models/base.py | 8 -------- tests/core_tests.py | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/modAL/batch.py b/modAL/batch.py index b8fab1a..38fa732 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -8,7 +8,7 @@ import scipy.sparse as sp from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin_min -from modAL.utils.data import data_vstack, modALinput +from modAL.utils.data import data_vstack, modALinput, data_shape from modAL.models.base import BaseCommittee, BaseLearner from modAL.uncertainty import classifier_uncertainty @@ -150,8 +150,10 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee], if classifier.X_training is None: best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs) instance_index_ranking = [best_coldstart_instance_index] - elif classifier.X_training.shape[0] > 0: - labeled = classifier.Xt_training[:] if classifier.on_transformed else classifier.X_training[:] + elif data_shape(classifier.X_training)[0] > 0: + labeled = classifier.transform_without_estimating( + classifier.X_training + ) if classifier.on_transformed else classifier.X_training[:] instance_index_ranking = [] # The maximum number of records to sample. diff --git a/modAL/models/base.py b/modAL/models/base.py index 4474a0e..3d9dadc 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -66,11 +66,9 @@ def __init__(self, self.on_transformed = on_transformed self.X_training = X_training - self.Xt_training = None self.y_training = y_training if X_training is not None: self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs) - self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool' self.force_all_finite = force_all_finite @@ -92,15 +90,10 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None: if self.X_training is None: self.X_training = X - self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None self.y_training = y else: try: self.X_training = data_vstack((self.X_training, X)) - self.Xt_training = data_vstack(( - self.Xt_training, - self.transform_without_estimating(X) - )) if self.on_transformed else None self.y_training = data_vstack((self.y_training, y)) except ValueError: raise ValueError('the dimensions of the new training data and label must' @@ -213,7 +206,6 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, force_all_finite=self.force_all_finite) self.X_training, self.y_training = X, y - self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) def predict(self, X: modALinput, **predict_kwargs) -> Any: diff --git a/tests/core_tests.py b/tests/core_tests.py index 499b49b..1ed4f95 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -29,6 +29,7 @@ from sklearn.multiclass import OneVsRestClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer +from sklearn.feature_extraction.text import CountVectorizer from scipy.stats import entropy, norm from scipy.special import ndtr from scipy import sparse as sp @@ -824,6 +825,45 @@ def test_on_transformed(self): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool.iloc[query_idx], y_pool[query_idx]) + def test_on_transformed_with_variable_transformation(self): + """ + Learnable transformations naturally change after a model is retrained. Make sure this is handled + properly for on_transformed=True query strategies. + """ + query_strategies = [ + modAL.batch.uncertainty_batch_sampling + # add further strategies which work with instance representations + # no further ones as of 09.12.2020 + ] + + X_labeled = ['Dog', 'Cat', 'Tree'] + + # contains unseen in labeled words, training model on those + # will alter CountVectorizer transformations + X_pool = ['Airplane', 'House'] + + y = [0, 1, 1, 0, 1] # irrelevant for test + + for query_strategy in query_strategies: + learner = modAL.models.learners.ActiveLearner( + estimator=make_pipeline( + CountVectorizer(), + RandomForestClassifier(n_estimators=10) + ), + query_strategy=query_strategy, + X_training=X_labeled, y_training=y[:len(X_labeled)], + on_transformed=True, + ) + + for _ in range(len(X_pool)): + query_idx, query_instance = learner.query(X_pool, n_instances=1) + i = query_idx[0] + + learner.teach( + X=[X_pool[i]], + y=[y[i]] + ) + def test_old_query_strategy_interface(self): n_samples = 10 n_features = 5 From 99d4947f69f0b2247f9916a6ef44463f7af06c55 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 7 Jan 2021 10:40:40 +0100 Subject: [PATCH 101/182] version number bumped --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8daf94e..c3f2b60 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='modAL', - version='0.4.0', + version='0.4.1', author='Tivadar Danka', author_email='85a5187a@opayq.com', description='A modular active learning framework for Python3', From 3e3d0e9a4f9e214336c54d77b48e0b845a34edaf Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sun, 17 Jan 2021 15:05:20 +0100 Subject: [PATCH 102/182] With this adaption teaching works now for complex NN's but the scoring does still not work --- modAL/models/base.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 3d9dadc..b25eca1 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -40,6 +40,9 @@ class BaseLearner(ABC, BaseEstimator): Useful when building Committee models with bagging. on_transformed: Whether to transform samples with the pipeline defined by the estimator when applying the query strategy. + accept_different_dim : bool + if True: the dimensions of X and Y inputs for the teaching/ predict/ fit part + do not have to match (needed for complex models e.g. Transformers) **fit_kwargs: keyword arguments. Attributes: @@ -57,6 +60,7 @@ def __init__(self, bootstrap_init: bool = False, on_transformed: bool = False, force_all_finite: bool = True, + accept_different_dim: bool = False, **fit_kwargs ) -> None: assert callable(query_strategy), 'query_strategy must be callable' @@ -64,6 +68,7 @@ def __init__(self, self.estimator = estimator self.query_strategy = query_strategy self.on_transformed = on_transformed + self.accept_different_dim = accept_different_dim self.X_training = X_training self.y_training = y_training @@ -85,8 +90,9 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None: If the classifier has been fitted, the features in X have to agree with the training samples which the classifier has seen. """ - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, - force_all_finite=self.force_all_finite) + if not self.accept_different_dim: + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, + force_all_finite=self.force_all_finite) if self.X_training is None: self.X_training = X From 06b8b8296de5ec536a579062393556651a5b32af Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Mon, 18 Jan 2021 13:03:42 +0100 Subject: [PATCH 103/182] score works now with complex NN's but the query part is still problematic --- modAL/models/base.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index b25eca1..2cd07da 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -65,7 +65,7 @@ def __init__(self, ) -> None: assert callable(query_strategy), 'query_strategy must be callable' - self.estimator = estimator + self.estimator = estimator #MultiOutputClassifier(estimator, n_jobs=-1) self.query_strategy = query_strategy self.on_transformed = on_transformed self.accept_different_dim = accept_different_dim @@ -179,8 +179,9 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f Returns: self """ - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, - force_all_finite=self.force_all_finite) + if not self.accept_different_dim: + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, + force_all_finite=self.force_all_finite) if not bootstrap: self.estimator.fit(X, y, **fit_kwargs) @@ -209,8 +210,10 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg Returns: self """ - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, - force_all_finite=self.force_all_finite) + if not self.accept_different_dim: + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, + force_all_finite=self.force_all_finite) + self.X_training, self.y_training = X, y return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) @@ -264,7 +267,7 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] "Please return only the indices of the selected instances.", DeprecationWarning) return query_result - return query_result, retrieve_rows(X_pool, query_result) + return query_result, retrieve_rows(X_pool, query_result, accept_different_dim=self.accept_different_dim) def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: """ @@ -278,6 +281,18 @@ def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: Returns: The score of the predictor. """ + + """ + sklearn does only accept tensors of different dim for X and Y, if we use + Multilabel classifiaction. If we do not want to do this but we still want + to go with tensors of different size (e.g. Transformers) we have to use this + workaround. + """ + if self.accept_different_dim: + prediction = self.estimator.infer(X) + criterion = self.estimator.criterion() + return criterion(prediction, y).item() + return self.estimator.score(X, y, **score_kwargs) @abc.abstractmethod @@ -404,7 +419,7 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] "Please return only the indices of the selected instances", DeprecationWarning) return query_result - return query_result, retrieve_rows(X_pool, query_result) + return query_result, retrieve_rows(X_pool, query_result, accept_different_dim=self.accept_different_dim) def rebag(self, **fit_kwargs) -> None: """ From 169c772cd960ab8b83ceec0410d44f9f5de6f16f Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sat, 23 Jan 2021 18:22:36 +0100 Subject: [PATCH 104/182] revert change --- modAL/models/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 2cd07da..5913eb4 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -267,7 +267,7 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] "Please return only the indices of the selected instances.", DeprecationWarning) return query_result - return query_result, retrieve_rows(X_pool, query_result, accept_different_dim=self.accept_different_dim) + return query_result, retrieve_rows(X_pool, query_result) def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: """ From c2763d0565b3c7ee360db0548aa37a8d2dd16924 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sun, 24 Jan 2021 14:53:37 +0100 Subject: [PATCH 105/182] move mc-Dropout to the right position &adapt dependencies --- modAL/dropout.py | 135 +++++++++++++++++++++++++++++++++++++++++++ rtd_requirements.txt | 1 + setup.py | 2 +- 3 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 modAL/dropout.py diff --git a/modAL/dropout.py b/modAL/dropout.py new file mode 100644 index 0000000..0887225 --- /dev/null +++ b/modAL/dropout.py @@ -0,0 +1,135 @@ +import numpy as np +import logging +import sys + +from sklearn.base import BaseEstimator +from scipy.special import entr + +from modAL.utils.data import modALinput +from modAL.utils.selection import multi_argmax, shuffled_argmax + +from skorch.utils import to_numpy + + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + + +def mc_dropout(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, + random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + """ + Mc-Dropout query strategy. Selects the instance with the largest change in their + values by multiple forward passes with enabled dropout. Change/ Disagrement is + the calculated BALD (Bayesian Active Learning by Disagreement) score. + + Based on the work of: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + Dropout as a Bayesian Approximation: Representing Model Uncer- tainty in Deep Learning. + (Yarin Gal and Zoubin Ghahramani. 2016.) + Bayesian Active Learning for Classification and Preference Learning. + (NeilHoulsby,FerencHusza ́r,ZoubinGhahramani,andMa ́te ́Lengyel. 2011.) + + Args: + classifier: The classifier for which the labels are to be queried. + X: The pool of samples to query from. + n_instances: Number of samples to be queried. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + dropout_layer_indexes: Indexes of the dropout layers which should be activated + Choose indices from : list(torch_model.modules()) + num_cycles: Number of forward passes with activated dropout + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. + + Returns: + The indices of the instances from X chosen to be labelled; + """ + + # set dropout layers to train mode + set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) + + predictions = [] + + #for each batch run num_cycles forward passes + for i in range(num_cycles): + logging.getLogger().info("Dropout: start prediction forward pass") + #call Skorch infer function to perform model forward pass + #In comparison to: predict(), predict_proba() the infer() + # does not change train/eval mode of other layers + prediction = classifier.estimator.infer(X) + predictions.append(to_numpy(prediction)) + + # set dropout layers to eval + set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) + + #calculate BALD (Bayesian active learning divergence)) + bald_scores = _bald_divergence(predictions) + + if not random_tie_break: + return multi_argmax(bald_scores, n_instances=n_instances) + + return shuffled_argmax(bald_scores, n_instances=n_instances) + +def entropy_sum(values, axis=-1): + #sum Scipy basic entropy function: entr() + return np.sum(entr(values), axis=axis) + +def _bald_divergence(proba) -> np.ndarray: + accumulated_score = np.zeros(shape=proba[0].shape) + accumulated_entropy = np.zeros(shape=(proba[0].shape[0])) + + #create 3D or 4D array from prediction dim: (drop_cycles, proba.shape[0], proba.shape[1], opt:proba.shape[2]) + proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + + #entropy along dropout cycles + accumulated_entropy = entropy_sum(proba_stacked, axis=-1) + f_x = accumulated_entropy/len(proba) + + #score sums along dropout cycles + accumulated_score = np.sum(proba_stacked, axis=-1) + average_score = accumulated_score / len(proba) + #expand dimension w/o data for entropy calculation + average_score = np.expand_dims(average_score, axis=-1) + + #entropy over average prediction score + g_x = entropy_sum(average_score, axis=-1) + + #entropy differences + diff = g_x - f_x + + #sum all dimensions of diff besides first dim (instances) + shaped = np.reshape(diff, (diff.shape[0], -1)) + bald = np.sum(shaped, axis=-1) + + return bald + +def set_dropout_mode(model, dropout_layer_indexes: list, train_mode: bool): + """ + Function to enable the dropout layers by setting them to user specified mode (bool: train_mode) + TODO: Reduce maybe complexity + TODO: Keras support + """ + + modules = list(model.modules()) # list of all modules in the network. + + if len(dropout_layer_indexes) != 0: + for index in dropout_layer_indexes: + layer = modules[index] + if layer.__class__.__name__.startswith('Dropout'): + if True == train_mode: + layer.train() + elif False == train_mode: + layer.eval() + else: + raise KeyError("The passed index: {} is not a Dropout layer".format(index)) + + else: + for module in modules: + if module.__class__.__name__.startswith('Dropout'): + if True == train_mode: + module.train() + logging.getLogger().info("Dropout: set mode of " + str(module.__class__.__name__) + " to train") + elif False == train_mode: + module.eval() + logging.getLogger().info("Dropout: set mode of " + str(module.__class__.__name__) + " to eval") diff --git a/rtd_requirements.txt b/rtd_requirements.txt index 76d0d38..8c13855 100644 --- a/rtd_requirements.txt +++ b/rtd_requirements.txt @@ -4,3 +4,4 @@ scikit-learn ipykernel nbsphinx pandas +skorch diff --git a/setup.py b/setup.py index c3f2b60..78d36a1 100644 --- a/setup.py +++ b/setup.py @@ -10,5 +10,5 @@ url='https://modAL-python.github.io/', packages=['modAL', 'modAL.models', 'modAL.utils'], classifiers=['Development Status :: 4 - Beta'], - install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0'], + install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0', 'skorch>=0.9.0'], ) From 9849d362d94d76990ef84911abafabb816bee6ff Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Thu, 28 Jan 2021 22:11:09 +0100 Subject: [PATCH 106/182] Metric uppassing --- modAL/batch.py | 2 +- modAL/expected_error.py | 4 ++-- modAL/models/base.py | 8 ++++---- modAL/multilabel.py | 4 ++-- modAL/uncertainty.py | 4 ++-- modAL/utils/selection.py | 30 +++++++++++++++++++----------- 6 files changed, 30 insertions(+), 22 deletions(-) diff --git a/modAL/batch.py b/modAL/batch.py index 38fa732..d0121a7 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -178,7 +178,7 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee], instance_index_ranking.append(instance_index) # Return numpy array, not a list. - return np.array(instance_index_ranking) + return np.array(instance_index_ranking), uncertainty_scores[np.array(instance_index_ranking)] def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee], diff --git a/modAL/expected_error.py b/modAL/expected_error.py index f362ecd..74469f9 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -77,6 +77,6 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = expected_error[x_idx] = np.inf if not random_tie_break: - return multi_argmax(-expected_error, n_instances) + return multi_argmax(-expected_error, n_instances, return_negative=True) - return shuffled_argmax(-expected_error, n_instances) + return shuffled_argmax(-expected_error, n_instances, return_negative=True) diff --git a/modAL/models/base.py b/modAL/models/base.py index 5913eb4..a250705 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -259,7 +259,7 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] labelled and the instances themselves. Can be different in other cases, for instance only the instance to be labelled upon query synthesis. """ - query_result = self.query_strategy(self, X_pool, *query_args, **query_kwargs) + query_result, query_metrics = self.query_strategy(self, X_pool, *query_args, **query_kwargs) if isinstance(query_result, tuple): warnings.warn("Query strategies should no longer return the selected instances, " @@ -267,7 +267,7 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] "Please return only the indices of the selected instances.", DeprecationWarning) return query_result - return query_result, retrieve_rows(X_pool, query_result) + return query_result, retrieve_rows(X_pool, query_result), query_metrics def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: """ @@ -411,7 +411,7 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] be labelled and the instances themselves. Can be different in other cases, for instance only the instance to be labelled upon query synthesis. """ - query_result = self.query_strategy(self, X_pool, *query_args, **query_kwargs) + query_result, query_metrics = self.query_strategy(self, X_pool, *query_args, **query_kwargs) if isinstance(query_result, tuple): warnings.warn("Query strategies should no longer return the selected instances, " @@ -419,7 +419,7 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] "Please return only the indices of the selected instances", DeprecationWarning) return query_result - return query_result, retrieve_rows(X_pool, query_result, accept_different_dim=self.accept_different_dim) + return query_result, retrieve_rows(X_pool, query_result), query_metrics def rebag(self, **fit_kwargs) -> None: """ diff --git a/modAL/multilabel.py b/modAL/multilabel.py index 3a81ba1..b483d0d 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -161,9 +161,9 @@ def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, classwise_min = np.min(classwise_confidence, axis=1) if not random_tie_break: - return multi_argmax(-classwise_min, n_instances) + return multi_argmax(-classwise_min, n_instances, return_negative=True) - return shuffled_argmax(-classwise_min, n_instances) + return shuffled_argmax(-classwise_min, n_instances, return_negative=True) def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, diff --git a/modAL/uncertainty.py b/modAL/uncertainty.py index e00d564..88a8c9f 100644 --- a/modAL/uncertainty.py +++ b/modAL/uncertainty.py @@ -178,9 +178,9 @@ def margin_sampling(classifier: BaseEstimator, X: modALinput, margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs) if not random_tie_break: - return multi_argmax(-margin, n_instances=n_instances) + return multi_argmax(-margin, n_instances=n_instances, return_negative=True) - return shuffled_argmax(-margin, n_instances=n_instances) + return shuffled_argmax(-margin, n_instances=n_instances, return_negative=True) def entropy_sampling(classifier: BaseEstimator, X: modALinput, diff --git a/modAL/utils/selection.py b/modAL/utils/selection.py index 3741a66..01f5d83 100644 --- a/modAL/utils/selection.py +++ b/modAL/utils/selection.py @@ -5,7 +5,7 @@ import numpy as np -def shuffled_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: +def shuffled_argmax(values: np.ndarray, n_instances: int = 1, return_negative=False) -> np.ndarray: """ Shuffles the values and sorts them afterwards. This can be used to break the tie when the highest utility score is not unique. The shuffle randomizes @@ -13,10 +13,10 @@ def shuffled_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: Args: values: Contains the values to be selected from. - n_instances: Specifies how many indices to return. - + n_instances: Specifies how many indices and values to return. + return_negative: if true: returns negative values Returns: - The indices of the n_instances largest values. + The indices and values of the n_instances largest values. """ assert n_instances <= values.shape[0], 'n_instances must be less or equal than the size of utility' @@ -30,24 +30,32 @@ def shuffled_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: # inverting the shuffle query_idx = shuffled_idx[sorted_query_idx] - return query_idx + + if return_negative == True: + values = -values + + return query_idx, values[max_idx] -def multi_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: +def multi_argmax(values: np.ndarray, n_instances: int = 1, return_negative=False) -> np.ndarray: """ - Selects the indices of the n_instances highest values. + return the indices and values of the n_instances highest values. Args: values: Contains the values to be selected from. - n_instances: Specifies how many indices to return. - + n_instances: Specifies how many indices and values to return. + return_negative: if true: returns negative values Returns: - The indices of the n_instances largest values. + The indices and values of the n_instances largest values. """ assert n_instances <= values.shape[0], 'n_instances must be less or equal than the size of utility' max_idx = np.argpartition(-values, n_instances-1, axis=0)[:n_instances] - return max_idx + + if return_negative == True: + values = -values + + return max_idx, values[max_idx] def weighted_random(weights: np.ndarray, n_instances: int = 1) -> np.ndarray: From 59dd7f3185ed9cdc1bb1bc325361352a8dd04bc1 Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Sun, 31 Jan 2021 15:21:34 +0100 Subject: [PATCH 107/182] Structure for DeepLearning design decision --- modAL/models/base.py | 1 + modAL/models/learners.py | 58 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/modAL/models/base.py b/modAL/models/base.py index a250705..dfa44c7 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -24,6 +24,7 @@ ABC = abc.ABCMeta('ABC', (), {}) +#TODO: Adapt BaseLearner to be common class for ML&DL class BaseLearner(ABC, BaseEstimator): """ Core abstraction in modAL. diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 9af43a2..89d0c76 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -11,6 +11,7 @@ from modAL.uncertainty import uncertainty_sampling from modAL.disagreement import vote_entropy_sampling, max_std_sampling from modAL.acquisition import max_EI +from modAL.dropout import mc_dropout """ Classes for active learning algorithms @@ -101,6 +102,63 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: else: self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) +#TODO: Adapt DeepACtiveLearner +class DeepActiveLearner(BaseLearner): + """ + This class is an abstract model of a general active learning algorithm. + + Args: + estimator: The estimator to be used in the active learning loop. + query_strategy: Function providing the query strategy for the active learning loop, + for instance, modAL.uncertainty.uncertainty_sampling. + X_training: Initial training samples, if available. + y_training: Initial training labels corresponding to initial training samples. + bootstrap_init: If initial training data is available, bootstrapping can be done during the first training. + Useful when building Committee models with bagging. + on_transformed: Whether to transform samples with the pipeline defined by the estimator + when applying the query strategy. + **fit_kwargs: keyword arguments. + + Attributes: + estimator: The estimator to be used in the active learning loop. + query_strategy: Function providing the query strategy for the active learning loop. + X_training: If the model hasn't been fitted yet it is None, otherwise it contains the samples + which the model has been trained on. If provided, the method fit() of estimator is called during __init__() + y_training: The labels corresponding to X_training. + """ + + def __init__(self, + estimator: BaseEstimator, + query_strategy: Callable = uncertainty_sampling, + X_training: Optional[modALinput] = None, + y_training: Optional[modALinput] = None, + bootstrap_init: bool = False, + on_transformed: bool = False, + **fit_kwargs + ) -> None: + #TODO: Check if given query strategy works for Deep Learning + super().__init__(estimator, query_strategy, + X_training, y_training, bootstrap_init, on_transformed, **fit_kwargs) + + def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: + """ + Adds X and y to the known training data and retrains the predictor with the augmented dataset. + + Args: + X: The new samples for which the labels are supplied by the expert. + y: Labels corresponding to the new instances in X. + bootstrap: If True, training is done on a bootstrapped dataset. Useful for building Committee models + with bagging. + only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples. + Useful when working with models where the .fit() method doesn't retrain the model from scratch (e. g. in + tensorflow or keras). + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + """ + ##self._add_training_data(X, y) + if not only_new: + self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) + else: + self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) """ Classes for Bayesian optimization From 8f1d7fefc3d4695d5ebfe0dcd9d57ee9f5e45b68 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sun, 31 Jan 2021 16:01:25 +0100 Subject: [PATCH 108/182] First adaptions for KL-divergenz .. work in progress --- modAL/dropout.py | 70 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 0887225..fb0da65 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -3,6 +3,8 @@ import sys from sklearn.base import BaseEstimator +from sklearn.preprocessing import normalize + from scipy.special import entr from modAL.utils.data import modALinput @@ -13,6 +15,27 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +def KL_divergence(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, + random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + """ + TODO: Work in progress + """ + # set dropout layers to train mode + set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) + + predictions = get_predictions(classifier, X, num_cycles) + + # set dropout layers to eval + set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) + + #KL_divergence = _KL_divergence(predictions) + + if not random_tie_break: + return multi_argmax(KL_divergence, n_instances=n_instances) + + return shuffled_argmax(KL_divergence, n_instances=n_instances) + def mc_dropout(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], @@ -49,16 +72,7 @@ def mc_dropout(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, # set dropout layers to train mode set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) - predictions = [] - - #for each batch run num_cycles forward passes - for i in range(num_cycles): - logging.getLogger().info("Dropout: start prediction forward pass") - #call Skorch infer function to perform model forward pass - #In comparison to: predict(), predict_proba() the infer() - # does not change train/eval mode of other layers - prediction = classifier.estimator.infer(X) - predictions.append(to_numpy(prediction)) + predictions = get_predictions(classifier, X, num_cycles) # set dropout layers to eval set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) @@ -71,6 +85,30 @@ def mc_dropout(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, return shuffled_argmax(bald_scores, n_instances=n_instances) +def get_predictions(classifier: BaseEstimator, X: modALinput, num_predictions: int = 50): + """ + Runs num_predictions times the prediction of the classifier on the input X + and puts the predictions in a list. + + Args: + classifier: The classifier for which the labels are to be queried. + X: The pool of samples to query from. + num_predictions: Number of predictions which should be made + Return: + prediction: list with all predictions + """ + + predictions = [] + for i in range(num_predictions): + logging.getLogger().info("Dropout: start prediction forward pass") + #call Skorch infer function to perform model forward pass + #In comparison to: predict(), predict_proba() the infer() + # does not change train/eval mode of other layers + prediction = classifier.estimator.infer(X) + predictions.append(to_numpy(prediction)) + return predictions + + def entropy_sum(values, axis=-1): #sum Scipy basic entropy function: entr() return np.sum(entr(values), axis=axis) @@ -104,6 +142,18 @@ def _bald_divergence(proba) -> np.ndarray: return bald +def _KL_divergence(proba) -> np.ndarray: + + #create 3D or 4D array from prediction dim: (drop_cycles, proba.shape[0], proba.shape[1], opt:proba.shape[2]) + proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + # TODO work in progress + # TODO add dimensionality adaption + #number_of_dimensions = proba_stacked.ndim + #if proba_stacked.ndim > 2: + + normalized_proba = normalize(proba_stacked, axis=0) + + def set_dropout_mode(model, dropout_layer_indexes: list, train_mode: bool): """ Function to enable the dropout layers by setting them to user specified mode (bool: train_mode) From 519d2b1a1be23840a6e2a1bbc6518637cca190be Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sun, 31 Jan 2021 16:47:32 +0100 Subject: [PATCH 109/182] Separate DeepActiveLearner and ActiveLearner, commite adaption still missing --- modAL/models/base.py | 121 +------------------------------ modAL/models/learners.py | 152 +++++++++++++++++++++++++++++++++++---- 2 files changed, 140 insertions(+), 133 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index dfa44c7..6b3800b 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -24,7 +24,6 @@ ABC = abc.ABCMeta('ABC', (), {}) -#TODO: Adapt BaseLearner to be common class for ML&DL class BaseLearner(ABC, BaseEstimator): """ Core abstraction in modAL. @@ -33,79 +32,35 @@ class BaseLearner(ABC, BaseEstimator): estimator: The estimator to be used in the active learning loop. query_strategy: Function providing the query strategy for the active learning loop, for instance, modAL.uncertainty.uncertainty_sampling. - X_training: Initial training samples, if available. - y_training: Initial training labels corresponding to initial training samples. force_all_finite: When True, forces all values of the data finite. When False, accepts np.nan and np.inf values. bootstrap_init: If initial training data is available, bootstrapping can be done during the first training. Useful when building Committee models with bagging. on_transformed: Whether to transform samples with the pipeline defined by the estimator when applying the query strategy. - accept_different_dim : bool - if True: the dimensions of X and Y inputs for the teaching/ predict/ fit part - do not have to match (needed for complex models e.g. Transformers) **fit_kwargs: keyword arguments. Attributes: estimator: The estimator to be used in the active learning loop. query_strategy: Function providing the query strategy for the active learning loop. - X_training: If the model hasn't been fitted yet it is None, otherwise it contains the samples - which the model has been trained on. - y_training: The labels corresponding to X_training. """ def __init__(self, estimator: BaseEstimator, query_strategy: Callable, - X_training: Optional[modALinput] = None, - y_training: Optional[modALinput] = None, bootstrap_init: bool = False, on_transformed: bool = False, force_all_finite: bool = True, - accept_different_dim: bool = False, **fit_kwargs ) -> None: assert callable(query_strategy), 'query_strategy must be callable' - self.estimator = estimator #MultiOutputClassifier(estimator, n_jobs=-1) + self.estimator = estimator self.query_strategy = query_strategy self.on_transformed = on_transformed - self.accept_different_dim = accept_different_dim - - self.X_training = X_training - self.y_training = y_training - if X_training is not None: - self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs) assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool' self.force_all_finite = force_all_finite - def _add_training_data(self, X: modALinput, y: modALinput) -> None: - """ - Adds the new data and label to the known data, but does not retrain the model. - - Args: - X: The new samples for which the labels are supplied by the expert. - y: Labels corresponding to the new instances in X. - - Note: - If the classifier has been fitted, the features in X have to agree with the training samples which the - classifier has seen. - """ - if not self.accept_different_dim: - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, - force_all_finite=self.force_all_finite) - - if self.X_training is None: - self.X_training = X - self.y_training = y - else: - try: - self.X_training = data_vstack((self.X_training, X)) - self.y_training = data_vstack((self.y_training, y)) - except ValueError: - raise ValueError('the dimensions of the new training data and label must' - 'agree with the training data and labels provided so far') - def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.csr_matrix]: """ Transforms the data as supplied to the estimator. @@ -147,26 +102,6 @@ def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.cs # concatenate all transformations and return return data_hstack(Xt) - def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': - """ - Fits self.estimator to the training data and labels provided to it so far. - - Args: - bootstrap: If True, the method trains the model on a set bootstrapped from the known training instances. - **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. - - Returns: - self - """ - if not bootstrap: - self.estimator.fit(self.X_training, self.y_training, **fit_kwargs) - else: - n_instances = self.X_training.shape[0] - bootstrap_idx = np.random.choice(range(n_instances), n_instances, replace=True) - self.estimator.fit(self.X_training[bootstrap_idx], self.y_training[bootstrap_idx], **fit_kwargs) - - return self - def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': """ Fits self.estimator to the given data and labels. @@ -180,9 +115,6 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f Returns: self """ - if not self.accept_different_dim: - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, - force_all_finite=self.force_all_finite) if not bootstrap: self.estimator.fit(X, y, **fit_kwargs) @@ -192,32 +124,6 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f return self - def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': - """ - Interface for the fit method of the predictor. Fits the predictor to the supplied data, then stores it - internally for the active learning loop. - - Args: - X: The samples to be fitted. - y: The corresponding labels. - bootstrap: If true, trains the estimator on a set bootstrapped from X. - Useful for building Committee models with bagging. - **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. - - Note: - When using scikit-learn estimators, calling this method will make the ActiveLearner forget all training data - it has seen! - - Returns: - self - """ - if not self.accept_different_dim: - check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, - force_all_finite=self.force_all_finite) - - self.X_training, self.y_training = X, y - return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) - def predict(self, X: modALinput, **predict_kwargs) -> Any: """ Estimator predictions for X. Interface with the predict method of the estimator. @@ -270,31 +176,6 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] return query_result, retrieve_rows(X_pool, query_result), query_metrics - def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: - """ - Interface for the score method of the predictor. - - Args: - X: The samples for which prediction accuracy is to be calculated. - y: Ground truth labels for X. - **score_kwargs: Keyword arguments to be passed to the .score() method of the predictor. - - Returns: - The score of the predictor. - """ - - """ - sklearn does only accept tensors of different dim for X and Y, if we use - Multilabel classifiaction. If we do not want to do this but we still want - to go with tensors of different size (e.g. Transformers) we have to use this - workaround. - """ - if self.accept_different_dim: - prediction = self.estimator.infer(X) - criterion = self.estimator.criterion() - return criterion(prediction, y).item() - - return self.estimator.score(X, y, **score_kwargs) @abc.abstractmethod def teach(self, *args, **kwargs) -> None: diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 89d0c76..1015fa7 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -80,7 +80,106 @@ def __init__(self, **fit_kwargs ) -> None: super().__init__(estimator, query_strategy, - X_training, y_training, bootstrap_init, on_transformed, **fit_kwargs) + bootstrap_init, on_transformed, **fit_kwargs) + + self.X_training = X_training + self.y_training = y_training + + def _add_training_data(self, X: modALinput, y: modALinput) -> None: + """ + Adds the new data and label to the known data, but does not retrain the model. + + Args: + X: The new samples for which the labels are supplied by the expert. + y: Labels corresponding to the new instances in X. + + Note: + If the classifier has been fitted, the features in X have to agree with the training samples which the + classifier has seen. + """ + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, + force_all_finite=self.force_all_finite) + + if self.X_training is None: + self.X_training = X + self.y_training = y + else: + try: + self.X_training = data_vstack((self.X_training, X)) + self.y_training = data_vstack((self.y_training, y)) + except ValueError: + raise ValueError('the dimensions of the new training data and label must' + 'agree with the training data and labels provided so far') + + def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': + """ + Fits self.estimator to the training data and labels provided to it so far. + + Args: + bootstrap: If True, the method trains the model on a set bootstrapped from the known training instances. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + + Returns: + self + """ + if not bootstrap: + self.estimator.fit(self.X_training, self.y_training, **fit_kwargs) + else: + n_instances = self.X_training.shape[0] + bootstrap_idx = np.random.choice(range(n_instances), n_instances, replace=True) + self.estimator.fit(self.X_training[bootstrap_idx], self.y_training[bootstrap_idx], **fit_kwargs) + + return self + + def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': + """ + Interface for the fit method of the predictor. Fits the predictor to the supplied data, then stores it + internally for the active learning loop. + + Args: + X: The samples to be fitted. + y: The corresponding labels. + bootstrap: If true, trains the estimator on a set bootstrapped from X. + Useful for building Committee models with bagging. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + + Note: + When using scikit-learn estimators, calling this method will make the ActiveLearner forget all training data + it has seen! + + Returns: + self + """ + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, + force_all_finite=self.force_all_finite) + self.X_training, self.y_training = X, y + return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) + + def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: + """ + Interface for the score method of the predictor. + + Args: + X: The samples for which prediction accuracy is to be calculated. + y: Ground truth labels for X. + **score_kwargs: Keyword arguments to be passed to the .score() method of the predictor. + + Returns: + The score of the predictor. + """ + + """ + sklearn does only accept tensors of different dim for X and Y, if we use + Multilabel classifiaction. If we do not want to do this but we still want + to go with tensors of different size (e.g. Transformers) we have to use this + workaround. + """ + if self.accept_different_dim: + prediction = self.estimator.infer(X) + criterion = self.estimator.criterion() + return criterion(prediction, y).item() + + return self.estimator.score(X, y, **score_kwargs) def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: """ @@ -96,13 +195,14 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: tensorflow or keras). **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. """ - self._add_training_data(X, y) if not only_new: + self._add_training_data(X, y) self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) else: + check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, + force_all_finite=self.force_all_finite) self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) -#TODO: Adapt DeepACtiveLearner class DeepActiveLearner(BaseLearner): """ This class is an abstract model of a general active learning algorithm. @@ -138,9 +238,42 @@ def __init__(self, ) -> None: #TODO: Check if given query strategy works for Deep Learning super().__init__(estimator, query_strategy, - X_training, y_training, bootstrap_init, on_transformed, **fit_kwargs) + bootstrap_init, on_transformed, **fit_kwargs) - def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: + def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': + """ + Interface for the fit method of the predictor. Fits the predictor to the supplied data, then stores it + internally for the active learning loop. + + Args: + X: The samples to be fitted. + y: The corresponding labels. + bootstrap: If true, trains the estimator on a set bootstrapped from X. + Useful for building Committee models with bagging. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + + Returns: + self + """ + return self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + + def score(self, X: modALinput, y: modALinput) -> Any: + """ + Interface for the score method of the predictor. + + Args: + X: The samples for which prediction accuracy is to be calculated. + y: Ground truth labels for X. + + Returns: + The score of the predictor. + """ + + prediction = self.estimator.infer(X) + criterion = self.estimator.criterion() + return criterion(prediction, y).item() + + def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> None: """ Adds X and y to the known training data and retrains the predictor with the augmented dataset. @@ -149,16 +282,9 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: y: Labels corresponding to the new instances in X. bootstrap: If True, training is done on a bootstrapped dataset. Useful for building Committee models with bagging. - only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples. - Useful when working with models where the .fit() method doesn't retrain the model from scratch (e. g. in - tensorflow or keras). **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. """ - ##self._add_training_data(X, y) - if not only_new: - self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) - else: - self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) """ Classes for Bayesian optimization From 8d05b57d4bb068f01bbb6bb9b7438f160e628ecf Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sun, 31 Jan 2021 19:51:17 +0100 Subject: [PATCH 110/182] DeepCommitee adaption --- modAL/models/__init__.py | 6 +- modAL/models/base.py | 96 +++----------- modAL/models/learners.py | 274 ++++++++++++++++++++++++++++++++++----- 3 files changed, 266 insertions(+), 110 deletions(-) diff --git a/modAL/models/__init__.py b/modAL/models/__init__.py index f96b37f..e178fe8 100644 --- a/modAL/models/__init__.py +++ b/modAL/models/__init__.py @@ -1,6 +1,6 @@ -from .learners import ActiveLearner, BayesianOptimizer, Committee, CommitteeRegressor +from .learners import ActiveLearner, DeepActiveLearner, BayesianOptimizer, Committee, DeepCommittee, CommitteeRegressor __all__ = [ - 'ActiveLearner', 'BayesianOptimizer', - 'Committee', 'CommitteeRegressor' + 'ActiveLearner', 'DeepActiveLearner', 'BayesianOptimizer', + 'Committee', 'DeepCommittee', 'CommitteeRegressor' ] \ No newline at end of file diff --git a/modAL/models/base.py b/modAL/models/base.py index 6b3800b..78e8860 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -12,7 +12,6 @@ from sklearn.base import BaseEstimator from sklearn.ensemble._base import _BaseHeterogeneousEnsemble from sklearn.pipeline import Pipeline -from sklearn.utils import check_X_y import scipy.sparse as sp @@ -198,8 +197,7 @@ def __init__(self, learner_list: List[BaseLearner], query_strategy: Callable, on self.learner_list = learner_list self.query_strategy = query_strategy self.on_transformed = on_transformed - # TODO: update training data when using fit() and teach() methods - self.X_training = None + def __iter__(self) -> Iterator[BaseLearner]: for learner in self.learner_list: @@ -208,33 +206,6 @@ def __iter__(self) -> Iterator[BaseLearner]: def __len__(self) -> int: return len(self.learner_list) - def _add_training_data(self, X: modALinput, y: modALinput) -> None: - """ - Adds the new data and label to the known data for each learner, but does not retrain the model. - - Args: - X: The new samples for which the labels are supplied by the expert. - y: Labels corresponding to the new instances in X. - - Note: - If the learners have been fitted, the features in X have to agree with the training samples which the - classifier has seen. - """ - for learner in self.learner_list: - learner._add_training_data(X, y) - - def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> None: - """ - Fits all learners to the training data and labels provided to it so far. - - Args: - bootstrap: If True, each estimator is trained on a bootstrapped dataset. Useful when - using bagging to build the ensemble. - **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. - """ - for learner in self.learner_list: - learner._fit_to_known(bootstrap=bootstrap, **fit_kwargs) - def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> None: """ Fits all learners to the given data and labels. @@ -248,24 +219,6 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f for learner in self.learner_list: learner._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) - def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> 'BaseCommittee': - """ - Fits every learner to a subset sampled with replacement from X. Calling this method makes the learner forget the - data it has seen up until this point and replaces it with X! If you would like to perform bootstrapping on each - learner using the data it has seen, use the method .rebag()! - - Calling this method makes the learner forget the data it has seen up until this point and replaces it with X! - - Args: - X: The samples to be fitted on. - y: The corresponding labels. - **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. - """ - for learner in self.learner_list: - learner.fit(X, y, **fit_kwargs) - - return self - def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.csr_matrix]: """ Transforms the data as supplied to each learner's estimator and concatenates transformations. @@ -303,35 +256,26 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] return query_result, retrieve_rows(X_pool, query_result), query_metrics - def rebag(self, **fit_kwargs) -> None: - """ - Refits every learner with a dataset bootstrapped from its training instances. Contrary to .bag(), it bootstraps - the training data for each learner based on its own examples. - - Todo: - Where is .bag()? - - Args: - **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. - """ - self._fit_to_known(bootstrap=True, **fit_kwargs) - - def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: + def _set_classes(self): """ - Adds X and y to the known training data for each learner and retrains learners with the augmented dataset. - - Args: - X: The new samples for which the labels are supplied by the expert. - y: Labels corresponding to the new instances in X. - bootstrap: If True, trains each learner on a bootstrapped set. Useful when building the ensemble by bagging. - only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples. - **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + Checks the known class labels by each learner, merges the labels and returns a mapping which maps the learner's + classes to the complete label list. """ - self._add_training_data(X, y) - if not only_new: - self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) - else: - self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + # assemble the list of known classes from each learner + try: + # if estimators are fitted + known_classes = tuple(learner.estimator.classes_ for learner in self.learner_list) + except AttributeError: + # handle unfitted estimators + self.classes_ = None + self.n_classes_ = 0 + return + + self.classes_ = np.unique( + np.concatenate(known_classes, axis=0), + axis=0 + ) + self.n_classes_ = len(self.classes_) @abc.abstractmethod def predict(self, X: modALinput) -> Any: @@ -339,4 +283,4 @@ def predict(self, X: modALinput) -> Any: @abc.abstractmethod def vote(self, X: modALinput) -> Any: # TODO: clarify typing - pass \ No newline at end of file + pass diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 1015fa7..3acdad5 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -5,13 +5,13 @@ from sklearn.base import BaseEstimator from sklearn.metrics import accuracy_score -from modAL.models.base import BaseLearner, BaseCommittee +from sklearn.utils import check_X_y +from modAL.models.base import BaseLearner from modAL.utils.validation import check_class_labels, check_class_proba from modAL.utils.data import modALinput, retrieve_rows from modAL.uncertainty import uncertainty_sampling from modAL.disagreement import vote_entropy_sampling, max_std_sampling from modAL.acquisition import max_EI -from modAL.dropout import mc_dropout """ Classes for active learning algorithms @@ -174,11 +174,6 @@ def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: to go with tensors of different size (e.g. Transformers) we have to use this workaround. """ - if self.accept_different_dim: - prediction = self.estimator.infer(X) - criterion = self.estimator.criterion() - return criterion(prediction, y).item() - return self.estimator.score(X, y, **score_kwargs) def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: @@ -292,7 +287,7 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwa """ -class BayesianOptimizer(BaseLearner): +class BayesianOptimizer(ActiveLearner): """ This class is an abstract model of a Bayesian optimizer algorithm. @@ -478,32 +473,246 @@ def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = on_transformed: bool = False) -> None: super().__init__(learner_list, query_strategy, on_transformed) self._set_classes() + # TODO: update training data when using fit() and teach() methods + self.X_training = None + + def _add_training_data(self, X: modALinput, y: modALinput) -> None: + """ + Adds the new data and label to the known data for each learner, but does not retrain the model. + + Args: + X: The new samples for which the labels are supplied by the expert. + y: Labels corresponding to the new instances in X. + + Note: + If the learners have been fitted, the features in X have to agree with the training samples which the + classifier has seen. + """ + for learner in self.learner_list: + learner._add_training_data(X, y) + + def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> None: + """ + Fits all learners to the training data and labels provided to it so far. + + Args: + bootstrap: If True, each estimator is trained on a bootstrapped dataset. Useful when + using bagging to build the ensemble. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + """ + for learner in self.learner_list: + learner._fit_to_known(bootstrap=bootstrap, **fit_kwargs) + + def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> None: + """ + Fits every learner to a subset sampled with replacement from X. Calling this method makes the learner forget the + data it has seen up until this point and replaces it with X! If you would like to perform bootstrapping on each + learner using the data it has seen, use the method .rebag()! + + Calling this method makes the learner forget the data it has seen up until this point and replaces it with X! + + Args: + X: The samples to be fitted on. + y: The corresponding labels. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + """ + for learner in self.learner_list: + learner.fit(X, y, **fit_kwargs) + + self._set_classes() + + def rebag(self, **fit_kwargs) -> None: + """ + Refits every learner with a dataset bootstrapped from its training instances. Contrary to .bag(), it bootstraps + the training data for each learner based on its own examples. + + Todo: + Where is .bag()? + + Args: + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + """ + self._fit_to_known(bootstrap=True, **fit_kwargs) + + def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: + """ + Adds X and y to the known training data for each learner and retrains learners with the augmented dataset. + + Args: + X: The new samples for which the labels are supplied by the expert. + y: Labels corresponding to the new instances in X. + bootstrap: If True, trains each learner on a bootstrapped set. Useful when building the ensemble by bagging. + only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + """ + self._add_training_data(X, y) + if not only_new: + self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) + else: + self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + self._set_classes() + + def predict(self, X: modALinput, **predict_proba_kwargs) -> Any: + """ + Predicts the class of the samples by picking the consensus prediction. + + Args: + X: The samples to be predicted. + **predict_proba_kwargs: Keyword arguments to be passed to the :meth:`predict_proba` of the Committee. + + Returns: + The predicted class labels for X. + """ + # getting average certainties + proba = self.predict_proba(X, **predict_proba_kwargs) + # finding the sample-wise max probability + max_proba_idx = np.argmax(proba, axis=1) + # translating label indices to labels + return self.classes_[max_proba_idx] + + def predict_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: + """ + Consensus probabilities of the Committee. + + Args: + X: The samples for which the class probabilities are to be predicted. + **predict_proba_kwargs: Keyword arguments to be passed to the :meth:`predict_proba` of the Committee. + + Returns: + Class probabilities for X. + """ + return np.mean(self.vote_proba(X, **predict_proba_kwargs), axis=1) + + def score(self, X: modALinput, y: modALinput, sample_weight: List[float] = None) -> Any: + """ + Returns the mean accuracy on the given test data and labels. + + Todo: + Why accuracy? + + Args: + X: The samples to score. + y: Ground truth labels corresponding to X. + sample_weight: Sample weights. + + Returns: + Mean accuracy of the classifiers. + """ + y_pred = self.predict(X) + return accuracy_score(y, y_pred, sample_weight=sample_weight) - def _set_classes(self): + def vote(self, X: modALinput, **predict_kwargs) -> Any: """ - Checks the known class labels by each learner, merges the labels and returns a mapping which maps the learner's - classes to the complete label list. + Predicts the labels for the supplied data for each learner in the Committee. + + Args: + X: The samples to cast votes. + **predict_kwargs: Keyword arguments to be passed to the :meth:`predict` of the learners. + + Returns: + The predicted class for each learner in the Committee and each sample in X. """ - # assemble the list of known classes from each learner - try: - # if estimators are fitted - known_classes = tuple(learner.estimator.classes_ for learner in self.learner_list) - except AttributeError: - # handle unfitted estimators - self.classes_ = None - self.n_classes_ = 0 - return + prediction = np.zeros(shape=(X.shape[0], len(self.learner_list))) + + for learner_idx, learner in enumerate(self.learner_list): + prediction[:, learner_idx] = learner.predict(X, **predict_kwargs) + + return prediction + + def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: + """ + Predicts the probabilities of the classes for each sample and each learner. + + Args: + X: The samples for which class probabilities are to be calculated. + **predict_proba_kwargs: Keyword arguments for the :meth:`predict_proba` of the learners. + + Returns: + Probabilities of each class for each learner and each instance. + """ + + # get dimensions + n_samples = X.shape[0] + n_learners = len(self.learner_list) + proba = np.zeros(shape=(n_samples, n_learners, self.n_classes_)) + + # checking if the learners in the Committee know the same set of class labels + if check_class_labels(*[learner.estimator for learner in self.learner_list]): + # known class labels are the same for each learner + # probability prediction is straightforward + + for learner_idx, learner in enumerate(self.learner_list): + proba[:, learner_idx, :] = learner.predict_proba(X, **predict_proba_kwargs) + + else: + for learner_idx, learner in enumerate(self.learner_list): + proba[:, learner_idx, :] = check_class_proba( + proba=learner.predict_proba(X, **predict_proba_kwargs), + known_labels=learner.estimator.classes_, + all_labels=self.classes_ + ) + + return proba + + +class DeepCommittee(BaseCommittee): + """ + This class is an abstract model of a committee-based active learning algorithm. + + Args: + learner_list: A list of ActiveLearners forming the Committee. + query_strategy: Query strategy function. Committee supports disagreement-based query strategies from + :mod:`modAL.disagreement`, but uncertainty-based ones from :mod:`modAL.uncertainty` are also supported. + on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator + when applying the query strategy. - self.classes_ = np.unique( - np.concatenate(known_classes, axis=0), - axis=0 - ) - self.n_classes_ = len(self.classes_) + Attributes: + classes_: Class labels known by the Committee. + n_classes_: Number of classes known by the Committee. - def _add_training_data(self, X: modALinput, y: modALinput): - super()._add_training_data(X, y) + Examples: - def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> 'BaseCommittee': + >>> from sklearn.datasets import load_iris + >>> from sklearn.neighbors import KNeighborsClassifier + >>> from sklearn.ensemble import RandomForestClassifier + >>> from modAL.models import ActiveLearner, Committee + >>> + >>> iris = load_iris() + >>> + >>> # initialize ActiveLearners + >>> learner_1 = ActiveLearner( + ... estimator=RandomForestClassifier(), + ... X_training=iris['data'][[0, 50, 100]], y_training=iris['target'][[0, 50, 100]] + ... ) + >>> learner_2 = ActiveLearner( + ... estimator=KNeighborsClassifier(n_neighbors=3), + ... X_training=iris['data'][[1, 51, 101]], y_training=iris['target'][[1, 51, 101]] + ... ) + >>> + >>> # initialize the Committee + >>> committee = Committee( + ... learner_list=[learner_1, learner_2] + ... ) + >>> + >>> # querying for labels + >>> query_idx, query_sample = committee.query(iris['data']) + >>> + >>> # ...obtaining new labels from the Oracle... + >>> + >>> # teaching newly labelled examples + >>> committee.teach( + ... X=iris['data'][query_idx].reshape(1, -1), + ... y=iris['target'][query_idx].reshape(1, ) + ... ) + """ + def __init__(self, learner_list: List[DeepActiveLearner], query_strategy: Callable = vote_entropy_sampling, + on_transformed: bool = False) -> None: + super().__init__(learner_list, query_strategy, on_transformed) + self._set_classes() + # TODO: update training data when using fit() and teach() methods + self.X_training = None + + def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> None: """ Fits every learner to a subset sampled with replacement from X. Calling this method makes the learner forget the data it has seen up until this point and replaces it with X! If you would like to perform bootstrapping on each @@ -516,10 +725,12 @@ def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> 'BaseCommittee': y: The corresponding labels. **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. """ - super().fit(X, y, **fit_kwargs) + for learner in self.learner_list: + learner.fit(X, y, **fit_kwargs) + self._set_classes() - def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: + def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> None: """ Adds X and y to the known training data for each learner and retrains learners with the augmented dataset. @@ -530,7 +741,7 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples. **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. """ - super().teach(X, y, bootstrap=bootstrap, only_new=only_new, **fit_kwargs) + self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) self._set_classes() def predict(self, X: modALinput, **predict_proba_kwargs) -> Any: @@ -566,6 +777,7 @@ def predict_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: def score(self, X: modALinput, y: modALinput, sample_weight: List[float] = None) -> Any: """ + TODO test with deep learning ... I think it needs still adaption Returns the mean accuracy on the given test data and labels. Todo: From 43e980a9883961eaab59346d0f84f2b708ab5f20 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sun, 31 Jan 2021 20:32:20 +0100 Subject: [PATCH 111/182] small fixes, score does somehow not work anymore --- modAL/models/base.py | 2 +- modAL/models/learners.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 78e8860..902e64a 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -15,7 +15,7 @@ import scipy.sparse as sp -from modAL.utils.data import data_vstack, data_hstack, modALinput, retrieve_rows +from modAL.utils.data import data_hstack, modALinput, retrieve_rows if sys.version_info >= (3, 4): ABC = abc.ABC diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 3acdad5..44f9658 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -6,13 +6,16 @@ from sklearn.metrics import accuracy_score from sklearn.utils import check_X_y -from modAL.models.base import BaseLearner +from modAL.models.base import BaseLearner, BaseCommittee from modAL.utils.validation import check_class_labels, check_class_proba -from modAL.utils.data import modALinput, retrieve_rows +from modAL.utils.data import modALinput, retrieve_rows, data_vstack from modAL.uncertainty import uncertainty_sampling from modAL.disagreement import vote_entropy_sampling, max_std_sampling from modAL.acquisition import max_EI +from skorch.utils import to_numpy + + """ Classes for active learning algorithms -------------------------------------- @@ -234,6 +237,7 @@ def __init__(self, #TODO: Check if given query strategy works for Deep Learning super().__init__(estimator, query_strategy, bootstrap_init, on_transformed, **fit_kwargs) + self.estimator.initialize() def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': """ @@ -491,7 +495,7 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None: for learner in self.learner_list: learner._add_training_data(X, y) - def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> None: + def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> None: """ Fits all learners to the training data and labels provided to it so far. From 1245cb79cd3496650f0a76b6dedfa47c2c692bf4 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 2 Feb 2021 16:00:35 +0100 Subject: [PATCH 112/182] Pytorch support (Drop_row) needs still an adaption --- modAL/utils/data.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/modAL/utils/data.py b/modAL/utils/data.py index a24e1c7..4b4f697 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -1,6 +1,7 @@ from typing import Union, List, Sequence import numpy as np +import torch import pandas as pd import scipy.sparse as sp @@ -26,6 +27,8 @@ def data_vstack(blocks: Sequence[modALinput]) -> modALinput: return np.concatenate(blocks) elif isinstance(blocks[0], list): return np.concatenate(blocks).tolist() + elif torch.is_tensor(blocks[0]): + return torch.cat(blocks) raise TypeError('%s datatype is not supported' % type(blocks[0])) @@ -48,6 +51,8 @@ def data_hstack(blocks: Sequence[modALinput]) -> modALinput: return np.hstack(blocks) elif isinstance(blocks[0], list): return np.hstack(blocks).tolist() + elif torch.is_tensor(blocks[0]): + return torch.cat(blocks, dim=1) TypeError('%s datatype is not supported' % type(blocks[0])) @@ -62,6 +67,8 @@ def add_row(X:modALinput, row: modALinput): """ if isinstance(X, np.ndarray): return np.vstack((X, row)) + elif torch.is_tensor(X): + return torch.cat((X, row)) elif isinstance(X, list): return np.vstack((X, row)).tolist() @@ -93,10 +100,12 @@ def retrieve_rows(X: modALinput, return X.tocsr()[I].asformat(sp_format) elif isinstance(X, pd.DataFrame): return X.iloc[I] - elif isinstance(X, np.ndarray): - return X[I] elif isinstance(X, list): return np.array(X)[I].tolist() + elif isinstance(X, np.ndarray): + return X[I] + elif torch.is_tensor(X): + return X[I] raise TypeError('%s datatype is not supported' % type(X)) @@ -104,6 +113,7 @@ def retrieve_rows(X: modALinput, def drop_rows(X: modALinput, I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: """ + TODO: Add pytorch support Returns X without the row(s) at index/indices I """ if sp.issparse(X): @@ -134,8 +144,8 @@ def enumerate_data(X: modALinput): return enumerate(X.tocsr()) elif isinstance(X, pd.DataFrame): return X.iterrows() - elif isinstance(X, np.ndarray) or isinstance(X, list): - # numpy arrays and lists can readily be enumerated + elif isinstance(X, np.ndarray) or isinstance(X, list) or torch.is_tensor(X): + # numpy arrays, torch tensors and lists can readily be enumerated return enumerate(X) raise TypeError('%s datatype is not supported' % type(X)) @@ -150,5 +160,7 @@ def data_shape(X: modALinput): return X.shape elif isinstance(X, list): return np.array(X).shape + elif torch.is_tensor(X): + return tuple(X.size()) raise TypeError('%s datatype is not supported' % type(X)) From 23d0eb8dc58437942cdbd8fbc6724a20de8822e4 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 2 Feb 2021 16:22:19 +0100 Subject: [PATCH 113/182] Initial teaching problem fix --- modAL/models/learners.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 44f9658..7449e59 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -24,7 +24,7 @@ class ActiveLearner(BaseLearner): """ - This class is an abstract model of a general active learning algorithm. + This class is an abstract model of a general classic active learning algorithm. Args: estimator: The estimator to be used in the active learning loop. @@ -84,10 +84,13 @@ def __init__(self, ) -> None: super().__init__(estimator, query_strategy, bootstrap_init, on_transformed, **fit_kwargs) - + self.X_training = X_training self.y_training = y_training + if X_training is not None: + self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs) + def _add_training_data(self, X: modALinput, y: modALinput) -> None: """ Adds the new data and label to the known data, but does not retrain the model. @@ -209,8 +212,6 @@ class DeepActiveLearner(BaseLearner): estimator: The estimator to be used in the active learning loop. query_strategy: Function providing the query strategy for the active learning loop, for instance, modAL.uncertainty.uncertainty_sampling. - X_training: Initial training samples, if available. - y_training: Initial training labels corresponding to initial training samples. bootstrap_init: If initial training data is available, bootstrapping can be done during the first training. Useful when building Committee models with bagging. on_transformed: Whether to transform samples with the pipeline defined by the estimator @@ -220,16 +221,11 @@ class DeepActiveLearner(BaseLearner): Attributes: estimator: The estimator to be used in the active learning loop. query_strategy: Function providing the query strategy for the active learning loop. - X_training: If the model hasn't been fitted yet it is None, otherwise it contains the samples - which the model has been trained on. If provided, the method fit() of estimator is called during __init__() - y_training: The labels corresponding to X_training. """ def __init__(self, estimator: BaseEstimator, query_strategy: Callable = uncertainty_sampling, - X_training: Optional[modALinput] = None, - y_training: Optional[modALinput] = None, bootstrap_init: bool = False, on_transformed: bool = False, **fit_kwargs @@ -237,7 +233,8 @@ def __init__(self, #TODO: Check if given query strategy works for Deep Learning super().__init__(estimator, query_strategy, bootstrap_init, on_transformed, **fit_kwargs) - self.estimator.initialize() + + self.estimator.initialize() # does maybe just work with pytorch def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': """ From 85530e79f8e960c1d8c490f590ca200bbf83bd08 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 2 Feb 2021 16:49:48 +0100 Subject: [PATCH 114/182] Small polishing of Dropout --- modAL/dropout.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index fb0da65..bca15be 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -114,9 +114,6 @@ def entropy_sum(values, axis=-1): return np.sum(entr(values), axis=axis) def _bald_divergence(proba) -> np.ndarray: - accumulated_score = np.zeros(shape=proba[0].shape) - accumulated_entropy = np.zeros(shape=(proba[0].shape[0])) - #create 3D or 4D array from prediction dim: (drop_cycles, proba.shape[0], proba.shape[1], opt:proba.shape[2]) proba_stacked = np.stack(proba, axis=len(proba[0].shape)) @@ -134,7 +131,7 @@ def _bald_divergence(proba) -> np.ndarray: g_x = entropy_sum(average_score, axis=-1) #entropy differences - diff = g_x - f_x + diff = np.subtract(g_x, f_x) #sum all dimensions of diff besides first dim (instances) shaped = np.reshape(diff, (diff.shape[0], -1)) From aaf236f4b639cdd7d806981ef7d599ae9872f3fe Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 9 Feb 2021 09:05:20 +0100 Subject: [PATCH 115/182] Metric uppassig documentation adaption --- modAL/acquisition.py | 12 +++++++++--- modAL/batch.py | 7 ++++++- modAL/disagreement.py | 20 ++++++++++++-------- modAL/dropout.py | 1 + modAL/expected_error.py | 1 + modAL/multilabel.py | 32 ++++++++++++++++++++------------ modAL/uncertainty.py | 12 ++++++------ 7 files changed, 55 insertions(+), 30 deletions(-) diff --git a/modAL/acquisition.py b/modAL/acquisition.py index 54fd0ab..4d2be85 100644 --- a/modAL/acquisition.py +++ b/modAL/acquisition.py @@ -115,7 +115,9 @@ def max_PI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0, n_instances: Number of samples to be queried. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. + The pi metric of the chosen instances. + """ pi = optimizer_PI(optimizer, X, tradeoff=tradeoff) return multi_argmax(pi, n_instances=n_instances) @@ -133,7 +135,9 @@ def max_EI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0, n_instances: Number of samples to be queried. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. + The ei metric of the chosen instances. + """ ei = optimizer_EI(optimizer, X, tradeoff=tradeoff) return multi_argmax(ei, n_instances=n_instances) @@ -151,7 +155,9 @@ def max_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1, n_instances: Number of samples to be queried. Returns: - The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. + The ucb metric of the chosen instances. + """ ucb = optimizer_UCB(optimizer, X, beta=beta) return multi_argmax(ucb, n_instances=n_instances) diff --git a/modAL/batch.py b/modAL/batch.py index d0121a7..39488c4 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -139,6 +139,8 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee], Returns: The indices of the top n_instances ranked unlabelled samples. + The uncertainty scores of the chosen instances. + """ # Make a local copy of our classifier's training data. # Define our record container and record the best cold start instance in the case of cold start. @@ -210,7 +212,10 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee], **uncertainty_measure_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier. Returns: - Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled. + Indices of the instances from `X` chosen to be labelled + Records from `X` chosen to be labelled. + The uncertainty scores of the chosen instances. + """ uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs) return ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty, diff --git a/modAL/disagreement.py b/modAL/disagreement.py index 66fad5d..7789135 100644 --- a/modAL/disagreement.py +++ b/modAL/disagreement.py @@ -116,8 +116,9 @@ def vote_entropy_sampling(committee: BaseCommittee, X: modALinput, measure function. Returns: - The indices of the instances from X chosen to be labelled; - the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. + The disagrerment metric of the chosen instances. + """ disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs) @@ -143,8 +144,9 @@ def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput, measure function. Returns: - The indices of the instances from X chosen to be labelled; - the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. + The disagrerment metric of the chosen instances. + """ disagreement = consensus_entropy(committee, X, **disagreement_measure_kwargs) @@ -170,8 +172,9 @@ def max_disagreement_sampling(committee: BaseCommittee, X: modALinput, measure function. Returns: - The indices of the instances from X chosen to be labelled; - the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. + The disagrerment metric of the chosen instances. + """ disagreement = KL_max_disagreement(committee, X, **disagreement_measure_kwargs) @@ -196,8 +199,9 @@ def max_std_sampling(regressor: BaseEstimator, X: modALinput, **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor. Returns: - The indices of the instances from X chosen to be labelled; - the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. + The standard deviation of the chosen instances. + """ _, std = regressor.predict(X, return_std=True, **predict_kwargs) std = std.reshape(X.shape[0], ) diff --git a/modAL/dropout.py b/modAL/dropout.py index bca15be..37c3056 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -67,6 +67,7 @@ def mc_dropout(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, Returns: The indices of the instances from X chosen to be labelled; + The mc-dropout metric of the chosen instances; """ # set dropout layers to train mode diff --git a/modAL/expected_error.py b/modAL/expected_error.py index 74469f9..d6ebc3c 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -39,6 +39,7 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = Returns: The indices of the instances from X chosen to be labelled. + The expected error metric of the chosen instances; """ assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0' diff --git a/modAL/multilabel.py b/modAL/multilabel.py index b483d0d..a0f009d 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -58,7 +58,8 @@ def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput, Returns: The index of the instance from X_pool chosen to be labelled; - the instance from X_pool chosen to be labelled. + The instance from X_pool chosen to be labelled. + The Minimum absolute distance metric of the chosen instance; """ decision_function = np.array([svm.decision_function(X_pool) @@ -92,7 +93,9 @@ def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput, Returns: The index of the instance from X_pool chosen to be labelled; - the instance from X_pool chosen to be labelled. + The instance from X_pool chosen to be labelled. + The SVM-loss-max metric of the chosen instances; + """ assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)' @@ -124,8 +127,9 @@ def mean_max_loss(classifier: OneVsRestClassifier, X_pool: modALinput, can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X_pool chosen to be labelled; - the instance from X_pool chosen to be labelled. + The index of the instance from X_pool chosen to be labelled. + The SVM-loss metric of the chosen instances. + """ assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)' @@ -153,8 +157,9 @@ def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X_pool chosen to be labelled; - the instance from X_pool chosen to be labelled. + The index of the instance from X_pool chosen to be labelled. + The minimal confidence metric of the chosen instance. + """ classwise_confidence = classifier.predict_proba(X_pool) @@ -182,8 +187,9 @@ def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X_pool chosen to be labelled; - the instance from X_pool chosen to be labelled. + The index of the instance from X_pool chosen to be labelled. + The average confidence metric of the chosen instances. + """ classwise_confidence = classifier.predict_proba(X_pool) @@ -211,8 +217,9 @@ def max_score(classifier: OneVsRestClassifier, X_pool: modALinput, can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X_pool chosen to be labelled; - the instance from X_pool chosen to be labelled. + The index of the instance from X_pool chosen to be labelled. + The classwise maximum metric of the chosen instances. + """ classwise_confidence = classifier.predict_proba(X_pool) @@ -242,8 +249,9 @@ def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput, can be used to break the tie when the highest utility score is not unique. Returns: - The index of the instance from X_pool chosen to be labelled; - the instance from X_pool chosen to be labelled. + The index of the instance from X_pool chosen to be labelled. + The classwise mean metric of the chosen instances. + """ classwise_confidence = classifier.predict_proba(X_pool) diff --git a/modAL/uncertainty.py b/modAL/uncertainty.py index 88a8c9f..815fba8 100644 --- a/modAL/uncertainty.py +++ b/modAL/uncertainty.py @@ -146,8 +146,8 @@ def uncertainty_sampling(classifier: BaseEstimator, X: modALinput, measure function. Returns: - The indices of the instances from X chosen to be labelled; - the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. + The uncertainty metric of the chosen instances. """ uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs) @@ -172,8 +172,8 @@ def margin_sampling(classifier: BaseEstimator, X: modALinput, **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. Returns: - The indices of the instances from X chosen to be labelled; - the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. + The margin metric of the chosen instances. """ margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs) @@ -200,8 +200,8 @@ def entropy_sampling(classifier: BaseEstimator, X: modALinput, measure function. Returns: - The indices of the instances from X chosen to be labelled; - the instances from X chosen to be labelled. + The indices of the instances from X chosen to be labelled. + The entropy metric of the chosen instances. """ entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs) From 20906ba9f0d685d5832d6c58fb1c4f7bca0a1c44 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 9 Feb 2021 09:50:10 +0100 Subject: [PATCH 116/182] Issue #2 fix --- modAL/utils/selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modAL/utils/selection.py b/modAL/utils/selection.py index 01f5d83..5345915 100644 --- a/modAL/utils/selection.py +++ b/modAL/utils/selection.py @@ -34,7 +34,7 @@ def shuffled_argmax(values: np.ndarray, n_instances: int = 1, return_negative=Fa if return_negative == True: values = -values - return query_idx, values[max_idx] + return query_idx, values[query_idx] def multi_argmax(values: np.ndarray, n_instances: int = 1, return_negative=False) -> np.ndarray: From 1d5288d630cd6361a08b287fc772f2dce562099d Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Wed, 10 Feb 2021 13:33:50 -0500 Subject: [PATCH 117/182] Update Installation.rst `modAL/models/base.py` imports `_BaseHeterogeneousEnsemble` from `sklearn.ensemble._base`. The first stable release in which the change from `sklearn.ensemble.base` to the protected `sklearn.ensemble._base` occurred appears to be `v0.22`. See https://scikit-learn.org/stable/whats_new/v0.22.html#clear-definition-of-the-public-api. --- docs/source/content/overview/Installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/content/overview/Installation.rst b/docs/source/content/overview/Installation.rst index 16a209a..c44093a 100644 --- a/docs/source/content/overview/Installation.rst +++ b/docs/source/content/overview/Installation.rst @@ -5,7 +5,7 @@ modAL requires * Python >= 3.5 * NumPy >= 1.13 * SciPy >= 0.18 - * scikit-learn >= 0.18 + * scikit-learn >= 0.22 You can install modAL directly with pip: From 83cb7269be6c4ee4c93bb2a255b3f5f5c25e4157 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Fri, 12 Feb 2021 17:29:18 +0100 Subject: [PATCH 118/182] Documentation adaption and mean standard deviation query strategy --- .gitignore | 1 + modAL/dropout.py | 91 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 86 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 0134267..ab971ff 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *pyc +*.vscode .idea .ipynb_checkpoints *~ diff --git a/modAL/dropout.py b/modAL/dropout.py index 37c3056..4b292d6 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -37,13 +37,13 @@ def KL_divergence(classifier: BaseEstimator, X: modALinput, n_instances: int = 1 return shuffled_argmax(KL_divergence, n_instances=n_instances) -def mc_dropout(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, +def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: """ - Mc-Dropout query strategy. Selects the instance with the largest change in their - values by multiple forward passes with enabled dropout. Change/ Disagrement is - the calculated BALD (Bayesian Active Learning by Disagreement) score. + Mc-Dropout bald query strategy. Returns the indexes of the instances with the largest BALD + (Bayesian Active Learning by Disagreement) score calculated through the dropout cycles + and the corresponding bald score. Based on the work of: Deep Bayesian Active Learning with Image Data. @@ -86,6 +86,51 @@ def mc_dropout(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, return shuffled_argmax(bald_scores, n_instances=n_instances) + +def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, + random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + """ + Mc-Dropout mean standard deviation query strategy. Returns the indexes of the instances + with the largest mean of the per class calculated standard deviations over multiple dropout cycles + and the corresponding metric. + + Based on the equations of: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + + Args: + classifier: The classifier for which the labels are to be queried. + X: The pool of samples to query from. + n_instances: Number of samples to be queried. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + dropout_layer_indexes: Indexes of the dropout layers which should be activated + Choose indices from : list(torch_model.modules()) + num_cycles: Number of forward passes with activated dropout + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. + + Returns: + The indices of the instances from X chosen to be labelled; + The mc-dropout metric of the chosen instances; + """ + + # set dropout layers to train mode + set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) + + predictions = get_predictions(classifier, X, num_cycles) + + # set dropout layers to eval + set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) + + mean_standard_deviations = _mean_standard_deviation(predictions) + + if not random_tie_break: + return multi_argmax(mean_standard_deviations, n_instances=n_instances) + + return shuffled_argmax(mean_standard_deviations, n_instances=n_instances) + def get_predictions(classifier: BaseEstimator, X: modALinput, num_predictions: int = 50): """ Runs num_predictions times the prediction of the classifier on the input X @@ -114,8 +159,42 @@ def entropy_sum(values, axis=-1): #sum Scipy basic entropy function: entr() return np.sum(entr(values), axis=axis) -def _bald_divergence(proba) -> np.ndarray: - #create 3D or 4D array from prediction dim: (drop_cycles, proba.shape[0], proba.shape[1], opt:proba.shape[2]) +def _mean_standard_deviation(proba: list) -> np.ndarray: + """ + Calculates the mean of the per class calculated standard deviations. + + As it is explicitly formulated in: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + + Args: + proba: list with the predictions over the dropout cycles + Return: + Returns the mean standard deviation of the dropout cycles over all classes. + """ + + proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + mean_squared = np.mean(proba_stacked, axis=-1)**2 + squared_mean = np.mean(proba_stacked**2, axis=-1) + standard_deviation_class_vise = np.sqrt(squared_mean - mean_squared) + mean_standard_deviation = np.mean(standard_deviation_class_vise, axis=-1) + + return mean_standard_deviation + + +def _bald_divergence(proba: list) -> np.ndarray: + """ + Calculates the bald divergence for each instance + + As it is explicitly formulated in: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + + Args: + proba: list with the predictions over the dropout cycles + Return: + Returns the mean standard deviation of the dropout cycles over all classes. + """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) #entropy along dropout cycles From 881496d7a7b8942baee4d4a17598b3c34cc7ef0b Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Tue, 16 Feb 2021 15:36:19 +0100 Subject: [PATCH 119/182] Updated deep learning code comments --- modAL/models/learners.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 7449e59..2e0215f 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -24,7 +24,7 @@ class ActiveLearner(BaseLearner): """ - This class is an abstract model of a general classic active learning algorithm. + This class is an model of a general classic (machine learning) active learning algorithm. Args: estimator: The estimator to be used in the active learning loop. @@ -173,13 +173,6 @@ def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: Returns: The score of the predictor. """ - - """ - sklearn does only accept tensors of different dim for X and Y, if we use - Multilabel classifiaction. If we do not want to do this but we still want - to go with tensors of different size (e.g. Transformers) we have to use this - workaround. - """ return self.estimator.score(X, y, **score_kwargs) def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: @@ -206,7 +199,11 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: class DeepActiveLearner(BaseLearner): """ - This class is an abstract model of a general active learning algorithm. + This class is an model of a general deep active learning algorithm. + Differences to the classical ActiveLearner are: + - Data is no member variable of the DeepActiveLearner class + - Misses the initial add/train data methods, therefore always trains on new data + - Uses different interfaces to sklearn in some functions Args: estimator: The estimator to be used in the active learning loop. @@ -265,6 +262,11 @@ def score(self, X: modALinput, y: modALinput) -> Any: The score of the predictor. """ + """ + sklearn does only accept tensors of different dim for X and Y, if we use + Multilabel classifiaction. Using tensors of different sizes for more complex models (e.g. Transformers) + requires to bypass the sklearn checks by directly calling the NeuralNets infer() function. + """ prediction = self.estimator.infer(X) criterion = self.estimator.criterion() return criterion(prediction, y).item() @@ -658,7 +660,7 @@ def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: class DeepCommittee(BaseCommittee): """ - This class is an abstract model of a committee-based active learning algorithm. + This class is for committee-based deep active learner algorithms. Args: learner_list: A list of ActiveLearners forming the Committee. From 33b33e54a2901ea6ecb063f19d0d94eae9228dac Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sat, 20 Feb 2021 10:26:43 +0100 Subject: [PATCH 120/182] Bootstrap init removed from base.py leand DeepAktive learner --- modAL/models/base.py | 3 --- modAL/models/learners.py | 9 ++------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 902e64a..b925608 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -33,8 +33,6 @@ class BaseLearner(ABC, BaseEstimator): for instance, modAL.uncertainty.uncertainty_sampling. force_all_finite: When True, forces all values of the data finite. When False, accepts np.nan and np.inf values. - bootstrap_init: If initial training data is available, bootstrapping can be done during the first training. - Useful when building Committee models with bagging. on_transformed: Whether to transform samples with the pipeline defined by the estimator when applying the query strategy. **fit_kwargs: keyword arguments. @@ -46,7 +44,6 @@ class BaseLearner(ABC, BaseEstimator): def __init__(self, estimator: BaseEstimator, query_strategy: Callable, - bootstrap_init: bool = False, on_transformed: bool = False, force_all_finite: bool = True, **fit_kwargs diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 2e0215f..26a51a9 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -82,8 +82,7 @@ def __init__(self, on_transformed: bool = False, **fit_kwargs ) -> None: - super().__init__(estimator, query_strategy, - bootstrap_init, on_transformed, **fit_kwargs) + super().__init__(estimator, query_strategy, on_transformed, **fit_kwargs) self.X_training = X_training self.y_training = y_training @@ -209,8 +208,6 @@ class DeepActiveLearner(BaseLearner): estimator: The estimator to be used in the active learning loop. query_strategy: Function providing the query strategy for the active learning loop, for instance, modAL.uncertainty.uncertainty_sampling. - bootstrap_init: If initial training data is available, bootstrapping can be done during the first training. - Useful when building Committee models with bagging. on_transformed: Whether to transform samples with the pipeline defined by the estimator when applying the query strategy. **fit_kwargs: keyword arguments. @@ -223,13 +220,11 @@ class DeepActiveLearner(BaseLearner): def __init__(self, estimator: BaseEstimator, query_strategy: Callable = uncertainty_sampling, - bootstrap_init: bool = False, on_transformed: bool = False, **fit_kwargs ) -> None: #TODO: Check if given query strategy works for Deep Learning - super().__init__(estimator, query_strategy, - bootstrap_init, on_transformed, **fit_kwargs) + super().__init__(estimator, query_strategy, on_transformed, **fit_kwargs) self.estimator.initialize() # does maybe just work with pytorch From 4f3341990ee836cae7b2cc84a982cdf9244f3706 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sat, 20 Feb 2021 10:58:07 +0100 Subject: [PATCH 121/182] multiargmin and multiargmax adaption instead of return_negative --- modAL/expected_error.py | 6 +++--- modAL/multilabel.py | 6 +++--- modAL/uncertainty.py | 6 +++--- modAL/utils/selection.py | 43 +++++++++++++++++++++++++++++++--------- 4 files changed, 43 insertions(+), 18 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index d6ebc3c..01b139c 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -11,7 +11,7 @@ from modAL.models import ActiveLearner from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows, data_shape, add_row -from modAL.utils.selection import multi_argmax, shuffled_argmax +from modAL.utils.selection import multi_argmax, multi_argmin, shuffled_argmax, shuffled_argmin from modAL.uncertainty import _proba_uncertainty, _proba_entropy @@ -78,6 +78,6 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = expected_error[x_idx] = np.inf if not random_tie_break: - return multi_argmax(-expected_error, n_instances, return_negative=True) + return multi_argmin(expected_error, n_instances) - return shuffled_argmax(-expected_error, n_instances, return_negative=True) + return shuffled_argmin(expected_error, n_instances) diff --git a/modAL/multilabel.py b/modAL/multilabel.py index a0f009d..186ed08 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -5,7 +5,7 @@ from modAL.models import ActiveLearner from modAL.utils.data import modALinput -from modAL.utils.selection import multi_argmax, shuffled_argmax +from modAL.utils.selection import multi_argmax, multi_argmin, shuffled_argmax, shuffled_argmin from typing import Tuple, Optional from itertools import combinations @@ -166,9 +166,9 @@ def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, classwise_min = np.min(classwise_confidence, axis=1) if not random_tie_break: - return multi_argmax(-classwise_min, n_instances, return_negative=True) + return multi_argmin(classwise_min, n_instances) - return shuffled_argmax(-classwise_min, n_instances, return_negative=True) + return shuffled_argmin(classwise_min, n_instances) def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput, diff --git a/modAL/uncertainty.py b/modAL/uncertainty.py index 815fba8..f41206c 100644 --- a/modAL/uncertainty.py +++ b/modAL/uncertainty.py @@ -9,7 +9,7 @@ from sklearn.base import BaseEstimator from modAL.utils.data import modALinput -from modAL.utils.selection import multi_argmax, shuffled_argmax +from modAL.utils.selection import multi_argmax, multi_argmin, shuffled_argmax, shuffled_argmin def _proba_uncertainty(proba: np.ndarray) -> np.ndarray: @@ -178,9 +178,9 @@ def margin_sampling(classifier: BaseEstimator, X: modALinput, margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs) if not random_tie_break: - return multi_argmax(-margin, n_instances=n_instances, return_negative=True) + return multi_argmin(margin, n_instances=n_instances) - return shuffled_argmax(-margin, n_instances=n_instances, return_negative=True) + return shuffled_argmin(margin, n_instances=n_instances) def entropy_sampling(classifier: BaseEstimator, X: modALinput, diff --git a/modAL/utils/selection.py b/modAL/utils/selection.py index 5345915..537c082 100644 --- a/modAL/utils/selection.py +++ b/modAL/utils/selection.py @@ -5,7 +5,7 @@ import numpy as np -def shuffled_argmax(values: np.ndarray, n_instances: int = 1, return_negative=False) -> np.ndarray: +def shuffled_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: """ Shuffles the values and sorts them afterwards. This can be used to break the tie when the highest utility score is not unique. The shuffle randomizes @@ -31,20 +31,34 @@ def shuffled_argmax(values: np.ndarray, n_instances: int = 1, return_negative=Fa # inverting the shuffle query_idx = shuffled_idx[sorted_query_idx] - if return_negative == True: - values = -values - return query_idx, values[query_idx] -def multi_argmax(values: np.ndarray, n_instances: int = 1, return_negative=False) -> np.ndarray: +def shuffled_argmin(values: np.ndarray, n_instances: int = 1) -> np.ndarray: + """ + Shuffles the values and sorts them afterwards. This can be used to break + the tie when the highest utility score is not unique. The shuffle randomizes + order, which is preserved by the mergesort algorithm. + + Args: + values: Contains the values to be selected from. + n_instances: Specifies how many indices and values to return. + Returns: + The indices and values of the n_instances smallest values. + """ + + indexes, index_values = shuffled_argmax(-values, n_instances) + + return indexes, -index_values + + +def multi_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: """ return the indices and values of the n_instances highest values. Args: values: Contains the values to be selected from. n_instances: Specifies how many indices and values to return. - return_negative: if true: returns negative values Returns: The indices and values of the n_instances largest values. """ @@ -52,12 +66,23 @@ def multi_argmax(values: np.ndarray, n_instances: int = 1, return_negative=False max_idx = np.argpartition(-values, n_instances-1, axis=0)[:n_instances] - if return_negative == True: - values = -values - return max_idx, values[max_idx] +def multi_argmin(values: np.ndarray, n_instances: int = 1) -> np.ndarray: + """ + return the indices and values of the n_instances smallest values. + + Args: + values: Contains the values to be selected from. + n_instances: Specifies how many indices and values to return. + Returns: + The indices and values of the n_instances smallest values. + """ + indexes, index_values = multi_argmax(-values, n_instances) + return indexes, -index_values + + def weighted_random(weights: np.ndarray, n_instances: int = 1) -> np.ndarray: """ Returns n_instances indices based on the weights. From 351080a2afdce046c46af0cd0cecd2447b80af60 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sun, 21 Feb 2021 15:33:11 +0100 Subject: [PATCH 122/182] Softmax Dropout support --- modAL/dropout.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 4b292d6..334e99c 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -13,6 +13,7 @@ from skorch.utils import to_numpy + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) def KL_divergence(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, @@ -151,7 +152,8 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, num_predictions: i #In comparison to: predict(), predict_proba() the infer() # does not change train/eval mode of other layers prediction = classifier.estimator.infer(X) - predictions.append(to_numpy(prediction)) + prediction_proba = to_numpy(prediction.softmax(1)) + predictions.append(prediction_proba) return predictions From 8beed50bdc4cbce389eebde253f244878c7b3a9a Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Sun, 21 Feb 2021 18:10:56 +0100 Subject: [PATCH 123/182] Added max entropy query strategy --- modAL/dropout.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/modAL/dropout.py b/modAL/dropout.py index 4b292d6..2b0f704 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -131,6 +131,51 @@ def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: in return shuffled_argmax(mean_standard_deviations, n_instances=n_instances) +def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, + random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + """ + Mc-Dropout maximum entropy query strategy. Returns the indexes of the instances + with the largest entropy of the per class calculated entropies over multiple dropout cycles + and the corresponding metric. + + Based on the equations of: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + + Args: + classifier: The classifier for which the labels are to be queried. + X: The pool of samples to query from. + n_instances: Number of samples to be queried. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + dropout_layer_indexes: Indexes of the dropout layers which should be activated + Choose indices from : list(torch_model.modules()) + num_cycles: Number of forward passes with activated dropout + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. + + Returns: + The indices of the instances from X chosen to be labelled; + The mc-dropout metric of the chosen instances; + """ + + # set dropout layers to train mode + set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) + + predictions = get_predictions(classifier, X, num_cycles) + + # set dropout layers to eval + set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) + + #get entropy values for predictions + entropy = _class_entropy(predictions) + + if not random_tie_break: + return multi_argmax(entropy, n_instances=n_instances) + + return shuffled_argmax(entropy, n_instances=n_instances) + def get_predictions(classifier: BaseEstimator, X: modALinput, num_predictions: int = 50): """ Runs num_predictions times the prediction of the classifier on the input X @@ -181,6 +226,24 @@ def _mean_standard_deviation(proba: list) -> np.ndarray: return mean_standard_deviation +def _class_entropy(proba: list) -> np.ndarray: + """ + Calculates the entropy per class over dropout cycles + + As it is explicitly formulated in: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + + Args: + proba: list with the predictions over the dropout cycles + Return: + Returns the entropy of the dropout cycles over all classes. + """ + + proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + #calculate entropy and sum along dropout cycles + entropy = entropy_sum(proba_stacked, axis=-1) + return entropy def _bald_divergence(proba: list) -> np.ndarray: """ From df70b44996f4daf9c2786860917934073c493fc1 Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Sun, 21 Feb 2021 18:13:44 +0100 Subject: [PATCH 124/182] Remove debug logging from dropout file --- modAL/dropout.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 2b0f704..a0447d1 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -1,5 +1,4 @@ import numpy as np -import logging import sys from sklearn.base import BaseEstimator @@ -12,9 +11,6 @@ from skorch.utils import to_numpy - -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) - def KL_divergence(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: @@ -191,7 +187,6 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, num_predictions: i predictions = [] for i in range(num_predictions): - logging.getLogger().info("Dropout: start prediction forward pass") #call Skorch infer function to perform model forward pass #In comparison to: predict(), predict_proba() the infer() # does not change train/eval mode of other layers @@ -319,7 +314,5 @@ def set_dropout_mode(model, dropout_layer_indexes: list, train_mode: bool): if module.__class__.__name__.startswith('Dropout'): if True == train_mode: module.train() - logging.getLogger().info("Dropout: set mode of " + str(module.__class__.__name__) + " to train") elif False == train_mode: module.eval() - logging.getLogger().info("Dropout: set mode of " + str(module.__class__.__name__) + " to eval") From 397902e3c37bbbc106f8dc4a7a1dcdefa9cbfc42 Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Mon, 22 Feb 2021 09:11:37 +0100 Subject: [PATCH 125/182] Max entropy query strategy fix dimensions --- modAL/dropout.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 8b50dea..b877a88 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -165,7 +165,7 @@ def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) #get entropy values for predictions - entropy = _class_entropy(predictions) + entropy = _entropy(predictions) if not random_tie_break: return multi_argmax(entropy, n_instances=n_instances) @@ -222,7 +222,7 @@ def _mean_standard_deviation(proba: list) -> np.ndarray: return mean_standard_deviation -def _class_entropy(proba: list) -> np.ndarray: +def _entropy(proba: list) -> np.ndarray: """ Calculates the entropy per class over dropout cycles @@ -237,8 +237,9 @@ def _class_entropy(proba: list) -> np.ndarray: """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) - #calculate entropy and sum along dropout cycles - entropy = entropy_sum(proba_stacked, axis=-1) + #calculate entropy per class and sum along dropout cycles + entropy_classes = entropy_sum(proba_stacked, axis=-1) + entropy = np.mean(entropy_classes, axis=-1) return entropy def _bald_divergence(proba: list) -> np.ndarray: From ed07ef98d3629152cf33ff8a8a3286630eb13ac9 Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Mon, 22 Feb 2021 09:31:15 +0100 Subject: [PATCH 126/182] Restructure dropout get_predictions --- modAL/dropout.py | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index b877a88..123ab44 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -65,14 +65,7 @@ def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = The indices of the instances from X chosen to be labelled; The mc-dropout metric of the chosen instances; """ - - # set dropout layers to train mode - set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) - - predictions = get_predictions(classifier, X, num_cycles) - - # set dropout layers to eval - set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) #calculate BALD (Bayesian active learning divergence)) bald_scores = _bald_divergence(predictions) @@ -82,7 +75,6 @@ def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = return shuffled_argmax(bald_scores, n_instances=n_instances) - def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: @@ -113,12 +105,7 @@ def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: in """ # set dropout layers to train mode - set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) - - predictions = get_predictions(classifier, X, num_cycles) - - # set dropout layers to eval - set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) mean_standard_deviations = _mean_standard_deviation(predictions) @@ -155,14 +142,7 @@ def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances The indices of the instances from X chosen to be labelled; The mc-dropout metric of the chosen instances; """ - - # set dropout layers to train mode - set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) - - predictions = get_predictions(classifier, X, num_cycles) - - # set dropout layers to eval - set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) #get entropy values for predictions entropy = _entropy(predictions) @@ -172,7 +152,7 @@ def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances return shuffled_argmax(entropy, n_instances=n_instances) -def get_predictions(classifier: BaseEstimator, X: modALinput, num_predictions: int = 50): +def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_indexes: list, num_predictions: int = 50): """ Runs num_predictions times the prediction of the classifier on the input X and puts the predictions in a list. @@ -186,6 +166,9 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, num_predictions: i """ predictions = [] + # set dropout layers to train mode + set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) + for i in range(num_predictions): #call Skorch infer function to perform model forward pass #In comparison to: predict(), predict_proba() the infer() @@ -193,6 +176,10 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, num_predictions: i prediction = classifier.estimator.infer(X) prediction_proba = to_numpy(prediction.softmax(1)) predictions.append(prediction_proba) + + # set dropout layers to eval + set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) + return predictions From 339403355ec68fe98412f3bb90e4357a5dd7c8b9 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Mon, 22 Feb 2021 09:48:12 +0100 Subject: [PATCH 127/182] Documentation adaption --- modAL/dropout.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modAL/dropout.py b/modAL/dropout.py index 123ab44..6bac250 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -160,6 +160,8 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde Args: classifier: The classifier for which the labels are to be queried. X: The pool of samples to query from. + dropout_layer_indexes: Indexes of the dropout layers which should be activated + Choose indices from : list(torch_model.modules()) num_predictions: Number of predictions which should be made Return: prediction: list with all predictions From 03e79db94952422336e56a3f3ae32b5da930eead Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Mon, 22 Feb 2021 17:20:50 +0100 Subject: [PATCH 128/182] Warm start adaption --- modAL/models/learners.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 26a51a9..4da0457 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -256,7 +256,6 @@ def score(self, X: modALinput, y: modALinput) -> Any: Returns: The score of the predictor. """ - """ sklearn does only accept tensors of different dim for X and Y, if we use Multilabel classifiaction. Using tensors of different sizes for more complex models (e.g. Transformers) @@ -266,18 +265,28 @@ def score(self, X: modALinput, y: modALinput) -> Any: criterion = self.estimator.criterion() return criterion(prediction, y).item() - def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> None: + def teach(self, X: modALinput, y: modALinput, warm_start: bool = True, bootstrap: bool = False, **fit_kwargs) -> None: """ Adds X and y to the known training data and retrains the predictor with the augmented dataset. Args: X: The new samples for which the labels are supplied by the expert. y: Labels corresponding to the new instances in X. + warm_start: If False, the model parameters are resetted and the training starts from zero, + otherwise the pre trained model is kept and further trained. bootstrap: If True, training is done on a bootstrapped dataset. Useful for building Committee models with bagging. **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. """ - self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + + if warm_start: + if not bootstrap: + self.estimator.partial_fit(X, y, **fit_kwargs) + else: + bootstrap_idx = np.random.choice(range(X.shape[0]), X.shape[0], replace=True) + self.estimator.partial_fit(X[bootstrap_idx], y[bootstrap_idx], **fit_kwargs) + else: + self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) """ Classes for Bayesian optimization From 87e4ec92ed17bb3eedc31fa1c1a8c0453540e8ff Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Mon, 22 Feb 2021 23:30:48 +0100 Subject: [PATCH 129/182] Add abstract methods for learner and committee base classes --- modAL/models/base.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index b925608..19776ff 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -120,6 +120,10 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f return self + @abc.abstractmethod + def fit(self, *args, **kwargs) -> None: + pass + def predict(self, X: modALinput, **predict_kwargs) -> Any: """ Estimator predictions for X. Interface with the predict method of the estimator. @@ -172,6 +176,9 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] return query_result, retrieve_rows(X_pool, query_result), query_metrics + @abc.abstractmethod + def score(self, *args, **kwargs) -> None: + pass @abc.abstractmethod def teach(self, *args, **kwargs) -> None: @@ -216,6 +223,26 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f for learner in self.learner_list: learner._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + @abc.abstractmethod + def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> Any: + pass + + @abc.abstractmethod + def predict(self, X: modALinput) -> Any: + pass + + @abc.abstractmethod + def predict_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: + pass + + @abc.abstractmethod + def score(self, X: modALinput, y: modALinput, sample_weight: List[float] = None) -> Any: + pass + + @abc.abstractmethod + def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> Any: + pass + def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.csr_matrix]: """ Transforms the data as supplied to each learner's estimator and concatenates transformations. @@ -274,10 +301,14 @@ def _set_classes(self): ) self.n_classes_ = len(self.classes_) - @abc.abstractmethod - def predict(self, X: modALinput) -> Any: + pass @abc.abstractmethod def vote(self, X: modALinput) -> Any: # TODO: clarify typing pass + + @abc.abstractmethod + def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: + pass + From 7564806d72ece26a8b35a4c7dbfdb852f087b6c3 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sun, 28 Feb 2021 11:08:53 +0100 Subject: [PATCH 130/182] Input type dict support --- modAL/utils/data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modAL/utils/data.py b/modAL/utils/data.py index 4b4f697..a7a466d 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -102,6 +102,10 @@ def retrieve_rows(X: modALinput, return X.iloc[I] elif isinstance(X, list): return np.array(X)[I].tolist() + elif isinstance(X, dict): + for key, value in X.items(): + X[key] = retrieve_rows(value, I) + return X elif isinstance(X, np.ndarray): return X[I] elif torch.is_tensor(X): From 70c3e9529e494867fdded47d8eb60ca58d08c1eb Mon Sep 17 00:00:00 2001 From: Michael <34004884+Schweinebaermann@users.noreply.github.com> Date: Mon, 8 Mar 2021 00:17:18 +0100 Subject: [PATCH 131/182] Added max variation ratios query strategy --- modAL/dropout.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/modAL/dropout.py b/modAL/dropout.py index 6bac250..8d0a69d 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -152,6 +152,44 @@ def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances return shuffled_argmax(entropy, n_instances=n_instances) +def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, + random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + """ + Mc-Dropout maximum variation ratios query strategy. Returns the indexes of the instances + with the largest variation ratios over multiple dropout cycles + and the corresponding metric. + + Based on the equations of: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + + Args: + classifier: The classifier for which the labels are to be queried. + X: The pool of samples to query from. + n_instances: Number of samples to be queried. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + dropout_layer_indexes: Indexes of the dropout layers which should be activated + Choose indices from : list(torch_model.modules()) + num_cycles: Number of forward passes with activated dropout + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. + + Returns: + The indices of the instances from X chosen to be labelled; + The mc-dropout metric of the chosen instances; + """ + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) + + #get variation ratios values for predictions + variationRatios = _variation_ratios(predictions) + + if not random_tie_break: + return multi_argmax(variationRatios, n_instances=n_instances) + + return shuffled_argmax(variationRatios, n_instances=n_instances) + def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_indexes: list, num_predictions: int = 50): """ Runs num_predictions times the prediction of the classifier on the input X @@ -189,6 +227,11 @@ def entropy_sum(values, axis=-1): #sum Scipy basic entropy function: entr() return np.sum(entr(values), axis=axis) +def variationRatios(values, axis=-1): + #Mean over Dropout Cycles + valuesDCMean = np.mean(values, axis=axis) + return 1 - np.amax(valuesDCMean, axis=axis) + def _mean_standard_deviation(proba: list) -> np.ndarray: """ Calculates the mean of the per class calculated standard deviations. @@ -231,6 +274,24 @@ def _entropy(proba: list) -> np.ndarray: entropy = np.mean(entropy_classes, axis=-1) return entropy +def _variation_ratios(proba: list) -> np.ndarray: + """ + Calculates the variation ratios over dropout cycles + + As it is explicitly formulated in: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + + Args: + proba: list with the predictions over the dropout cycles + Return: + Returns the variation ratios of the dropout cycles. + """ + proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + #Calculate the variation ratios over the mean of dropout cycles + variation_ratios = variationRatios(proba_stacked, axis=-1) + return variation_ratios + def _bald_divergence(proba: list) -> np.ndarray: """ Calculates the bald divergence for each instance From eb531dc87bb768df9f794af92f6ead3101e6f6a8 Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Mon, 8 Mar 2021 00:32:44 +0100 Subject: [PATCH 132/182] Add query strategy for multiple metrics --- modAL/dropout.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 6bac250..da5df3f 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -32,6 +32,25 @@ def KL_divergence(classifier: BaseEstimator, X: modALinput, n_instances: int = 1 return shuffled_argmax(KL_divergence, n_instances=n_instances) +def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: list = ["bald", "mean_st", "max_entropy"], + n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + """ + Mc-Dropout bald query strategy. Returns the indexes of the instances with the largest BALD + (Bayesian Active Learning by Disagreement) score calculated through the dropout cycles + and the corresponding bald score. + """ + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) + + metrics_dict = {} + if "bald" in query_strategies: + metrics_dict["bald"] = _bald_divergence(predictions) + if "mean_st" in query_strategies: + metrics_dict["mean_st"] = _mean_standard_deviation(predictions) + if "max_entropy" in query_strategies: + metrics_dict["max_entropy"] = _entropy(predictions) + + return None, metrics_dict def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], @@ -184,7 +203,6 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde return predictions - def entropy_sum(values, axis=-1): #sum Scipy basic entropy function: entr() return np.sum(entr(values), axis=axis) From 4a1d39a89e5fe47b809f449a14882aa8908e5b81 Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Mon, 8 Mar 2021 11:53:48 +0100 Subject: [PATCH 133/182] Added max variation to dropout_multi --- modAL/dropout.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 77550c0..467addf 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -32,13 +32,20 @@ def KL_divergence(classifier: BaseEstimator, X: modALinput, n_instances: int = 1 return shuffled_argmax(KL_divergence, n_instances=n_instances) -def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: list = ["bald", "mean_st", "max_entropy"], +def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: list = ["bald", "mean_st", "max_entropy", "max_var"], n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: """ - Mc-Dropout bald query strategy. Returns the indexes of the instances with the largest BALD - (Bayesian Active Learning by Disagreement) score calculated through the dropout cycles - and the corresponding bald score. + Multi metric dropout query strategy. Returns the specified metrics for given input data. + Selection of query strategies are: + - bald: BALD query strategy + - mean_st: Mean Standard deviation + - max_entropy: maximum entropy + - max_var: maximum variation + By default all query strategies are selected + + Function returns dictionary of metrics with their name as key. + The indices of the n-best samples (n_instances) is not used in this function. """ predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) @@ -49,6 +56,8 @@ def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: metrics_dict["mean_st"] = _mean_standard_deviation(predictions) if "max_entropy" in query_strategies: metrics_dict["max_entropy"] = _entropy(predictions) + if "max_var" in query_strategies: + metrics_dict["max_var"] = _variation_ratios(predictions) return None, metrics_dict From cbee563b6b5b8c792b9ea84c13b301656a24fde7 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Mon, 8 Mar 2021 12:31:28 +0100 Subject: [PATCH 134/182] Number of epochs setter and getter support --- modAL/models/learners.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 4da0457..60deb1b 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -287,6 +287,28 @@ def teach(self, X: modALinput, y: modALinput, warm_start: bool = True, bootstrap self.estimator.partial_fit(X[bootstrap_idx], y[bootstrap_idx], **fit_kwargs) else: self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + + @property + def num_epochs(self): + """ + Returns the number of epochs of a single fit cycle. + """ + return self.estimator.max_epochs + + @num_epochs.setter + def num_epochs(self, value): + """ + Sets the number of epochs of a single fit cycle. The number of epochs + can be changed at any time, even after the model was pretrained. + """ + if isinstance(value, int): + if 0 <= value <= 100: + self.estimator.max_epochs = value + else: + raise ValueError("num_epochs must be in range 0 <= x <= 100") + else: + raise TypeError("num_epochs must be of type integer!") + """ Classes for Bayesian optimization From 4614d92169bef25a0428eb04e86d31a04ebb2838 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Mon, 8 Mar 2021 12:40:14 +0100 Subject: [PATCH 135/182] Batch size setter and getter --- modAL/models/learners.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 60deb1b..244f456 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -299,16 +299,37 @@ def num_epochs(self): def num_epochs(self, value): """ Sets the number of epochs of a single fit cycle. The number of epochs - can be changed at any time, even after the model was pretrained. + can be changed at any time, even after the model was trained. """ if isinstance(value, int): - if 0 <= value <= 100: + if 0 < value <= 100: self.estimator.max_epochs = value else: - raise ValueError("num_epochs must be in range 0 <= x <= 100") + raise ValueError("num_epochs must be in range 0 < x <= 100") else: raise TypeError("num_epochs must be of type integer!") + @property + def batch_size(self): + """ + Returns the batch size of a single forward pass. + """ + return self.estimator.batch_size + + @batch_size.setter + def batch_size(self, value): + """ + Sets the batch size of a single forward pass. The batch size + can be changed at any time, even after the model was trained. + """ + if isinstance(value, int): + if 0 < value: + self.estimator.batch_size = value + else: + raise ValueError("batch size must be larger than 0") + else: + raise TypeError("batch size must be of type integer!") + """ Classes for Bayesian optimization From 72bdf18ab423bfe362d3f4725a22fa8e475ef241 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Mon, 8 Mar 2021 14:09:39 +0100 Subject: [PATCH 136/182] Mean_std Bugfix --- modAL/dropout.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 467addf..3ff5e86 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -274,9 +274,7 @@ def _mean_standard_deviation(proba: list) -> np.ndarray: """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) - mean_squared = np.mean(proba_stacked, axis=-1)**2 - squared_mean = np.mean(proba_stacked**2, axis=-1) - standard_deviation_class_vise = np.sqrt(squared_mean - mean_squared) + standard_deviation_class_vise = np.std(proba_stacked, axis=-1) mean_standard_deviation = np.mean(standard_deviation_class_vise, axis=-1) return mean_standard_deviation From ba1b729d0138f1ff8e971168ffca70946d806780 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Mon, 8 Mar 2021 14:21:58 +0100 Subject: [PATCH 137/182] Metric calculation consistency adaption --- modAL/dropout.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 3ff5e86..85139a0 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -254,11 +254,6 @@ def entropy_sum(values, axis=-1): #sum Scipy basic entropy function: entr() return np.sum(entr(values), axis=axis) -def variationRatios(values, axis=-1): - #Mean over Dropout Cycles - valuesDCMean = np.mean(values, axis=axis) - return 1 - np.amax(valuesDCMean, axis=axis) - def _mean_standard_deviation(proba: list) -> np.ndarray: """ Calculates the mean of the per class calculated standard deviations. @@ -314,8 +309,8 @@ def _variation_ratios(proba: list) -> np.ndarray: """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) #Calculate the variation ratios over the mean of dropout cycles - variation_ratios = variationRatios(proba_stacked, axis=-1) - return variation_ratios + valuesDCMean = np.mean(proba_stacked, axis=-1) + return 1 - np.amax(valuesDCMean, axis=-1) def _bald_divergence(proba: list) -> np.ndarray: """ From 481d07e78c0fe4bdd7ac912888269fdeca3c0747 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Wed, 10 Mar 2021 16:10:36 +0100 Subject: [PATCH 138/182] Ram adaption --- modAL/dropout.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 85139a0..9df5f8b 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -1,5 +1,6 @@ import numpy as np import sys +import torch from sklearn.base import BaseEstimator from sklearn.preprocessing import normalize @@ -241,9 +242,20 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde #call Skorch infer function to perform model forward pass #In comparison to: predict(), predict_proba() the infer() # does not change train/eval mode of other layers - prediction = classifier.estimator.infer(X) - prediction_proba = to_numpy(prediction.softmax(1)) - predictions.append(prediction_proba) + X.detach() + + probas = [] + for X_split in torch.split(X, 5000): + prediction = classifier.estimator.infer(X_split) + prediction_proba = to_numpy(prediction.softmax(1)) + + if type(probas) != list: + probas = np.vstack((probas, prediction_proba)) + else: + probas = prediction_proba + + + predictions.append(probas) # set dropout layers to eval set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) From d0f0783f37cf88d6bd49448bf5949f2770ae4bf2 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Wed, 17 Mar 2021 09:53:22 +0100 Subject: [PATCH 139/182] Number of samples per forward pass adaption --- modAL/dropout.py | 232 +++++++++++++++++++++++++---------------------- 1 file changed, 124 insertions(+), 108 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 9df5f8b..d0d51ed 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -35,7 +35,7 @@ def KL_divergence(classifier: BaseEstimator, X: modALinput, n_instances: int = 1 def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: list = ["bald", "mean_st", "max_entropy", "max_var"], n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + num_cycles : int = 50, sample_per_forward_pass: int = 1000, **mc_dropout_kwargs) -> np.ndarray: """ Multi metric dropout query strategy. Returns the specified metrics for given input data. Selection of query strategies are: @@ -48,7 +48,7 @@ def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: Function returns dictionary of metrics with their name as key. The indices of the n-best samples (n_instances) is not used in this function. """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass) metrics_dict = {} if "bald" in query_strategies: @@ -64,37 +64,40 @@ def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + num_cycles : int = 50, sample_per_forward_pass: int = 1000, **mc_dropout_kwargs) -> np.ndarray: """ - Mc-Dropout bald query strategy. Returns the indexes of the instances with the largest BALD - (Bayesian Active Learning by Disagreement) score calculated through the dropout cycles - and the corresponding bald score. - - Based on the work of: - Deep Bayesian Active Learning with Image Data. - (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) - Dropout as a Bayesian Approximation: Representing Model Uncer- tainty in Deep Learning. - (Yarin Gal and Zoubin Ghahramani. 2016.) - Bayesian Active Learning for Classification and Preference Learning. - (NeilHoulsby,FerencHusza ́r,ZoubinGhahramani,andMa ́te ́Lengyel. 2011.) - - Args: - classifier: The classifier for which the labels are to be queried. - X: The pool of samples to query from. - n_instances: Number of samples to be queried. - random_tie_break: If True, shuffles utility scores to randomize the order. This - can be used to break the tie when the highest utility score is not unique. - dropout_layer_indexes: Indexes of the dropout layers which should be activated - Choose indices from : list(torch_model.modules()) - num_cycles: Number of forward passes with activated dropout - **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty - measure function. - - Returns: - The indices of the instances from X chosen to be labelled; - The mc-dropout metric of the chosen instances; + Mc-Dropout bald query strategy. Returns the indexes of the instances with the largest BALD + (Bayesian Active Learning by Disagreement) score calculated through the dropout cycles + and the corresponding bald score. + + Based on the work of: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + Dropout as a Bayesian Approximation: Representing Model Uncer- tainty in Deep Learning. + (Yarin Gal and Zoubin Ghahramani. 2016.) + Bayesian Active Learning for Classification and Preference Learning. + (NeilHoulsby,FerencHusza ́r,ZoubinGhahramani,andMa ́te ́Lengyel. 2011.) + + Args: + classifier: The classifier for which the labels are to be queried. + X: The pool of samples to query from. + n_instances: Number of samples to be queried. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + dropout_layer_indexes: Indexes of the dropout layers which should be activated + Choose indices from : list(torch_model.modules()) + num_cycles: Number of forward passes with activated dropout + sample_per_forward_pass: max. sample number for each forward pass. + The allocated RAM does mainly depend on this. + Small number --> small RAM allocation + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. + + Returns: + The indices of the instances from X chosen to be labelled; + The mc-dropout metric of the chosen instances; """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass) #calculate BALD (Bayesian active learning divergence)) bald_scores = _bald_divergence(predictions) @@ -106,35 +109,38 @@ def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + num_cycles : int = 50, sample_per_forward_pass: int = 1000, **mc_dropout_kwargs) -> np.ndarray: """ - Mc-Dropout mean standard deviation query strategy. Returns the indexes of the instances - with the largest mean of the per class calculated standard deviations over multiple dropout cycles - and the corresponding metric. - - Based on the equations of: - Deep Bayesian Active Learning with Image Data. - (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) - - Args: - classifier: The classifier for which the labels are to be queried. - X: The pool of samples to query from. - n_instances: Number of samples to be queried. - random_tie_break: If True, shuffles utility scores to randomize the order. This - can be used to break the tie when the highest utility score is not unique. - dropout_layer_indexes: Indexes of the dropout layers which should be activated - Choose indices from : list(torch_model.modules()) - num_cycles: Number of forward passes with activated dropout - **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty - measure function. - - Returns: - The indices of the instances from X chosen to be labelled; - The mc-dropout metric of the chosen instances; + Mc-Dropout mean standard deviation query strategy. Returns the indexes of the instances + with the largest mean of the per class calculated standard deviations over multiple dropout cycles + and the corresponding metric. + + Based on the equations of: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + + Args: + classifier: The classifier for which the labels are to be queried. + X: The pool of samples to query from. + n_instances: Number of samples to be queried. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + dropout_layer_indexes: Indexes of the dropout layers which should be activated + Choose indices from : list(torch_model.modules()) + num_cycles: Number of forward passes with activated dropout + sample_per_forward_pass: max. sample number for each forward pass. + The allocated RAM does mainly depend on this. + Small number --> small RAM allocation + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. + + Returns: + The indices of the instances from X chosen to be labelled; + The mc-dropout metric of the chosen instances; """ # set dropout layers to train mode - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass) mean_standard_deviations = _mean_standard_deviation(predictions) @@ -145,33 +151,36 @@ def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: in def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + num_cycles : int = 50, sample_per_forward_pass: int = 1000, **mc_dropout_kwargs) -> np.ndarray: """ - Mc-Dropout maximum entropy query strategy. Returns the indexes of the instances - with the largest entropy of the per class calculated entropies over multiple dropout cycles - and the corresponding metric. - - Based on the equations of: - Deep Bayesian Active Learning with Image Data. - (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) - - Args: - classifier: The classifier for which the labels are to be queried. - X: The pool of samples to query from. - n_instances: Number of samples to be queried. - random_tie_break: If True, shuffles utility scores to randomize the order. This - can be used to break the tie when the highest utility score is not unique. - dropout_layer_indexes: Indexes of the dropout layers which should be activated - Choose indices from : list(torch_model.modules()) - num_cycles: Number of forward passes with activated dropout - **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty - measure function. - - Returns: - The indices of the instances from X chosen to be labelled; - The mc-dropout metric of the chosen instances; + Mc-Dropout maximum entropy query strategy. Returns the indexes of the instances + with the largest entropy of the per class calculated entropies over multiple dropout cycles + and the corresponding metric. + + Based on the equations of: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + + Args: + classifier: The classifier for which the labels are to be queried. + X: The pool of samples to query from. + n_instances: Number of samples to be queried. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + dropout_layer_indexes: Indexes of the dropout layers which should be activated + Choose indices from : list(torch_model.modules()) + num_cycles: Number of forward passes with activated dropout + sample_per_forward_pass: max. sample number for each forward pass. + The allocated RAM does mainly depend on this. + Small number --> small RAM allocation + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. + + Returns: + The indices of the instances from X chosen to be labelled; + The mc-dropout metric of the chosen instances; """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass) #get entropy values for predictions entropy = _entropy(predictions) @@ -183,33 +192,36 @@ def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: + num_cycles : int = 50, sample_per_forward_pass: int = 1000, **mc_dropout_kwargs) -> np.ndarray: """ - Mc-Dropout maximum variation ratios query strategy. Returns the indexes of the instances - with the largest variation ratios over multiple dropout cycles - and the corresponding metric. - - Based on the equations of: - Deep Bayesian Active Learning with Image Data. - (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) - - Args: - classifier: The classifier for which the labels are to be queried. - X: The pool of samples to query from. - n_instances: Number of samples to be queried. - random_tie_break: If True, shuffles utility scores to randomize the order. This - can be used to break the tie when the highest utility score is not unique. - dropout_layer_indexes: Indexes of the dropout layers which should be activated - Choose indices from : list(torch_model.modules()) - num_cycles: Number of forward passes with activated dropout - **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty - measure function. - - Returns: - The indices of the instances from X chosen to be labelled; - The mc-dropout metric of the chosen instances; + Mc-Dropout maximum variation ratios query strategy. Returns the indexes of the instances + with the largest variation ratios over multiple dropout cycles + and the corresponding metric. + + Based on the equations of: + Deep Bayesian Active Learning with Image Data. + (Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017.) + + Args: + classifier: The classifier for which the labels are to be queried. + X: The pool of samples to query from. + n_instances: Number of samples to be queried. + random_tie_break: If True, shuffles utility scores to randomize the order. This + can be used to break the tie when the highest utility score is not unique. + dropout_layer_indexes: Indexes of the dropout layers which should be activated + Choose indices from : list(torch_model.modules()) + num_cycles: Number of forward passes with activated dropout + sample_per_forward_pass: max. sample number for each forward pass. + The allocated RAM does mainly depend on this. + Small number --> small RAM allocation + **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty + measure function. + + Returns: + The indices of the instances from X chosen to be labelled; + The mc-dropout metric of the chosen instances; """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass) #get variation ratios values for predictions variationRatios = _variation_ratios(predictions) @@ -219,7 +231,8 @@ def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_i return shuffled_argmax(variationRatios, n_instances=n_instances) -def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_indexes: list, num_predictions: int = 50): +def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_indexes: list, + num_predictions: int = 50, sample_per_forward_pass: int = 1000): """ Runs num_predictions times the prediction of the classifier on the input X and puts the predictions in a list. @@ -230,6 +243,9 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde dropout_layer_indexes: Indexes of the dropout layers which should be activated Choose indices from : list(torch_model.modules()) num_predictions: Number of predictions which should be made + sample_per_forward_pass: max. sample number for each forward pass. + The allocated RAM does mainly depend on this. + Small number --> small RAM allocation Return: prediction: list with all predictions """ @@ -245,7 +261,7 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde X.detach() probas = [] - for X_split in torch.split(X, 5000): + for X_split in torch.split(X, sample_per_forward_pass): prediction = classifier.estimator.infer(X_split) prediction_proba = to_numpy(prediction.softmax(1)) From c26b0d3ccdc91810b55155505ac7325625977413 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Mon, 22 Mar 2021 14:32:11 +0100 Subject: [PATCH 140/182] Padding/ mask support in the query strategies --- modAL/dropout.py | 38 +++++++++++++++++++++++++++----------- rtd_requirements.txt | 2 +- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index d0d51ed..5d0f500 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -278,11 +278,13 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde return predictions -def entropy_sum(values, axis=-1): +def entropy_sum(values: np.array, mask: np.ndarray = None, axis: int =-1): + if mask is None: + mask = np.ones(values.shape, dtype=bool) #sum Scipy basic entropy function: entr() - return np.sum(entr(values), axis=axis) + return np.sum(entr(values), where=mask, axis=axis) -def _mean_standard_deviation(proba: list) -> np.ndarray: +def _mean_standard_deviation(proba: list, mask: np.ndarray = None) -> np.ndarray: """ Calculates the mean of the per class calculated standard deviations. @@ -292,17 +294,21 @@ def _mean_standard_deviation(proba: list) -> np.ndarray: Args: proba: list with the predictions over the dropout cycles + mask: mask to detect the padded classes (must be of same shape as elements in proba) Return: Returns the mean standard deviation of the dropout cycles over all classes. """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + if mask is None: + mask = np.ones(proba[0].shape, dtype=bool) + standard_deviation_class_vise = np.std(proba_stacked, axis=-1) - mean_standard_deviation = np.mean(standard_deviation_class_vise, axis=-1) + mean_standard_deviation = np.mean(standard_deviation_class_vise, where=mask, axis=-1) return mean_standard_deviation -def _entropy(proba: list) -> np.ndarray: +def _entropy(proba: list, mask: np.ndarray = None) -> np.ndarray: """ Calculates the entropy per class over dropout cycles @@ -312,17 +318,21 @@ def _entropy(proba: list) -> np.ndarray: Args: proba: list with the predictions over the dropout cycles + mask: mask to detect the padded classes (must be of same shape as elements in proba) Return: Returns the entropy of the dropout cycles over all classes. """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + if mask is None: + mask = np.ones(proba[0].shape, dtype=bool) + #calculate entropy per class and sum along dropout cycles entropy_classes = entropy_sum(proba_stacked, axis=-1) - entropy = np.mean(entropy_classes, axis=-1) + entropy = np.mean(entropy_classes, where=mask, axis=-1) return entropy -def _variation_ratios(proba: list) -> np.ndarray: +def _variation_ratios(proba: list, mask: np.ndarray = None) -> np.ndarray: """ Calculates the variation ratios over dropout cycles @@ -332,15 +342,18 @@ def _variation_ratios(proba: list) -> np.ndarray: Args: proba: list with the predictions over the dropout cycles + mask: mask to detect the padded classes (must be of same shape as elements in proba) Return: Returns the variation ratios of the dropout cycles. """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + if mask is None: + mask = np.ones(proba[0].shape, dtype=bool) #Calculate the variation ratios over the mean of dropout cycles valuesDCMean = np.mean(proba_stacked, axis=-1) - return 1 - np.amax(valuesDCMean, axis=-1) + return 1 - np.amax(valuesDCMean, initial=0, where=mask, axis=-1) -def _bald_divergence(proba: list) -> np.ndarray: +def _bald_divergence(proba: list, mask: np.ndarray = None) -> np.ndarray: """ Calculates the bald divergence for each instance @@ -350,10 +363,13 @@ def _bald_divergence(proba: list) -> np.ndarray: Args: proba: list with the predictions over the dropout cycles + mask: mask to detect the padded classes (must be of same shape as elements in proba) Return: Returns the mean standard deviation of the dropout cycles over all classes. """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + if mask is None: + mask = np.ones(proba[0].shape, dtype=bool) #entropy along dropout cycles accumulated_entropy = entropy_sum(proba_stacked, axis=-1) @@ -361,7 +377,7 @@ def _bald_divergence(proba: list) -> np.ndarray: #score sums along dropout cycles accumulated_score = np.sum(proba_stacked, axis=-1) - average_score = accumulated_score / len(proba) + average_score = accumulated_score/len(proba) #expand dimension w/o data for entropy calculation average_score = np.expand_dims(average_score, axis=-1) @@ -373,7 +389,7 @@ def _bald_divergence(proba: list) -> np.ndarray: #sum all dimensions of diff besides first dim (instances) shaped = np.reshape(diff, (diff.shape[0], -1)) - bald = np.sum(shaped, axis=-1) + bald = np.sum(shaped, where=mask, axis=-1) return bald diff --git a/rtd_requirements.txt b/rtd_requirements.txt index 8c13855..db0bd81 100644 --- a/rtd_requirements.txt +++ b/rtd_requirements.txt @@ -1,4 +1,4 @@ -numpy +numpy==1.20.0 scipy scikit-learn ipykernel From e76e8c2620d6b9578e19a2c8950f02474b19bde8 Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Sat, 10 Apr 2021 19:41:18 +0200 Subject: [PATCH 141/182] Fix dropout dictionary support --- modAL/dropout.py | 57 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 5d0f500..58d4efe 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -1,6 +1,7 @@ import numpy as np import sys import torch +from collections.abc import Mapping from sklearn.base import BaseEstimator from sklearn.preprocessing import normalize @@ -250,26 +251,49 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde prediction: list with all predictions """ + #dbg + sample_per_forward_pass = 2 + predictions = [] # set dropout layers to train mode set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) - for i in range(num_predictions): - #call Skorch infer function to perform model forward pass - #In comparison to: predict(), predict_proba() the infer() - # does not change train/eval mode of other layers + if isinstance(X, Mapping): #check for dict + for k, v in X.items(): + v.detach() + elif torch.is_tensor(X): #check for tensor X.detach() - - probas = [] - for X_split in torch.split(X, sample_per_forward_pass): - prediction = classifier.estimator.infer(X_split) - prediction_proba = to_numpy(prediction.softmax(1)) + else: + raise RuntimeError("Error in model data type, only dict or tensors supported") - if type(probas) != list: - probas = np.vstack((probas, prediction_proba)) - else: - probas = prediction_proba + for i in range(num_predictions): + split_args = [] + + if isinstance(X, Mapping): #check for dict + for k, v in X.items(): + v.detach() + split_v = torch.split(v, sample_per_forward_pass) + #create sub-dictionary split for each forward pass with same keys&values + for split_idx, split in enumerate(split_v): + if len(split_args)<=split_idx: + split_args.append({}) + split_args[split_idx][k] = split + + elif torch.is_tensor(X): #check for tensor + X.detach() + split_args = torch.split(X, sample_per_forward_pass) + else: + raise RuntimeError("Error in model data type, only dict or tensors supported") + + probas = None + for samples in split_args: + #call Skorch infer function to perform model forward pass + #In comparison to: predict(), predict_proba() the infer() + # does not change train/eval mode of other layers + prediction = classifier.estimator.infer(samples) + prediction_proba = to_numpy(prediction.softmax(1)) + probas = prediction_proba if probas is None else np.vstack((probas, prediction_proba)) predictions.append(probas) @@ -368,8 +392,6 @@ def _bald_divergence(proba: list, mask: np.ndarray = None) -> np.ndarray: Returns the mean standard deviation of the dropout cycles over all classes. """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) - if mask is None: - mask = np.ones(proba[0].shape, dtype=bool) #entropy along dropout cycles accumulated_entropy = entropy_sum(proba_stacked, axis=-1) @@ -389,8 +411,11 @@ def _bald_divergence(proba: list, mask: np.ndarray = None) -> np.ndarray: #sum all dimensions of diff besides first dim (instances) shaped = np.reshape(diff, (diff.shape[0], -1)) - bald = np.sum(shaped, where=mask, axis=-1) + if mask is None: + mask = np.ones(shaped.shape, dtype=bool) + + bald = np.sum(shaped, where=mask, axis=-1) return bald def _KL_divergence(proba) -> np.ndarray: From 4e408b28001260fb79bc19f2b83199f91d89c2f6 Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Thu, 15 Apr 2021 18:21:55 +0200 Subject: [PATCH 142/182] Remove mask parameter & change to ignore NaN's --- modAL/dropout.py | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 58d4efe..e49cbf3 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -302,13 +302,12 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde return predictions -def entropy_sum(values: np.array, mask: np.ndarray = None, axis: int =-1): - if mask is None: - mask = np.ones(values.shape, dtype=bool) +def entropy_sum(values: np.array, axis: int =-1): #sum Scipy basic entropy function: entr() - return np.sum(entr(values), where=mask, axis=axis) + entropy = entr(values) + return np.sum(entropy, where=~np.isnan(entropy), axis=axis) -def _mean_standard_deviation(proba: list, mask: np.ndarray = None) -> np.ndarray: +def _mean_standard_deviation(proba: list) -> np.ndarray: """ Calculates the mean of the per class calculated standard deviations. @@ -324,15 +323,13 @@ def _mean_standard_deviation(proba: list, mask: np.ndarray = None) -> np.ndarray """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) - if mask is None: - mask = np.ones(proba[0].shape, dtype=bool) standard_deviation_class_vise = np.std(proba_stacked, axis=-1) - mean_standard_deviation = np.mean(standard_deviation_class_vise, where=mask, axis=-1) + mean_standard_deviation = np.mean(standard_deviation_class_vise, where=~np.isnan(standard_deviation_class_vise), axis=-1) return mean_standard_deviation -def _entropy(proba: list, mask: np.ndarray = None) -> np.ndarray: +def _entropy(proba: list) -> np.ndarray: """ Calculates the entropy per class over dropout cycles @@ -348,15 +345,13 @@ def _entropy(proba: list, mask: np.ndarray = None) -> np.ndarray: """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) - if mask is None: - mask = np.ones(proba[0].shape, dtype=bool) #calculate entropy per class and sum along dropout cycles entropy_classes = entropy_sum(proba_stacked, axis=-1) - entropy = np.mean(entropy_classes, where=mask, axis=-1) + entropy = np.mean(entropy_classes, where=~np.isnan(entropy_classes), axis=-1) return entropy -def _variation_ratios(proba: list, mask: np.ndarray = None) -> np.ndarray: +def _variation_ratios(proba: list) -> np.ndarray: """ Calculates the variation ratios over dropout cycles @@ -371,13 +366,12 @@ def _variation_ratios(proba: list, mask: np.ndarray = None) -> np.ndarray: Returns the variation ratios of the dropout cycles. """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) - if mask is None: - mask = np.ones(proba[0].shape, dtype=bool) + #Calculate the variation ratios over the mean of dropout cycles valuesDCMean = np.mean(proba_stacked, axis=-1) - return 1 - np.amax(valuesDCMean, initial=0, where=mask, axis=-1) + return 1 - np.amax(valuesDCMean, initial=0, where=~np.isnan(valuesDCMean), axis=-1) -def _bald_divergence(proba: list, mask: np.ndarray = None) -> np.ndarray: +def _bald_divergence(proba: list) -> np.ndarray: """ Calculates the bald divergence for each instance @@ -412,10 +406,7 @@ def _bald_divergence(proba: list, mask: np.ndarray = None) -> np.ndarray: #sum all dimensions of diff besides first dim (instances) shaped = np.reshape(diff, (diff.shape[0], -1)) - if mask is None: - mask = np.ones(shaped.shape, dtype=bool) - - bald = np.sum(shaped, where=mask, axis=-1) + bald = np.sum(shaped, where=~np.isnan(shaped), axis=-1) return bald def _KL_divergence(proba) -> np.ndarray: From 30abf74efbf40828b520d7c830f951cac46d994b Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Thu, 15 Apr 2021 20:32:13 +0200 Subject: [PATCH 143/182] Dropout softmax handle NaN's --- modAL/dropout.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index e49cbf3..bf7ca10 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -292,8 +292,10 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde #In comparison to: predict(), predict_proba() the infer() # does not change train/eval mode of other layers prediction = classifier.estimator.infer(samples) - prediction_proba = to_numpy(prediction.softmax(1)) - probas = prediction_proba if probas is None else np.vstack((probas, prediction_proba)) + mask = ~prediction.isnan() + prediction[mask] = prediction[mask].unsqueeze(0).softmax(1) + prediction = to_numpy(prediction) + probas = prediction if probas is None else np.vstack((probas, prediction)) predictions.append(probas) From cce7933fdc53197adf84fe2173d619cde7f92a14 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 20 Apr 2021 08:08:40 +0200 Subject: [PATCH 144/182] logits_adapter_function --- modAL/dropout.py | 61 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index bf7ca10..b4f3576 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -2,6 +2,7 @@ import sys import torch from collections.abc import Mapping +from typing import Callable from sklearn.base import BaseEstimator from sklearn.preprocessing import normalize @@ -13,6 +14,10 @@ from skorch.utils import to_numpy +def default_logits_adaptor(input_tensor: torch.tensor, samples: modALinput): + # default Callable parameter for get_predictions + return input_tensor + def KL_divergence(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: @@ -36,7 +41,9 @@ def KL_divergence(classifier: BaseEstimator, X: modALinput, n_instances: int = 1 def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: list = ["bald", "mean_st", "max_entropy", "max_var"], n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, sample_per_forward_pass: int = 1000, **mc_dropout_kwargs) -> np.ndarray: + num_cycles : int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, + **mc_dropout_kwargs) -> np.ndarray: """ Multi metric dropout query strategy. Returns the specified metrics for given input data. Selection of query strategies are: @@ -49,7 +56,7 @@ def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: Function returns dictionary of metrics with their name as key. The indices of the n-best samples (n_instances) is not used in this function. """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) metrics_dict = {} if "bald" in query_strategies: @@ -65,7 +72,9 @@ def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, sample_per_forward_pass: int = 1000, **mc_dropout_kwargs) -> np.ndarray: + num_cycles : int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, + **mc_dropout_kwargs,) -> np.ndarray: """ Mc-Dropout bald query strategy. Returns the indexes of the instances with the largest BALD (Bayesian Active Learning by Disagreement) score calculated through the dropout cycles @@ -91,6 +100,8 @@ def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = sample_per_forward_pass: max. sample number for each forward pass. The allocated RAM does mainly depend on this. Small number --> small RAM allocation + logits_adaptor: Callable which can be used to adapt the output of a forward pass + to the required vector format for the vectorised metric functions **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. @@ -98,7 +109,7 @@ def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = The indices of the instances from X chosen to be labelled; The mc-dropout metric of the chosen instances; """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) #calculate BALD (Bayesian active learning divergence)) bald_scores = _bald_divergence(predictions) @@ -110,7 +121,9 @@ def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, sample_per_forward_pass: int = 1000, **mc_dropout_kwargs) -> np.ndarray: + num_cycles : int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, + **mc_dropout_kwargs) -> np.ndarray: """ Mc-Dropout mean standard deviation query strategy. Returns the indexes of the instances with the largest mean of the per class calculated standard deviations over multiple dropout cycles @@ -132,6 +145,8 @@ def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: in sample_per_forward_pass: max. sample number for each forward pass. The allocated RAM does mainly depend on this. Small number --> small RAM allocation + logits_adaptor: Callable which can be used to adapt the output of a forward pass + to the required vector format for the vectorised metric functions **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. @@ -141,7 +156,7 @@ def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: in """ # set dropout layers to train mode - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) mean_standard_deviations = _mean_standard_deviation(predictions) @@ -152,7 +167,9 @@ def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: in def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, sample_per_forward_pass: int = 1000, **mc_dropout_kwargs) -> np.ndarray: + num_cycles : int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, + **mc_dropout_kwargs) -> np.ndarray: """ Mc-Dropout maximum entropy query strategy. Returns the indexes of the instances with the largest entropy of the per class calculated entropies over multiple dropout cycles @@ -174,6 +191,8 @@ def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances sample_per_forward_pass: max. sample number for each forward pass. The allocated RAM does mainly depend on this. Small number --> small RAM allocation + logits_adaptor: Callable which can be used to adapt the output of a forward pass + to the required vector format for the vectorised metric functions **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. @@ -181,7 +200,7 @@ def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances The indices of the instances from X chosen to be labelled; The mc-dropout metric of the chosen instances; """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) #get entropy values for predictions entropy = _entropy(predictions) @@ -193,7 +212,9 @@ def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, sample_per_forward_pass: int = 1000, **mc_dropout_kwargs) -> np.ndarray: + num_cycles : int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, + **mc_dropout_kwargs) -> np.ndarray: """ Mc-Dropout maximum variation ratios query strategy. Returns the indexes of the instances with the largest variation ratios over multiple dropout cycles @@ -215,6 +236,8 @@ def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_i sample_per_forward_pass: max. sample number for each forward pass. The allocated RAM does mainly depend on this. Small number --> small RAM allocation + logits_adaptor: Callable which can be used to adapt the output of a forward pass + to the required vector format for the vectorised metric functions **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. @@ -222,7 +245,7 @@ def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_i The indices of the instances from X chosen to be labelled; The mc-dropout metric of the chosen instances; """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass) + predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) #get variation ratios values for predictions variationRatios = _variation_ratios(predictions) @@ -233,7 +256,8 @@ def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_i return shuffled_argmax(variationRatios, n_instances=n_instances) def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_indexes: list, - num_predictions: int = 50, sample_per_forward_pass: int = 1000): + num_predictions: int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor): """ Runs num_predictions times the prediction of the classifier on the input X and puts the predictions in a list. @@ -247,6 +271,8 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde sample_per_forward_pass: max. sample number for each forward pass. The allocated RAM does mainly depend on this. Small number --> small RAM allocation + logits_adaptor: Callable which can be used to adapt the output of a forward pass + to the required vector format for the vectorised metric functions Return: prediction: list with all predictions """ @@ -258,14 +284,6 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde # set dropout layers to train mode set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) - if isinstance(X, Mapping): #check for dict - for k, v in X.items(): - v.detach() - elif torch.is_tensor(X): #check for tensor - X.detach() - else: - raise RuntimeError("Error in model data type, only dict or tensors supported") - for i in range(num_predictions): split_args = [] @@ -287,16 +305,19 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde probas = None + for samples in split_args: #call Skorch infer function to perform model forward pass #In comparison to: predict(), predict_proba() the infer() # does not change train/eval mode of other layers - prediction = classifier.estimator.infer(samples) + logits = classifier.estimator.infer(samples) + prediction = logits_adaptor(logits, samples) mask = ~prediction.isnan() prediction[mask] = prediction[mask].unsqueeze(0).softmax(1) prediction = to_numpy(prediction) probas = prediction if probas is None else np.vstack((probas, prediction)) + predictions.append(probas) # set dropout layers to eval From f04db99271184e8ee718535ad95ea93c86be937e Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 20 Apr 2021 12:29:12 +0200 Subject: [PATCH 145/182] dict modAl update and documentation --- modAL/dropout.py | 5 +---- modAL/utils/data.py | 5 +++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index b4f3576..2fd798a 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -276,10 +276,7 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde Return: prediction: list with all predictions """ - - #dbg - sample_per_forward_pass = 2 - + predictions = [] # set dropout layers to train mode set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) diff --git a/modAL/utils/data.py b/modAL/utils/data.py index a7a466d..10ddb2e 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -103,9 +103,10 @@ def retrieve_rows(X: modALinput, elif isinstance(X, list): return np.array(X)[I].tolist() elif isinstance(X, dict): + X_return = {} for key, value in X.items(): - X[key] = retrieve_rows(value, I) - return X + X_return[key] = retrieve_rows(value, I) + return X_return elif isinstance(X, np.ndarray): return X[I] elif torch.is_tensor(X): From a4750f06a3426199ae5e81b7a82ddd41383fa416 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 20 Apr 2021 13:45:05 +0200 Subject: [PATCH 146/182] remove time --- modAL/dropout.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 2fd798a..5ae9c6a 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -4,6 +4,7 @@ from collections.abc import Mapping from typing import Callable + from sklearn.base import BaseEstimator from sklearn.preprocessing import normalize @@ -110,8 +111,8 @@ def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = The mc-dropout metric of the chosen instances; """ predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) - #calculate BALD (Bayesian active learning divergence)) + bald_scores = _bald_divergence(predictions) if not random_tie_break: @@ -276,7 +277,7 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde Return: prediction: list with all predictions """ - + predictions = [] # set dropout layers to train mode set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) @@ -308,6 +309,7 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde #In comparison to: predict(), predict_proba() the infer() # does not change train/eval mode of other layers logits = classifier.estimator.infer(samples) + prediction = logits_adaptor(logits, samples) mask = ~prediction.isnan() prediction[mask] = prediction[mask].unsqueeze(0).softmax(1) From 83d92cb449574a1ccebbdab46a237e834b6120cc Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Thu, 22 Apr 2021 09:01:24 +0200 Subject: [PATCH 147/182] split_args replacement --- modAL/dropout.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 5ae9c6a..a4a83f2 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -282,26 +282,26 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde # set dropout layers to train mode set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) - for i in range(num_predictions): - split_args = [] - - if isinstance(X, Mapping): #check for dict - for k, v in X.items(): - v.detach() - split_v = torch.split(v, sample_per_forward_pass) - #create sub-dictionary split for each forward pass with same keys&values - for split_idx, split in enumerate(split_v): - if len(split_args)<=split_idx: - split_args.append({}) - split_args[split_idx][k] = split - - elif torch.is_tensor(X): #check for tensor - X.detach() - split_args = torch.split(X, sample_per_forward_pass) - else: - raise RuntimeError("Error in model data type, only dict or tensors supported") - + split_args = [] + + if isinstance(X, Mapping): #check for dict + for k, v in X.items(): + v.detach() + split_v = torch.split(v, sample_per_forward_pass) + #create sub-dictionary split for each forward pass with same keys&values + for split_idx, split in enumerate(split_v): + if len(split_args)<=split_idx: + split_args.append({}) + split_args[split_idx][k] = split + elif torch.is_tensor(X): #check for tensor + X.detach() + split_args = torch.split(X, sample_per_forward_pass) + else: + raise RuntimeError("Error in model data type, only dict or tensors supported") + + for i in range(num_predictions): + probas = None for samples in split_args: From 2ae3a6599789abd9afe8e212295bdb57e2e8d81d Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Fri, 23 Apr 2021 09:20:37 +0200 Subject: [PATCH 148/182] runtime_improvement, removement of np.vstack --- modAL/dropout.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index a4a83f2..877453c 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -284,8 +284,12 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde split_args = [] + number_of_samples = 0 + if isinstance(X, Mapping): #check for dict for k, v in X.items(): + number_of_samples = v.size(0) + v.detach() split_v = torch.split(v, sample_per_forward_pass) #create sub-dictionary split for each forward pass with same keys&values @@ -295,16 +299,18 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde split_args[split_idx][k] = split elif torch.is_tensor(X): #check for tensor + number_of_samples = X.size(0) X.detach() split_args = torch.split(X, sample_per_forward_pass) else: raise RuntimeError("Error in model data type, only dict or tensors supported") + for i in range(num_predictions): probas = None - for samples in split_args: + for index, samples in enumerate(split_args): #call Skorch infer function to perform model forward pass #In comparison to: predict(), predict_proba() the infer() # does not change train/eval mode of other layers @@ -313,10 +319,12 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde prediction = logits_adaptor(logits, samples) mask = ~prediction.isnan() prediction[mask] = prediction[mask].unsqueeze(0).softmax(1) - prediction = to_numpy(prediction) - probas = prediction if probas is None else np.vstack((probas, prediction)) + if probas is None: probas = torch.empty((number_of_samples, prediction.shape[-1])) + + probas[range(sample_per_forward_pass*index, sample_per_forward_pass*(index+1)), :] = prediction + probas = to_numpy(prediction) predictions.append(probas) # set dropout layers to eval From ad081bb11b2c02c5be0b70a9607a175eb7151b0e Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Fri, 23 Apr 2021 09:25:54 +0200 Subject: [PATCH 149/182] change position of softmax in get_predictions --- modAL/dropout.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 877453c..3072786 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -315,16 +315,15 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde #In comparison to: predict(), predict_proba() the infer() # does not change train/eval mode of other layers logits = classifier.estimator.infer(samples) - prediction = logits_adaptor(logits, samples) - mask = ~prediction.isnan() - prediction[mask] = prediction[mask].unsqueeze(0).softmax(1) if probas is None: probas = torch.empty((number_of_samples, prediction.shape[-1])) - probas[range(sample_per_forward_pass*index, sample_per_forward_pass*(index+1)), :] = prediction - probas = to_numpy(prediction) + + mask = ~probas.isnan() + probas[mask] = probas[mask].unsqueeze(0).softmax(1) + probas = to_numpy(probas) predictions.append(probas) # set dropout layers to eval From af49d6de4ae83a8c21992363ef4eadabc72e9689 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Fri, 23 Apr 2021 09:41:38 +0200 Subject: [PATCH 150/182] device tensor adaption --- modAL/dropout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 3072786..d83f724 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -317,7 +317,7 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde logits = classifier.estimator.infer(samples) prediction = logits_adaptor(logits, samples) - if probas is None: probas = torch.empty((number_of_samples, prediction.shape[-1])) + if probas is None: probas = torch.empty((number_of_samples, prediction.shape[-1]), device=prediction.device) probas[range(sample_per_forward_pass*index, sample_per_forward_pass*(index+1)), :] = prediction From 4f42d20855ffd4f3ebb4e69aeb53ee99e89c0e2b Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Fri, 23 Apr 2021 10:09:24 +0200 Subject: [PATCH 151/182] try_to_allocate_tensor_on_cpu --- modAL/dropout.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index d83f724..ff6d17a 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -317,12 +317,11 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde logits = classifier.estimator.infer(samples) prediction = logits_adaptor(logits, samples) - if probas is None: probas = torch.empty((number_of_samples, prediction.shape[-1]), device=prediction.device) - probas[range(sample_per_forward_pass*index, sample_per_forward_pass*(index+1)), :] = prediction + mask = ~prediction.isnan() + prediction[mask] = prediction[mask].unsqueeze(0).softmax(1) + if probas is None: probas = torch.empty((number_of_samples, prediction.shape[-1]), device='cpu') + probas[range(sample_per_forward_pass*index, sample_per_forward_pass*(index+1)), :] = prediction.cpu() - - mask = ~probas.isnan() - probas[mask] = probas[mask].unsqueeze(0).softmax(1) probas = to_numpy(probas) predictions.append(probas) From a72386a84720158baee6a158e391e70641647f06 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Fri, 23 Apr 2021 13:56:41 +0200 Subject: [PATCH 152/182] torch no grad in get_predictions --- modAL/dropout.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index ff6d17a..fc7eab5 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -314,13 +314,14 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde #call Skorch infer function to perform model forward pass #In comparison to: predict(), predict_proba() the infer() # does not change train/eval mode of other layers - logits = classifier.estimator.infer(samples) - prediction = logits_adaptor(logits, samples) - - mask = ~prediction.isnan() - prediction[mask] = prediction[mask].unsqueeze(0).softmax(1) - if probas is None: probas = torch.empty((number_of_samples, prediction.shape[-1]), device='cpu') - probas[range(sample_per_forward_pass*index, sample_per_forward_pass*(index+1)), :] = prediction.cpu() + with torch.no_grad: + logits = classifier.estimator.infer(samples) + prediction = logits_adaptor(logits, samples) + + mask = ~prediction.isnan() + prediction[mask] = prediction[mask].unsqueeze(0).softmax(1) + if probas is None: probas = torch.empty((number_of_samples, prediction.shape[-1]), device='cpu') + probas[range(sample_per_forward_pass*index, sample_per_forward_pass*(index+1)), :] = prediction.cpu() probas = to_numpy(probas) predictions.append(probas) From 53e5484865f975dce404f15a64e00f755598af97 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Fri, 23 Apr 2021 14:37:04 +0200 Subject: [PATCH 153/182] no grad function call --- modAL/dropout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index fc7eab5..1da2361 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -314,7 +314,7 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde #call Skorch infer function to perform model forward pass #In comparison to: predict(), predict_proba() the infer() # does not change train/eval mode of other layers - with torch.no_grad: + with torch.no_grad(): logits = classifier.estimator.infer(samples) prediction = logits_adaptor(logits, samples) From 61442b68ee2deb9c3fa723ae21d1c0533729d519 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Fri, 23 Apr 2021 16:03:22 +0200 Subject: [PATCH 154/182] get_predictions with torch cat --- modAL/dropout.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 1da2361..6b5cd1e 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -308,7 +308,7 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde for i in range(num_predictions): - probas = None + probas = [] for index, samples in enumerate(split_args): #call Skorch infer function to perform model forward pass @@ -317,14 +317,12 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde with torch.no_grad(): logits = classifier.estimator.infer(samples) prediction = logits_adaptor(logits, samples) - mask = ~prediction.isnan() prediction[mask] = prediction[mask].unsqueeze(0).softmax(1) - if probas is None: probas = torch.empty((number_of_samples, prediction.shape[-1]), device='cpu') - probas[range(sample_per_forward_pass*index, sample_per_forward_pass*(index+1)), :] = prediction.cpu() - - probas = to_numpy(probas) - predictions.append(probas) + probas.append(prediction) + + probas = torch.cat(probas) + predictions.append(to_numpy(probas)) # set dropout layers to eval set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) From c5160062363c449bb30ab5297b7517ff9ce0962f Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sat, 24 Apr 2021 10:02:15 +0200 Subject: [PATCH 155/182] remove of unsqueece --- modAL/dropout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 6b5cd1e..e046a6d 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -318,7 +318,7 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde logits = classifier.estimator.infer(samples) prediction = logits_adaptor(logits, samples) mask = ~prediction.isnan() - prediction[mask] = prediction[mask].unsqueeze(0).softmax(1) + prediction[mask] = prediction[mask].softmax(1) probas.append(prediction) probas = torch.cat(probas) From 26585e1e73f8ca0ce8d7d59cb71feb6e9ec89f91 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 4 May 2021 11:22:54 +0200 Subject: [PATCH 156/182] Put score back into base class (Accounting for arbitrary deep learning models is not possible) --- modAL/models/base.py | 16 +++++++++++++--- modAL/models/learners.py | 34 ---------------------------------- 2 files changed, 13 insertions(+), 37 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 19776ff..a165be1 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -176,9 +176,19 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] return query_result, retrieve_rows(X_pool, query_result), query_metrics - @abc.abstractmethod - def score(self, *args, **kwargs) -> None: - pass + def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: + """ + Interface for the score method of the predictor. + + Args: + X: The samples for which prediction accuracy is to be calculated. + y: Ground truth labels for X. + **score_kwargs: Keyword arguments to be passed to the .score() method of the predictor. + + Returns: + The score of the predictor. + """ + return self.estimator.score(X, y, **score_kwargs) @abc.abstractmethod def teach(self, *args, **kwargs) -> None: diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 244f456..7e7ceba 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -160,20 +160,6 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg self.X_training, self.y_training = X, y return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) - def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: - """ - Interface for the score method of the predictor. - - Args: - X: The samples for which prediction accuracy is to be calculated. - y: Ground truth labels for X. - **score_kwargs: Keyword arguments to be passed to the .score() method of the predictor. - - Returns: - The score of the predictor. - """ - return self.estimator.score(X, y, **score_kwargs) - def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: """ Adds X and y to the known training data and retrains the predictor with the augmented dataset. @@ -245,26 +231,6 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg """ return self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) - def score(self, X: modALinput, y: modALinput) -> Any: - """ - Interface for the score method of the predictor. - - Args: - X: The samples for which prediction accuracy is to be calculated. - y: Ground truth labels for X. - - Returns: - The score of the predictor. - """ - """ - sklearn does only accept tensors of different dim for X and Y, if we use - Multilabel classifiaction. Using tensors of different sizes for more complex models (e.g. Transformers) - requires to bypass the sklearn checks by directly calling the NeuralNets infer() function. - """ - prediction = self.estimator.infer(X) - criterion = self.estimator.criterion() - return criterion(prediction, y).item() - def teach(self, X: modALinput, y: modALinput, warm_start: bool = True, bootstrap: bool = False, **fit_kwargs) -> None: """ Adds X and y to the known training data and retrains the predictor with the augmented dataset. From 72f9d92c2c8c60d294737694c651cced865ba40d Mon Sep 17 00:00:00 2001 From: Stefan Ott Date: Wed, 5 May 2021 13:20:05 +0200 Subject: [PATCH 157/182] Fix softmax dimension arg for general models --- modAL/dropout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index e046a6d..92a6a0e 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -318,7 +318,7 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde logits = classifier.estimator.infer(samples) prediction = logits_adaptor(logits, samples) mask = ~prediction.isnan() - prediction[mask] = prediction[mask].softmax(1) + prediction[mask] = prediction[mask].softmax(-1) probas.append(prediction) probas = torch.cat(probas) From 8b71d2483570b7691f2ba595be7e1f9c623c5bb5 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Wed, 26 May 2021 13:26:36 +0200 Subject: [PATCH 158/182] Remove not finished KL-Divergence & non used import & not more relevant TODO's --- modAL/dropout.py | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index e046a6d..f5762e7 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -1,5 +1,4 @@ import numpy as np -import sys import torch from collections.abc import Mapping from typing import Callable @@ -19,27 +18,6 @@ def default_logits_adaptor(input_tensor: torch.tensor, samples: modALinput): # default Callable parameter for get_predictions return input_tensor -def KL_divergence(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, - random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, **mc_dropout_kwargs) -> np.ndarray: - """ - TODO: Work in progress - """ - # set dropout layers to train mode - set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) - - predictions = get_predictions(classifier, X, num_cycles) - - # set dropout layers to eval - set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) - - #KL_divergence = _KL_divergence(predictions) - - if not random_tie_break: - return multi_argmax(KL_divergence, n_instances=n_instances) - - return shuffled_argmax(KL_divergence, n_instances=n_instances) - def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: list = ["bald", "mean_st", "max_entropy", "max_var"], n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], num_cycles : int = 50, sample_per_forward_pass: int = 1000, @@ -436,23 +414,10 @@ def _bald_divergence(proba: list) -> np.ndarray: bald = np.sum(shaped, where=~np.isnan(shaped), axis=-1) return bald -def _KL_divergence(proba) -> np.ndarray: - - #create 3D or 4D array from prediction dim: (drop_cycles, proba.shape[0], proba.shape[1], opt:proba.shape[2]) - proba_stacked = np.stack(proba, axis=len(proba[0].shape)) - # TODO work in progress - # TODO add dimensionality adaption - #number_of_dimensions = proba_stacked.ndim - #if proba_stacked.ndim > 2: - - normalized_proba = normalize(proba_stacked, axis=0) - def set_dropout_mode(model, dropout_layer_indexes: list, train_mode: bool): """ Function to enable the dropout layers by setting them to user specified mode (bool: train_mode) - TODO: Reduce maybe complexity - TODO: Keras support """ modules = list(model.modules()) # list of all modules in the network. From 1484bf00ae928a59cb69595984ba93977279caa5 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Mon, 7 Jun 2021 10:20:40 +0200 Subject: [PATCH 159/182] PyTorch as explicit requirement --- rtd_requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/rtd_requirements.txt b/rtd_requirements.txt index db0bd81..685089f 100644 --- a/rtd_requirements.txt +++ b/rtd_requirements.txt @@ -5,3 +5,4 @@ ipykernel nbsphinx pandas skorch +torch From b33829e348b4be04b5ad44e31c0ce435f8f74ba6 Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 8 Jun 2021 09:09:43 +0200 Subject: [PATCH 160/182] modAL package dependencies adaption --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 78d36a1..8b2bd5a 100644 --- a/setup.py +++ b/setup.py @@ -10,5 +10,5 @@ url='https://modAL-python.github.io/', packages=['modAL', 'modAL.models', 'modAL.utils'], classifiers=['Development Status :: 4 - Beta'], - install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0', 'skorch>=0.9.0'], + install_requires=['numpy==1.20.0', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0', 'skorch==0.9.0'], ) From 1d77cdef0f3012d44eba9480be8f7bed93ae080f Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Tue, 8 Jun 2021 09:13:41 +0200 Subject: [PATCH 161/182] ad PyTorch to setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8b2bd5a..6d80a37 100644 --- a/setup.py +++ b/setup.py @@ -10,5 +10,5 @@ url='https://modAL-python.github.io/', packages=['modAL', 'modAL.models', 'modAL.utils'], classifiers=['Development Status :: 4 - Beta'], - install_requires=['numpy==1.20.0', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0', 'skorch==0.9.0'], + install_requires=['numpy==1.20.0', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0', 'skorch==0.9.0', 'torch>=1.8.1'], ) From fd4506d2125246e3e88a5a1b294e1865ff0b2f7d Mon Sep 17 00:00:00 2001 From: Max Keller Date: Sun, 8 Aug 2021 13:58:38 +0200 Subject: [PATCH 162/182] Add PR-adaptions & Do refactoring --- .travis.yml | 2 +- modAL/dropout.py | 187 ++++++++++++++++++++++----------------- modAL/models/base.py | 71 +++++++++------ modAL/models/learners.py | 78 +++++++++------- modAL/utils/selection.py | 7 +- setup.py | 3 +- 6 files changed, 206 insertions(+), 142 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9c9ae7e..a84f0a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ after_success: matrix: include: install: - - pip install numpy==1.13 scikit-learn==0.18 scipy==0.18 + - pip install numpy==1.20 scikit-learn==0.18 scipy==0.18 - pip install codecov - pip install coverage - pip install . diff --git a/modAL/dropout.py b/modAL/dropout.py index 20c9f5f..6533c1d 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -1,5 +1,5 @@ import numpy as np -import torch +import torch from collections.abc import Mapping from typing import Callable @@ -14,15 +14,18 @@ from skorch.utils import to_numpy -def default_logits_adaptor(input_tensor: torch.tensor, samples: modALinput): + +def default_logits_adaptor(input_tensor: torch.tensor, samples: modALinput): # default Callable parameter for get_predictions return input_tensor -def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: list = ["bald", "mean_st", "max_entropy", "max_var"], - n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, sample_per_forward_pass: int = 1000, - logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, - **mc_dropout_kwargs) -> np.ndarray: + +def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: list = ["bald", "mean_st", "max_entropy", "max_var"], + n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles: int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[ + torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, + **mc_dropout_kwargs) -> np.ndarray: """ Multi metric dropout query strategy. Returns the specified metrics for given input data. Selection of query strategies are: @@ -35,7 +38,8 @@ def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: Function returns dictionary of metrics with their name as key. The indices of the n-best samples (n_instances) is not used in this function. """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) + predictions = get_predictions( + classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) metrics_dict = {} if "bald" in query_strategies: @@ -49,11 +53,13 @@ def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: return None, metrics_dict + def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, - random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, sample_per_forward_pass: int = 1000, - logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, - **mc_dropout_kwargs,) -> np.ndarray: + random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles: int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[ + torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, + **mc_dropout_kwargs,) -> np.ndarray: """ Mc-Dropout bald query strategy. Returns the indexes of the instances with the largest BALD (Bayesian Active Learning by Disagreement) score calculated through the dropout cycles @@ -88,8 +94,9 @@ def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = The indices of the instances from X chosen to be labelled; The mc-dropout metric of the chosen instances; """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) - #calculate BALD (Bayesian active learning divergence)) + predictions = get_predictions( + classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) + # calculate BALD (Bayesian active learning divergence)) bald_scores = _bald_divergence(predictions) @@ -98,11 +105,13 @@ def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = return shuffled_argmax(bald_scores, n_instances=n_instances) + def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, - random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, sample_per_forward_pass: int = 1000, - logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, - **mc_dropout_kwargs) -> np.ndarray: + random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles: int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[ + torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, + **mc_dropout_kwargs) -> np.ndarray: """ Mc-Dropout mean standard deviation query strategy. Returns the indexes of the instances with the largest mean of the per class calculated standard deviations over multiple dropout cycles @@ -135,7 +144,8 @@ def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: in """ # set dropout layers to train mode - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) + predictions = get_predictions( + classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) mean_standard_deviations = _mean_standard_deviation(predictions) @@ -144,11 +154,13 @@ def mc_dropout_mean_st(classifier: BaseEstimator, X: modALinput, n_instances: in return shuffled_argmax(mean_standard_deviations, n_instances=n_instances) + def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, - random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, sample_per_forward_pass: int = 1000, - logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, - **mc_dropout_kwargs) -> np.ndarray: + random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles: int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[ + torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, + **mc_dropout_kwargs) -> np.ndarray: """ Mc-Dropout maximum entropy query strategy. Returns the indexes of the instances with the largest entropy of the per class calculated entropies over multiple dropout cycles @@ -179,9 +191,10 @@ def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances The indices of the instances from X chosen to be labelled; The mc-dropout metric of the chosen instances; """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) + predictions = get_predictions( + classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) - #get entropy values for predictions + # get entropy values for predictions entropy = _entropy(predictions) if not random_tie_break: @@ -189,11 +202,13 @@ def mc_dropout_max_entropy(classifier: BaseEstimator, X: modALinput, n_instances return shuffled_argmax(entropy, n_instances=n_instances) + def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, - random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles : int = 50, sample_per_forward_pass: int = 1000, - logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, - **mc_dropout_kwargs) -> np.ndarray: + random_tie_break: bool = False, dropout_layer_indexes: list = [], + num_cycles: int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[ + torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, + **mc_dropout_kwargs) -> np.ndarray: """ Mc-Dropout maximum variation ratios query strategy. Returns the indexes of the instances with the largest variation ratios over multiple dropout cycles @@ -224,9 +239,10 @@ def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_i The indices of the instances from X chosen to be labelled; The mc-dropout metric of the chosen instances; """ - predictions = get_predictions(classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) + predictions = get_predictions( + classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) - #get variation ratios values for predictions + # get variation ratios values for predictions variationRatios = _variation_ratios(predictions) if not random_tie_break: @@ -234,9 +250,10 @@ def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_i return shuffled_argmax(variationRatios, n_instances=n_instances) + def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_indexes: list, - num_predictions: int = 50, sample_per_forward_pass: int = 1000, - logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor): + num_predictions: int = 50, sample_per_forward_pass: int = 1000, + logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor): """ Runs num_predictions times the prediction of the classifier on the input X and puts the predictions in a list. @@ -258,61 +275,61 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde predictions = [] # set dropout layers to train mode - set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=True) + set_dropout_mode(classifier.estimator.module_, + dropout_layer_indexes, train_mode=True) split_args = [] - number_of_samples = 0 - - if isinstance(X, Mapping): #check for dict + if isinstance(X, Mapping): # check for dict for k, v in X.items(): - number_of_samples = v.size(0) v.detach() split_v = torch.split(v, sample_per_forward_pass) - #create sub-dictionary split for each forward pass with same keys&values + # create sub-dictionary split for each forward pass with same keys&values for split_idx, split in enumerate(split_v): - if len(split_args)<=split_idx: + if len(split_args) <= split_idx: split_args.append({}) split_args[split_idx][k] = split - - elif torch.is_tensor(X): #check for tensor - number_of_samples = X.size(0) + + elif torch.is_tensor(X): # check for tensor X.detach() split_args = torch.split(X, sample_per_forward_pass) else: - raise RuntimeError("Error in model data type, only dict or tensors supported") - + raise RuntimeError( + "Error in model data type, only dict or tensors supported") for i in range(num_predictions): probas = [] - for index, samples in enumerate(split_args): - #call Skorch infer function to perform model forward pass - #In comparison to: predict(), predict_proba() the infer() - # does not change train/eval mode of other layers - with torch.no_grad(): + for samples in split_args: + # call Skorch infer function to perform model forward pass + # In comparison to: predict(), predict_proba() the infer() + # does not change train/eval mode of other layers + with torch.no_grad(): logits = classifier.estimator.infer(samples) prediction = logits_adaptor(logits, samples) mask = ~prediction.isnan() prediction[mask] = prediction[mask].softmax(-1) probas.append(prediction) - + probas = torch.cat(probas) predictions.append(to_numpy(probas)) # set dropout layers to eval - set_dropout_mode(classifier.estimator.module_, dropout_layer_indexes, train_mode=False) + set_dropout_mode(classifier.estimator.module_, + dropout_layer_indexes, train_mode=False) return predictions -def entropy_sum(values: np.array, axis: int =-1): - #sum Scipy basic entropy function: entr() + +def entropy_sum(values: np.array, axis: int = -1): + # sum Scipy basic entropy function: entr() entropy = entr(values) return np.sum(entropy, where=~np.isnan(entropy), axis=axis) -def _mean_standard_deviation(proba: list) -> np.ndarray: + +def _mean_standard_deviation(proba: list) -> np.ndarray: """ Calculates the mean of the per class calculated standard deviations. @@ -327,14 +344,16 @@ def _mean_standard_deviation(proba: list) -> np.ndarray: Returns the mean standard deviation of the dropout cycles over all classes. """ - proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + proba_stacked = np.stack(proba, axis=len(proba[0].shape)) standard_deviation_class_vise = np.std(proba_stacked, axis=-1) - mean_standard_deviation = np.mean(standard_deviation_class_vise, where=~np.isnan(standard_deviation_class_vise), axis=-1) + mean_standard_deviation = np.mean(standard_deviation_class_vise, where=~np.isnan( + standard_deviation_class_vise), axis=-1) return mean_standard_deviation -def _entropy(proba: list) -> np.ndarray: + +def _entropy(proba: list) -> np.ndarray: """ Calculates the entropy per class over dropout cycles @@ -349,14 +368,16 @@ def _entropy(proba: list) -> np.ndarray: Returns the entropy of the dropout cycles over all classes. """ - proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + proba_stacked = np.stack(proba, axis=len(proba[0].shape)) - #calculate entropy per class and sum along dropout cycles + # calculate entropy per class and sum along dropout cycles entropy_classes = entropy_sum(proba_stacked, axis=-1) - entropy = np.mean(entropy_classes, where=~np.isnan(entropy_classes), axis=-1) + entropy = np.mean(entropy_classes, where=~ + np.isnan(entropy_classes), axis=-1) return entropy -def _variation_ratios(proba: list) -> np.ndarray: + +def _variation_ratios(proba: list) -> np.ndarray: """ Calculates the variation ratios over dropout cycles @@ -370,12 +391,13 @@ def _variation_ratios(proba: list) -> np.ndarray: Return: Returns the variation ratios of the dropout cycles. """ - proba_stacked = np.stack(proba, axis=len(proba[0].shape)) + proba_stacked = np.stack(proba, axis=len(proba[0].shape)) - #Calculate the variation ratios over the mean of dropout cycles + # Calculate the variation ratios over the mean of dropout cycles valuesDCMean = np.mean(proba_stacked, axis=-1) return 1 - np.amax(valuesDCMean, initial=0, where=~np.isnan(valuesDCMean), axis=-1) + def _bald_divergence(proba: list) -> np.ndarray: """ Calculates the bald divergence for each instance @@ -392,23 +414,23 @@ def _bald_divergence(proba: list) -> np.ndarray: """ proba_stacked = np.stack(proba, axis=len(proba[0].shape)) - #entropy along dropout cycles + # entropy along dropout cycles accumulated_entropy = entropy_sum(proba_stacked, axis=-1) f_x = accumulated_entropy/len(proba) - #score sums along dropout cycles + # score sums along dropout cycles accumulated_score = np.sum(proba_stacked, axis=-1) average_score = accumulated_score/len(proba) - #expand dimension w/o data for entropy calculation + # expand dimension w/o data for entropy calculation average_score = np.expand_dims(average_score, axis=-1) - #entropy over average prediction score + # entropy over average prediction score g_x = entropy_sum(average_score, axis=-1) - #entropy differences + # entropy differences diff = np.subtract(g_x, f_x) - #sum all dimensions of diff besides first dim (instances) + # sum all dimensions of diff besides first dim (instances) shaped = np.reshape(diff, (diff.shape[0], -1)) bald = np.sum(shaped, where=~np.isnan(shaped), axis=-1) @@ -417,23 +439,30 @@ def _bald_divergence(proba: list) -> np.ndarray: def set_dropout_mode(model, dropout_layer_indexes: list, train_mode: bool): """ - Function to enable the dropout layers by setting them to user specified mode (bool: train_mode) + Function to change the mode of the dropout layers (bool: train_mode -> train or evaluation) + + Args: + model: Pytorch model + dropout_layer_indexes: Indexes of the dropout layers which should be activated + Choose indices from : list(torch_model.modules()) + train_mode: boolean, true <=> train_mode, false <=> evaluation_mode """ - modules = list(model.modules()) # list of all modules in the network. - - if len(dropout_layer_indexes) != 0: - for index in dropout_layer_indexes: + modules = list(model.modules()) # list of all modules in the network. + + if len(dropout_layer_indexes) != 0: + for index in dropout_layer_indexes: layer = modules[index] - if layer.__class__.__name__.startswith('Dropout'): + if layer.__class__.__name__.startswith('Dropout'): if True == train_mode: layer.train() elif False == train_mode: layer.eval() - else: - raise KeyError("The passed index: {} is not a Dropout layer".format(index)) + else: + raise KeyError( + "The passed index: {} is not a Dropout layer".format(index)) - else: + else: for module in modules: if module.__class__.__name__.startswith('Dropout'): if True == train_mode: diff --git a/modAL/models/base.py b/modAL/models/base.py index a165be1..63f5409 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -41,6 +41,7 @@ class BaseLearner(ABC, BaseEstimator): estimator: The estimator to be used in the active learning loop. query_strategy: Function providing the query strategy for the active learning loop. """ + def __init__(self, estimator: BaseEstimator, query_strategy: Callable, @@ -54,7 +55,8 @@ def __init__(self, self.query_strategy = query_strategy self.on_transformed = on_transformed - assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool' + assert isinstance(force_all_finite, + bool), 'force_all_finite must be a bool' self.force_all_finite = force_all_finite def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.csr_matrix]: @@ -86,7 +88,8 @@ def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.cs # components but the final estimator, which is replaced by an empty (passthrough) component. # This prevents any special handling of the final transformation pipe, which is usually # expected to be an estimator. - transformation_pipe = pipe.__class__(steps=[*pipe.steps[:-1], ('passthrough', 'passthrough')]) + transformation_pipe = pipe.__class__( + steps=[*pipe.steps[:-1], ('passthrough', 'passthrough')]) Xt.append(transformation_pipe.transform(X)) # in case no transformation pipelines are used by the estimator, @@ -115,7 +118,8 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f if not bootstrap: self.estimator.fit(X, y, **fit_kwargs) else: - bootstrap_idx = np.random.choice(range(X.shape[0]), X.shape[0], replace=True) + bootstrap_idx = np.random.choice( + range(X.shape[0]), X.shape[0], replace=True) self.estimator.fit(X[bootstrap_idx], y[bootstrap_idx]) return self @@ -150,12 +154,13 @@ def predict_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: """ return self.estimator.predict_proba(X, **predict_proba_kwargs) - def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: + def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: """ Finds the n_instances most informative point in the data provided by calling the query_strategy function. Args: X_pool: Pool of unlabeled instances to retrieve most informative instances from + return_metrics: boolean to indicate, if the corresponding query metrics should be (not) returned *query_args: The arguments for the query strategy. For instance, in the case of :func:`~modAL.uncertainty.uncertainty_sampling`, it is the pool of samples from which the query strategy should choose instances to request labels. @@ -165,16 +170,24 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] Value of the query_strategy function. Should be the indices of the instances from the pool chosen to be labelled and the instances themselves. Can be different in other cases, for instance only the instance to be labelled upon query synthesis. + query_metrics: returns also the corresponding metrics, if return_metrics == True """ - query_result, query_metrics = self.query_strategy(self, X_pool, *query_args, **query_kwargs) - - if isinstance(query_result, tuple): - warnings.warn("Query strategies should no longer return the selected instances, " - "this is now handled by the query method. " - "Please return only the indices of the selected instances.", DeprecationWarning) - return query_result - return query_result, retrieve_rows(X_pool, query_result), query_metrics + try: + query_result, query_metrics = self.query_strategy( + self, X_pool, *query_args, **query_kwargs) + + except TypeError: + query_metrics = None + query_result = self.query_strategy( + self, X_pool, *query_args, **query_kwargs) + warnings.warn( + "The selected query strategy doesn't support return_metrics") + + if return_metrics: + return query_result, retrieve_rows(X_pool, query_result), query_metrics + else: + return query_result, retrieve_rows(X_pool, query_result) def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any: """ @@ -205,6 +218,7 @@ class BaseCommittee(ABC, BaseEstimator): on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator when applying the query strategy. """ + def __init__(self, learner_list: List[BaseLearner], query_strategy: Callable, on_transformed: bool = False) -> None: assert type(learner_list) == list, 'learners must be supplied in a list' @@ -212,7 +226,6 @@ def __init__(self, learner_list: List[BaseLearner], query_strategy: Callable, on self.query_strategy = query_strategy self.on_transformed = on_transformed - def __iter__(self) -> Iterator[BaseLearner]: for learner in self.learner_list: yield learner @@ -264,12 +277,13 @@ def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.cs """ return data_hstack([learner.transform_without_estimating(X) for learner in self.learner_list]) - def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: + def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: """ Finds the n_instances most informative point in the data provided by calling the query_strategy function. Args: X_pool: Pool of unlabeled instances to retrieve most informative instances from + return_metrics: boolean to indicate, if the corresponding query metrics should be (not) returned *query_args: The arguments for the query strategy. For instance, in the case of :func:`~modAL.disagreement.max_disagreement_sampling`, it is the pool of samples from which the query. strategy should choose instances to request labels. @@ -279,16 +293,24 @@ def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput] Return value of the query_strategy function. Should be the indices of the instances from the pool chosen to be labelled and the instances themselves. Can be different in other cases, for instance only the instance to be labelled upon query synthesis. + query_metrics: returns also the corresponding metrics, if return_metrics == True """ - query_result, query_metrics = self.query_strategy(self, X_pool, *query_args, **query_kwargs) - if isinstance(query_result, tuple): - warnings.warn("Query strategies should no longer return the selected instances, " - "this is now handled by the query method. " - "Please return only the indices of the selected instances", DeprecationWarning) - return query_result - - return query_result, retrieve_rows(X_pool, query_result), query_metrics + try: + query_result, query_metrics = self.query_strategy( + self, X_pool, *query_args, **query_kwargs) + + except TypeError: + query_metrics = None + query_result = self.query_strategy( + self, X_pool, *query_args, **query_kwargs) + warnings.warn( + "The selected query strategy doesn't support return_metrics") + + if return_metrics: + return query_result, retrieve_rows(X_pool, query_result), query_metrics + else: + return query_result, retrieve_rows(X_pool, query_result) def _set_classes(self): """ @@ -298,7 +320,8 @@ def _set_classes(self): # assemble the list of known classes from each learner try: # if estimators are fitted - known_classes = tuple(learner.estimator.classes_ for learner in self.learner_list) + known_classes = tuple( + learner.estimator.classes_ for learner in self.learner_list) except AttributeError: # handle unfitted estimators self.classes_ = None @@ -311,7 +334,6 @@ def _set_classes(self): ) self.n_classes_ = len(self.classes_) - pass @abc.abstractmethod @@ -321,4 +343,3 @@ def vote(self, X: modALinput) -> Any: # TODO: clarify typing @abc.abstractmethod def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: pass - diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 7e7ceba..1ca6379 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -83,7 +83,7 @@ def __init__(self, **fit_kwargs ) -> None: super().__init__(estimator, query_strategy, on_transformed, **fit_kwargs) - + self.X_training = X_training self.y_training = y_training @@ -103,7 +103,7 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None: classifier has seen. """ check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, - force_all_finite=self.force_all_finite) + force_all_finite=self.force_all_finite) if self.X_training is None: self.X_training = X @@ -131,11 +131,13 @@ def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': self.estimator.fit(self.X_training, self.y_training, **fit_kwargs) else: n_instances = self.X_training.shape[0] - bootstrap_idx = np.random.choice(range(n_instances), n_instances, replace=True) - self.estimator.fit(self.X_training[bootstrap_idx], self.y_training[bootstrap_idx], **fit_kwargs) + bootstrap_idx = np.random.choice( + range(n_instances), n_instances, replace=True) + self.estimator.fit( + self.X_training[bootstrap_idx], self.y_training[bootstrap_idx], **fit_kwargs) + + return self - return self - def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': """ Interface for the fit method of the predictor. Fits the predictor to the supplied data, then stores it @@ -154,9 +156,9 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg Returns: self - """ + """ check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, - force_all_finite=self.force_all_finite) + force_all_finite=self.force_all_finite) self.X_training, self.y_training = X, y return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) @@ -179,9 +181,10 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) else: check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None, - force_all_finite=self.force_all_finite) + force_all_finite=self.force_all_finite) self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + class DeepActiveLearner(BaseLearner): """ This class is an model of a general deep active learning algorithm. @@ -209,10 +212,10 @@ def __init__(self, on_transformed: bool = False, **fit_kwargs ) -> None: - #TODO: Check if given query strategy works for Deep Learning + # TODO: Check if given query strategy works for Deep Learning super().__init__(estimator, query_strategy, on_transformed, **fit_kwargs) - self.estimator.initialize() # does maybe just work with pytorch + self.estimator.initialize() def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': """ @@ -228,7 +231,7 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg Returns: self - """ + """ return self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) def teach(self, X: modALinput, y: modALinput, warm_start: bool = True, bootstrap: bool = False, **fit_kwargs) -> None: @@ -245,15 +248,17 @@ def teach(self, X: modALinput, y: modALinput, warm_start: bool = True, bootstrap **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. """ - if warm_start: - if not bootstrap: + if warm_start: + if not bootstrap: self.estimator.partial_fit(X, y, **fit_kwargs) else: - bootstrap_idx = np.random.choice(range(X.shape[0]), X.shape[0], replace=True) - self.estimator.partial_fit(X[bootstrap_idx], y[bootstrap_idx], **fit_kwargs) - else: + bootstrap_idx = np.random.choice( + range(X.shape[0]), X.shape[0], replace=True) + self.estimator.partial_fit( + X[bootstrap_idx], y[bootstrap_idx], **fit_kwargs) + else: self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) - + @property def num_epochs(self): """ @@ -268,11 +273,11 @@ def num_epochs(self, value): can be changed at any time, even after the model was trained. """ if isinstance(value, int): - if 0 < value <= 100: + if 0 < value <= 100: self.estimator.max_epochs = value - else: + else: raise ValueError("num_epochs must be in range 0 < x <= 100") - else: + else: raise TypeError("num_epochs must be of type integer!") @property @@ -289,11 +294,11 @@ def batch_size(self, value): can be changed at any time, even after the model was trained. """ if isinstance(value, int): - if 0 < value: + if 0 < value: self.estimator.batch_size = value - else: + else: raise ValueError("batch size must be larger than 0") - else: + else: raise TypeError("batch size must be of type integer!") @@ -369,6 +374,7 @@ class BayesianOptimizer(ActiveLearner): ... query_idx, query_inst = optimizer.query(X) ... optimizer.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1)) """ + def __init__(self, estimator: BaseEstimator, query_strategy: Callable = max_EI, @@ -485,13 +491,14 @@ class Committee(BaseCommittee): ... y=iris['target'][query_idx].reshape(1, ) ... ) """ + def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = vote_entropy_sampling, on_transformed: bool = False) -> None: super().__init__(learner_list, query_strategy, on_transformed) self._set_classes() # TODO: update training data when using fit() and teach() methods self.X_training = None - + def _add_training_data(self, X: modALinput, y: modALinput) -> None: """ Adds the new data and label to the known data for each learner, but does not retrain the model. @@ -506,7 +513,7 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None: """ for learner in self.learner_list: learner._add_training_data(X, y) - + def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> None: """ Fits all learners to the training data and labels provided to it so far. @@ -518,7 +525,7 @@ def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> None: """ for learner in self.learner_list: learner._fit_to_known(bootstrap=bootstrap, **fit_kwargs) - + def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> None: """ Fits every learner to a subset sampled with replacement from X. Calling this method makes the learner forget the @@ -534,7 +541,7 @@ def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> None: """ for learner in self.learner_list: learner.fit(X, y, **fit_kwargs) - + self._set_classes() def rebag(self, **fit_kwargs) -> None: @@ -658,7 +665,8 @@ def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: # probability prediction is straightforward for learner_idx, learner in enumerate(self.learner_list): - proba[:, learner_idx, :] = learner.predict_proba(X, **predict_proba_kwargs) + proba[:, learner_idx, :] = learner.predict_proba( + X, **predict_proba_kwargs) else: for learner_idx, learner in enumerate(self.learner_list): @@ -721,13 +729,14 @@ class DeepCommittee(BaseCommittee): ... y=iris['target'][query_idx].reshape(1, ) ... ) """ + def __init__(self, learner_list: List[DeepActiveLearner], query_strategy: Callable = vote_entropy_sampling, on_transformed: bool = False) -> None: super().__init__(learner_list, query_strategy, on_transformed) self._set_classes() # TODO: update training data when using fit() and teach() methods self.X_training = None - + def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> None: """ Fits every learner to a subset sampled with replacement from X. Calling this method makes the learner forget the @@ -743,7 +752,7 @@ def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> None: """ for learner in self.learner_list: learner.fit(X, y, **fit_kwargs) - + self._set_classes() def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> None: @@ -851,7 +860,8 @@ def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: # probability prediction is straightforward for learner_idx, learner in enumerate(self.learner_list): - proba[:, learner_idx, :] = learner.predict_proba(X, **predict_proba_kwargs) + proba[:, learner_idx, :] = learner.predict_proba( + X, **predict_proba_kwargs) else: for learner_idx, learner in enumerate(self.learner_list): @@ -916,6 +926,7 @@ class CommitteeRegressor(BaseCommittee): ... query_idx, query_instance = committee.query(X.reshape(-1, 1)) ... committee.teach(X[query_idx].reshape(-1, 1), y[query_idx].reshape(-1, 1)) """ + def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = max_std_sampling, on_transformed: bool = False) -> None: super().__init__(learner_list, query_strategy, on_transformed) @@ -951,6 +962,7 @@ def vote(self, X: modALinput, **predict_kwargs): prediction = np.zeros(shape=(len(X), len(self.learner_list))) for learner_idx, learner in enumerate(self.learner_list): - prediction[:, learner_idx] = learner.predict(X, **predict_kwargs).reshape(-1, ) + prediction[:, learner_idx] = learner.predict( + X, **predict_kwargs).reshape(-1, ) return prediction diff --git a/modAL/utils/selection.py b/modAL/utils/selection.py index 537c082..6c9c2d9 100644 --- a/modAL/utils/selection.py +++ b/modAL/utils/selection.py @@ -14,7 +14,6 @@ def shuffled_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: Args: values: Contains the values to be selected from. n_instances: Specifies how many indices and values to return. - return_negative: if true: returns negative values Returns: The indices and values of the n_instances largest values. """ @@ -26,7 +25,8 @@ def shuffled_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray: # getting the n_instances best instance # since mergesort is used, the shuffled order is preserved - sorted_query_idx = np.argsort(shuffled_values, kind='mergesort')[len(shuffled_values)-n_instances:] + sorted_query_idx = np.argsort(shuffled_values, kind='mergesort')[ + len(shuffled_values)-n_instances:] # inverting the shuffle query_idx = shuffled_idx[sorted_query_idx] @@ -98,5 +98,6 @@ def weighted_random(weights: np.ndarray, n_instances: int = 1) -> np.ndarray: weight_sum = np.sum(weights) assert weight_sum > 0, 'the sum of weights must be larger than zero' - random_idx = np.random.choice(range(len(weights)), size=n_instances, p=weights/weight_sum, replace=False) + random_idx = np.random.choice( + range(len(weights)), size=n_instances, p=weights/weight_sum, replace=False) return random_idx diff --git a/setup.py b/setup.py index 6d80a37..3f48835 100644 --- a/setup.py +++ b/setup.py @@ -10,5 +10,6 @@ url='https://modAL-python.github.io/', packages=['modAL', 'modAL.models', 'modAL.utils'], classifiers=['Development Status :: 4 - Beta'], - install_requires=['numpy==1.20.0', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0', 'skorch==0.9.0', 'torch>=1.8.1'], + install_requires=['numpy==1.20.0', 'scikit-learn>=0.18', + 'scipy>=0.18', 'pandas>=1.1.0', 'skorch==0.9.0', 'torch>=1.8.1'], ) From 0a2d24a197a1476f5a5da1bc2cf0e546c3b5e030 Mon Sep 17 00:00:00 2001 From: Max Keller Date: Sun, 22 Aug 2021 15:09:22 +0200 Subject: [PATCH 163/182] Add DeepActiveLearner tests & fix ValueError --- modAL/models/base.py | 19 +- modAL/utils/data.py | 19 +- tests/core_tests.py | 539 ++++++++++++++++++++++++++++++------------- 3 files changed, 395 insertions(+), 182 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 63f5409..cff5dc0 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -177,7 +177,7 @@ def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwarg query_result, query_metrics = self.query_strategy( self, X_pool, *query_args, **query_kwargs) - except TypeError: + except ValueError: query_metrics = None query_result = self.query_strategy( self, X_pool, *query_args, **query_kwargs) @@ -246,22 +246,10 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f for learner in self.learner_list: learner._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) - @abc.abstractmethod - def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> Any: - pass - @abc.abstractmethod def predict(self, X: modALinput) -> Any: pass - @abc.abstractmethod - def predict_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: - pass - - @abc.abstractmethod - def score(self, X: modALinput, y: modALinput, sample_weight: List[float] = None) -> Any: - pass - @abc.abstractmethod def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> Any: pass @@ -300,7 +288,7 @@ def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwarg query_result, query_metrics = self.query_strategy( self, X_pool, *query_args, **query_kwargs) - except TypeError: + except ValueError: query_metrics = None query_result = self.query_strategy( self, X_pool, *query_args, **query_kwargs) @@ -340,6 +328,3 @@ def _set_classes(self): def vote(self, X: modALinput) -> Any: # TODO: clarify typing pass - @abc.abstractmethod - def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: - pass diff --git a/modAL/utils/data.py b/modAL/utils/data.py index 10ddb2e..12d8a32 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -27,7 +27,7 @@ def data_vstack(blocks: Sequence[modALinput]) -> modALinput: return np.concatenate(blocks) elif isinstance(blocks[0], list): return np.concatenate(blocks).tolist() - elif torch.is_tensor(blocks[0]): + elif torch.is_tensor(blocks[0]): return torch.cat(blocks) raise TypeError('%s datatype is not supported' % type(blocks[0])) @@ -51,23 +51,22 @@ def data_hstack(blocks: Sequence[modALinput]) -> modALinput: return np.hstack(blocks) elif isinstance(blocks[0], list): return np.hstack(blocks).tolist() - elif torch.is_tensor(blocks[0]): + elif torch.is_tensor(blocks[0]): return torch.cat(blocks, dim=1) TypeError('%s datatype is not supported' % type(blocks[0])) -def add_row(X:modALinput, row: modALinput): +def add_row(X: modALinput, row: modALinput): """ Returns X' = [X - row] - """ + row] """ if isinstance(X, np.ndarray): return np.vstack((X, row)) - elif torch.is_tensor(X): + elif torch.is_tensor(X): return torch.cat((X, row)) elif isinstance(X, list): return np.vstack((X, row)).tolist() @@ -102,7 +101,7 @@ def retrieve_rows(X: modALinput, return X.iloc[I] elif isinstance(X, list): return np.array(X)[I].tolist() - elif isinstance(X, dict): + elif isinstance(X, dict): X_return = {} for key, value in X.items(): X_return[key] = retrieve_rows(value, I) @@ -118,7 +117,6 @@ def retrieve_rows(X: modALinput, def drop_rows(X: modALinput, I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: """ - TODO: Add pytorch support Returns X without the row(s) at index/indices I """ if sp.issparse(X): @@ -131,6 +129,9 @@ def drop_rows(X: modALinput, return np.delete(X, I, axis=0) elif isinstance(X, list): return np.delete(X, I, axis=0).tolist() + elif torch.is_tensor(X): + return X[[True if row not in I else False + for row in range(X.size(0))]] raise TypeError('%s datatype is not supported' % type(X)) @@ -165,7 +166,7 @@ def data_shape(X: modALinput): return X.shape elif isinstance(X, list): return np.array(X).shape - elif torch.is_tensor(X): + elif torch.is_tensor(X): return tuple(X.size()) raise TypeError('%s datatype is not supported' % type(X)) diff --git a/tests/core_tests.py b/tests/core_tests.py index 1ed4f95..4beb0fc 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -4,6 +4,8 @@ import pandas as pd import mock +from unittest.mock import MagicMock + import modAL.models.base import modAL.models.learners import modAL.utils.selection @@ -34,6 +36,8 @@ from scipy.special import ndtr from scipy import sparse as sp +import torch +from torch import nn Test = namedtuple('Test', ['input', 'output']) @@ -50,18 +54,27 @@ def test_check_class_labels(self): for n_learners in range(1, 10): # 1. test fitted estimators labels = np.random.randint(10, size=n_labels) - different_labels = np.random.randint(10, 20, size=np.random.randint(1, 10)) - learner_list_1 = [mock.MockEstimator(classes_=labels) for _ in range(n_learners)] - learner_list_2 = [mock.MockEstimator(classes_=different_labels) for _ in range(np.random.randint(1, 5))] - shuffled_learners = random.sample(learner_list_1 + learner_list_2, len(learner_list_1 + learner_list_2)) - self.assertTrue(modAL.utils.validation.check_class_labels(*learner_list_1)) - self.assertFalse(modAL.utils.validation.check_class_labels(*shuffled_learners)) + different_labels = np.random.randint( + 10, 20, size=np.random.randint(1, 10)) + learner_list_1 = [mock.MockEstimator( + classes_=labels) for _ in range(n_learners)] + learner_list_2 = [mock.MockEstimator( + classes_=different_labels) for _ in range(np.random.randint(1, 5))] + shuffled_learners = random.sample( + learner_list_1 + learner_list_2, len(learner_list_1 + learner_list_2)) + self.assertTrue( + modAL.utils.validation.check_class_labels(*learner_list_1)) + self.assertFalse( + modAL.utils.validation.check_class_labels(*shuffled_learners)) # 2. test unfitted estimators - unfitted_learner_list = [mock.MockEstimator(classes_=labels) for _ in range(n_learners)] + unfitted_learner_list = [mock.MockEstimator( + classes_=labels) for _ in range(n_learners)] idx = np.random.randint(0, n_learners) - unfitted_learner_list.insert(idx, mock.MockEstimator(fitted=False)) - self.assertRaises(NotFittedError, modAL.utils.validation.check_class_labels, *unfitted_learner_list) + unfitted_learner_list.insert( + idx, mock.MockEstimator(fitted=False)) + self.assertRaises( + NotFittedError, modAL.utils.validation.check_class_labels, *unfitted_learner_list) def test_check_class_proba(self): for n_labels in range(2, 20): @@ -69,16 +82,19 @@ def test_check_class_proba(self): proba = np.random.rand(100, n_labels) class_labels = list(range(n_labels)) np.testing.assert_almost_equal( - modAL.utils.check_class_proba(proba, known_labels=class_labels, all_labels=class_labels), + modAL.utils.check_class_proba( + proba, known_labels=class_labels, all_labels=class_labels), proba ) for unknown_idx in range(n_labels): all_labels = list(range(n_labels)) known_labels = deepcopy(all_labels) known_labels.remove(unknown_idx) - aug_proba = np.insert(proba[:, known_labels], unknown_idx, np.zeros(len(proba)), axis=1) + aug_proba = np.insert( + proba[:, known_labels], unknown_idx, np.zeros(len(proba)), axis=1) np.testing.assert_almost_equal( - modAL.utils.check_class_proba(proba[:, known_labels], known_labels=known_labels, all_labels=all_labels), + modAL.utils.check_class_proba( + proba[:, known_labels], known_labels=known_labels, all_labels=all_labels), aug_proba ) @@ -91,7 +107,8 @@ def dummy_function(X_in): for n_features in range(1, 10): for n_functions in range(2, 10): functions = [dummy_function for _ in range(n_functions)] - linear_combination = modAL.utils.combination.make_linear_combination(*functions) + linear_combination = modAL.utils.combination.make_linear_combination( + *functions) X_in = np.random.rand(n_samples, n_features) if n_samples == 1: @@ -100,7 +117,8 @@ def dummy_function(X_in): true_result = n_functions*np.ones(shape=(n_samples, 1)) try: - np.testing.assert_almost_equal(linear_combination(X_in), true_result) + np.testing.assert_almost_equal( + linear_combination(X_in), true_result) except: linear_combination(X_in) @@ -119,7 +137,8 @@ def test_product(self): # linear combination with weights exponents = np.random.rand(n_functions) - exp_product = modAL.utils.combination.make_product(*functions, exponents=exponents) + exp_product = modAL.utils.combination.make_product( + *functions, exponents=exponents) np.testing.assert_almost_equal( exp_product(X_in), np.prod([X_in**exponent for exponent in exponents], axis=0) @@ -149,7 +168,8 @@ def test_make_query_strategy(self): def test_data_vstack(self): for n_samples, n_features in product(range(1, 10), range(1, 10)): # numpy arrays - a, b = np.random.rand(n_samples, n_features), np.random.rand(n_samples, n_features) + a, b = np.random.rand(n_samples, n_features), np.random.rand( + n_samples, n_features) np.testing.assert_almost_equal( modAL.utils.data.data_vstack((a, b)), np.concatenate((a, b)) @@ -157,23 +177,42 @@ def test_data_vstack(self): # sparse matrices for format in ['lil', 'csc', 'csr']: - a, b = sp.random(n_samples, n_features, format=format), sp.random(n_samples, n_features, format=format) - self.assertEqual((modAL.utils.data.data_vstack((a, b)) != sp.vstack((a, b))).sum(), 0) + a, b = sp.random(n_samples, n_features, format=format), sp.random( + n_samples, n_features, format=format) + self.assertEqual((modAL.utils.data.data_vstack( + (a, b)) != sp.vstack((a, b))).sum(), 0) + + # pytorch tensors + a, b = torch.randn(n_samples, n_features), torch.randn( + n_samples, n_features) + self.assertTrue( + torch.equal(modAL.utils.data.data_vstack((a, b)), torch.cat((a, b)))) + + # lists + a, b = np.random.rand(n_samples, n_features).tolist(), np.random.rand( + n_samples, n_features).tolist() + np.testing.assert_almost_equal( + modAL.utils.data.data_vstack((a, b)), + np.concatenate((a, b)) + ) # not supported formats self.assertRaises(TypeError, modAL.utils.data.data_vstack, (1, 1)) - # functions from modAL.utils.selection + # functions from modALu.tils.selection def test_multi_argmax(self): for n_pool in range(2, 100): for n_instances in range(1, n_pool+1): utility = np.zeros(n_pool) - max_idx = np.random.choice(range(n_pool), size=n_instances, replace=False) + max_idx = np.random.choice( + range(n_pool), size=n_instances, replace=False) utility[max_idx] = 1e-10 + np.random.rand(n_instances, ) np.testing.assert_equal( - np.sort(modAL.utils.selection.multi_argmax(utility, n_instances)), - np.sort(max_idx) + np.sort(modAL.utils.selection.multi_argmax( + utility, n_instances)), + (np.sort(max_idx), np.sort(utility) + [len(utility)-n_instances:]) ) def test_shuffled_argmax(self): @@ -181,9 +220,11 @@ def test_shuffled_argmax(self): for n_instances in range(1, n_pool+1): values = np.random.permutation(n_pool) true_query_idx = np.argsort(values)[len(values)-n_instances:] - + true_values = np.sort(values, axis=None)[ + len(values)-n_instances:] + np.testing.assert_equal( - true_query_idx, + (true_query_idx, true_values), modAL.utils.selection.shuffled_argmax(values, n_instances) ) @@ -191,11 +232,13 @@ def test_weighted_random(self): for n_pool in range(2, 100): for n_instances in range(1, n_pool): utility = np.ones(n_pool) - query_idx = modAL.utils.selection.weighted_random(utility, n_instances) + query_idx = modAL.utils.selection.weighted_random( + utility, n_instances) # testing for correct number of returned indices np.testing.assert_equal(len(query_idx), n_instances) # testing for uniqueness of each query index - np.testing.assert_equal(len(query_idx), len(np.unique(query_idx))) + np.testing.assert_equal( + len(query_idx), len(np.unique(query_idx))) class TestAcquisitionFunctions(unittest.TestCase): @@ -220,24 +263,29 @@ def test_optimizer_PI(self): # 1. fitted estimator mock_estimator = mock.MockEstimator(predict_return=(mean, std)) - optimizer = modAL.models.learners.BayesianOptimizer(estimator=mock_estimator) + optimizer = modAL.models.learners.BayesianOptimizer( + estimator=mock_estimator) optimizer._set_max([0], [max_val]) true_PI = ndtr((mean - max_val - tradeoff)/std) np.testing.assert_almost_equal( true_PI, - modAL.acquisition.optimizer_PI(optimizer, np.random.rand(n_samples, 2), tradeoff) + modAL.acquisition.optimizer_PI( + optimizer, np.random.rand(n_samples, 2), tradeoff) ) # 2. unfitted estimator mock_estimator = mock.MockEstimator(fitted=False) - optimizer = modAL.models.learners.BayesianOptimizer(estimator=mock_estimator) + optimizer = modAL.models.learners.BayesianOptimizer( + estimator=mock_estimator) optimizer._set_max([0], [max_val]) - true_PI = ndtr((np.zeros(shape=(len(mean), 1)) - max_val - tradeoff) / np.ones(shape=(len(mean), 1))) + true_PI = ndtr((np.zeros(shape=(len(mean), 1)) - + max_val - tradeoff) / np.ones(shape=(len(mean), 1))) np.testing.assert_almost_equal( true_PI, - modAL.acquisition.optimizer_PI(optimizer, np.random.rand(n_samples, 2), tradeoff) + modAL.acquisition.optimizer_PI( + optimizer, np.random.rand(n_samples, 2), tradeoff) ) def test_optimizer_EI(self): @@ -251,26 +299,31 @@ def test_optimizer_EI(self): mock_estimator = mock.MockEstimator( predict_return=(mean, std) ) - optimizer = modAL.models.learners.BayesianOptimizer(estimator=mock_estimator) + optimizer = modAL.models.learners.BayesianOptimizer( + estimator=mock_estimator) optimizer._set_max([0], [max_val]) true_EI = (mean - optimizer.y_max - tradeoff) * ndtr((mean - optimizer.y_max - tradeoff) / std) \ - + std * norm.pdf((mean - optimizer.y_max - tradeoff) / std) + + std * norm.pdf((mean - optimizer.y_max - tradeoff) / std) np.testing.assert_almost_equal( true_EI, - modAL.acquisition.optimizer_EI(optimizer, np.random.rand(n_samples, 2), tradeoff) + modAL.acquisition.optimizer_EI( + optimizer, np.random.rand(n_samples, 2), tradeoff) ) # 2. unfitted estimator mock_estimator = mock.MockEstimator(fitted=False) - optimizer = modAL.models.learners.BayesianOptimizer(estimator=mock_estimator) + optimizer = modAL.models.learners.BayesianOptimizer( + estimator=mock_estimator) optimizer._set_max([0], [max_val]) true_EI = (np.zeros(shape=(len(mean), 1)) - optimizer.y_max - tradeoff) * ndtr((np.zeros(shape=(len(mean), 1)) - optimizer.y_max - tradeoff) / np.ones(shape=(len(mean), 1))) \ - + np.ones(shape=(len(mean), 1)) * norm.pdf((np.zeros(shape=(len(mean), 1)) - optimizer.y_max - tradeoff) / np.ones(shape=(len(mean), 1))) + + np.ones(shape=(len(mean), 1)) * norm.pdf((np.zeros(shape=(len(mean), 1) + ) - optimizer.y_max - tradeoff) / np.ones(shape=(len(mean), 1))) np.testing.assert_almost_equal( true_EI, - modAL.acquisition.optimizer_EI(optimizer, np.random.rand(n_samples, 2), tradeoff) + modAL.acquisition.optimizer_EI( + optimizer, np.random.rand(n_samples, 2), tradeoff) ) def test_optimizer_UCB(self): @@ -283,22 +336,27 @@ def test_optimizer_UCB(self): mock_estimator = mock.MockEstimator( predict_return=(mean, std) ) - optimizer = modAL.models.learners.BayesianOptimizer(estimator=mock_estimator) + optimizer = modAL.models.learners.BayesianOptimizer( + estimator=mock_estimator) true_UCB = mean + beta*std np.testing.assert_almost_equal( true_UCB, - modAL.acquisition.optimizer_UCB(optimizer, np.random.rand(n_samples, 2), beta) + modAL.acquisition.optimizer_UCB( + optimizer, np.random.rand(n_samples, 2), beta) ) # 2. unfitted estimator mock_estimator = mock.MockEstimator(fitted=False) - optimizer = modAL.models.learners.BayesianOptimizer(estimator=mock_estimator) - true_UCB = np.zeros(shape=(len(mean), 1)) + beta * np.ones(shape=(len(mean), 1)) + optimizer = modAL.models.learners.BayesianOptimizer( + estimator=mock_estimator) + true_UCB = np.zeros(shape=(len(mean), 1)) + \ + beta * np.ones(shape=(len(mean), 1)) np.testing.assert_almost_equal( true_UCB, - modAL.acquisition.optimizer_UCB(optimizer, np.random.rand(n_samples, 2), beta) + modAL.acquisition.optimizer_UCB( + optimizer, np.random.rand(n_samples, 2), beta) ) def test_selection(self): @@ -313,12 +371,16 @@ def test_selection(self): predict_return=(mean, std) ) - optimizer = modAL.models.learners.BayesianOptimizer(estimator=mock_estimator) + optimizer = modAL.models.learners.BayesianOptimizer( + estimator=mock_estimator) optimizer._set_max([0], [max_val]) - modAL.acquisition.max_PI(optimizer, X, tradeoff=np.random.rand(), n_instances=n_instances) - modAL.acquisition.max_EI(optimizer, X, tradeoff=np.random.rand(), n_instances=n_instances) - modAL.acquisition.max_UCB(optimizer, X, beta=np.random.rand(), n_instances=n_instances) + modAL.acquisition.max_PI( + optimizer, X, tradeoff=np.random.rand(), n_instances=n_instances) + modAL.acquisition.max_EI( + optimizer, X, tradeoff=np.random.rand(), n_instances=n_instances) + modAL.acquisition.max_UCB( + optimizer, X, beta=np.random.rand(), n_instances=n_instances) class TestDensity(unittest.TestCase): @@ -349,15 +411,20 @@ def test_vote_entropy(self): for n_classes in range(1, 10): for true_query_idx in range(n_samples): # 1. fitted committee - vote_return = np.zeros(shape=(n_samples, n_classes), dtype=np.int16) - vote_return[true_query_idx] = np.asarray(range(n_classes), dtype=np.int16) - committee = mock.MockCommittee(classes_=np.asarray(range(n_classes)), vote_return=vote_return) + vote_return = np.zeros( + shape=(n_samples, n_classes), dtype=np.int16) + vote_return[true_query_idx] = np.asarray( + range(n_classes), dtype=np.int16) + committee = mock.MockCommittee(classes_=np.asarray( + range(n_classes)), vote_return=vote_return) vote_entr = modAL.disagreement.vote_entropy( committee, np.random.rand(n_samples, n_classes) ) true_entropy = np.zeros(shape=(n_samples, )) - true_entropy[true_query_idx] = entropy(np.ones(n_classes)/n_classes) - np.testing.assert_array_almost_equal(vote_entr, true_entropy) + true_entropy[true_query_idx] = entropy( + np.ones(n_classes)/n_classes) + np.testing.assert_array_almost_equal( + vote_entr, true_entropy) # 2. unfitted committee committee = mock.MockCommittee(fitted=False) @@ -380,8 +447,10 @@ def test_consensus_entropy(self): committee, np.random.rand(n_samples, n_classes) ) true_entropy = np.zeros(shape=(n_samples,)) - true_entropy[true_query_idx] = entropy(np.ones(n_classes) / n_classes) - np.testing.assert_array_almost_equal(consensus_entropy, true_entropy) + true_entropy[true_query_idx] = entropy( + np.ones(n_classes) / n_classes) + np.testing.assert_array_almost_equal( + consensus_entropy, true_entropy) # 2. unfitted committee committee = mock.MockCommittee(fitted=False) @@ -389,14 +458,16 @@ def test_consensus_entropy(self): consensus_entropy = modAL.disagreement.consensus_entropy( committee, np.random.rand(n_samples, n_classes) ) - np.testing.assert_almost_equal(consensus_entropy, true_entropy) + np.testing.assert_almost_equal( + consensus_entropy, true_entropy) def test_KL_max_disagreement(self): for n_samples in range(1, 10): for n_classes in range(2, 10): - for n_learners in range (2, 10): + for n_learners in range(2, 10): # 1. fitted committee - vote_proba = np.zeros(shape=(n_samples, n_learners, n_classes)) + vote_proba = np.zeros( + shape=(n_samples, n_learners, n_classes)) vote_proba[:, :, 0] = 1.0 committee = mock.MockCommittee( n_learners=n_learners, classes_=range(n_classes), @@ -408,10 +479,12 @@ def test_KL_max_disagreement(self): try: np.testing.assert_array_almost_equal( true_KL_disagreement, - modAL.disagreement.KL_max_disagreement(committee, np.random.rand(n_samples, 1)) + modAL.disagreement.KL_max_disagreement( + committee, np.random.rand(n_samples, 1)) ) except: - modAL.disagreement.KL_max_disagreement(committee, np.random.rand(n_samples, 1)) + modAL.disagreement.KL_max_disagreement( + committee, np.random.rand(n_samples, 1)) # 2. unfitted committee committee = mock.MockCommittee(fitted=False) @@ -419,20 +492,24 @@ def test_KL_max_disagreement(self): returned_KL_disagreement = modAL.disagreement.KL_max_disagreement( committee, np.random.rand(n_samples, n_classes) ) - np.testing.assert_almost_equal(returned_KL_disagreement, true_KL_disagreement) + np.testing.assert_almost_equal( + returned_KL_disagreement, true_KL_disagreement) def test_vote_entropy_sampling(self): for n_samples, n_features, n_classes in product(range(1, 10), range(1, 10), range(1, 10)): committee = mock.MockCommittee(classes_=np.asarray(range(n_classes)), vote_return=np.zeros(shape=(n_samples, n_classes), dtype=np.int16)) - modAL.disagreement.vote_entropy_sampling(committee, np.random.rand(n_samples, n_features)) + modAL.disagreement.vote_entropy_sampling( + committee, np.random.rand(n_samples, n_features)) modAL.disagreement.vote_entropy_sampling(committee, np.random.rand(n_samples, n_features), random_tie_break=True) def test_consensus_entropy_sampling(self): for n_samples, n_features, n_classes in product(range(1, 10), range(1, 10), range(1, 10)): - committee = mock.MockCommittee(predict_proba_return=np.random.rand(n_samples, n_classes)) - modAL.disagreement.consensus_entropy_sampling(committee, np.random.rand(n_samples, n_features)) + committee = mock.MockCommittee( + predict_proba_return=np.random.rand(n_samples, n_classes)) + modAL.disagreement.consensus_entropy_sampling( + committee, np.random.rand(n_samples, n_features)) modAL.disagreement.consensus_entropy_sampling(committee, np.random.rand(n_samples, n_features), random_tie_break=True) @@ -440,17 +517,21 @@ def test_max_disagreement_sampling(self): for n_samples, n_features, n_classes, n_learners in product(range(1, 10), range(1, 10), range(1, 10), range(2, 5)): committee = mock.MockCommittee( n_learners=n_learners, classes_=range(n_classes), - vote_proba_return=np.zeros(shape=(n_samples, n_learners, n_classes)) + vote_proba_return=np.zeros( + shape=(n_samples, n_learners, n_classes)) ) - modAL.disagreement.max_disagreement_sampling(committee, np.random.rand(n_samples, n_features)) + modAL.disagreement.max_disagreement_sampling( + committee, np.random.rand(n_samples, n_features)) modAL.disagreement.max_disagreement_sampling(committee, np.random.rand(n_samples, n_features), random_tie_break=True) def test_max_std_sampling(self): for n_samples, n_features in product(range(1, 10), range(1, 10)): regressor = GaussianProcessRegressor() - regressor.fit(np.random.rand(n_samples, n_features), np.random.rand(n_samples)) - modAL.disagreement.max_std_sampling(regressor, np.random.rand(n_samples, n_features)) + regressor.fit(np.random.rand(n_samples, n_features), + np.random.rand(n_samples)) + modAL.disagreement.max_std_sampling( + regressor, np.random.rand(n_samples, n_features)) modAL.disagreement.max_std_sampling(regressor, np.random.rand(n_samples, n_features), random_tie_break=True) @@ -458,8 +539,10 @@ def test_max_std_sampling(self): class TestEER(unittest.TestCase): def test_eer(self): for n_pool, n_features, n_classes in product(range(5, 10), range(1, 5), range(2, 5)): - X_training_, y_training = np.random.rand(10, n_features).tolist(), np.random.randint(0, n_classes, size=10) - X_pool_, y_pool = np.random.rand(n_pool, n_features).tolist(), np.random.randint(0, n_classes+1, size=n_pool) + X_training_, y_training = np.random.rand( + 10, n_features).tolist(), np.random.randint(0, n_classes, size=10) + X_pool_, y_pool = np.random.rand(n_pool, n_features).tolist( + ), np.random.randint(0, n_classes+1, size=n_pool) for data_type in (sp.csr_matrix, pd.DataFrame, np.array, list): X_training, X_pool = data_type(X_training_), data_type(X_pool_) @@ -468,10 +551,14 @@ def test_eer(self): X_training=X_training, y_training=y_training) modAL.expected_error.expected_error_reduction(learner, X_pool) - modAL.expected_error.expected_error_reduction(learner, X_pool, random_tie_break=True) - modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1) - modAL.expected_error.expected_error_reduction(learner, X_pool, loss='binary') - modAL.expected_error.expected_error_reduction(learner, X_pool, p_subsample=0.1, loss='log') + modAL.expected_error.expected_error_reduction( + learner, X_pool, random_tie_break=True) + modAL.expected_error.expected_error_reduction( + learner, X_pool, p_subsample=0.1) + modAL.expected_error.expected_error_reduction( + learner, X_pool, loss='binary') + modAL.expected_error.expected_error_reduction( + learner, X_pool, p_subsample=0.1, loss='log') self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction, learner, X_pool, p_subsample=1.5) self.assertRaises(AssertionError, modAL.expected_error.expected_error_reduction, @@ -491,24 +578,27 @@ def test_classifier_uncertainty(self): ) # fitted estimator - fitted_estimator = mock.MockEstimator(predict_proba_return=case.input) + fitted_estimator = mock.MockEstimator( + predict_proba_return=case.input) np.testing.assert_almost_equal( - modAL.uncertainty.classifier_uncertainty(fitted_estimator, np.random.rand(10)), + modAL.uncertainty.classifier_uncertainty( + fitted_estimator, np.random.rand(10)), case.output ) # not fitted estimator not_fitted_estimator = mock.MockEstimator(fitted=False) np.testing.assert_almost_equal( - modAL.uncertainty.classifier_uncertainty(not_fitted_estimator, case.input), + modAL.uncertainty.classifier_uncertainty( + not_fitted_estimator, case.input), np.ones(shape=(len(case.output))) ) def test_classifier_margin(self): test_cases_1 = (Test(p * np.ones(shape=(k, l)), np.zeros(shape=(k,))) - for k in range(1, 100) for l in range(1, 10) for p in np.linspace(0, 1, 11)) + for k in range(1, 100) for l in range(1, 10) for p in np.linspace(0, 1, 11)) test_cases_2 = (Test(p * np.tile(np.asarray(range(k))+1.0, l).reshape(l, k), - p * np.ones(shape=(l, ))*int(k!=1)) + p * np.ones(shape=(l, ))*int(k != 1)) for k in range(1, 10) for l in range(1, 100) for p in np.linspace(0, 1, 11)) for case in chain(test_cases_1, test_cases_2): # _proba_margin @@ -518,16 +608,19 @@ def test_classifier_margin(self): ) # fitted estimator - fitted_estimator = mock.MockEstimator(predict_proba_return=case.input) + fitted_estimator = mock.MockEstimator( + predict_proba_return=case.input) np.testing.assert_almost_equal( - modAL.uncertainty.classifier_margin(fitted_estimator, np.random.rand(10)), + modAL.uncertainty.classifier_margin( + fitted_estimator, np.random.rand(10)), case.output ) # not fitted estimator not_fitted_estimator = mock.MockEstimator(fitted=False) np.testing.assert_almost_equal( - modAL.uncertainty.classifier_margin(not_fitted_estimator, case.input), + modAL.uncertainty.classifier_margin( + not_fitted_estimator, case.input), np.zeros(shape=(len(case.output))) ) @@ -545,16 +638,19 @@ def test_classifier_entropy(self): ) # fitted estimator - fitted_estimator = mock.MockEstimator(predict_proba_return=proba) + fitted_estimator = mock.MockEstimator( + predict_proba_return=proba) np.testing.assert_equal( - modAL.uncertainty.classifier_entropy(fitted_estimator, np.random.rand(n_samples, 1)), + modAL.uncertainty.classifier_entropy( + fitted_estimator, np.random.rand(n_samples, 1)), np.zeros(shape=(n_samples, )) ) # not fitted estimator not_fitted_estimator = mock.MockEstimator(fitted=False) np.testing.assert_almost_equal( - modAL.uncertainty.classifier_entropy(not_fitted_estimator, np.random.rand(n_samples, 1)), + modAL.uncertainty.classifier_entropy( + not_fitted_estimator, np.random.rand(n_samples, 1)), np.zeros(shape=(n_samples, )) ) @@ -565,15 +661,18 @@ def test_uncertainty_sampling(self): for true_query_idx in range(n_samples): predict_proba = np.random.rand(n_samples, n_classes) predict_proba[true_query_idx] = max_proba - classifier = mock.MockEstimator(predict_proba_return=predict_proba) - query_idx = modAL.uncertainty.uncertainty_sampling( + classifier = mock.MockEstimator( + predict_proba_return=predict_proba) + query_idx, query_metric = modAL.uncertainty.uncertainty_sampling( classifier, np.random.rand(n_samples, n_classes) ) - shuffled_query_idx = modAL.uncertainty.uncertainty_sampling( + shuffled_query_idx, shuffled_query_metric = modAL.uncertainty.uncertainty_sampling( classifier, np.random.rand(n_samples, n_classes), random_tie_break=True ) np.testing.assert_array_equal(query_idx, true_query_idx) + np.testing.assert_array_equal( + shuffled_query_idx, true_query_idx) def test_margin_sampling(self): for n_samples in range(1, 10): @@ -582,15 +681,19 @@ def test_margin_sampling(self): predict_proba = np.zeros(shape=(n_samples, n_classes)) predict_proba[:, 0] = 1.0 predict_proba[true_query_idx, 0] = 0.0 - classifier = mock.MockEstimator(predict_proba_return=predict_proba) - query_idx = modAL.uncertainty.margin_sampling( + classifier = mock.MockEstimator( + predict_proba_return=predict_proba) + + query_idx, query_metric = modAL.uncertainty.margin_sampling( classifier, np.random.rand(n_samples, n_classes) ) - shuffled_query_idx = modAL.uncertainty.margin_sampling( + shuffled_query_idx, shuffled_query_metric = modAL.uncertainty.margin_sampling( classifier, np.random.rand(n_samples, n_classes), random_tie_break=True ) np.testing.assert_array_equal(query_idx, true_query_idx) + np.testing.assert_array_equal( + shuffled_query_idx, true_query_idx) def test_entropy_sampling(self): for n_samples in range(1, 10): @@ -600,15 +703,72 @@ def test_entropy_sampling(self): predict_proba = np.zeros(shape=(n_samples, n_classes)) predict_proba[:, 0] = 1.0 predict_proba[true_query_idx] = max_proba - classifier = mock.MockEstimator(predict_proba_return=predict_proba) - query_idx = modAL.uncertainty.entropy_sampling( + classifier = mock.MockEstimator( + predict_proba_return=predict_proba) + + query_idx, query_metric = modAL.uncertainty.entropy_sampling( classifier, np.random.rand(n_samples, n_classes) ) - shuffled_query_idx = modAL.uncertainty.entropy_sampling( + shuffled_query_idx, shuffled_query_metric = modAL.uncertainty.entropy_sampling( classifier, np.random.rand(n_samples, n_classes), random_tie_break=True ) np.testing.assert_array_equal(query_idx, true_query_idx) + np.testing.assert_array_equal( + shuffled_query_idx, true_query_idx) + + +class TestDropout(unittest.TestCase): + def test_mc_dropout_bald(self): pass + def test_mc_dropout_mean_st(self): pass + def test_mc_dropout_max_entropy(self): pass + def test_mc_dropout_max_variationRatios(self): pass + def test_get_predictions(self): pass + def test_set_dropout_mode(self): pass + + +class TestDeepActiveLearner(unittest.TestCase): + """ + Tests for the base class methods of the BaseLearner (base.py) are provided in + the TestActiveLearner. + """ + + def setUp(self): + self.mock_deep_estimator = mock.MockEstimator() + # Add methods that can not be autospecced (because of the wrapper) + self.mock_deep_estimator.initialize = MagicMock(name='initialize') + self.mock_deep_estimator.partial_fit = MagicMock(name='partial_fit') + + def test_teach(self): + + for bootstrap, warm_start in product([True, False], [True, False]): + for n_samples in range(1, 10): + X = torch.randn(n_samples, 1) + y = torch.randn(n_samples) + + learner = modAL.models.learners.DeepActiveLearner( + estimator=self.mock_deep_estimator + ) + + learner.teach(X, y, bootstrap=bootstrap, warm_start=warm_start) + + def test_batch_size(self): + learner = modAL.models.learners.DeepActiveLearner( + estimator=self.mock_deep_estimator + ) + + for batch_size in range(1, 50): + learner.batch_size = batch_size + self.assertEqual(batch_size, learner.batch_size) + + def test_num_epochs(self): + learner = modAL.models.learners.DeepActiveLearner( + estimator=self.mock_deep_estimator + ) + + for num_epochs in range(1, 50): + learner.num_epochs = num_epochs + self.assertEqual(num_epochs, learner.num_epochs) class TestActiveLearner(unittest.TestCase): @@ -637,8 +797,10 @@ def test_add_training_data(self): np.concatenate((y_initial, y_new)) ) # 2. vector class labels - y_initial = np.random.randint(0, 2, size=(n_samples, n_features+1)) - y_new = np.random.randint(0, 2, size=(n_new_samples, n_features+1)) + y_initial = np.random.randint( + 0, 2, size=(n_samples, n_features+1)) + y_new = np.random.randint( + 0, 2, size=(n_new_samples, n_features+1)) learner = modAL.models.learners.ActiveLearner( estimator=mock.MockEstimator(), X_training=X_initial, y_training=y_initial @@ -659,24 +821,25 @@ def test_add_training_data(self): y_new = np.random.randint(0, 2, size=(n_new_samples,)) learner._add_training_data(X_new, y_new) - - # testing for invalid cases # 1. len(X_new) != len(y_new) X_new = np.random.rand(n_new_samples, n_features) y_new = np.random.randint(0, 2, size=(2*n_new_samples,)) - self.assertRaises(ValueError, learner._add_training_data, X_new, y_new) + self.assertRaises( + ValueError, learner._add_training_data, X_new, y_new) # 2. X_new has wrong dimensions X_new = np.random.rand(n_new_samples, 2*n_features) y_new = np.random.randint(0, 2, size=(n_new_samples,)) - self.assertRaises(ValueError, learner._add_training_data, X_new, y_new) + self.assertRaises( + ValueError, learner._add_training_data, X_new, y_new) def test_predict(self): for n_samples in range(1, 100): for n_features in range(1, 10): X = np.random.rand(n_samples, n_features) predict_return = np.random.randint(0, 2, size=(n_samples, )) - mock_classifier = mock.MockEstimator(predict_return=predict_return) + mock_classifier = mock.MockEstimator( + predict_return=predict_return) learner = modAL.models.learners.ActiveLearner( estimator=mock_classifier ) @@ -689,8 +852,10 @@ def test_predict_proba(self): for n_samples in range(1, 100): for n_features in range(1, 10): X = np.random.rand(n_samples, n_features) - predict_proba_return = np.random.randint(0, 2, size=(n_samples,)) - mock_classifier = mock.MockEstimator(predict_proba_return=predict_proba_return) + predict_proba_return = np.random.randint( + 0, 2, size=(n_samples,)) + mock_classifier = mock.MockEstimator( + predict_proba_return=predict_proba_return) learner = modAL.models.learners.ActiveLearner( estimator=mock_classifier ) @@ -704,7 +869,9 @@ def test_query(self): for n_features in range(1, 10): X = np.random.rand(n_samples, n_features) query_idx = np.random.randint(0, n_samples) - mock_query = mock.MockFunction(return_val=query_idx) + query_metrics = np.random.randint(0, n_samples) + mock_query = mock.MockFunction( + return_val=(query_idx, query_metrics)) learner = modAL.models.learners.ActiveLearner( estimator=None, query_strategy=mock_query @@ -713,12 +880,17 @@ def test_query(self): learner.query(X), (query_idx, X[query_idx]) ) + np.testing.assert_equal( + learner.query(X, return_metrics=True), + (query_idx, X[query_idx], query_metrics) + ) def test_score(self): test_cases = (np.random.rand() for _ in range(10)) for score_return in test_cases: mock_classifier = mock.MockEstimator(score_return=score_return) - learner = modAL.models.learners.ActiveLearner(mock_classifier, mock.MockFunction(None)) + learner = modAL.models.learners.ActiveLearner( + mock_classifier, mock.MockFunction(None)) np.testing.assert_almost_equal( learner.score(np.random.rand(5, 2), np.random.rand(5, )), score_return @@ -768,7 +940,8 @@ def test_sklearn(self): X_training=np.random.rand(10, 10), y_training=np.random.randint(0, 2, size=(10,)) ) - learner.fit(np.random.rand(10, 10), np.random.randint(0, 2, size=(10,))) + learner.fit(np.random.rand(10, 10), + np.random.randint(0, 2, size=(10,))) pred = learner.predict(np.random.rand(10, 10)) learner.predict_proba(np.random.rand(10, 10)) confusion_matrix(pred, np.random.randint(0, 2, size=(10,))) @@ -786,7 +959,8 @@ def test_sparse_matrices(self): for query_strategy, format, n_samples, n_features in product(query_strategies, formats, sample_count, feature_count): X_pool = sp.random(n_samples, n_features, format=format) y_pool = np.random.randint(0, 2, size=(n_samples, )) - initial_idx = np.random.choice(range(n_samples), size=5, replace=False) + initial_idx = np.random.choice( + range(n_samples), size=5, replace=False) learner = modAL.models.learners.ActiveLearner( estimator=RandomForestClassifier(n_estimators=10), query_strategy=query_strategy, @@ -856,7 +1030,8 @@ def test_on_transformed_with_variable_transformation(self): ) for _ in range(len(X_pool)): - query_idx, query_instance = learner.query(X_pool, n_instances=1) + query_idx, query_instance = learner.query( + X_pool, n_instances=1) i = query_idx[0] learner.teach( @@ -880,7 +1055,6 @@ def test_old_query_strategy_interface(self): def custom_query_strategy(classifier, X): return query_idx_, query_instance_ - train_idx = np.random.choice(range(n_samples), size=2, replace=False) custom_query_learner = modAL.models.learners.ActiveLearner( estimator=RandomForestClassifier(n_estimators=10), @@ -925,7 +1099,8 @@ def test_set_new_max(self): y = np.random.rand(n_samples) max_idx = np.argmax(y) regressor = mock.MockEstimator() - learner = modAL.models.learners.BayesianOptimizer(estimator=regressor) + learner = modAL.models.learners.BayesianOptimizer( + estimator=regressor) learner._set_max(X, y) np.testing.assert_equal(learner.X_max, X[max_idx]) np.testing.assert_equal(learner.y_max, y[max_idx]) @@ -975,7 +1150,8 @@ def test_get_max(self): y[max_idx] = 10 regressor = mock.MockEstimator() - optimizer = modAL.models.learners.BayesianOptimizer(regressor, X_training=X, y_training=y) + optimizer = modAL.models.learners.BayesianOptimizer( + regressor, X_training=X, y_training=y) X_max, y_max = optimizer.get_max() np.testing.assert_equal(X_max, X[max_idx]) np.testing.assert_equal(y_max, y[max_idx]) @@ -986,7 +1162,8 @@ def test_teach(self): for n_samples in range(1, 100): for n_features in range(1, 100): regressor = mock.MockEstimator() - learner = modAL.models.learners.BayesianOptimizer(estimator=regressor) + learner = modAL.models.learners.BayesianOptimizer( + estimator=regressor) X = np.random.rand(n_samples, 2) y = np.random.rand(n_samples) @@ -1046,7 +1223,8 @@ def test_set_classes(self): for n_learners in range(1, 10): learner_list = [modAL.models.learners.ActiveLearner(estimator=mock.MockEstimator(fitted=False)) for idx in range(n_learners)] - committee = modAL.models.learners.Committee(learner_list=learner_list) + committee = modAL.models.learners.Committee( + learner_list=learner_list) self.assertEqual(committee.classes_, None) self.assertEqual(committee.n_classes_, 0) @@ -1054,7 +1232,8 @@ def test_set_classes(self): for n_classes in range(1, 10): learner_list = [modAL.models.learners.ActiveLearner(estimator=mock.MockEstimator(classes_=np.asarray([idx]))) for idx in range(n_classes)] - committee = modAL.models.learners.Committee(learner_list=learner_list) + committee = modAL.models.learners.Committee( + learner_list=learner_list) np.testing.assert_equal( committee.classes_, np.unique(range(n_classes)) @@ -1063,13 +1242,14 @@ def test_set_classes(self): def test_predict(self): for n_learners in range(1, 10): for n_instances in range(1, 10): - prediction = np.random.randint(10, size=(n_instances, n_learners)) + prediction = np.random.randint( + 10, size=(n_instances, n_learners)) committee = modAL.models.learners.Committee( learner_list=[mock.MockActiveLearner( - mock.MockEstimator(classes_=np.asarray([0])), - predict_return=prediction[:, learner_idx] - ) - for learner_idx in range(n_learners)] + mock.MockEstimator(classes_=np.asarray([0])), + predict_return=prediction[:, learner_idx] + ) + for learner_idx in range(n_learners)] ) np.testing.assert_equal( committee.vote(np.random.rand(n_instances, 5)), @@ -1080,13 +1260,17 @@ def test_predict_proba(self): for n_samples in range(1, 100): for n_learners in range(1, 10): for n_classes in range(1, 10): - vote_proba_output = np.random.rand(n_samples, n_learners, n_classes) + vote_proba_output = np.random.rand( + n_samples, n_learners, n_classes) # assembling the mock learners learner_list = [mock.MockActiveLearner( - predict_proba_return=vote_proba_output[:, learner_idx, :], - predictor=mock.MockEstimator(classes_=list(range(n_classes))) + predict_proba_return=vote_proba_output[:, + learner_idx, :], + predictor=mock.MockEstimator( + classes_=list(range(n_classes))) ) for learner_idx in range(n_learners)] - committee = modAL.models.learners.Committee(learner_list=learner_list) + committee = modAL.models.learners.Committee( + learner_list=learner_list) np.testing.assert_almost_equal( committee.predict_proba(np.random.rand(n_samples, 1)), np.mean(vote_proba_output, axis=1) @@ -1095,14 +1279,16 @@ def test_predict_proba(self): def test_vote(self): for n_members in range(1, 10): for n_instances in range(1, 100): - vote_output = np.random.randint(0, 2, size=(n_instances, n_members)) + vote_output = np.random.randint( + 0, 2, size=(n_instances, n_members)) # assembling the Committee learner_list = [mock.MockActiveLearner( - predict_return=vote_output[:, member_idx], - predictor=mock.MockEstimator(classes_=[0]) - ) - for member_idx in range(n_members)] - committee = modAL.models.learners.Committee(learner_list=learner_list) + predict_return=vote_output[:, member_idx], + predictor=mock.MockEstimator(classes_=[0]) + ) + for member_idx in range(n_members)] + committee = modAL.models.learners.Committee( + learner_list=learner_list) np.testing.assert_array_almost_equal( committee.vote(np.random.rand(n_instances).reshape(-1, 1)), vote_output @@ -1112,13 +1298,17 @@ def test_vote_proba(self): for n_samples in range(1, 100): for n_learners in range(1, 10): for n_classes in range(1, 10): - vote_proba_output = np.random.rand(n_samples, n_learners, n_classes) + vote_proba_output = np.random.rand( + n_samples, n_learners, n_classes) # assembling the mock learners learner_list = [mock.MockActiveLearner( - predict_proba_return=vote_proba_output[:, learner_idx, :], - predictor=mock.MockEstimator(classes_=list(range(n_classes))) + predict_proba_return=vote_proba_output[:, + learner_idx, :], + predictor=mock.MockEstimator( + classes_=list(range(n_classes))) ) for learner_idx in range(n_learners)] - committee = modAL.models.learners.Committee(learner_list=learner_list) + committee = modAL.models.learners.Committee( + learner_list=learner_list) np.testing.assert_almost_equal( committee.vote_proba(np.random.rand(n_samples, 1)), vote_proba_output @@ -1171,8 +1361,10 @@ def test_on_transformed(self): ), # committee learners can contain different amounts of # different instances - X_training=X_pool.iloc[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], - y_training=y_pool[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], + X_training=X_pool.iloc[train_idx[( + np.arange(i + 1) + i) % len(train_idx)]], + y_training=y_pool[train_idx[( + np.arange(i + 1) + i) % len(train_idx)]], ) for i in range(3)] for query_strategy in query_strategies: @@ -1194,13 +1386,16 @@ def test_predict(self): # assembling the Committee learner_list = [mock.MockActiveLearner(predict_return=vote[:, member_idx]) for member_idx in range(n_members)] - committee = modAL.models.learners.CommitteeRegressor(learner_list=learner_list) + committee = modAL.models.learners.CommitteeRegressor( + learner_list=learner_list) np.testing.assert_array_almost_equal( - committee.predict(np.random.rand(n_instances).reshape(-1, 1), return_std=False), + committee.predict(np.random.rand( + n_instances).reshape(-1, 1), return_std=False), np.mean(vote, axis=1) ) np.testing.assert_array_almost_equal( - committee.predict(np.random.rand(n_instances).reshape(-1, 1), return_std=True), + committee.predict(np.random.rand( + n_instances).reshape(-1, 1), return_std=True), (np.mean(vote, axis=1), np.std(vote, axis=1)) ) @@ -1211,7 +1406,8 @@ def test_vote(self): # assembling the Committee learner_list = [mock.MockActiveLearner(predict_return=vote_output[:, member_idx]) for member_idx in range(n_members)] - committee = modAL.models.learners.CommitteeRegressor(learner_list=learner_list) + committee = modAL.models.learners.CommitteeRegressor( + learner_list=learner_list) np.testing.assert_array_almost_equal( committee.vote(np.random.rand(n_instances).reshape(-1, 1)), vote_output @@ -1243,8 +1439,10 @@ def test_on_transformed(self): ), # committee learners can contain different amounts of # different instances - X_training=X_pool.iloc[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], - y_training=y_pool[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], + X_training=X_pool.iloc[train_idx[( + np.arange(i + 1) + i) % len(train_idx)]], + y_training=y_pool[train_idx[( + np.arange(i + 1) + i) % len(train_idx)]], ) for i in range(3)] for query_strategy in query_strategies: @@ -1262,10 +1460,12 @@ def test_SVM_loss(self): for n_classes in range(2, 10): for n_instances in range(1, 10): X_training = np.random.rand(n_instances, 5) - y_training = np.random.randint(0, 2, size=(n_instances, n_classes)) + y_training = np.random.randint( + 0, 2, size=(n_instances, n_classes)) X_pool = np.random.rand(n_instances, 5) y_pool = np.random.randint(0, 2, size=(n_instances, n_classes)) - classifier = OneVsRestClassifier(SVC(probability=True, gamma='auto')) + classifier = OneVsRestClassifier( + SVC(probability=True, gamma='auto')) classifier.fit(X_training, y_training) avg_loss = modAL.multilabel._SVM_loss(classifier, X_pool) mcc_loss = modAL.multilabel._SVM_loss(classifier, X_pool, @@ -1278,28 +1478,43 @@ def test_strategies(self): for n_pool_instances in range(1, 10): for n_query_instances in range(1, min(n_pool_instances, 3)): X_training = np.random.rand(n_pool_instances, 5) - y_training = np.random.randint(0, 2, size=(n_pool_instances, n_classes)) + y_training = np.random.randint( + 0, 2, size=(n_pool_instances, n_classes)) X_pool = np.random.rand(n_pool_instances, 5) - classifier = OneVsRestClassifier(SVC(probability=True, gamma='auto')) + classifier = OneVsRestClassifier( + SVC(probability=True, gamma='auto')) classifier.fit(X_training, y_training) active_learner = modAL.models.ActiveLearner(classifier) # no random tie break modAL.multilabel.SVM_binary_minimum(active_learner, X_pool) - modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances) - modAL.multilabel.max_loss(classifier, X_pool, n_query_instances) - modAL.multilabel.min_confidence(classifier, X_pool, n_query_instances) - modAL.multilabel.avg_confidence(classifier, X_pool, n_query_instances) - modAL.multilabel.max_score(classifier, X_pool, n_query_instances) - modAL.multilabel.avg_score(classifier, X_pool, n_query_instances) + modAL.multilabel.mean_max_loss( + classifier, X_pool, n_query_instances) + modAL.multilabel.max_loss( + classifier, X_pool, n_query_instances) + modAL.multilabel.min_confidence( + classifier, X_pool, n_query_instances) + modAL.multilabel.avg_confidence( + classifier, X_pool, n_query_instances) + modAL.multilabel.max_score( + classifier, X_pool, n_query_instances) + modAL.multilabel.avg_score( + classifier, X_pool, n_query_instances) # random tie break - modAL.multilabel.SVM_binary_minimum(active_learner, X_pool, random_tie_break=True) - modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances, random_tie_break=True) - modAL.multilabel.max_loss(classifier, X_pool, n_query_instances, random_tie_break=True) - modAL.multilabel.min_confidence(classifier, X_pool, n_query_instances, random_tie_break=True) - modAL.multilabel.avg_confidence(classifier, X_pool, n_query_instances, random_tie_break=True) - modAL.multilabel.max_score(classifier, X_pool, n_query_instances, random_tie_break=True) - modAL.multilabel.avg_score(classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.SVM_binary_minimum( + active_learner, X_pool, random_tie_break=True) + modAL.multilabel.mean_max_loss( + classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.max_loss( + classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.min_confidence( + classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.avg_confidence( + classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.max_score( + classifier, X_pool, n_query_instances, random_tie_break=True) + modAL.multilabel.avg_score( + classifier, X_pool, n_query_instances, random_tie_break=True) class TestExamples(unittest.TestCase): @@ -1320,5 +1535,17 @@ def test_examples(self): import example_tests.ranked_batch_mode +# Empty PyTorch model for test cases +class Torch_Model(nn.Module): + def __init__(self,): + super(Torch_Model, self).__init__() + self.convs = nn.Sequential( + nn.Conv2d(1, 5, 3), + ) + + def forward(self, x): + return x + + if __name__ == '__main__': unittest.main(verbosity=2) From 0f978be47fc9be574caa61f8485a7fccc19e5c98 Mon Sep 17 00:00:00 2001 From: Max Keller Date: Mon, 23 Aug 2021 11:00:06 +0200 Subject: [PATCH 164/182] Add tests for dropout query strategies --- modAL/dropout.py | 5 +- tests/core_tests.py | 181 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 166 insertions(+), 20 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 6533c1d..8cf8905 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -251,7 +251,7 @@ def mc_dropout_max_variationRatios(classifier: BaseEstimator, X: modALinput, n_i return shuffled_argmax(variationRatios, n_instances=n_instances) -def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_indexes: list, +def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_indexes: list = [], num_predictions: int = 50, sample_per_forward_pass: int = 1000, logits_adaptor: Callable[[torch.tensor, modALinput], torch.tensor] = default_logits_adaptor): """ @@ -273,6 +273,9 @@ def get_predictions(classifier: BaseEstimator, X: modALinput, dropout_layer_inde prediction: list with all predictions """ + assert num_predictions > 0, 'num_predictions must be larger than zero' + assert sample_per_forward_pass > 0, 'sample_per_forward_pass must be larger than zero' + predictions = [] # set dropout layers to train mode set_dropout_mode(classifier.estimator.module_, diff --git a/tests/core_tests.py b/tests/core_tests.py index 4beb0fc..86db55e 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -18,6 +18,7 @@ import modAL.expected_error import modAL.multilabel import modAL.uncertainty +import modAL.dropout from copy import deepcopy from itertools import chain, product @@ -39,6 +40,8 @@ import torch from torch import nn +from skorch import NeuralNetClassifier + Test = namedtuple('Test', ['input', 'output']) @@ -718,18 +721,170 @@ def test_entropy_sampling(self): shuffled_query_idx, true_query_idx) +# PyTorch model for test cases --> Do not change the layers +class Torch_Model(nn.Module): + def __init__(self,): + super(Torch_Model, self).__init__() + self.convs = nn.Sequential( + nn.Conv2d(1, 32, 3), + nn.ReLU(), + nn.Conv2d(32, 64, 3), + nn.ReLU(), + nn.MaxPool2d(2), + nn.Dropout(0.25) + ) + self.fcs = nn.Sequential( + nn.Linear(12*12*64, 128), + nn.ReLU(), + nn.Dropout(0.5), + nn.Linear(128, 10), + ) + + def forward(self, x): + return x + + class TestDropout(unittest.TestCase): - def test_mc_dropout_bald(self): pass - def test_mc_dropout_mean_st(self): pass - def test_mc_dropout_max_entropy(self): pass - def test_mc_dropout_max_variationRatios(self): pass - def test_get_predictions(self): pass - def test_set_dropout_mode(self): pass + def setUp(self): + self.skorch_classifier = NeuralNetClassifier(Torch_Model, + criterion=torch.nn.CrossEntropyLoss, + optimizer=torch.optim.Adam, + train_split=None, + verbose=1) + + def test_mc_dropout_bald(self): + learner = modAL.models.learners.DeepActiveLearner( + estimator=self.skorch_classifier, + query_strategy=modAL.dropout.mc_dropout_bald, + ) + for random_tie_break in [True, False]: + for num_cycles, sample_per_forward_pass in product(range(1, 5), range(1, 5)): + for n_samples, n_classes in product(range(1, 5), range(1, 5)): + for n_instances in range(1, n_samples): + X_pool = torch.randn(n_samples, n_classes) + modAL.dropout.mc_dropout_bald(learner, X_pool, n_instances, random_tie_break, [], + num_cycles, sample_per_forward_pass) + + def test_mc_dropout_mean_st(self): + learner = modAL.models.learners.DeepActiveLearner( + estimator=self.skorch_classifier, + query_strategy=modAL.dropout.mc_dropout_mean_st, + ) + for random_tie_break in [True, False]: + for num_cycles, sample_per_forward_pass in product(range(1, 5), range(1, 5)): + for n_samples, n_classes in product(range(1, 5), range(1, 5)): + for n_instances in range(1, n_samples): + X_pool = torch.randn(n_samples, n_classes) + modAL.dropout.mc_dropout_mean_st(learner, X_pool, n_instances, random_tie_break, [], + num_cycles, sample_per_forward_pass) + + def test_mc_dropout_max_entropy(self): + learner = modAL.models.learners.DeepActiveLearner( + estimator=self.skorch_classifier, + query_strategy=modAL.dropout.mc_dropout_max_entropy, + ) + for random_tie_break in [True, False]: + for num_cycles, sample_per_forward_pass in product(range(1, 5), range(1, 5)): + for n_samples, n_classes in product(range(1, 5), range(1, 5)): + for n_instances in range(1, n_samples): + X_pool = torch.randn(n_samples, n_classes) + modAL.dropout.mc_dropout_max_entropy(learner, X_pool, n_instances, random_tie_break, [], + num_cycles, sample_per_forward_pass) + + def test_mc_dropout_max_variationRatios(self): + learner = modAL.models.learners.DeepActiveLearner( + estimator=self.skorch_classifier, + query_strategy=modAL.dropout.mc_dropout_max_variationRatios, + ) + for random_tie_break in [True, False]: + for num_cycles, sample_per_forward_pass in product(range(1, 5), range(1, 5)): + for n_samples, n_classes in product(range(1, 5), range(1, 5)): + for n_instances in range(1, n_samples): + X_pool = torch.randn(n_samples, n_classes) + modAL.dropout.mc_dropout_max_variationRatios(learner, X_pool, n_instances, random_tie_break, [], + num_cycles, sample_per_forward_pass) + + def test_get_predictions(self): + X = torch.randn(100, 1) + + learner = modAL.models.learners.DeepActiveLearner( + estimator=self.skorch_classifier, + query_strategy=mock.MockFunction(return_val=None), + ) + + # num predictions tests + for num_predictions in range(1, 20): + for samples_per_forward_pass in range(1, 10): + + predictions = modAL.dropout.get_predictions( + learner, X, dropout_layer_indexes=[], + num_predictions=num_predictions, + sample_per_forward_pass=samples_per_forward_pass) + + self.assertEqual(len(predictions), num_predictions) + + self.assertRaises(AssertionError, modAL.dropout.get_predictions, + learner, X, dropout_layer_indexes=[], + num_predictions=-1, + sample_per_forward_pass=0) + + self.assertRaises(AssertionError, modAL.dropout.get_predictions, + learner, X, dropout_layer_indexes=[], + num_predictions=10, + sample_per_forward_pass=-5) + + # logits adapter function test + for samples, classes, subclasses in product(range(1, 10), range(1, 10), range(1, 10)): + input_shape = (samples, classes, subclasses) + desired_shape = (input_shape[0], np.prod(input_shape[1:])) + X_adaption_needed = torch.randn(input_shape) + + def logits_adaptor(input_tensor, data): return torch.flatten( + input_tensor, start_dim=1) + + predictions = modAL.dropout.get_predictions( + learner, X_adaption_needed, dropout_layer_indexes=[], + num_predictions=num_predictions, + sample_per_forward_pass=samples_per_forward_pass, + logits_adaptor=logits_adaptor) + + self.assertEqual(predictions[0].shape, desired_shape) + + def test_set_dropout_mode(self): + # set dropmout mode for all dropout layers + for train_mode in [True, False]: + model = Torch_Model() + modules = list(model.modules()) + + for module in modules: + self.assertEqual(module.training, True) + + modAL.dropout.set_dropout_mode(model, [], train_mode) + + self.assertEqual(modules[7].training, train_mode) + self.assertEqual(modules[11].training, train_mode) + + # set dropout mode only for special layers: + for train_mode in [True, False]: + model = Torch_Model() + modules = list(model.modules()) + modAL.dropout.set_dropout_mode(model, [7], train_mode) + self.assertEqual(modules[7].training, train_mode) + self.assertEqual(modules[11].training, True) + + modAL.dropout.set_dropout_mode(model, [], True) + modAL.dropout.set_dropout_mode(model, [11], train_mode) + self.assertEqual(modules[11].training, train_mode) + self.assertEqual(modules[7].training, True) + + # No Dropout Layer + self.assertRaises(KeyError, modAL.dropout.set_dropout_mode, + model, [5], train_mode) class TestDeepActiveLearner(unittest.TestCase): """ - Tests for the base class methods of the BaseLearner (base.py) are provided in + Tests for the base class methods of the BaseLearner (base.py) are provided in the TestActiveLearner. """ @@ -1535,17 +1690,5 @@ def test_examples(self): import example_tests.ranked_batch_mode -# Empty PyTorch model for test cases -class Torch_Model(nn.Module): - def __init__(self,): - super(Torch_Model, self).__init__() - self.convs = nn.Sequential( - nn.Conv2d(1, 5, 3), - ) - - def forward(self, x): - return x - - if __name__ == '__main__': unittest.main(verbosity=2) From e5218f551f78217126f714be7218ba216741c307 Mon Sep 17 00:00:00 2001 From: Max Keller Date: Wed, 25 Aug 2021 19:51:21 +0200 Subject: [PATCH 165/182] Add pyTorch mc_dropout example --- examples/pytorch_mc_dropout.py | 126 +++++++++++++++++++++++++++++++++ modAL/models/base.py | 4 -- 2 files changed, 126 insertions(+), 4 deletions(-) create mode 100644 examples/pytorch_mc_dropout.py diff --git a/examples/pytorch_mc_dropout.py b/examples/pytorch_mc_dropout.py new file mode 100644 index 0000000..0e0befa --- /dev/null +++ b/examples/pytorch_mc_dropout.py @@ -0,0 +1,126 @@ +""" +In this file the basic ModAL PyTorch DeepActiveLearner workflow is explained +through an example on the MNIST dataset and the MC-Dropout-Bald query strategy. +""" +import sys +import os +import torch +from torch import nn +from skorch import NeuralNetClassifier + +from modAL.models import DeepActiveLearner + +# import of query strategies +from modAL.dropout import mc_dropout_bald, mc_dropout_mean_st + +import numpy as np +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +from torchvision.datasets import MNIST + +# Standard Pytorch Model (Visit the PyTorch documentation for more details) +class Torch_Model(nn.Module): + def __init__(self,): + super(Torch_Model, self).__init__() + self.convs = nn.Sequential( + nn.Conv2d(1, 32, 3), + nn.ReLU(), + nn.Conv2d(32, 64, 3), + nn.ReLU(), + nn.MaxPool2d(2), + nn.Dropout(0.25) + ) + self.fcs = nn.Sequential( + nn.Linear(12*12*64, 128), + nn.ReLU(), + nn.Dropout(0.5), + nn.Linear(128, 10), + ) + + def forward(self, x): + out = x + out = self.convs(out) + out = out.view(-1, 12*12*64) + out = self.fcs(out) + return out + + +torch_model = Torch_Model() +""" +You can acquire from the layer_list the dropout_layer_indexes, which can then be passed on +to the query strategies to decide which dropout layers should be active for the predictions. +When no dropout_layer_indexes are passed, all dropout layers will be activated on default. +""" +layer_list = list(torch_model.modules()) + +device = "cuda" if torch.cuda.is_available() else "cpu" + +# Use the NeuralNetClassifier from skorch to wrap the Pytorch model to the scikit-learn API +classifier = NeuralNetClassifier(Torch_Model, + criterion=torch.nn.CrossEntropyLoss, + optimizer=torch.optim.Adam, + train_split=None, + verbose=1, + device=device) + + +# Load the Dataset +mnist_data = MNIST('.', download=True, transform=ToTensor()) +dataloader = DataLoader(mnist_data, shuffle=True, batch_size=60000) +X, y = next(iter(dataloader)) + +# read training data +X_train, X_test, y_train, y_test = X[:50000], X[50000:], y[:50000], y[50000:] +X_train = X_train.reshape(50000, 1, 28, 28) +X_test = X_test.reshape(10000, 1, 28, 28) + +# assemble initial data +n_initial = 1000 +initial_idx = np.random.choice( + range(len(X_train)), size=n_initial, replace=False) +X_initial = X_train[initial_idx] +y_initial = y_train[initial_idx] + + +# generate the pool +# remove the initial data from the training dataset +X_pool = np.delete(X_train, initial_idx, axis=0)[:5000] +y_pool = np.delete(y_train, initial_idx, axis=0)[:5000] + + +# initialize ActiveLearner (Pass to him the skorch wrapped PyTorch model & the Query strategy) +learner = DeepActiveLearner( + estimator=classifier, + query_strategy=mc_dropout_bald, +) +# initial teaching if desired (not necessary) +learner.teach(X_initial, y_initial) + +print("Score from sklearn: {}".format(learner.score(X_pool, y_pool))) + + +# the active learning loop +n_queries = 10 +X_teach = X_initial +y_teach = y_initial + + +for idx in range(n_queries): + print('Query no. %d' % (idx + 1)) + """ + Query new data (num_cycles are the number of dropout forward passes that should be performed) + --> check the documentation of mc_dropout_bald in modAL/dropout.py to see all available parameters + """ + query_idx, metric_values = learner.query( + X_pool, n_instances=100, dropout_layer_indexes=[7, 11], num_cycles=10) + # Add queried instances + X_teach = torch.cat((X_teach, X_pool[query_idx])) + y_teach = torch.cat((y_teach, y_pool[query_idx])) + learner.teach(X_teach, y_teach) + + # remove queried instance from pool + X_pool = np.delete(X_pool, query_idx, axis=0) + y_pool = np.delete(y_pool, query_idx, axis=0) + + # give us the model performance + print("Model score: {}".format(learner.score(X_test, y_test))) diff --git a/modAL/models/base.py b/modAL/models/base.py index cff5dc0..de356fa 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -250,10 +250,6 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f def predict(self, X: modALinput) -> Any: pass - @abc.abstractmethod - def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> Any: - pass - def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.csr_matrix]: """ Transforms the data as supplied to each learner's estimator and concatenates transformations. From a9cfa6ac37c3f583deaaed8afced693ce3783b00 Mon Sep 17 00:00:00 2001 From: Max Keller Date: Wed, 25 Aug 2021 19:53:23 +0200 Subject: [PATCH 166/182] Change np.bool to bool, (deprecation in numpy) --- modAL/batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modAL/batch.py b/modAL/batch.py index 39488c4..c860966 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -162,7 +162,7 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee], ceiling = np.minimum(unlabeled.shape[0], n_instances) - len(instance_index_ranking) # mask for unlabeled initialized as transparent - mask = np.ones(unlabeled.shape[0], np.bool) + mask = np.ones(unlabeled.shape[0], bool) for _ in range(ceiling): From eb63dc0c6bdb12b78f53b2ac159ebfd445a0746e Mon Sep 17 00:00:00 2001 From: Max Keller Date: Wed, 25 Aug 2021 20:04:44 +0200 Subject: [PATCH 167/182] Change return_metric warning location & put abstract teach back --- modAL/models/base.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index de356fa..71472d9 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -181,10 +181,11 @@ def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwarg query_metrics = None query_result = self.query_strategy( self, X_pool, *query_args, **query_kwargs) - warnings.warn( - "The selected query strategy doesn't support return_metrics") if return_metrics: + if query_metrics is None: + warnings.warn( + "The selected query strategy doesn't support return_metrics") return query_result, retrieve_rows(X_pool, query_result), query_metrics else: return query_result, retrieve_rows(X_pool, query_result) @@ -246,6 +247,10 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f for learner in self.learner_list: learner._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + @abc.abstractmethod + def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> Any: + pass + @abc.abstractmethod def predict(self, X: modALinput) -> Any: pass @@ -288,10 +293,11 @@ def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwarg query_metrics = None query_result = self.query_strategy( self, X_pool, *query_args, **query_kwargs) - warnings.warn( - "The selected query strategy doesn't support return_metrics") if return_metrics: + if query_metrics is None: + warnings.warn( + "The selected query strategy doesn't support return_metrics") return query_result, retrieve_rows(X_pool, query_result), query_metrics else: return query_result, retrieve_rows(X_pool, query_result) From 4cb70608d1458fbf6a8a5d906502fead10ea4497 Mon Sep 17 00:00:00 2001 From: Max Keller Date: Wed, 1 Sep 2021 10:38:14 +0200 Subject: [PATCH 168/182] Remove mc_dropout_multi --- modAL/dropout.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/modAL/dropout.py b/modAL/dropout.py index 8cf8905..92aca53 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -20,40 +20,6 @@ def default_logits_adaptor(input_tensor: torch.tensor, samples: modALinput): return input_tensor -def mc_dropout_multi(classifier: BaseEstimator, X: modALinput, query_strategies: list = ["bald", "mean_st", "max_entropy", "max_var"], - n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], - num_cycles: int = 50, sample_per_forward_pass: int = 1000, - logits_adaptor: Callable[[ - torch.tensor, modALinput], torch.tensor] = default_logits_adaptor, - **mc_dropout_kwargs) -> np.ndarray: - """ - Multi metric dropout query strategy. Returns the specified metrics for given input data. - Selection of query strategies are: - - bald: BALD query strategy - - mean_st: Mean Standard deviation - - max_entropy: maximum entropy - - max_var: maximum variation - By default all query strategies are selected - - Function returns dictionary of metrics with their name as key. - The indices of the n-best samples (n_instances) is not used in this function. - """ - predictions = get_predictions( - classifier, X, dropout_layer_indexes, num_cycles, sample_per_forward_pass, logits_adaptor) - - metrics_dict = {} - if "bald" in query_strategies: - metrics_dict["bald"] = _bald_divergence(predictions) - if "mean_st" in query_strategies: - metrics_dict["mean_st"] = _mean_standard_deviation(predictions) - if "max_entropy" in query_strategies: - metrics_dict["max_entropy"] = _entropy(predictions) - if "max_var" in query_strategies: - metrics_dict["max_var"] = _variation_ratios(predictions) - - return None, metrics_dict - - def mc_dropout_bald(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, dropout_layer_indexes: list = [], num_cycles: int = 50, sample_per_forward_pass: int = 1000, From 612a9eda1d4eee3545a25e3c9fce534c35b0aadd Mon Sep 17 00:00:00 2001 From: maxkeller321 Date: Sat, 18 Sep 2021 12:19:41 +0200 Subject: [PATCH 169/182] Remove abstract teach (BaseCommitte) & Fix test --- modAL/models/base.py | 4 ---- tests/core_tests.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 71472d9..ab7a6b2 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -247,10 +247,6 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f for learner in self.learner_list: learner._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) - @abc.abstractmethod - def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> Any: - pass - @abc.abstractmethod def predict(self, X: modALinput) -> Any: pass diff --git a/tests/core_tests.py b/tests/core_tests.py index 86db55e..e00a833 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -1205,7 +1205,7 @@ def test_old_query_strategy_interface(self): # (e.g. instance not matching instance index), # the old interface remains unchanged query_idx_ = np.random.choice(n_samples, 2) - query_instance_ = X_pool[(query_idx_ + 1) % len(X_pool)] + query_instance_ = X_pool[query_idx_] def custom_query_strategy(classifier, X): return query_idx_, query_instance_ From 939c7c4e3e5dbfe57d35eee4d1ff001fc5335361 Mon Sep 17 00:00:00 2001 From: Max Keller Date: Fri, 8 Oct 2021 08:37:36 +0200 Subject: [PATCH 170/182] Fix test & adapt documentation --- modAL/models/base.py | 1 + modAL/models/learners.py | 7 ++----- tests/core_tests.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 71472d9..11fc1fa 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -330,3 +330,4 @@ def _set_classes(self): def vote(self, X: modALinput) -> Any: # TODO: clarify typing pass + \ No newline at end of file diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 1ca6379..d295833 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -219,8 +219,7 @@ def __init__(self, def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner': """ - Interface for the fit method of the predictor. Fits the predictor to the supplied data, then stores it - internally for the active learning loop. + Interface for the fit method of the predictor. Fits the predictor to the supplied data. Args: X: The samples to be fitted. @@ -236,7 +235,7 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg def teach(self, X: modALinput, y: modALinput, warm_start: bool = True, bootstrap: bool = False, **fit_kwargs) -> None: """ - Adds X and y to the known training data and retrains the predictor with the augmented dataset. + Trains the predictor with the passed data (warm_start decides if params are resetted or not). Args: X: The new samples for which the labels are supplied by the expert. @@ -734,8 +733,6 @@ def __init__(self, learner_list: List[DeepActiveLearner], query_strategy: Callab on_transformed: bool = False) -> None: super().__init__(learner_list, query_strategy, on_transformed) self._set_classes() - # TODO: update training data when using fit() and teach() methods - self.X_training = None def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> None: """ diff --git a/tests/core_tests.py b/tests/core_tests.py index 86db55e..e00a833 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -1205,7 +1205,7 @@ def test_old_query_strategy_interface(self): # (e.g. instance not matching instance index), # the old interface remains unchanged query_idx_ = np.random.choice(n_samples, 2) - query_instance_ = X_pool[(query_idx_ + 1) % len(X_pool)] + query_instance_ = X_pool[query_idx_] def custom_query_strategy(classifier, X): return query_idx_, query_instance_ From ff8356bdb5c8f72278cd96c29796785ad53bd5fa Mon Sep 17 00:00:00 2001 From: Max Keller Date: Sat, 9 Oct 2021 10:17:09 +0200 Subject: [PATCH 171/182] Remove DeepCommittee --- modAL/models/__init__.py | 4 +- modAL/models/base.py | 94 +++++++++---- modAL/models/learners.py | 296 ++++----------------------------------- 3 files changed, 94 insertions(+), 300 deletions(-) diff --git a/modAL/models/__init__.py b/modAL/models/__init__.py index e178fe8..2c2bd87 100644 --- a/modAL/models/__init__.py +++ b/modAL/models/__init__.py @@ -1,6 +1,6 @@ -from .learners import ActiveLearner, DeepActiveLearner, BayesianOptimizer, Committee, DeepCommittee, CommitteeRegressor +from .learners import ActiveLearner, DeepActiveLearner, BayesianOptimizer, Committee, CommitteeRegressor __all__ = [ 'ActiveLearner', 'DeepActiveLearner', 'BayesianOptimizer', - 'Committee', 'DeepCommittee', 'CommitteeRegressor' + 'Committee', 'CommitteeRegressor' ] \ No newline at end of file diff --git a/modAL/models/base.py b/modAL/models/base.py index 777f58e..d767b6c 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -212,20 +212,20 @@ def teach(self, *args, **kwargs) -> None: class BaseCommittee(ABC, BaseEstimator): """ Base class for query-by-committee setup. - Args: learner_list: List of ActiveLearner objects to form committee. query_strategy: Function to query labels. on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator when applying the query strategy. """ - def __init__(self, learner_list: List[BaseLearner], query_strategy: Callable, on_transformed: bool = False) -> None: assert type(learner_list) == list, 'learners must be supplied in a list' self.learner_list = learner_list self.query_strategy = query_strategy self.on_transformed = on_transformed + # TODO: update training data when using fit() and teach() methods + self.X_training = None def __iter__(self) -> Iterator[BaseLearner]: for learner in self.learner_list: @@ -234,10 +234,33 @@ def __iter__(self) -> Iterator[BaseLearner]: def __len__(self) -> int: return len(self.learner_list) + def _add_training_data(self, X: modALinput, y: modALinput) -> None: + """ + Adds the new data and label to the known data for each learner, but does not retrain the model. + Args: + X: The new samples for which the labels are supplied by the expert. + y: Labels corresponding to the new instances in X. + Note: + If the learners have been fitted, the features in X have to agree with the training samples which the + classifier has seen. + """ + for learner in self.learner_list: + learner._add_training_data(X, y) + + def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> None: + """ + Fits all learners to the training data and labels provided to it so far. + Args: + bootstrap: If True, each estimator is trained on a bootstrapped dataset. Useful when + using bagging to build the ensemble. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + """ + for learner in self.learner_list: + learner._fit_to_known(bootstrap=bootstrap, **fit_kwargs) + def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> None: """ Fits all learners to the given data and labels. - Args: X: The new samples for which the labels are supplied by the expert. y: Labels corresponding to the new instances in X. @@ -247,16 +270,27 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f for learner in self.learner_list: learner._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) - @abc.abstractmethod - def predict(self, X: modALinput) -> Any: - pass + def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> 'BaseCommittee': + """ + Fits every learner to a subset sampled with replacement from X. Calling this method makes the learner forget the + data it has seen up until this point and replaces it with X! If you would like to perform bootstrapping on each + learner using the data it has seen, use the method .rebag()! + Calling this method makes the learner forget the data it has seen up until this point and replaces it with X! + Args: + X: The samples to be fitted on. + y: The corresponding labels. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + """ + for learner in self.learner_list: + learner.fit(X, y, **fit_kwargs) + + return self def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.csr_matrix]: """ Transforms the data as supplied to each learner's estimator and concatenates transformations. Args: X: dataset to be transformed - Returns: Transformed data set """ @@ -298,32 +332,38 @@ def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwarg else: return query_result, retrieve_rows(X_pool, query_result) - def _set_classes(self): + def rebag(self, **fit_kwargs) -> None: """ - Checks the known class labels by each learner, merges the labels and returns a mapping which maps the learner's - classes to the complete label list. + Refits every learner with a dataset bootstrapped from its training instances. Contrary to .bag(), it bootstraps + the training data for each learner based on its own examples. + Todo: + Where is .bag()? + Args: + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. """ - # assemble the list of known classes from each learner - try: - # if estimators are fitted - known_classes = tuple( - learner.estimator.classes_ for learner in self.learner_list) - except AttributeError: - # handle unfitted estimators - self.classes_ = None - self.n_classes_ = 0 - return - - self.classes_ = np.unique( - np.concatenate(known_classes, axis=0), - axis=0 - ) - self.n_classes_ = len(self.classes_) + self._fit_to_known(bootstrap=True, **fit_kwargs) + def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: + """ + Adds X and y to the known training data for each learner and retrains learners with the augmented dataset. + Args: + X: The new samples for which the labels are supplied by the expert. + y: Labels corresponding to the new instances in X. + bootstrap: If True, trains each learner on a bootstrapped set. Useful when building the ensemble by bagging. + only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples. + **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. + """ + self._add_training_data(X, y) + if not only_new: + self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) + else: + self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + + @abc.abstractmethod + def predict(self, X: modALinput) -> Any: pass @abc.abstractmethod def vote(self, X: modALinput) -> Any: # TODO: clarify typing pass - \ No newline at end of file diff --git a/modAL/models/learners.py b/modAL/models/learners.py index d295833..21d5181 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -443,20 +443,16 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: class Committee(BaseCommittee): """ This class is an abstract model of a committee-based active learning algorithm. - Args: learner_list: A list of ActiveLearners forming the Committee. query_strategy: Query strategy function. Committee supports disagreement-based query strategies from :mod:`modAL.disagreement`, but uncertainty-based ones from :mod:`modAL.uncertainty` are also supported. on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator when applying the query strategy. - Attributes: classes_: Class labels known by the Committee. n_classes_: Number of classes known by the Committee. - Examples: - >>> from sklearn.datasets import load_iris >>> from sklearn.neighbors import KNeighborsClassifier >>> from sklearn.ensemble import RandomForestClassifier @@ -490,272 +486,52 @@ class Committee(BaseCommittee): ... y=iris['target'][query_idx].reshape(1, ) ... ) """ - def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = vote_entropy_sampling, on_transformed: bool = False) -> None: super().__init__(learner_list, query_strategy, on_transformed) self._set_classes() - # TODO: update training data when using fit() and teach() methods - self.X_training = None - def _add_training_data(self, X: modALinput, y: modALinput) -> None: + def _set_classes(self): """ - Adds the new data and label to the known data for each learner, but does not retrain the model. - - Args: - X: The new samples for which the labels are supplied by the expert. - y: Labels corresponding to the new instances in X. - - Note: - If the learners have been fitted, the features in X have to agree with the training samples which the - classifier has seen. + Checks the known class labels by each learner, merges the labels and returns a mapping which maps the learner's + classes to the complete label list. """ - for learner in self.learner_list: - learner._add_training_data(X, y) + # assemble the list of known classes from each learner + try: + # if estimators are fitted + known_classes = tuple(learner.estimator.classes_ for learner in self.learner_list) + except AttributeError: + # handle unfitted estimators + self.classes_ = None + self.n_classes_ = 0 + return - def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> None: - """ - Fits all learners to the training data and labels provided to it so far. + self.classes_ = np.unique( + np.concatenate(known_classes, axis=0), + axis=0 + ) + self.n_classes_ = len(self.classes_) - Args: - bootstrap: If True, each estimator is trained on a bootstrapped dataset. Useful when - using bagging to build the ensemble. - **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. - """ - for learner in self.learner_list: - learner._fit_to_known(bootstrap=bootstrap, **fit_kwargs) + def _add_training_data(self, X: modALinput, y: modALinput): + super()._add_training_data(X, y) - def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> None: + def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> 'BaseCommittee': """ Fits every learner to a subset sampled with replacement from X. Calling this method makes the learner forget the data it has seen up until this point and replaces it with X! If you would like to perform bootstrapping on each learner using the data it has seen, use the method .rebag()! - Calling this method makes the learner forget the data it has seen up until this point and replaces it with X! - Args: X: The samples to be fitted on. y: The corresponding labels. **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. """ - for learner in self.learner_list: - learner.fit(X, y, **fit_kwargs) - + super().fit(X, y, **fit_kwargs) self._set_classes() - def rebag(self, **fit_kwargs) -> None: - """ - Refits every learner with a dataset bootstrapped from its training instances. Contrary to .bag(), it bootstraps - the training data for each learner based on its own examples. - - Todo: - Where is .bag()? - - Args: - **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. - """ - self._fit_to_known(bootstrap=True, **fit_kwargs) - def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None: """ Adds X and y to the known training data for each learner and retrains learners with the augmented dataset. - - Args: - X: The new samples for which the labels are supplied by the expert. - y: Labels corresponding to the new instances in X. - bootstrap: If True, trains each learner on a bootstrapped set. Useful when building the ensemble by bagging. - only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples. - **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. - """ - self._add_training_data(X, y) - if not only_new: - self._fit_to_known(bootstrap=bootstrap, **fit_kwargs) - else: - self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) - self._set_classes() - - def predict(self, X: modALinput, **predict_proba_kwargs) -> Any: - """ - Predicts the class of the samples by picking the consensus prediction. - - Args: - X: The samples to be predicted. - **predict_proba_kwargs: Keyword arguments to be passed to the :meth:`predict_proba` of the Committee. - - Returns: - The predicted class labels for X. - """ - # getting average certainties - proba = self.predict_proba(X, **predict_proba_kwargs) - # finding the sample-wise max probability - max_proba_idx = np.argmax(proba, axis=1) - # translating label indices to labels - return self.classes_[max_proba_idx] - - def predict_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: - """ - Consensus probabilities of the Committee. - - Args: - X: The samples for which the class probabilities are to be predicted. - **predict_proba_kwargs: Keyword arguments to be passed to the :meth:`predict_proba` of the Committee. - - Returns: - Class probabilities for X. - """ - return np.mean(self.vote_proba(X, **predict_proba_kwargs), axis=1) - - def score(self, X: modALinput, y: modALinput, sample_weight: List[float] = None) -> Any: - """ - Returns the mean accuracy on the given test data and labels. - - Todo: - Why accuracy? - - Args: - X: The samples to score. - y: Ground truth labels corresponding to X. - sample_weight: Sample weights. - - Returns: - Mean accuracy of the classifiers. - """ - y_pred = self.predict(X) - return accuracy_score(y, y_pred, sample_weight=sample_weight) - - def vote(self, X: modALinput, **predict_kwargs) -> Any: - """ - Predicts the labels for the supplied data for each learner in the Committee. - - Args: - X: The samples to cast votes. - **predict_kwargs: Keyword arguments to be passed to the :meth:`predict` of the learners. - - Returns: - The predicted class for each learner in the Committee and each sample in X. - """ - prediction = np.zeros(shape=(X.shape[0], len(self.learner_list))) - - for learner_idx, learner in enumerate(self.learner_list): - prediction[:, learner_idx] = learner.predict(X, **predict_kwargs) - - return prediction - - def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: - """ - Predicts the probabilities of the classes for each sample and each learner. - - Args: - X: The samples for which class probabilities are to be calculated. - **predict_proba_kwargs: Keyword arguments for the :meth:`predict_proba` of the learners. - - Returns: - Probabilities of each class for each learner and each instance. - """ - - # get dimensions - n_samples = X.shape[0] - n_learners = len(self.learner_list) - proba = np.zeros(shape=(n_samples, n_learners, self.n_classes_)) - - # checking if the learners in the Committee know the same set of class labels - if check_class_labels(*[learner.estimator for learner in self.learner_list]): - # known class labels are the same for each learner - # probability prediction is straightforward - - for learner_idx, learner in enumerate(self.learner_list): - proba[:, learner_idx, :] = learner.predict_proba( - X, **predict_proba_kwargs) - - else: - for learner_idx, learner in enumerate(self.learner_list): - proba[:, learner_idx, :] = check_class_proba( - proba=learner.predict_proba(X, **predict_proba_kwargs), - known_labels=learner.estimator.classes_, - all_labels=self.classes_ - ) - - return proba - - -class DeepCommittee(BaseCommittee): - """ - This class is for committee-based deep active learner algorithms. - - Args: - learner_list: A list of ActiveLearners forming the Committee. - query_strategy: Query strategy function. Committee supports disagreement-based query strategies from - :mod:`modAL.disagreement`, but uncertainty-based ones from :mod:`modAL.uncertainty` are also supported. - on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator - when applying the query strategy. - - Attributes: - classes_: Class labels known by the Committee. - n_classes_: Number of classes known by the Committee. - - Examples: - - >>> from sklearn.datasets import load_iris - >>> from sklearn.neighbors import KNeighborsClassifier - >>> from sklearn.ensemble import RandomForestClassifier - >>> from modAL.models import ActiveLearner, Committee - >>> - >>> iris = load_iris() - >>> - >>> # initialize ActiveLearners - >>> learner_1 = ActiveLearner( - ... estimator=RandomForestClassifier(), - ... X_training=iris['data'][[0, 50, 100]], y_training=iris['target'][[0, 50, 100]] - ... ) - >>> learner_2 = ActiveLearner( - ... estimator=KNeighborsClassifier(n_neighbors=3), - ... X_training=iris['data'][[1, 51, 101]], y_training=iris['target'][[1, 51, 101]] - ... ) - >>> - >>> # initialize the Committee - >>> committee = Committee( - ... learner_list=[learner_1, learner_2] - ... ) - >>> - >>> # querying for labels - >>> query_idx, query_sample = committee.query(iris['data']) - >>> - >>> # ...obtaining new labels from the Oracle... - >>> - >>> # teaching newly labelled examples - >>> committee.teach( - ... X=iris['data'][query_idx].reshape(1, -1), - ... y=iris['target'][query_idx].reshape(1, ) - ... ) - """ - - def __init__(self, learner_list: List[DeepActiveLearner], query_strategy: Callable = vote_entropy_sampling, - on_transformed: bool = False) -> None: - super().__init__(learner_list, query_strategy, on_transformed) - self._set_classes() - - def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> None: - """ - Fits every learner to a subset sampled with replacement from X. Calling this method makes the learner forget the - data it has seen up until this point and replaces it with X! If you would like to perform bootstrapping on each - learner using the data it has seen, use the method .rebag()! - - Calling this method makes the learner forget the data it has seen up until this point and replaces it with X! - - Args: - X: The samples to be fitted on. - y: The corresponding labels. - **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. - """ - for learner in self.learner_list: - learner.fit(X, y, **fit_kwargs) - - self._set_classes() - - def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> None: - """ - Adds X and y to the known training data for each learner and retrains learners with the augmented dataset. - Args: X: The new samples for which the labels are supplied by the expert. y: Labels corresponding to the new instances in X. @@ -763,17 +539,15 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwa only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples. **fit_kwargs: Keyword arguments to be passed to the fit method of the predictor. """ - self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs) + super().teach(X, y, bootstrap=bootstrap, only_new=only_new, **fit_kwargs) self._set_classes() def predict(self, X: modALinput, **predict_proba_kwargs) -> Any: """ Predicts the class of the samples by picking the consensus prediction. - Args: X: The samples to be predicted. **predict_proba_kwargs: Keyword arguments to be passed to the :meth:`predict_proba` of the Committee. - Returns: The predicted class labels for X. """ @@ -787,11 +561,9 @@ def predict(self, X: modALinput, **predict_proba_kwargs) -> Any: def predict_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: """ Consensus probabilities of the Committee. - Args: X: The samples for which the class probabilities are to be predicted. **predict_proba_kwargs: Keyword arguments to be passed to the :meth:`predict_proba` of the Committee. - Returns: Class probabilities for X. """ @@ -799,17 +571,13 @@ def predict_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: def score(self, X: modALinput, y: modALinput, sample_weight: List[float] = None) -> Any: """ - TODO test with deep learning ... I think it needs still adaption Returns the mean accuracy on the given test data and labels. - Todo: Why accuracy? - Args: X: The samples to score. y: Ground truth labels corresponding to X. sample_weight: Sample weights. - Returns: Mean accuracy of the classifiers. """ @@ -819,11 +587,9 @@ def score(self, X: modALinput, y: modALinput, sample_weight: List[float] = None) def vote(self, X: modALinput, **predict_kwargs) -> Any: """ Predicts the labels for the supplied data for each learner in the Committee. - Args: X: The samples to cast votes. **predict_kwargs: Keyword arguments to be passed to the :meth:`predict` of the learners. - Returns: The predicted class for each learner in the Committee and each sample in X. """ @@ -837,11 +603,9 @@ def vote(self, X: modALinput, **predict_kwargs) -> Any: def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: """ Predicts the probabilities of the classes for each sample and each learner. - Args: X: The samples for which class probabilities are to be calculated. **predict_proba_kwargs: Keyword arguments for the :meth:`predict_proba` of the learners. - Returns: Probabilities of each class for each learner and each instance. """ @@ -857,8 +621,7 @@ def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: # probability prediction is straightforward for learner_idx, learner in enumerate(self.learner_list): - proba[:, learner_idx, :] = learner.predict_proba( - X, **predict_proba_kwargs) + proba[:, learner_idx, :] = learner.predict_proba(X, **predict_proba_kwargs) else: for learner_idx, learner in enumerate(self.learner_list): @@ -874,15 +637,12 @@ def vote_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: class CommitteeRegressor(BaseCommittee): """ This class is an abstract model of a committee-based active learning regression. - Args: learner_list: A list of ActiveLearners forming the CommitteeRegressor. query_strategy: Query strategy function. on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator when applying the query strategy. - Examples: - >>> import numpy as np >>> import matplotlib.pyplot as plt >>> from sklearn.gaussian_process import GaussianProcessRegressor @@ -923,7 +683,6 @@ class CommitteeRegressor(BaseCommittee): ... query_idx, query_instance = committee.query(X.reshape(-1, 1)) ... committee.teach(X[query_idx].reshape(-1, 1), y[query_idx].reshape(-1, 1)) """ - def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = max_std_sampling, on_transformed: bool = False) -> None: super().__init__(learner_list, query_strategy, on_transformed) @@ -931,11 +690,9 @@ def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = def predict(self, X: modALinput, return_std: bool = False, **predict_kwargs) -> Any: """ Predicts the values of the samples by averaging the prediction of each regressor. - Args: X: The samples to be predicted. **predict_kwargs: Keyword arguments to be passed to the :meth:`vote` method of the CommitteeRegressor. - Returns: The predicted class labels for X. """ @@ -948,18 +705,15 @@ def predict(self, X: modALinput, return_std: bool = False, **predict_kwargs) -> def vote(self, X: modALinput, **predict_kwargs): """ Predicts the values for the supplied data for each regressor in the CommitteeRegressor. - Args: X: The samples to cast votes. **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the learners. - Returns: The predicted value for each regressor in the CommitteeRegressor and each sample in X. """ prediction = np.zeros(shape=(len(X), len(self.learner_list))) for learner_idx, learner in enumerate(self.learner_list): - prediction[:, learner_idx] = learner.predict( - X, **predict_kwargs).reshape(-1, ) + prediction[:, learner_idx] = learner.predict(X, **predict_kwargs).reshape(-1, ) - return prediction + return prediction \ No newline at end of file From 855be41814d4534f0c376442ac375b111faffefb Mon Sep 17 00:00:00 2001 From: Max Keller Date: Thu, 21 Oct 2021 21:04:15 +0200 Subject: [PATCH 172/182] Organize imports & catch all exceptions in querys --- examples/active_regression.py | 6 +-- examples/bagging.py | 7 +-- examples/bayesian_optimization.py | 11 ++-- examples/bayesian_optimization_multidim.py | 5 +- examples/custom_query_strategies.py | 8 ++- examples/deep_bayesian_active_learning.py | 4 +- examples/ensemble.py | 5 +- examples/ensemble_regression.py | 8 +-- examples/information_density.py | 1 - examples/keras_integration.py | 2 +- examples/multilabel_svm.py | 6 +-- examples/pool-based_sampling.py | 8 +-- examples/pytorch_integration.py | 10 ++-- examples/pytorch_mc_dropout.py | 18 +++---- examples/query_by_committee.py | 9 ++-- examples/ranked_batch_mode.py | 12 ++--- examples/runtime_comparison.py | 21 +++----- examples/shape_learning.py | 7 +-- examples/sklearn_workflow.py | 4 +- examples/stream-based_sampling.py | 4 +- modAL/__init__.py | 2 +- modAL/acquisition.py | 7 ++- modAL/batch.py | 5 +- modAL/disagreement.py | 5 +- modAL/dropout.py | 12 ++--- modAL/expected_error.py | 8 +-- modAL/models/__init__.py | 5 +- modAL/models/base.py | 12 ++--- modAL/models/learners.py | 27 ++++------ modAL/multilabel.py | 9 ++-- modAL/uncertainty.py | 6 +-- modAL/utils/__init__.py | 5 +- modAL/utils/combination.py | 3 +- modAL/utils/data.py | 5 +- modAL/utils/validation.py | 2 +- setup.py | 2 +- tests/core_tests.py | 54 +++++++++---------- tests/example_tests/active_regression.py | 6 +-- tests/example_tests/bagging.py | 5 +- tests/example_tests/bayesian_optimization.py | 9 ++-- .../example_tests/custom_query_strategies.py | 6 +-- tests/example_tests/ensemble.py | 5 +- tests/example_tests/ensemble_regression.py | 6 +-- tests/example_tests/information_density.py | 4 +- tests/example_tests/multidimensional_data.py | 7 ++- tests/example_tests/multilabel_svm.py | 4 +- tests/example_tests/pool_based_sampling.py | 2 +- tests/example_tests/query_by_committee.py | 5 +- tests/example_tests/ranked_batch_mode.py | 10 ++-- tests/example_tests/shape_learning.py | 5 +- tests/example_tests/stream_based_sampling.py | 2 +- 51 files changed, 187 insertions(+), 214 deletions(-) diff --git a/examples/active_regression.py b/examples/active_regression.py index 5426e5f..abe3fe9 100644 --- a/examples/active_regression.py +++ b/examples/active_regression.py @@ -2,11 +2,11 @@ Active regression example with Gaussian processes. """ -import numpy as np import matplotlib.pyplot as plt -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import WhiteKernel, RBF +import numpy as np from modAL.models import ActiveLearner +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import RBF, WhiteKernel # query strategy for regression diff --git a/examples/bagging.py b/examples/bagging.py index 2d61040..868a10b 100644 --- a/examples/bagging.py +++ b/examples/bagging.py @@ -2,11 +2,12 @@ This example shows how to build models with bagging using the Committee model. """ -import numpy as np from itertools import product + +import numpy as np from matplotlib import pyplot as plt -from sklearn.neighbors import KNeighborsClassifier from modAL.models import ActiveLearner, Committee +from sklearn.neighbors import KNeighborsClassifier # creating the dataset im_width = 500 @@ -90,4 +91,4 @@ plt.subplot(1, n_learners, learner_idx+1) plt.imshow(learner.predict(X_pool).reshape(im_height, im_width)) plt.title('Learner no. %d after refitting' % (learner_idx + 1)) - plt.show() \ No newline at end of file + plt.show() diff --git a/examples/bayesian_optimization.py b/examples/bayesian_optimization.py index 3caa0b4..0d62ea3 100644 --- a/examples/bayesian_optimization.py +++ b/examples/bayesian_optimization.py @@ -1,11 +1,12 @@ -import numpy as np -import matplotlib.pyplot as plt from functools import partial + +import matplotlib.pyplot as plt +import numpy as np +from modAL.acquisition import (max_EI, max_PI, max_UCB, optimizer_EI, + optimizer_PI, optimizer_UCB) +from modAL.models import BayesianOptimizer from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import Matern -from modAL.models import BayesianOptimizer -from modAL.acquisition import optimizer_PI, optimizer_EI, optimizer_UCB, max_PI, max_EI, max_UCB - # generating the data X = np.linspace(0, 20, 1000).reshape(-1, 1) diff --git a/examples/bayesian_optimization_multidim.py b/examples/bayesian_optimization_multidim.py index 26499dc..9c74219 100644 --- a/examples/bayesian_optimization_multidim.py +++ b/examples/bayesian_optimization_multidim.py @@ -1,9 +1,8 @@ import numpy as np +from modAL.acquisition import max_EI +from modAL.models import BayesianOptimizer from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import Matern -from modAL.models import BayesianOptimizer -from modAL.acquisition import max_EI - # generating the data x1, x2 = np.linspace(0, 10, 11).reshape(-1, 1), np.linspace(0, 10, 11).reshape(-1, 1) diff --git a/examples/custom_query_strategies.py b/examples/custom_query_strategies.py index 6680457..d969643 100644 --- a/examples/custom_query_strategies.py +++ b/examples/custom_query_strategies.py @@ -25,18 +25,16 @@ def custom_query_strategy(classifier, X, a_keyword_argument=42): and classifier margin. """ -import numpy as np import matplotlib.pyplot as plt - +import numpy as np +from modAL.models import ActiveLearner +from modAL.uncertainty import classifier_margin, classifier_uncertainty from modAL.utils.combination import make_linear_combination, make_product from modAL.utils.selection import multi_argmax -from modAL.uncertainty import classifier_uncertainty, classifier_margin -from modAL.models import ActiveLearner from sklearn.datasets import make_blobs from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF - # generating the data centers = np.asarray([[-2, 3], [0.5, 5], [1, 1.5]]) X, y = make_blobs( diff --git a/examples/deep_bayesian_active_learning.py b/examples/deep_bayesian_active_learning.py index 265c37d..355549b 100644 --- a/examples/deep_bayesian_active_learning.py +++ b/examples/deep_bayesian_active_learning.py @@ -2,12 +2,14 @@ import numpy as np from keras import backend as K from keras.datasets import mnist +from keras.layers import (Activation, Conv2D, Dense, Dropout, Flatten, + MaxPooling2D) from keras.models import Sequential -from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D from keras.regularizers import l2 from keras.wrappers.scikit_learn import KerasClassifier from modAL.models import ActiveLearner + def create_keras_model(): model = Sequential() model.add(Conv2D(32, (4, 4), activation='relu')) diff --git a/examples/ensemble.py b/examples/ensemble.py index bd621f4..d4c6791 100644 --- a/examples/ensemble.py +++ b/examples/ensemble.py @@ -1,8 +1,9 @@ -import numpy as np from itertools import product + +import numpy as np from matplotlib import pyplot as plt -from sklearn.ensemble import RandomForestClassifier from modAL.models import ActiveLearner, Committee +from sklearn.ensemble import RandomForestClassifier # creating the dataset im_width = 500 diff --git a/examples/ensemble_regression.py b/examples/ensemble_regression.py index bdb2276..e6d845f 100644 --- a/examples/ensemble_regression.py +++ b/examples/ensemble_regression.py @@ -1,9 +1,9 @@ -import numpy as np import matplotlib.pyplot as plt -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import WhiteKernel, RBF -from modAL.models import ActiveLearner, CommitteeRegressor +import numpy as np from modAL.disagreement import max_std_sampling +from modAL.models import ActiveLearner, CommitteeRegressor +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import RBF, WhiteKernel # generating the data X = np.concatenate((np.random.rand(100)-1, np.random.rand(100))) diff --git a/examples/information_density.py b/examples/information_density.py index c091061..a5d8ad1 100644 --- a/examples/information_density.py +++ b/examples/information_density.py @@ -1,5 +1,4 @@ import matplotlib.pyplot as plt - from modAL.density import information_density from sklearn.datasets import make_blobs diff --git a/examples/keras_integration.py b/examples/keras_integration.py index 0e27b75..abf4dca 100644 --- a/examples/keras_integration.py +++ b/examples/keras_integration.py @@ -6,8 +6,8 @@ import keras import numpy as np from keras.datasets import mnist +from keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D from keras.wrappers.scikit_learn import KerasClassifier from modAL.models import ActiveLearner diff --git a/examples/multilabel_svm.py b/examples/multilabel_svm.py index ab4eb3c..7d34ddf 100644 --- a/examples/multilabel_svm.py +++ b/examples/multilabel_svm.py @@ -1,9 +1,7 @@ -import numpy as np import matplotlib.pyplot as plt - +import numpy as np from modAL.models import ActiveLearner from modAL.multilabel import * - from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC @@ -33,4 +31,4 @@ ) query_idx, query_inst = learner.query(X_pool) -learner.teach(X_pool[query_idx], y_pool[query_idx]) \ No newline at end of file +learner.teach(X_pool[query_idx], y_pool[query_idx]) diff --git a/examples/pool-based_sampling.py b/examples/pool-based_sampling.py index cc89c4b..aedae0a 100644 --- a/examples/pool-based_sampling.py +++ b/examples/pool-based_sampling.py @@ -4,12 +4,12 @@ For its scikit-learn interface, see http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html """ -import numpy as np import matplotlib.pyplot as plt -from sklearn.decomposition import PCA +import numpy as np +from modAL.models import ActiveLearner from sklearn.datasets import load_iris +from sklearn.decomposition import PCA from sklearn.neighbors import KNeighborsClassifier -from modAL.models import ActiveLearner # loading the iris dataset iris = load_iris() @@ -65,4 +65,4 @@ prediction = learner.predict(iris['data']) plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50) plt.title('Classification accuracy after %i queries: %f' % (n_queries, learner.score(iris['data'], iris['target']))) - plt.show() \ No newline at end of file + plt.show() diff --git a/examples/pytorch_integration.py b/examples/pytorch_integration.py index b601caa..5ee4e9e 100644 --- a/examples/pytorch_integration.py +++ b/examples/pytorch_integration.py @@ -4,16 +4,14 @@ For more info, see https://skorch.readthedocs.io/en/stable/ """ -import torch import numpy as np - +import torch +from modAL.models import ActiveLearner +from skorch import NeuralNetClassifier from torch import nn from torch.utils.data import DataLoader -from torchvision.transforms import ToTensor from torchvision.datasets import MNIST -from skorch import NeuralNetClassifier - -from modAL.models import ActiveLearner +from torchvision.transforms import ToTensor # build class for the skorch API diff --git a/examples/pytorch_mc_dropout.py b/examples/pytorch_mc_dropout.py index 0e0befa..dab92d5 100644 --- a/examples/pytorch_mc_dropout.py +++ b/examples/pytorch_mc_dropout.py @@ -2,21 +2,17 @@ In this file the basic ModAL PyTorch DeepActiveLearner workflow is explained through an example on the MNIST dataset and the MC-Dropout-Bald query strategy. """ -import sys -import os +import numpy as np import torch -from torch import nn -from skorch import NeuralNetClassifier - -from modAL.models import DeepActiveLearner - # import of query strategies -from modAL.dropout import mc_dropout_bald, mc_dropout_mean_st - -import numpy as np +from modAL.dropout import mc_dropout_bald +from modAL.models import DeepActiveLearner +from skorch import NeuralNetClassifier +from torch import nn from torch.utils.data import DataLoader -from torchvision.transforms import ToTensor from torchvision.datasets import MNIST +from torchvision.transforms import ToTensor + # Standard Pytorch Model (Visit the PyTorch documentation for more details) class Torch_Model(nn.Module): diff --git a/examples/query_by_committee.py b/examples/query_by_committee.py index 746df00..439076d 100644 --- a/examples/query_by_committee.py +++ b/examples/query_by_committee.py @@ -1,10 +1,11 @@ -import numpy as np -import matplotlib.pyplot as plt from copy import deepcopy -from sklearn.decomposition import PCA + +import matplotlib.pyplot as plt +import numpy as np +from modAL.models import ActiveLearner, Committee from sklearn.datasets import load_iris +from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier -from modAL.models import ActiveLearner, Committee # loading the iris dataset iris = load_iris() diff --git a/examples/ranked_batch_mode.py b/examples/ranked_batch_mode.py index e9256d5..d9e4ca8 100644 --- a/examples/ranked_batch_mode.py +++ b/examples/ranked_batch_mode.py @@ -1,13 +1,13 @@ -import numpy as np +from functools import partial + import matplotlib as mpl import matplotlib.pyplot as plt +import numpy as np +from modAL.batch import uncertainty_batch_sampling +from modAL.models import ActiveLearner from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.neighbors import KNeighborsClassifier -from functools import partial - -from modAL.batch import uncertainty_batch_sampling -from modAL.models import ActiveLearner # Set our RNG for reproducibility. RANDOM_STATE_SEED = 123 @@ -161,4 +161,4 @@ )) ax.legend(loc='lower right') - plt.show() \ No newline at end of file + plt.show() diff --git a/examples/runtime_comparison.py b/examples/runtime_comparison.py index f6fdf13..551396e 100644 --- a/examples/runtime_comparison.py +++ b/examples/runtime_comparison.py @@ -1,25 +1,20 @@ -import numpy as np - from time import time -from sklearn.datasets import load_iris - +import numpy as np from acton.acton import main as acton_main - -from alp.active_learning.active_learning import ActiveLearner as ActiveLearnerALP - +from alp.active_learning.active_learning import \ + ActiveLearner as ActiveLearnerALP from libact.base.dataset import Dataset from libact.labelers import IdealLabeler -from libact.query_strategies import UncertaintySampling, QueryByCommittee +from libact.models.logistic_regression import \ + LogisticRegression as LogisticRegressionLibact +from libact.query_strategies import QueryByCommittee, UncertaintySampling from libact.query_strategies.multiclass.expected_error_reduction import EER -from libact.models.logistic_regression import LogisticRegression as LogisticRegressionLibact - -from modAL.models import ActiveLearner, Committee from modAL.expected_error import expected_error_reduction - +from modAL.models import ActiveLearner, Committee +from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression - runtime = {} diff --git a/examples/shape_learning.py b/examples/shape_learning.py index d856673..87b44dd 100644 --- a/examples/shape_learning.py +++ b/examples/shape_learning.py @@ -5,11 +5,12 @@ the scikit-learn implementation of the kNN classifier algorithm. """ -import numpy as np -import matplotlib.pyplot as plt from copy import deepcopy -from sklearn.ensemble import RandomForestClassifier + +import matplotlib.pyplot as plt +import numpy as np from modAL.models import ActiveLearner +from sklearn.ensemble import RandomForestClassifier # creating the image im_width = 500 diff --git a/examples/sklearn_workflow.py b/examples/sklearn_workflow.py index 12b175b..247ad1c 100644 --- a/examples/sklearn_workflow.py +++ b/examples/sklearn_workflow.py @@ -1,9 +1,9 @@ from modAL.models import ActiveLearner -from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_iris +from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score X_train, y_train = load_iris().data, load_iris().target learner = ActiveLearner(estimator=RandomForestClassifier()) -scores = cross_val_score(learner, X_train, y_train, cv=10) \ No newline at end of file +scores = cross_val_score(learner, X_train, y_train, cv=10) diff --git a/examples/stream-based_sampling.py b/examples/stream-based_sampling.py index 481a09c..7a6dd77 100644 --- a/examples/stream-based_sampling.py +++ b/examples/stream-based_sampling.py @@ -2,11 +2,11 @@ In this example the use of ActiveLearner is demonstrated in a stream-based sampling setting. """ -import numpy as np import matplotlib.pyplot as plt -from sklearn.ensemble import RandomForestClassifier +import numpy as np from modAL.models import ActiveLearner from modAL.uncertainty import classifier_uncertainty +from sklearn.ensemble import RandomForestClassifier # creating the image im_width = 500 diff --git a/modAL/__init__.py b/modAL/__init__.py index b34800b..4231550 100644 --- a/modAL/__init__.py +++ b/modAL/__init__.py @@ -1,3 +1,3 @@ from .models import ActiveLearner, Committee, CommitteeRegressor -__all__ = ['ActiveLearner', 'Committee', 'CommitteeRegressor'] \ No newline at end of file +__all__ = ['ActiveLearner', 'Committee', 'CommitteeRegressor'] diff --git a/modAL/acquisition.py b/modAL/acquisition.py index 4d2be85..8aa1fac 100644 --- a/modAL/acquisition.py +++ b/modAL/acquisition.py @@ -1,16 +1,15 @@ """ Acquisition functions for Bayesian optimization. """ -from typing import Tuple import numpy as np -from scipy.stats import norm from scipy.special import ndtr +from scipy.stats import norm from sklearn.exceptions import NotFittedError -from modAL.utils.selection import multi_argmax -from modAL.utils.data import modALinput from modAL.models.base import BaseLearner +from modAL.utils.data import modALinput +from modAL.utils.selection import multi_argmax def PI(mean, std, max_val, tradeoff): diff --git a/modAL/batch.py b/modAL/batch.py index c860966..d85afed 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -6,11 +6,12 @@ import numpy as np import scipy.sparse as sp -from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin_min +from sklearn.metrics.pairwise import (pairwise_distances, + pairwise_distances_argmin_min) -from modAL.utils.data import data_vstack, modALinput, data_shape from modAL.models.base import BaseCommittee, BaseLearner from modAL.uncertainty import classifier_uncertainty +from modAL.utils.data import data_shape, data_vstack, modALinput def select_cold_start_instance(X: modALinput, diff --git a/modAL/disagreement.py b/modAL/disagreement.py index 7789135..22430b4 100644 --- a/modAL/disagreement.py +++ b/modAL/disagreement.py @@ -2,16 +2,15 @@ Disagreement measures and disagreement based query strategies for the Committee model. """ from collections import Counter -from typing import Tuple import numpy as np from scipy.stats import entropy -from sklearn.exceptions import NotFittedError from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError +from modAL.models.base import BaseCommittee from modAL.utils.data import modALinput from modAL.utils.selection import multi_argmax, shuffled_argmax -from modAL.models.base import BaseCommittee def vote_entropy(committee: BaseCommittee, X: modALinput, **predict_proba_kwargs) -> np.ndarray: diff --git a/modAL/dropout.py b/modAL/dropout.py index 92aca53..c6c9cb1 100644 --- a/modAL/dropout.py +++ b/modAL/dropout.py @@ -1,19 +1,15 @@ -import numpy as np -import torch from collections.abc import Mapping from typing import Callable - -from sklearn.base import BaseEstimator -from sklearn.preprocessing import normalize - +import numpy as np +import torch from scipy.special import entr +from sklearn.base import BaseEstimator +from skorch.utils import to_numpy from modAL.utils.data import modALinput from modAL.utils.selection import multi_argmax, shuffled_argmax -from skorch.utils import to_numpy - def default_logits_adaptor(input_tensor: torch.tensor, samples: modALinput): # default Callable parameter for get_predictions diff --git a/modAL/expected_error.py b/modAL/expected_error.py index 01b139c..d7b3611 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -5,14 +5,14 @@ from typing import Tuple import numpy as np - from sklearn.base import clone from sklearn.exceptions import NotFittedError from modAL.models import ActiveLearner -from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows, data_shape, add_row -from modAL.utils.selection import multi_argmax, multi_argmin, shuffled_argmax, shuffled_argmin -from modAL.uncertainty import _proba_uncertainty, _proba_entropy +from modAL.uncertainty import _proba_entropy, _proba_uncertainty +from modAL.utils.data import (add_row, data_shape, data_vstack, drop_rows, + enumerate_data, modALinput) +from modAL.utils.selection import multi_argmin, shuffled_argmin def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary', diff --git a/modAL/models/__init__.py b/modAL/models/__init__.py index 2c2bd87..347716e 100644 --- a/modAL/models/__init__.py +++ b/modAL/models/__init__.py @@ -1,6 +1,7 @@ -from .learners import ActiveLearner, DeepActiveLearner, BayesianOptimizer, Committee, CommitteeRegressor +from .learners import (ActiveLearner, BayesianOptimizer, Committee, + CommitteeRegressor, DeepActiveLearner) __all__ = [ 'ActiveLearner', 'DeepActiveLearner', 'BayesianOptimizer', 'Committee', 'CommitteeRegressor' -] \ No newline at end of file +] diff --git a/modAL/models/base.py b/modAL/models/base.py index d767b6c..f371252 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -6,17 +6,15 @@ import abc import sys import warnings -from typing import Union, Callable, Optional, Tuple, List, Iterator, Any +from typing import Any, Callable, Iterator, List, Tuple, Union import numpy as np +import scipy.sparse as sp +from modAL.utils.data import data_hstack, modALinput, retrieve_rows from sklearn.base import BaseEstimator from sklearn.ensemble._base import _BaseHeterogeneousEnsemble from sklearn.pipeline import Pipeline -import scipy.sparse as sp - -from modAL.utils.data import data_hstack, modALinput, retrieve_rows - if sys.version_info >= (3, 4): ABC = abc.ABC else: @@ -177,7 +175,7 @@ def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwarg query_result, query_metrics = self.query_strategy( self, X_pool, *query_args, **query_kwargs) - except ValueError: + except: query_metrics = None query_result = self.query_strategy( self, X_pool, *query_args, **query_kwargs) @@ -319,7 +317,7 @@ def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwarg query_result, query_metrics = self.query_strategy( self, X_pool, *query_args, **query_kwargs) - except ValueError: + except: query_metrics = None query_result = self.query_strategy( self, X_pool, *query_args, **query_kwargs) diff --git a/modAL/models/learners.py b/modAL/models/learners.py index 21d5181..b7dac72 100644 --- a/modAL/models/learners.py +++ b/modAL/models/learners.py @@ -1,20 +1,15 @@ -import numpy as np - -from typing import Callable, Optional, Tuple, List, Any +from typing import Any, Callable, List, Optional, Tuple +import numpy as np +from modAL.acquisition import max_EI +from modAL.disagreement import max_std_sampling, vote_entropy_sampling +from modAL.models.base import BaseCommittee, BaseLearner +from modAL.uncertainty import uncertainty_sampling +from modAL.utils.data import data_vstack, modALinput, retrieve_rows +from modAL.utils.validation import check_class_labels, check_class_proba from sklearn.base import BaseEstimator from sklearn.metrics import accuracy_score - from sklearn.utils import check_X_y -from modAL.models.base import BaseLearner, BaseCommittee -from modAL.utils.validation import check_class_labels, check_class_proba -from modAL.utils.data import modALinput, retrieve_rows, data_vstack -from modAL.uncertainty import uncertainty_sampling -from modAL.disagreement import vote_entropy_sampling, max_std_sampling -from modAL.acquisition import max_EI - -from skorch.utils import to_numpy - """ Classes for active learning algorithms @@ -272,10 +267,10 @@ def num_epochs(self, value): can be changed at any time, even after the model was trained. """ if isinstance(value, int): - if 0 < value <= 100: + if 0 < value: self.estimator.max_epochs = value else: - raise ValueError("num_epochs must be in range 0 < x <= 100") + raise ValueError("num_epochs must be larger than zero") else: raise TypeError("num_epochs must be of type integer!") @@ -716,4 +711,4 @@ def vote(self, X: modALinput, **predict_kwargs): for learner_idx, learner in enumerate(self.learner_list): prediction[:, learner_idx] = learner.predict(X, **predict_kwargs).reshape(-1, ) - return prediction \ No newline at end of file + return prediction diff --git a/modAL/multilabel.py b/modAL/multilabel.py index 186ed08..c908674 100644 --- a/modAL/multilabel.py +++ b/modAL/multilabel.py @@ -1,13 +1,12 @@ -import numpy as np +from typing import Optional -from sklearn.base import BaseEstimator +import numpy as np from sklearn.multiclass import OneVsRestClassifier from modAL.models import ActiveLearner from modAL.utils.data import modALinput -from modAL.utils.selection import multi_argmax, multi_argmin, shuffled_argmax, shuffled_argmin -from typing import Tuple, Optional -from itertools import combinations +from modAL.utils.selection import (multi_argmax, multi_argmin, shuffled_argmax, + shuffled_argmin) def _SVM_loss(multiclass_classifier: ActiveLearner, diff --git a/modAL/uncertainty.py b/modAL/uncertainty.py index f41206c..d0f7b37 100644 --- a/modAL/uncertainty.py +++ b/modAL/uncertainty.py @@ -1,15 +1,15 @@ """ Uncertainty measures and uncertainty based sampling strategies for the active learning models. """ -from typing import Tuple import numpy as np from scipy.stats import entropy -from sklearn.exceptions import NotFittedError from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError from modAL.utils.data import modALinput -from modAL.utils.selection import multi_argmax, multi_argmin, shuffled_argmax, shuffled_argmin +from modAL.utils.selection import (multi_argmax, multi_argmin, shuffled_argmax, + shuffled_argmin) def _proba_uncertainty(proba: np.ndarray) -> np.ndarray: diff --git a/modAL/utils/__init__.py b/modAL/utils/__init__.py index 3b6501c..2f3bc12 100644 --- a/modAL/utils/__init__.py +++ b/modAL/utils/__init__.py @@ -1,4 +1,5 @@ -from .combination import make_linear_combination, make_product, make_query_strategy +from .combination import (make_linear_combination, make_product, + make_query_strategy) from .data import data_vstack from .selection import multi_argmax, weighted_random from .validation import check_class_labels, check_class_proba @@ -8,4 +9,4 @@ 'data_vstack', 'multi_argmax', 'weighted_random', 'check_class_labels', 'check_class_proba' -] \ No newline at end of file +] diff --git a/modAL/utils/combination.py b/modAL/utils/combination.py index 45eee2f..eb2b4d2 100644 --- a/modAL/utils/combination.py +++ b/modAL/utils/combination.py @@ -1,9 +1,8 @@ from typing import Callable, Optional, Sequence, Tuple import numpy as np -from sklearn.base import BaseEstimator - from modAL.utils.data import modALinput +from sklearn.base import BaseEstimator def make_linear_combination(*functions: Callable, weights: Optional[Sequence] = None) -> Callable: diff --git a/modAL/utils/data.py b/modAL/utils/data.py index 12d8a32..ee446c4 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -1,10 +1,9 @@ -from typing import Union, List, Sequence +from typing import List, Sequence, Union import numpy as np -import torch import pandas as pd import scipy.sparse as sp - +import torch modALinput = Union[sp.csr_matrix, pd.DataFrame, np.ndarray, list] diff --git a/modAL/utils/validation.py b/modAL/utils/validation.py index e5763de..93667db 100644 --- a/modAL/utils/validation.py +++ b/modAL/utils/validation.py @@ -1,8 +1,8 @@ from typing import Sequence import numpy as np -from sklearn.exceptions import NotFittedError from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError def check_class_labels(*args: BaseEstimator) -> bool: diff --git a/setup.py b/setup.py index 3f48835..4e79aca 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup, find_packages +from setuptools import find_packages, setup setup( name='modAL', diff --git a/tests/core_tests.py b/tests/core_tests.py index e00a833..e39cf4b 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -1,46 +1,42 @@ import random import unittest -import numpy as np -import pandas as pd - -import mock +from collections import namedtuple +from copy import deepcopy +from itertools import chain, product from unittest.mock import MagicMock -import modAL.models.base -import modAL.models.learners -import modAL.utils.selection -import modAL.utils.validation -import modAL.utils.combination import modAL.acquisition import modAL.batch import modAL.density import modAL.disagreement +import modAL.dropout import modAL.expected_error +import modAL.models.base +import modAL.models.learners import modAL.multilabel import modAL.uncertainty -import modAL.dropout - -from copy import deepcopy -from itertools import chain, product -from collections import namedtuple - +import modAL.utils.combination +import modAL.utils.selection +import modAL.utils.validation +import numpy as np +import pandas as pd +import torch +from scipy import sparse as sp +from scipy.special import ndtr +from scipy.stats import entropy, norm from sklearn.ensemble import RandomForestClassifier -from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.exceptions import NotFittedError +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.metrics import confusion_matrix -from sklearn.svm import SVC from sklearn.multiclass import OneVsRestClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer -from sklearn.feature_extraction.text import CountVectorizer -from scipy.stats import entropy, norm -from scipy.special import ndtr -from scipy import sparse as sp - -import torch +from sklearn.svm import SVC +from skorch import NeuralNetClassifier from torch import nn -from skorch import NeuralNetClassifier +import mock Test = namedtuple('Test', ['input', 'output']) @@ -1675,19 +1671,19 @@ def test_strategies(self): class TestExamples(unittest.TestCase): def test_examples(self): - import example_tests.multidimensional_data import example_tests.active_regression import example_tests.bagging + import example_tests.bayesian_optimization + import example_tests.custom_query_strategies import example_tests.ensemble import example_tests.ensemble_regression + import example_tests.information_density + import example_tests.multidimensional_data import example_tests.pool_based_sampling import example_tests.query_by_committee + import example_tests.ranked_batch_mode import example_tests.shape_learning import example_tests.stream_based_sampling - import example_tests.custom_query_strategies - import example_tests.information_density - import example_tests.bayesian_optimization - import example_tests.ranked_batch_mode if __name__ == '__main__': diff --git a/tests/example_tests/active_regression.py b/tests/example_tests/active_regression.py index 72d43f3..4306a3e 100644 --- a/tests/example_tests/active_regression.py +++ b/tests/example_tests/active_regression.py @@ -3,10 +3,10 @@ """ import numpy as np -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import WhiteKernel, RBF -from modAL.models import ActiveLearner from modAL.disagreement import max_std_sampling +from modAL.models import ActiveLearner +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import RBF, WhiteKernel np.random.seed(0) diff --git a/tests/example_tests/bagging.py b/tests/example_tests/bagging.py index 4830150..55a7d24 100644 --- a/tests/example_tests/bagging.py +++ b/tests/example_tests/bagging.py @@ -2,10 +2,11 @@ This example shows how to build models with bagging using the Committee model. """ -import numpy as np from itertools import product -from sklearn.neighbors import KNeighborsClassifier + +import numpy as np from modAL.models import ActiveLearner, Committee +from sklearn.neighbors import KNeighborsClassifier np.random.seed(0) diff --git a/tests/example_tests/bayesian_optimization.py b/tests/example_tests/bayesian_optimization.py index 981c6b0..8950c52 100644 --- a/tests/example_tests/bayesian_optimization.py +++ b/tests/example_tests/bayesian_optimization.py @@ -1,10 +1,11 @@ -import numpy as np from functools import partial + +import numpy as np +from modAL.acquisition import (max_EI, max_PI, max_UCB, optimizer_EI, + optimizer_PI, optimizer_UCB) +from modAL.models import BayesianOptimizer from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import Matern -from modAL.models import BayesianOptimizer -from modAL.acquisition import optimizer_PI, optimizer_EI, optimizer_UCB, max_PI, max_EI, max_UCB - # generating the data X = np.linspace(0, 20, 1000).reshape(-1, 1) diff --git a/tests/example_tests/custom_query_strategies.py b/tests/example_tests/custom_query_strategies.py index 441814a..c8a94f1 100644 --- a/tests/example_tests/custom_query_strategies.py +++ b/tests/example_tests/custom_query_strategies.py @@ -1,14 +1,12 @@ import numpy as np - +from modAL.models import ActiveLearner +from modAL.uncertainty import classifier_margin, classifier_uncertainty from modAL.utils.combination import make_linear_combination, make_product from modAL.utils.selection import multi_argmax -from modAL.uncertainty import classifier_uncertainty, classifier_margin -from modAL.models import ActiveLearner from sklearn.datasets import make_blobs from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF - # generating the data centers = np.asarray([[-2, 3], [0.5, 5], [1, 1.5]]) X, y = make_blobs( diff --git a/tests/example_tests/ensemble.py b/tests/example_tests/ensemble.py index 35c36df..c7e3193 100644 --- a/tests/example_tests/ensemble.py +++ b/tests/example_tests/ensemble.py @@ -1,7 +1,8 @@ -import numpy as np from itertools import product -from sklearn.ensemble import RandomForestClassifier + +import numpy as np from modAL.models import ActiveLearner, Committee +from sklearn.ensemble import RandomForestClassifier np.random.seed(0) diff --git a/tests/example_tests/ensemble_regression.py b/tests/example_tests/ensemble_regression.py index 1e9e1e4..1082fb3 100644 --- a/tests/example_tests/ensemble_regression.py +++ b/tests/example_tests/ensemble_regression.py @@ -1,8 +1,8 @@ import numpy as np -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import WhiteKernel, RBF -from modAL.models import ActiveLearner, CommitteeRegressor from modAL.disagreement import max_std_sampling +from modAL.models import ActiveLearner, CommitteeRegressor +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import RBF, WhiteKernel np.random.seed(0) diff --git a/tests/example_tests/information_density.py b/tests/example_tests/information_density.py index 7e82c76..43c8f39 100644 --- a/tests/example_tests/information_density.py +++ b/tests/example_tests/information_density.py @@ -1,6 +1,6 @@ -from modAL.density import similarize_distance, information_density -from sklearn.datasets import make_blobs +from modAL.density import information_density, similarize_distance from scipy.spatial.distance import euclidean +from sklearn.datasets import make_blobs X, y = make_blobs(n_features=2, n_samples=10, centers=3, random_state=0, cluster_std=0.7) diff --git a/tests/example_tests/multidimensional_data.py b/tests/example_tests/multidimensional_data.py index e87e319..e491765 100644 --- a/tests/example_tests/multidimensional_data.py +++ b/tests/example_tests/multidimensional_data.py @@ -1,10 +1,9 @@ import numpy as np -from sklearn.base import BaseEstimator - -from modAL.models import ActiveLearner -from modAL.uncertainty import margin_sampling, entropy_sampling from modAL.batch import uncertainty_batch_sampling from modAL.expected_error import expected_error_reduction +from modAL.models import ActiveLearner +from modAL.uncertainty import entropy_sampling, margin_sampling +from sklearn.base import BaseEstimator class MockClassifier(BaseEstimator): diff --git a/tests/example_tests/multilabel_svm.py b/tests/example_tests/multilabel_svm.py index ea91dae..96ed4bd 100644 --- a/tests/example_tests/multilabel_svm.py +++ b/tests/example_tests/multilabel_svm.py @@ -1,8 +1,6 @@ import numpy as np - from modAL.models import ActiveLearner from modAL.multilabel import SVM_binary_minimum - from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC @@ -25,4 +23,4 @@ for idx in range(n_queries): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(1, -1)) - X_pool, y_pool = np.delete(X_pool, query_idx, axis=0), np.delete(y_pool, query_idx, axis=0) \ No newline at end of file + X_pool, y_pool = np.delete(X_pool, query_idx, axis=0), np.delete(y_pool, query_idx, axis=0) diff --git a/tests/example_tests/pool_based_sampling.py b/tests/example_tests/pool_based_sampling.py index 8dab142..1fa052c 100644 --- a/tests/example_tests/pool_based_sampling.py +++ b/tests/example_tests/pool_based_sampling.py @@ -5,9 +5,9 @@ """ import numpy as np +from modAL.models import ActiveLearner from sklearn.datasets import load_iris from sklearn.neighbors import KNeighborsClassifier -from modAL.models import ActiveLearner np.random.seed(0) diff --git a/tests/example_tests/query_by_committee.py b/tests/example_tests/query_by_committee.py index b974483..e711faf 100644 --- a/tests/example_tests/query_by_committee.py +++ b/tests/example_tests/query_by_committee.py @@ -1,8 +1,9 @@ -import numpy as np from copy import deepcopy + +import numpy as np +from modAL.models import ActiveLearner, Committee from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier -from modAL.models import ActiveLearner, Committee np.random.seed(0) diff --git a/tests/example_tests/ranked_batch_mode.py b/tests/example_tests/ranked_batch_mode.py index 949957d..48ac2f0 100644 --- a/tests/example_tests/ranked_batch_mode.py +++ b/tests/example_tests/ranked_batch_mode.py @@ -1,11 +1,11 @@ -import numpy as np -from sklearn.datasets import load_iris -from sklearn.decomposition import PCA -from sklearn.neighbors import KNeighborsClassifier from functools import partial +import numpy as np from modAL.batch import uncertainty_batch_sampling from modAL.models import ActiveLearner +from sklearn.datasets import load_iris +from sklearn.decomposition import PCA +from sklearn.neighbors import KNeighborsClassifier # Set our RNG for reproducibility. RANDOM_STATE_SEED = 123 @@ -76,4 +76,4 @@ # Calculate and report our model's accuracy. model_accuracy = learner.score(X_raw, y_raw) -predictions = learner.predict(X_raw) \ No newline at end of file +predictions = learner.predict(X_raw) diff --git a/tests/example_tests/shape_learning.py b/tests/example_tests/shape_learning.py index f76a07a..17dd2cd 100644 --- a/tests/example_tests/shape_learning.py +++ b/tests/example_tests/shape_learning.py @@ -5,10 +5,11 @@ the scikit-learn implementation of the kNN classifier algorithm. """ -import numpy as np from copy import deepcopy -from sklearn.ensemble import RandomForestClassifier + +import numpy as np from modAL.models import ActiveLearner +from sklearn.ensemble import RandomForestClassifier np.random.seed(0) diff --git a/tests/example_tests/stream_based_sampling.py b/tests/example_tests/stream_based_sampling.py index d306f61..5e603d7 100644 --- a/tests/example_tests/stream_based_sampling.py +++ b/tests/example_tests/stream_based_sampling.py @@ -3,9 +3,9 @@ """ import numpy as np -from sklearn.ensemble import RandomForestClassifier from modAL.models import ActiveLearner from modAL.uncertainty import classifier_uncertainty +from sklearn.ensemble import RandomForestClassifier np.random.seed(0) From c15d44abf5ceae41e0e8e7762fc9a7d3268c708e Mon Sep 17 00:00:00 2001 From: Max Keller Date: Thu, 21 Oct 2021 21:14:23 +0200 Subject: [PATCH 173/182] Remove torch dependency from data.py --- .travis.yml | 2 +- modAL/utils/data.py | 18 ++---------------- rtd_requirements.txt | 1 - setup.py | 2 +- tests/core_tests.py | 6 ------ 5 files changed, 4 insertions(+), 25 deletions(-) diff --git a/.travis.yml b/.travis.yml index a84f0a9..b3c7647 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ after_success: matrix: include: install: - - pip install numpy==1.20 scikit-learn==0.18 scipy==0.18 + - pip install numpy==1.20 scikit-learn==0.18 scipy==0.18 torch==1.8.1 - pip install codecov - pip install coverage - pip install . diff --git a/modAL/utils/data.py b/modAL/utils/data.py index ee446c4..30a1127 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd import scipy.sparse as sp -import torch modALinput = Union[sp.csr_matrix, pd.DataFrame, np.ndarray, list] @@ -26,8 +25,6 @@ def data_vstack(blocks: Sequence[modALinput]) -> modALinput: return np.concatenate(blocks) elif isinstance(blocks[0], list): return np.concatenate(blocks).tolist() - elif torch.is_tensor(blocks[0]): - return torch.cat(blocks) raise TypeError('%s datatype is not supported' % type(blocks[0])) @@ -50,8 +47,6 @@ def data_hstack(blocks: Sequence[modALinput]) -> modALinput: return np.hstack(blocks) elif isinstance(blocks[0], list): return np.hstack(blocks).tolist() - elif torch.is_tensor(blocks[0]): - return torch.cat(blocks, dim=1) TypeError('%s datatype is not supported' % type(blocks[0])) @@ -65,8 +60,6 @@ def add_row(X: modALinput, row: modALinput): row] """ if isinstance(X, np.ndarray): return np.vstack((X, row)) - elif torch.is_tensor(X): - return torch.cat((X, row)) elif isinstance(X, list): return np.vstack((X, row)).tolist() @@ -107,8 +100,6 @@ def retrieve_rows(X: modALinput, return X_return elif isinstance(X, np.ndarray): return X[I] - elif torch.is_tensor(X): - return X[I] raise TypeError('%s datatype is not supported' % type(X)) @@ -128,9 +119,6 @@ def drop_rows(X: modALinput, return np.delete(X, I, axis=0) elif isinstance(X, list): return np.delete(X, I, axis=0).tolist() - elif torch.is_tensor(X): - return X[[True if row not in I else False - for row in range(X.size(0))]] raise TypeError('%s datatype is not supported' % type(X)) @@ -149,8 +137,8 @@ def enumerate_data(X: modALinput): return enumerate(X.tocsr()) elif isinstance(X, pd.DataFrame): return X.iterrows() - elif isinstance(X, np.ndarray) or isinstance(X, list) or torch.is_tensor(X): - # numpy arrays, torch tensors and lists can readily be enumerated + elif isinstance(X, np.ndarray) or isinstance(X, list): + # numpy arrays and lists can readily be enumerated return enumerate(X) raise TypeError('%s datatype is not supported' % type(X)) @@ -165,7 +153,5 @@ def data_shape(X: modALinput): return X.shape elif isinstance(X, list): return np.array(X).shape - elif torch.is_tensor(X): - return tuple(X.size()) raise TypeError('%s datatype is not supported' % type(X)) diff --git a/rtd_requirements.txt b/rtd_requirements.txt index 685089f..db0bd81 100644 --- a/rtd_requirements.txt +++ b/rtd_requirements.txt @@ -5,4 +5,3 @@ ipykernel nbsphinx pandas skorch -torch diff --git a/setup.py b/setup.py index 4e79aca..69c1ad2 100644 --- a/setup.py +++ b/setup.py @@ -11,5 +11,5 @@ packages=['modAL', 'modAL.models', 'modAL.utils'], classifiers=['Development Status :: 4 - Beta'], install_requires=['numpy==1.20.0', 'scikit-learn>=0.18', - 'scipy>=0.18', 'pandas>=1.1.0', 'skorch==0.9.0', 'torch>=1.8.1'], + 'scipy>=0.18', 'pandas>=1.1.0', 'skorch==0.9.0'], ) diff --git a/tests/core_tests.py b/tests/core_tests.py index e39cf4b..169ea90 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -181,12 +181,6 @@ def test_data_vstack(self): self.assertEqual((modAL.utils.data.data_vstack( (a, b)) != sp.vstack((a, b))).sum(), 0) - # pytorch tensors - a, b = torch.randn(n_samples, n_features), torch.randn( - n_samples, n_features) - self.assertTrue( - torch.equal(modAL.utils.data.data_vstack((a, b)), torch.cat((a, b)))) - # lists a, b = np.random.rand(n_samples, n_features).tolist(), np.random.rand( n_samples, n_features).tolist() From 87f64be9678eff755f4cff2f9ba79379a2f63108 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Mon, 29 Nov 2021 11:05:27 +0100 Subject: [PATCH 174/182] torch data utils added, torch is an optional dependency of the module --- modAL/utils/data.py | 100 ++++++++++++++++++++++++++++---------------- tests/core_tests.py | 7 ++++ 2 files changed, 70 insertions(+), 37 deletions(-) diff --git a/modAL/utils/data.py b/modAL/utils/data.py index 30a1127..3e707ff 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -4,6 +4,12 @@ import pandas as pd import scipy.sparse as sp +try: + import torch +except: + pass + + modALinput = Union[sp.csr_matrix, pd.DataFrame, np.ndarray, list] @@ -26,7 +32,13 @@ def data_vstack(blocks: Sequence[modALinput]) -> modALinput: elif isinstance(blocks[0], list): return np.concatenate(blocks).tolist() - raise TypeError('%s datatype is not supported' % type(blocks[0])) + try: + if torch.is_tensor(blocks[0]): + return torch.cat(blocks) + except: + pass + + raise TypeError("%s datatype is not supported" % type(blocks[0])) def data_hstack(blocks: Sequence[modALinput]) -> modALinput: @@ -48,7 +60,13 @@ def data_hstack(blocks: Sequence[modALinput]) -> modALinput: elif isinstance(blocks[0], list): return np.hstack(blocks).tolist() - TypeError('%s datatype is not supported' % type(blocks[0])) + try: + if torch.is_tensor(blocks[0]): + return torch.cat(blocks, dim=1) + except: + pass + + TypeError("%s datatype is not supported" % type(blocks[0])) def add_row(X: modALinput, row: modALinput): @@ -68,8 +86,9 @@ def add_row(X: modALinput, row: modALinput): return data_vstack([X, row]) -def retrieve_rows(X: modALinput, - I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: +def retrieve_rows( + X: modALinput, I: Union[int, List[int], np.ndarray] +) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: """ Returns the rows I from the data set X @@ -78,34 +97,34 @@ def retrieve_rows(X: modALinput, * pandas series in case of a pandas data frame * row in case of list or numpy format """ - if sp.issparse(X): - # Out of the sparse matrix formats (sp.csc_matrix, sp.csr_matrix, sp.bsr_matrix, - # sp.lil_matrix, sp.dok_matrix, sp.coo_matrix, sp.dia_matrix), only sp.bsr_matrix, sp.coo_matrix - # and sp.dia_matrix don't support indexing and need to be converted to a sparse format - # that does support indexing. It seems conversion to CSR is currently most efficient. - - try: - return X[I] - except: - sp_format = X.getformat() - return X.tocsr()[I].asformat(sp_format) - elif isinstance(X, pd.DataFrame): - return X.iloc[I] - elif isinstance(X, list): - return np.array(X)[I].tolist() - elif isinstance(X, dict): - X_return = {} - for key, value in X.items(): - X_return[key] = retrieve_rows(value, I) - return X_return - elif isinstance(X, np.ndarray): - return X[I] - - raise TypeError('%s datatype is not supported' % type(X)) + try: + return X[I] + except: + if sp.issparse(X): + # Out of the sparse matrix formats (sp.csc_matrix, sp.csr_matrix, sp.bsr_matrix, + # sp.lil_matrix, sp.dok_matrix, sp.coo_matrix, sp.dia_matrix), only sp.bsr_matrix, sp.coo_matrix + # and sp.dia_matrix don't support indexing and need to be converted to a sparse format + # that does support indexing. It seems conversion to CSR is currently most efficient. -def drop_rows(X: modALinput, - I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: + sp_format = X.getformat() + return X.tocsr()[I].asformat(sp_format) + elif isinstance(X, pd.DataFrame): + return X.iloc[I] + elif isinstance(X, list): + return np.array(X)[I].tolist() + elif isinstance(X, dict): + X_return = {} + for key, value in X.items(): + X_return[key] = retrieve_rows(value, I) + return X_return + + raise TypeError("%s datatype is not supported" % type(X)) + + +def drop_rows( + X: modALinput, I: Union[int, List[int], np.ndarray] +) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]: """ Returns X without the row(s) at index/indices I """ @@ -120,7 +139,13 @@ def drop_rows(X: modALinput, elif isinstance(X, list): return np.delete(X, I, axis=0).tolist() - raise TypeError('%s datatype is not supported' % type(X)) + try: + if torch.is_tensor(blocks[0]): + return torch.cat(blocks) + except: + X[[True if row not in I else False for row in range(X.size(0))]] + + raise TypeError("%s datatype is not supported" % type(X)) def enumerate_data(X: modALinput): @@ -141,17 +166,18 @@ def enumerate_data(X: modALinput): # numpy arrays and lists can readily be enumerated return enumerate(X) - raise TypeError('%s datatype is not supported' % type(X)) + raise TypeError("%s datatype is not supported" % type(X)) def data_shape(X: modALinput): """ Returns the shape of the data set X """ - if sp.issparse(X) or isinstance(X, pd.DataFrame) or isinstance(X, np.ndarray): - # scipy.sparse, pandas and numpy all support .shape + try: + # scipy.sparse, torch, pandas and numpy all support .shape return X.shape - elif isinstance(X, list): - return np.array(X).shape + except: + if isinstance(X, list): + return np.array(X).shape - raise TypeError('%s datatype is not supported' % type(X)) + raise TypeError("%s datatype is not supported" % type(X)) diff --git a/tests/core_tests.py b/tests/core_tests.py index 169ea90..e3113c4 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -189,6 +189,13 @@ def test_data_vstack(self): np.concatenate((a, b)) ) + # torch.Tensors + a, b = torch.ones(2, 2), torch.ones(2, 2) + torch.testing.assert_allclose( + modAL.utils.data.data_vstack((a, b)), + torch.cat((a, b)) + ) + # not supported formats self.assertRaises(TypeError, modAL.utils.data.data_vstack, (1, 1)) From 2f1e866023caaa10d04e6b4da80d86273e39631c Mon Sep 17 00:00:00 2001 From: Srikumar Sastry Date: Tue, 7 Dec 2021 20:21:29 +0100 Subject: [PATCH 175/182] Implementation of Cost Effective Active Learning --- examples/cost_effective_active_learning.py | 79 ++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 examples/cost_effective_active_learning.py diff --git a/examples/cost_effective_active_learning.py b/examples/cost_effective_active_learning.py new file mode 100644 index 0000000..68462d6 --- /dev/null +++ b/examples/cost_effective_active_learning.py @@ -0,0 +1,79 @@ +""" +This is a modified implementation of the algorithm Cost Effective Active Learning +(Pl. refer - https://arxiv.org/abs/1701.03551). This version not only picks up the +top K uncertain samples but also picks up the top N highly confident samples that +may represent information and diversity. It is better than the original implementation +as it does not involve tuning the confidence threshold parameter for every dataset. +""" + +from keras.datasets import mnist +import numpy as np +from modAL.models import ActiveLearner +from sklearn.ensemble import RandomForestClassifier +from scipy.special import entr + + +(X_train, y_train), (X_test, y_test) = mnist.load_data() + +X_train = X_train / 255 +X_test = X_test / 255 +y_train = y_train.astype(np.uint8) +y_test = y_test.astype(np.uint8) + +X_train = X_train.reshape(-1, 784) +X_test = X_test.reshape(-1, 784) + +model = RandomForestClassifier(n_estimators=100) + +INITIAL_SET_SIZE = 32 + +U_x = np.copy(X_train) +U_y = np.copy(y_train) + +ind = np.random.choice(range(len(U_x)), size=INITIAL_SET_SIZE) + +X_initial = U_x[ind] +y_initial = U_y[ind] + +U_x = np.delete(U_x, ind, axis=0) +U_y = np.delete(U_y, ind, axis=0) + + +def max_entropy(active_learner, X, K=16, N=16): + + class_prob = active_learner.predict_proba(X) + entropy = entr(class_prob).sum(axis=1) + uncertain_idx = np.argpartition(entropy, -K)[-K:] + confidence_idx = np.argpartition(entropy, N)[:N] + + return np.concatenate((uncertain_idx, confidence_idx), axis=0) + + +active_learner = ActiveLearner( + estimator=model, + X_training=X_initial, + y_training=y_initial, + query_strategy=max_entropy +) + +N_QUERIES = 20 + +K_MAX_ENTROPY = 16 +N_MIN_ENTROPY = 16 + +scores = [active_learner.score(X_test, y_test)] + +for index in range(N_QUERIES): + + query_idx, query_instance = active_learner.query(U_x, K_MAX_ENTROPY, N_MIN_ENTROPY) + + active_learner.teach(U_x[query_idx], U_y[query_idx]) + + U_x = np.delete(U_x, query_idx, axis=0) + U_y = np.delete(U_y, query_idx, axis=0) + + acc = active_learner.score(X_test, y_test) + + print(F'Query {index+1}: Test Accuracy: {acc}') + + scores.append(acc) \ No newline at end of file From d7321f50691e14c49721808b27c24b5cb4511f8d Mon Sep 17 00:00:00 2001 From: Srikumar Sastry Date: Wed, 8 Dec 2021 02:36:51 +0100 Subject: [PATCH 176/182] Added function for assigning pseudo labels for high confidence samples --- examples/cost_effective_active_learning.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/examples/cost_effective_active_learning.py b/examples/cost_effective_active_learning.py index 68462d6..3e7c16a 100644 --- a/examples/cost_effective_active_learning.py +++ b/examples/cost_effective_active_learning.py @@ -39,6 +39,12 @@ U_y = np.delete(U_y, ind, axis=0) +def assign_pseudo_labels(active_learner, X, confidence_idx): + conf_samples = X[confidence_idx] + labels = active_learner.predict(conf_samples) + return labels + + def max_entropy(active_learner, X, K=16, N=16): class_prob = active_learner.predict_proba(X) @@ -67,7 +73,15 @@ def max_entropy(active_learner, X, K=16, N=16): query_idx, query_instance = active_learner.query(U_x, K_MAX_ENTROPY, N_MIN_ENTROPY) - active_learner.teach(U_x[query_idx], U_y[query_idx]) + uncertain_idx = query_idx[:K_MAX_ENTROPY] + confidence_idx = query_idx[K_MAX_ENTROPY:] + + conf_labels = assign_pseudo_labels(active_learner, U_x, confidence_idx) + + L_x = U_x[query_idx] + L_y = np.concatenate((U_y[uncertain_idx], conf_labels), axis=0) + + active_learner.teach(L_x, L_y) U_x = np.delete(U_x, query_idx, axis=0) U_y = np.delete(U_y, query_idx, axis=0) From 37ac68b69a3460c760d194123234e589459742ad Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Sun, 19 Dec 2021 12:41:24 +0100 Subject: [PATCH 177/182] fix: BaseLearner query method return_metrics keyword argument moved after catching the arguments --- modAL/models/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index f371252..57c8b81 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -152,7 +152,7 @@ def predict_proba(self, X: modALinput, **predict_proba_kwargs) -> Any: """ return self.estimator.predict_proba(X, **predict_proba_kwargs) - def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwargs) -> Union[Tuple, modALinput]: + def query(self, X_pool, *query_args, return_metrics: bool = False, **query_kwargs) -> Union[Tuple, modALinput]: """ Finds the n_instances most informative point in the data provided by calling the query_strategy function. From caa19fb99d14a89a6189b9ec32820e9d5255b67b Mon Sep 17 00:00:00 2001 From: Srikumar Sastry Date: Sun, 19 Dec 2021 18:32:17 +0100 Subject: [PATCH 178/182] Docstring changes --- ...ayesian_active_learning_by_disagreement.py | 98 +++++++++++++++++++ examples/cost_effective_active_learning.py | 11 ++- 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 examples/bayesian_active_learning_by_disagreement.py diff --git a/examples/bayesian_active_learning_by_disagreement.py b/examples/bayesian_active_learning_by_disagreement.py new file mode 100644 index 0000000..8f56da2 --- /dev/null +++ b/examples/bayesian_active_learning_by_disagreement.py @@ -0,0 +1,98 @@ +from tensorflow.keras.models import Sequential, Model +from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.metrics import categorical_crossentropy +from tensorflow.keras.utils import to_categorical +import tensorflow as tf +import numpy as np +from keras.datasets import mnist +from keras.wrappers.scikit_learn import KerasClassifier +from modAL.models import ActiveLearner + +(X_train, y_train), (X_test, y_test) = mnist.load_data() + +X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1) +X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1) + +X_train = X_train.astype('float32') +X_test = X_test.astype('float32') + +X_train = X_train/255.0 +X_test_norm = X_test/255.0 + +y_train = to_categorical(y_train) +y_test = to_categorical(y_test) + +def LeNet(): + model = Sequential() + + model.add(Conv2D(filters = 6, kernel_size = (5,5), padding = 'same', + activation = 'relu', input_shape = (28,28,1))) + model.add(MaxPooling2D(pool_size = (2,2))) + model.add(Dropout(0.25)) + model.add(Conv2D(filters = 16, kernel_size = (5,5), activation = 'relu')) + model.add(MaxPooling2D(pool_size = (2,2))) + model.add(Flatten()) + model.add(Dense(120, activation = 'relu')) + model.add(Dropout(0.55)) + model.add(Dense(10, activation = 'softmax')) + opt = Adam(learning_rate = 0.001) + model.compile(loss = categorical_crossentropy, + optimizer = opt, + metrics = ['accuracy']) + + return model + +def max_disagreement(model, X, n=32, n_mcd=10): + + partial_model = Model(model.estimator.model.inputs, model.estimator.model.layers[-1].output) + prob = np.stack([partial_model(X.reshape(-1, 28, 28, 1), training=True) for _ in range(n_mcd)]) + pb = np.mean(prob, axis=0) + entropy1 = (-pb*np.log(pb)).sum(axis=1) + entropy2 = (-prob*np.log(prob)).sum(axis=2).mean(axis=0) + un = entropy2-entropy1 + return np.argpartition(un, n)[:n] + +model = KerasClassifier(LeNet) + +U_x = np.copy(X_train) +U_y = np.copy(y_train) + +INITIAL_SET_SIZE = 32 +ind = np.random.choice(range(len(U_x)), size=INITIAL_SET_SIZE) + +X_initial = U_x[ind] +y_initial = U_y[ind] + +U_x = np.delete(U_x, ind, axis=0) +U_y = np.delete(U_y, ind, axis=0) + +active_learner = ActiveLearner( + estimator=model, + X_training=X_initial, + y_training=y_initial, + query_strategy=max_disagreement, + verbose=0 +) + +N_QUERIES = 20 + +scores = [active_learner.score(X_test, y_test, verbose=0)] + +for index in range(N_QUERIES): + + query_idx, query_instance = active_learner.query(U_x) + + L_x = U_x[query_idx] + L_y = U_y[query_idx] + + active_learner.teach(L_x, L_y, epochs=50, batch_size=128, verbose=0) + + U_x = np.delete(U_x, query_idx, axis=0) + U_y = np.delete(U_y, query_idx, axis=0) + + acc = active_learner.score(X_test, y_test) + + print(F'Query {index+1}: Test Accuracy: {acc}') + + scores.append(acc) \ No newline at end of file diff --git a/examples/cost_effective_active_learning.py b/examples/cost_effective_active_learning.py index 3e7c16a..1feac04 100644 --- a/examples/cost_effective_active_learning.py +++ b/examples/cost_effective_active_learning.py @@ -2,7 +2,7 @@ This is a modified implementation of the algorithm Cost Effective Active Learning (Pl. refer - https://arxiv.org/abs/1701.03551). This version not only picks up the top K uncertain samples but also picks up the top N highly confident samples that -may represent information and diversity. It is better than the original implementation +may represent information and diversity. It is different than the original implementation as it does not involve tuning the confidence threshold parameter for every dataset. """ @@ -50,6 +50,15 @@ def max_entropy(active_learner, X, K=16, N=16): class_prob = active_learner.predict_proba(X) entropy = entr(class_prob).sum(axis=1) uncertain_idx = np.argpartition(entropy, -K)[-K:] + + """ + Original Implementation -- Pick most confident samples with + entropy less than a threshold. Threshold is decayed in every + iteration. + + Different than original -- Pick top n most confident samples. + """ + confidence_idx = np.argpartition(entropy, N)[:N] return np.concatenate((uncertain_idx, confidence_idx), axis=0) From ed4f3b0d642b347d846724ccf5f342825de994ba Mon Sep 17 00:00:00 2001 From: Srikumar Sastry Date: Sun, 19 Dec 2021 18:42:42 +0100 Subject: [PATCH 179/182] Docstring changes --- examples/bayesian_active_learning_by_disagreement.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/bayesian_active_learning_by_disagreement.py b/examples/bayesian_active_learning_by_disagreement.py index 8f56da2..3450db3 100644 --- a/examples/bayesian_active_learning_by_disagreement.py +++ b/examples/bayesian_active_learning_by_disagreement.py @@ -1,3 +1,8 @@ +""" +This is a original implementation of the algorithm Bayesian Active Learning by Disagreements. +(Pl. refer - https://arxiv.org/abs/1112.5745). It calculates the disagreement between an ensemble +of classifiers and a single classifier using monte carlo estimates. +""" from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout from tensorflow.keras.optimizers import Adam From 186f2ce79828246cc68fb5fa43184300304ff00c Mon Sep 17 00:00:00 2001 From: Srikumar Sastry Date: Fri, 7 Jan 2022 14:18:57 +0100 Subject: [PATCH 180/182] Delete bayesian_active_learning_by_disagreement.py --- ...ayesian_active_learning_by_disagreement.py | 103 ------------------ 1 file changed, 103 deletions(-) delete mode 100644 examples/bayesian_active_learning_by_disagreement.py diff --git a/examples/bayesian_active_learning_by_disagreement.py b/examples/bayesian_active_learning_by_disagreement.py deleted file mode 100644 index 3450db3..0000000 --- a/examples/bayesian_active_learning_by_disagreement.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -This is a original implementation of the algorithm Bayesian Active Learning by Disagreements. -(Pl. refer - https://arxiv.org/abs/1112.5745). It calculates the disagreement between an ensemble -of classifiers and a single classifier using monte carlo estimates. -""" -from tensorflow.keras.models import Sequential, Model -from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout -from tensorflow.keras.optimizers import Adam -from tensorflow.keras.metrics import categorical_crossentropy -from tensorflow.keras.utils import to_categorical -import tensorflow as tf -import numpy as np -from keras.datasets import mnist -from keras.wrappers.scikit_learn import KerasClassifier -from modAL.models import ActiveLearner - -(X_train, y_train), (X_test, y_test) = mnist.load_data() - -X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1) -X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1) - -X_train = X_train.astype('float32') -X_test = X_test.astype('float32') - -X_train = X_train/255.0 -X_test_norm = X_test/255.0 - -y_train = to_categorical(y_train) -y_test = to_categorical(y_test) - -def LeNet(): - model = Sequential() - - model.add(Conv2D(filters = 6, kernel_size = (5,5), padding = 'same', - activation = 'relu', input_shape = (28,28,1))) - model.add(MaxPooling2D(pool_size = (2,2))) - model.add(Dropout(0.25)) - model.add(Conv2D(filters = 16, kernel_size = (5,5), activation = 'relu')) - model.add(MaxPooling2D(pool_size = (2,2))) - model.add(Flatten()) - model.add(Dense(120, activation = 'relu')) - model.add(Dropout(0.55)) - model.add(Dense(10, activation = 'softmax')) - opt = Adam(learning_rate = 0.001) - model.compile(loss = categorical_crossentropy, - optimizer = opt, - metrics = ['accuracy']) - - return model - -def max_disagreement(model, X, n=32, n_mcd=10): - - partial_model = Model(model.estimator.model.inputs, model.estimator.model.layers[-1].output) - prob = np.stack([partial_model(X.reshape(-1, 28, 28, 1), training=True) for _ in range(n_mcd)]) - pb = np.mean(prob, axis=0) - entropy1 = (-pb*np.log(pb)).sum(axis=1) - entropy2 = (-prob*np.log(prob)).sum(axis=2).mean(axis=0) - un = entropy2-entropy1 - return np.argpartition(un, n)[:n] - -model = KerasClassifier(LeNet) - -U_x = np.copy(X_train) -U_y = np.copy(y_train) - -INITIAL_SET_SIZE = 32 -ind = np.random.choice(range(len(U_x)), size=INITIAL_SET_SIZE) - -X_initial = U_x[ind] -y_initial = U_y[ind] - -U_x = np.delete(U_x, ind, axis=0) -U_y = np.delete(U_y, ind, axis=0) - -active_learner = ActiveLearner( - estimator=model, - X_training=X_initial, - y_training=y_initial, - query_strategy=max_disagreement, - verbose=0 -) - -N_QUERIES = 20 - -scores = [active_learner.score(X_test, y_test, verbose=0)] - -for index in range(N_QUERIES): - - query_idx, query_instance = active_learner.query(U_x) - - L_x = U_x[query_idx] - L_y = U_y[query_idx] - - active_learner.teach(L_x, L_y, epochs=50, batch_size=128, verbose=0) - - U_x = np.delete(U_x, query_idx, axis=0) - U_y = np.delete(U_y, query_idx, axis=0) - - acc = active_learner.score(X_test, y_test) - - print(F'Query {index+1}: Test Accuracy: {acc}') - - scores.append(acc) \ No newline at end of file From ad7fa4092254bdbc6ee5935213b40ca87878a167 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 1 Jun 2023 13:11:51 +0200 Subject: [PATCH 181/182] version number bumped and package name change indicated in docs --- README.md | 2 +- docs/source/content/overview/Installation.rst | 2 +- setup.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0e9aec5..4a41633 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ modAL requires You can install modAL directly with pip: ``` -pip install modAL +pip install modAL-python ``` Alternatively, you can install modAL directly from source: ``` diff --git a/docs/source/content/overview/Installation.rst b/docs/source/content/overview/Installation.rst index c44093a..76d4471 100644 --- a/docs/source/content/overview/Installation.rst +++ b/docs/source/content/overview/Installation.rst @@ -11,7 +11,7 @@ You can install modAL directly with pip: :: - pip install modAL + pip install modAL-python Alternatively, you can install modAL directly from source: diff --git a/setup.py b/setup.py index 69c1ad2..70c51e8 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,8 @@ from setuptools import find_packages, setup setup( - name='modAL', - version='0.4.1', + name='modAL-python', + version='0.4.2', author='Tivadar Danka', author_email='85a5187a@opayq.com', description='A modular active learning framework for Python3', From bba6f6fd00dbb862b1e09259b78caf6cffa2e755 Mon Sep 17 00:00:00 2001 From: cosmic-cortex Date: Thu, 1 Jun 2023 14:18:23 +0200 Subject: [PATCH 182/182] numpy version fixed --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 70c51e8..3905e35 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,6 @@ url='https://modAL-python.github.io/', packages=['modAL', 'modAL.models', 'modAL.utils'], classifiers=['Development Status :: 4 - Beta'], - install_requires=['numpy==1.20.0', 'scikit-learn>=0.18', + install_requires=['numpy', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0', 'skorch==0.9.0'], )