From 96397f5ff8abd4dab74fabdbed26dc467484fffd Mon Sep 17 00:00:00 2001
From: xq5he <shexinquan@gmail.com>
Date: Thu, 23 Mar 2017 09:09:30 +0800
Subject: [PATCH 01/49] Fix typo (#28)

---
 mla/kmeans.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mla/kmeans.py b/mla/kmeans.py
index f9088455..cec03191 100644
--- a/mla/kmeans.py
+++ b/mla/kmeans.py
@@ -46,7 +46,7 @@ def __init__(self, K=5, max_iters=100, init='random'):
         self.centroids = []
         self.init = init
 
-    def _initialize_cetroids(self, init):
+    def _initialize_centroids(self, init):
         """Set the initial centroids."""
 
         if init == 'random':
@@ -61,7 +61,7 @@ def _initialize_cetroids(self, init):
 
     def _predict(self, X=None):
         """Perform clustering on the dataset."""
-        self._initialize_cetroids(self.init)
+        self._initialize_centroids(self.init)
         centroids = self.centroids
 
         # Optimize clusters

From 1fbc443166c6ebd28f24cd98fac174218bfd68f3 Mon Sep 17 00:00:00 2001
From: Artem Golubin <rushter@users.noreply.github.com>
Date: Thu, 23 Mar 2017 09:07:32 +0700
Subject: [PATCH 02/49] Fix markdown

---
 README.md | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index c4da27bb..91078c2e 100644
--- a/README.md
+++ b/README.md
@@ -7,20 +7,20 @@ The code is much easier to follow than the optimized libraries and easier to pla
 All algorithms are implemented in Python, using numpy, scipy and autograd.  
 
 ### Implemented:
-* [Deep learning (MLP, CNN, RNN, LSTM)] (mla/neuralnet)
-* [Linear regression, logistic regression] (mla/linear_models.py)
-* [Random Forests] (mla/ensemble/random_forest.py)
-* [Support vector machine (SVM) with kernels (Linear, Poly, RBF)] (mla/svm)
-* [K-Means] (mla/kmeans.py)
-* [Gaussian Mixture Model] (mla/gaussian_mixture.py)
-* [K-nearest neighbors] (mla/knn.py)
-* [Naive bayes] (mla/naive_bayes.py)
-* [Principal component analysis (PCA)] (mla/pca.py)
-* [Factorization machines] (mla/fm.py)
-* [Restricted Boltzmann machine (RBM)] (mla/rbm.py)
-* [t-Distributed Stochastic Neighbor Embedding (t-SNE)] (mla/tsne.py)
-* [Gradient Boosting trees (also known as GBDT, GBRT, GBM, XGBoost)] (mla/ensemble/gbm.py)
-* [Reinforcement learning (Deep Q learning)] (mla/rl)
+* [Deep learning (MLP, CNN, RNN, LSTM)](mla/neuralnet)
+* [Linear regression, logistic regression](mla/linear_models.py)
+* [Random Forests](mla/ensemble/random_forest.py)
+* [Support vector machine (SVM) with kernels (Linear, Poly, RBF)](mla/svm)
+* [K-Means](mla/kmeans.py)
+* [Gaussian Mixture Model](mla/gaussian_mixture.py)
+* [K-nearest neighbors](mla/knn.py)
+* [Naive bayes](mla/naive_bayes.py)
+* [Principal component analysis (PCA)](mla/pca.py)
+* [Factorization machines](mla/fm.py)
+* [Restricted Boltzmann machine (RBM)](mla/rbm.py)
+* [t-Distributed Stochastic Neighbor Embedding (t-SNE)](mla/tsne.py)
+* [Gradient Boosting trees (also known as GBDT, GBRT, GBM, XGBoost)](mla/ensemble/gbm.py)
+* [Reinforcement learning (Deep Q learning)](mla/rl)
 
 
 ### Installation
@@ -43,4 +43,4 @@ All algorithms are implemented in Python, using numpy, scipy and autograd.
 
 Your contributions are always welcome!  
 Feel free to improve existing code, documentation or implement new algorithm.  
-Please open an issue to propose your changes if they big are enough.  
\ No newline at end of file
+Please open an issue to propose your changes if they big are enough.  

From b7364b560d993975ecc4b3f0fe55bf4bf998fd0d Mon Sep 17 00:00:00 2001
From: Jiancheng <jekyll4168@icloud.com>
Date: Mon, 27 Mar 2017 09:13:32 +0800
Subject: [PATCH 03/49] Fix the CD-1 training code and add comments in RBM
 (#29)

1. `negative_visible` should also be sampled, only the last hidden can use the expectation. Though trivial modification, it follows the original algorithm, and theorically it should also be done (refer to Kevin Murphy's MLaPP P944).
2. Some comments are added in the code, for more friendly reading. At the first sight I didn't find it's CD :)
---
 mla/rbm.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mla/rbm.py b/mla/rbm.py
index f314cc33..3ceb72f1 100644
--- a/mla/rbm.py
+++ b/mla/rbm.py
@@ -54,15 +54,18 @@ def _init_weights(self):
         self.errors = []
 
     def _train(self):
+        '''Use CD-1 training procedure, basically an exact inference for `positive_associations`, 
+        followed by a "non burn-in" block Gibbs Sampling for the `negative_associations`.'''
 
         for i in range(self.max_epochs):
             error = 0
             for batch in batch_iterator(self.X, batch_size=self.batch_size):
                 positive_hidden = sigmoid(np.dot(batch, self.W) + self.bias_h)
-                hidden_states = self._sample(positive_hidden)
+                hidden_states = self._sample(positive_hidden) # sample hidden state h1
                 positive_associations = np.dot(batch.T, positive_hidden)
 
                 negative_visible = sigmoid(np.dot(hidden_states, self.W.T) + self.bias_v)
+                negative_visible = self._sample(negative_visible) # use the samped hidden state h1 to sample v1
                 negative_hidden = sigmoid(np.dot(negative_visible, self.W) + self.bias_h)
                 negative_associations = np.dot(negative_visible.T, negative_hidden)
 

From 2b798428d847adead4d54f9957d99e1651606cf4 Mon Sep 17 00:00:00 2001
From: Artem Golubin <rushter@users.noreply.github.com>
Date: Mon, 27 Mar 2017 11:48:39 +0700
Subject: [PATCH 04/49] Update rbm.py

---
 mla/rbm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mla/rbm.py b/mla/rbm.py
index 3ceb72f1..85d7ccc6 100644
--- a/mla/rbm.py
+++ b/mla/rbm.py
@@ -15,7 +15,6 @@
 """
 
 
-# Warning: It's untested and unfinished implementation.
 
 class RBM(BaseEstimator):
     y_required = False

From 7b1cbfd833fa125e89e299957711f229b52b5be4 Mon Sep 17 00:00:00 2001
From: rushter <me@rushter.com>
Date: Wed, 9 Aug 2017 11:50:58 +0300
Subject: [PATCH 05/49] Fix typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 91078c2e..aa4868d4 100644
--- a/README.md
+++ b/README.md
@@ -43,4 +43,4 @@ All algorithms are implemented in Python, using numpy, scipy and autograd.
 
 Your contributions are always welcome!  
 Feel free to improve existing code, documentation or implement new algorithm.  
-Please open an issue to propose your changes if they big are enough.  
+Please open an issue to propose your changes if they are big enough.  

From 4b3c24dfce6f430d42ce9f24b72de54d34c9d79e Mon Sep 17 00:00:00 2001
From: Yiran Sheng <yiransheng@users.noreply.github.com>
Date: Fri, 11 Aug 2017 21:24:05 -0700
Subject: [PATCH 06/49] Fix for Numeric Overflow of `softplus` implementation
 (#30)

* fix possible numeric overflow for softplus

* english
---
 mla/neuralnet/activations.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mla/neuralnet/activations.py b/mla/neuralnet/activations.py
index 691b4f5a..65bf279d 100644
--- a/mla/neuralnet/activations.py
+++ b/mla/neuralnet/activations.py
@@ -22,7 +22,8 @@ def linear(z):
 
 def softplus(z):
     """Smooth relu."""
-    return np.log(1 + np.exp(z))
+    # Avoid numerical overflow by putting possible inf into denominator position
+    return z + np.log(1 + 1 / np.exp(z))
 
 
 def softsign(z):

From d3987776426475af71a7f34cf298dd1232eb2f08 Mon Sep 17 00:00:00 2001
From: Yiran Sheng <yiransheng@users.noreply.github.com>
Date: Thu, 17 Aug 2017 01:56:34 -0700
Subject: [PATCH 07/49] Fix softplus overflow (attempt 2) (#31)

* actually fix numeric overflow
---
 mla/neuralnet/activations.py            |  5 +++--
 mla/neuralnet/tests/test_activations.py | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 mla/neuralnet/tests/test_activations.py

diff --git a/mla/neuralnet/activations.py b/mla/neuralnet/activations.py
index 65bf279d..55e2fb27 100644
--- a/mla/neuralnet/activations.py
+++ b/mla/neuralnet/activations.py
@@ -22,8 +22,9 @@ def linear(z):
 
 def softplus(z):
     """Smooth relu."""
-    # Avoid numerical overflow by putting possible inf into denominator position
-    return z + np.log(1 + 1 / np.exp(z))
+    # Avoid numerical overflow, see:
+    # https://docs.scipy.org/doc/numpy/reference/generated/numpy.logaddexp.html
+    return np.logaddexp(0.0, z)
 
 
 def softsign(z):
diff --git a/mla/neuralnet/tests/test_activations.py b/mla/neuralnet/tests/test_activations.py
new file mode 100644
index 00000000..5d8caa26
--- /dev/null
+++ b/mla/neuralnet/tests/test_activations.py
@@ -0,0 +1,25 @@
+import sys
+import numpy as np
+
+from mla.neuralnet.activations import *
+
+def test_softplus():
+    # np.exp(z_max) will overflow
+    z_max = np.log(sys.float_info.max) + 1.0e10
+    # 1.0 / np.exp(z_min) will overflow
+    z_min = np.log(sys.float_info.min) - 1.0e10
+    inputs = np.array([0.0, 1.0, -1.0, z_min, z_max])
+    # naive implementation of np.log(1 + np.exp(z_max)) will overflow
+    # naive implementation of z + np.log(1 + 1 / np.exp(z_min)) will
+    # throw ZeroDivisionError
+    outputs = np.array([
+      np.log(2.0),
+      np.log1p(np.exp(1.0)),
+      np.log1p(np.exp(-1.0)),
+      0.0,
+      z_max
+    ])
+
+    assert np.allclose(outputs, softplus(inputs))
+
+

From 6b729f043e6ee035d3d94db2f290eb2feb9f330a Mon Sep 17 00:00:00 2001
From: KaiMin Lai <32761599+KaiminLai@users.noreply.github.com>
Date: Sat, 30 Jun 2018 21:48:41 +0800
Subject: [PATCH 08/49] update the penalty function in linear models (#36)

---
 mla/linear_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mla/linear_models.py b/mla/linear_models.py
index 7fb7b7ca..5b899f12 100644
--- a/mla/linear_models.py
+++ b/mla/linear_models.py
@@ -48,9 +48,9 @@ def init_cost(self):
     def _add_penalty(self, loss, w):
         """Apply regularization to the loss."""
         if self.penalty == "l1":
-            loss += self.C * np.abs(w[:-1]).sum()
+            loss += self.C * np.abs(w[1:]).sum()
         elif self.penalty == "l2":
-            loss += (0.5 * self.C) * (w[:-1] ** 2).mean()
+            loss += (0.5 * self.C) * (w[1:] ** 2).mean()
         return loss
 
     def _cost(self, X, y, theta):

From 64d7c51d00d22eed516cd81def5908af1368b803 Mon Sep 17 00:00:00 2001
From: keineahnung2345 <mimifasosofamire1123@gmail.com>
Date: Mon, 22 Oct 2018 04:45:45 +0800
Subject: [PATCH 09/49] Fix the formula of l2 penalty (#40)

---
 mla/linear_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mla/linear_models.py b/mla/linear_models.py
index 5b899f12..3bdc044b 100644
--- a/mla/linear_models.py
+++ b/mla/linear_models.py
@@ -50,7 +50,7 @@ def _add_penalty(self, loss, w):
         if self.penalty == "l1":
             loss += self.C * np.abs(w[1:]).sum()
         elif self.penalty == "l2":
-            loss += (0.5 * self.C) * (w[1:] ** 2).mean()
+            loss += (0.5 * self.C) * (w[1:] ** 2).sum()
         return loss
 
     def _cost(self, X, y, theta):

From 5fbfeca8c44f2b01283eba9de3f32ae58243f6ab Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Sun, 21 Oct 2018 23:46:12 +0300
Subject: [PATCH 10/49] Update test_regression_accuracy.py

---
 mla/tests/test_regression_accuracy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mla/tests/test_regression_accuracy.py b/mla/tests/test_regression_accuracy.py
index 0f591c66..81e65ded 100644
--- a/mla/tests/test_regression_accuracy.py
+++ b/mla/tests/test_regression_accuracy.py
@@ -22,7 +22,7 @@
 
 
 def test_linear():
-    model = LinearRegression(lr=0.01, max_iters=2000, penalty='l2', C=0.03)
+    model = LinearRegression(lr=0.01, max_iters=2000, penalty='l2', C=0.003)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
     assert mean_squared_error(y_test, predictions) < 0.25

From e253211090f217a9f6b4be54daa657db2037b124 Mon Sep 17 00:00:00 2001
From: Xiaochun Ma <122030478@qq.com>
Date: Tue, 23 Oct 2018 01:18:21 -0400
Subject: [PATCH 11/49] Get the derivative of the loss function out of the for
 loop (#42)

I think getting the derivative of the loss function should be out of the for loop. There is no need to calculate the derivative function for max iteration times.
---
 mla/linear_models.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mla/linear_models.py b/mla/linear_models.py
index 3bdc044b..a533f1cb 100644
--- a/mla/linear_models.py
+++ b/mla/linear_models.py
@@ -87,10 +87,9 @@ def _predict(self, X=None):
     def _gradient_descent(self):
         theta = self.theta
         errors = [self._cost(self.X, self.y, theta)]
-
+        # Get derivative of the loss function
+        cost_d = grad(self._loss)
         for i in range(1, self.max_iters + 1):
-            # Get derivative of the loss function
-            cost_d = grad(self._loss)
             # Calculate gradient and update theta
             delta = cost_d(theta)
             theta -= self.lr * delta

From cd6689c12ac4db6489116147b50dfda73e356379 Mon Sep 17 00:00:00 2001
From: keineahnung2345 <mimifasosofamire1123@gmail.com>
Date: Mon, 7 Jan 2019 17:02:36 +0800
Subject: [PATCH 12/49] kmeans++

It seems the codes do not match with the comment starting from line 33.
Revise it according to the comments.
---
 mla/kmeans.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mla/kmeans.py b/mla/kmeans.py
index cec03191..65ded386 100644
--- a/mla/kmeans.py
+++ b/mla/kmeans.py
@@ -117,10 +117,9 @@ def _dist_from_centers(self):
 
     def _choose_next_center(self):
         distances = self._dist_from_centers()
-        probs = distances / distances.sum()
-        cumprobs = probs.cumsum()
-        r = random.random()
-        ind = np.where(cumprobs >= r)[0][0]
+        squared_distances = distances**2
+        probs = squared_distances/squared_distances.sum()
+        ind = np.random.choice(self.X.shape[0], 1, p=probs)[0]
         return self.X[ind]
 
     def _is_converged(self, centroids_old, centroids):

From be53f3eaffe678b6b2a7da3e22de63bf136ec738 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Mon, 7 Jan 2019 14:08:47 +0300
Subject: [PATCH 13/49] Add AUTHORS

---
 AUTHORS | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 AUTHORS

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 00000000..f92dead0
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,16 @@
+Artem Golubin <me@rushter.com>
+Anebi Agbo
+Convex Path
+James Chevalier
+Jiancheng
+KaiMin Lai
+Nguyễn Tuấn
+Nicolas Hug
+Xiaochun Ma
+Yiran Sheng
+brady salz
+junwang007
+keineahnung2345
+lucaskolstad
+vincent tang
+xq5he

From 3352ca819b0e557e87d6a854c8c6be0c65476e5e Mon Sep 17 00:00:00 2001
From: HswTime <hsw.time@gmail.com>
Date: Fri, 25 Jan 2019 15:02:14 +0800
Subject: [PATCH 14/49] update gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 4ed444a1..e240d8d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ dist/
 mla.egg-info/
 .cache
 *.swp
+.idea
\ No newline at end of file

From 602d3b6fd9a31bbbbd02d822192f8633a550a320 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Fri, 25 Jan 2019 12:11:16 +0300
Subject: [PATCH 15/49] Fix reduction test

---
 mla/tests/test_reduction.py | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/mla/tests/test_reduction.py b/mla/tests/test_reduction.py
index b7450a56..722194a7 100644
--- a/mla/tests/test_reduction.py
+++ b/mla/tests/test_reduction.py
@@ -1,31 +1,40 @@
-from sklearn.metrics import roc_auc_score
+# coding=utf-8
+import pytest
 
-from mla.pca import PCA
-from mla.ensemble import RandomForestClassifier
+from sklearn.metrics import roc_auc_score
+from sklearn.datasets import make_classification
 
 try:
     from sklearn.model_selection import train_test_split
 except ImportError:
     from sklearn.cross_validation import train_test_split
-from sklearn.datasets import make_classification
 
-# Generate a random binary classification problem.
-X, y = make_classification(n_samples=1000, n_features=100, n_informative=75,
-                           random_state=1111, n_classes=2, class_sep=2.5, )
+from mla.ensemble import RandomForestClassifier
+from mla.pca import PCA
+
+
+@pytest.fixture
+def dataset():
+    # Generate a random binary classification problem.
+    return make_classification(n_samples=1000, n_features=100, n_informative=75,
+                               random_state=1111, n_classes=2, class_sep=2.5, )
 
 
-def test_PCA():
+# TODO: fix
+@pytest.mark.skip()
+def test_PCA(dataset):
+    X, y = dataset
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                         random_state=1111)
-    p = PCA(100, solver='eigen')
+    p = PCA(50, solver='eigen')
 
-    # fit PCA with training data, not the entire dataset
+    # fit PCA with training set, not the entire dataset
     p.fit(X_train)
     X_train_reduced = p.transform(X_train)
     X_test_reduced = p.transform(X_test)
 
-    model = RandomForestClassifier(n_estimators=10, max_depth=4)
+    model = RandomForestClassifier(n_estimators=25, max_depth=5)
     model.fit(X_train_reduced, y_train)
     predictions = model.predict(X_test_reduced)[:, 1]
-    print(roc_auc_score(y_test, predictions))
-    assert roc_auc_score(y_test, predictions) >= 0.70
+    score = roc_auc_score(y_test, predictions)
+    assert score >= 0.75

From b8ad378a796ee867acfa3198e04d47a500dd90d3 Mon Sep 17 00:00:00 2001
From: Robert Wukmir <robert.wukmir@icloud.com>
Date: Wed, 13 Feb 2019 13:51:13 -0800
Subject: [PATCH 16/49] Add Leaky ReLU activation. Differentiation with
 autograd package confirmed to work correctly.

---
 mla/neuralnet/activations.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mla/neuralnet/activations.py b/mla/neuralnet/activations.py
index 55e2fb27..d13b6c2c 100644
--- a/mla/neuralnet/activations.py
+++ b/mla/neuralnet/activations.py
@@ -39,6 +39,10 @@ def relu(z):
     return np.maximum(0, z)
 
 
+def leakyrelu(z, a=0.01):
+    return np.maximum(z * a, z)
+
+
 def get_activation(name):
     """Return activation function by name"""
     try:

From 4e980800b7492b8258c1d085241c79ba0e807a2a Mon Sep 17 00:00:00 2001
From: Robert Wukmir <robert.wukmir@icloud.com>
Date: Sun, 17 Feb 2019 22:57:22 -0800
Subject: [PATCH 17/49] Add total size of batch iterations to tqdm loop so that
 it displays estimated time to completion.

---
 mla/neuralnet/optimizers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mla/neuralnet/optimizers.py b/mla/neuralnet/optimizers.py
index 714b0d6d..422c6d9e 100644
--- a/mla/neuralnet/optimizers.py
+++ b/mla/neuralnet/optimizers.py
@@ -44,7 +44,7 @@ def train_epoch(self, network):
 
         batch = zip(X_batch, y_batch)
         if network.verbose:
-            batch = tqdm(batch)
+            batch = tqdm(batch, total=int(np.ceil(network.n_samples/network.batch_size)))
 
         for X, y in batch:
             loss = np.mean(network.update(X, y))

From 49c51758fe2fed9bdd0f9cc2eb4557d44f4eb848 Mon Sep 17 00:00:00 2001
From: Robert Wukmir <robert.wukmir@icloud.com>
Date: Sun, 17 Feb 2019 23:25:27 -0800
Subject: [PATCH 18/49] Add a basic implementation of Batch Normalization.

Adapted from https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html

A more efficient algorithm could probably be used, but this one seems pretty easy to understand.
---
 mla/neuralnet/layers/__init__.py      |   1 +
 mla/neuralnet/layers/normalization.py | 150 +++++++++++++++++++++++++-
 2 files changed, 148 insertions(+), 3 deletions(-)

diff --git a/mla/neuralnet/layers/__init__.py b/mla/neuralnet/layers/__init__.py
index 6068f852..123a814d 100644
--- a/mla/neuralnet/layers/__init__.py
+++ b/mla/neuralnet/layers/__init__.py
@@ -1,2 +1,3 @@
 from .basic import *
 from .convnet import *
+from .normalization import *
\ No newline at end of file
diff --git a/mla/neuralnet/layers/normalization.py b/mla/neuralnet/layers/normalization.py
index d7081ace..72448682 100644
--- a/mla/neuralnet/layers/normalization.py
+++ b/mla/neuralnet/layers/normalization.py
@@ -1,5 +1,149 @@
-from mla.neuralnet.layers import Layer
+from mla.neuralnet.layers import Layer, PhaseMixin, ParamMixin
+from mla.neuralnet.parameters import Parameters
+import numpy as np
 
 
-class BatchNormalization(Layer):
-    pass
\ No newline at end of file
+class BatchNormalization(Layer, ParamMixin, PhaseMixin):
+    def __init__(self, momentum=0.9, eps=1e-5, parameters=None):
+        super().__init__()
+        self._params = parameters
+        if self._params is None:
+            self._params = Parameters()
+        self.momentum = momentum
+        self.eps = eps
+        self.ema_mean = None
+        self.ema_var = None
+
+    def setup(self, x_shape):
+        self._params.setup_weights((1, x_shape[1]))
+
+    def _forward_pass(self, X):
+        gamma = self._params['W']
+        beta = self._params['b']
+
+        if self.is_testing:
+            mu = self.ema_mean
+            xmu = X - mu
+            var = self.ema_var
+            sqrtvar = np.sqrt(var + self.eps)
+            ivar = 1. / sqrtvar
+            xhat = xmu * ivar
+            gammax = gamma * xhat
+            return gammax + beta
+
+        N, D = X.shape
+
+        # step1: calculate mean
+        mu = 1. / N * np.sum(X, axis=0)
+
+        # step2: subtract mean vector of every trainings example
+        xmu = X - mu
+
+        # step3: following the lower branch - calculation denominator
+        sq = xmu ** 2
+
+        # step4: calculate variance
+        var = 1. / N * np.sum(sq, axis=0)
+
+        # step5: add eps for numerical stability, then sqrt
+        sqrtvar = np.sqrt(var + self.eps)
+
+        # step6: invert sqrtwar
+        ivar = 1. / sqrtvar
+
+        # step7: execute normalization
+        xhat = xmu * ivar
+
+        # step8: Nor the two transformation steps
+        gammax = gamma * xhat
+
+        # step9
+        out = gammax + beta
+
+        # store running averages of mean and variance during training for use during testing
+        if self.ema_mean is None or self.ema_var is None:
+            self.ema_mean = mu
+            self.ema_var = var
+        else:
+            self.ema_mean = self.momentum * self.ema_mean + (1 - self.momentum) * mu
+            self.ema_var = self.momentum * self.ema_var + (1 - self.momentum) * var
+        # store intermediate
+        self.cache = (xhat, gamma, xmu, ivar, sqrtvar, var)
+
+        return out
+
+    def forward_pass(self, X):
+        if len(X.shape) == 2:
+            # input is a regular layer
+            return self._forward_pass(X)
+        elif len(X.shape) == 4:
+            # input is a convolution layer
+            N, C, H, W = X.shape
+            x_flat = X.transpose(0, 2, 3, 1).reshape(-1, C)
+            out_flat = self._forward_pass(x_flat)
+            return out_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)
+        else:
+            raise NotImplementedError('Unknown model with dimensions = {}'.format(len(X.shape)))
+
+    def _backward_pass(self, delta):
+        # unfold the variables stored in cache
+        xhat, gamma, xmu, ivar, sqrtvar, var = self.cache
+
+        # get the dimensions of the input/output
+        N, D = delta.shape
+
+        # step9
+        dbeta = np.sum(delta, axis=0)
+        dgammax = delta  # not necessary, but more understandable
+
+        # step8
+        dgamma = np.sum(dgammax * xhat, axis=0)
+        dxhat = dgammax * gamma
+
+        # step7
+        divar = np.sum(dxhat * xmu, axis=0)
+        dxmu1 = dxhat * ivar
+
+        # step6
+        dsqrtvar = -1. / (sqrtvar ** 2) * divar
+
+        # step5
+        dvar = 0.5 * 1. / np.sqrt(var + self.eps) * dsqrtvar
+
+        # step4
+        dsq = 1. / N * np.ones((N, D)) * dvar
+
+        # step3
+        dxmu2 = 2 * xmu * dsq
+
+        # step2
+        dx1 = (dxmu1 + dxmu2)
+        dmu = -1 * np.sum(dxmu1 + dxmu2, axis=0)
+
+        # step1
+        dx2 = 1. / N * np.ones((N, D)) * dmu
+
+        # step0
+        dx = dx1 + dx2
+
+        # Update gradient values
+        self._params.update_grad('W', dgamma)
+        self._params.update_grad('b', dbeta)
+
+        return dx
+
+    def backward_pass(self, X):
+        if len(X.shape) == 2:
+            # input is a regular layer
+            return self._backward_pass(X)
+        elif len(X.shape) == 4:
+            # input is a convolution layer
+            N, C, H, W = X.shape
+            x_flat = X.transpose(0, 2, 3, 1).reshape(-1, C)
+            out_flat = self._backward_pass(x_flat)
+            return out_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)
+        else:
+            raise NotImplementedError('Unknown model shape: {}'.format(X.shape))
+
+    def shape(self, x_shape):
+        return x_shape

From 528d623fd6bb961cc29e5adb9d680c31810432d9 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Mon, 18 Feb 2019 14:34:59 +0300
Subject: [PATCH 19/49] Update normalization.py

---
 mla/neuralnet/layers/normalization.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mla/neuralnet/layers/normalization.py b/mla/neuralnet/layers/normalization.py
index 72448682..b5f2cdd3 100644
--- a/mla/neuralnet/layers/normalization.py
+++ b/mla/neuralnet/layers/normalization.py
@@ -2,6 +2,10 @@
 from mla.neuralnet.parameters import Parameters
 import numpy as np
 
+"""
+References:
+https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html
+"""
 
 class BatchNormalization(Layer, ParamMixin, PhaseMixin):
     def __init__(self, momentum=0.9, eps=1e-5, parameters=None):

From 6e383f73e87ff1afb62ff4d711e4d8dd245ae923 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Mon, 18 Feb 2019 14:36:01 +0300
Subject: [PATCH 20/49] Update AUTHORS

---
 AUTHORS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AUTHORS b/AUTHORS
index f92dead0..3e699bab 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -14,3 +14,4 @@ keineahnung2345
 lucaskolstad
 vincent tang
 xq5he
+LanderTome

From 31c38db00b86468c21e9597ba1e84df25e71377b Mon Sep 17 00:00:00 2001
From: lm <therickli@gmail.com>
Date: Tue, 12 Mar 2019 13:34:20 +0800
Subject: [PATCH 21/49] fix sigmoid function

---
 mla/linear_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mla/linear_models.py b/mla/linear_models.py
index a533f1cb..5aff07e1 100644
--- a/mla/linear_models.py
+++ b/mla/linear_models.py
@@ -127,7 +127,7 @@ def _loss(self, w):
 
     @staticmethod
     def sigmoid(x):
-        return 0.5 * (np.tanh(x) + 1)
+        return 0.5 * (np.tanh(0.5 * x) + 1)
 
     def _predict(self, X=None):
         X = self._add_intercept(X)

From 9f22724a36705b7380e2c2e7771ddd895e64df7c Mon Sep 17 00:00:00 2001
From: Andrew Melnik <andrew.melnik.git@gmail.com>
Date: Tue, 12 Mar 2019 17:20:23 +0100
Subject: [PATCH 22/49] pip install . - bug fix - mla/rl was missing

---
 mla/rl/__init__.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 mla/rl/__init__.py

diff --git a/mla/rl/__init__.py b/mla/rl/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/mla/rl/__init__.py
@@ -0,0 +1 @@
+

From 8f49f660e979510bdf27db1c5cb05e604256c22d Mon Sep 17 00:00:00 2001
From: Andrew Melnik <andrew.melnik.git@gmail.com>
Date: Wed, 13 Mar 2019 16:06:05 +0100
Subject: [PATCH 23/49] bug fix self.memory_limit

---
 mla/rl/dqn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mla/rl/dqn.py b/mla/rl/dqn.py
index b3b9b461..07ade405 100644
--- a/mla/rl/dqn.py
+++ b/mla/rl/dqn.py
@@ -118,7 +118,7 @@ def train(self, render=False):
                     break
 
             # Remove old entries from replay memory
-            if len(self.replay) > self.memory_limit:
+            while len(self.replay) > self.memory_limit:
                 self.replay.pop(0)
 
             self.epsilon = self.min_epsilon + (1.0 - self.min_epsilon) * np.exp(-self.decay * ep)

From 6789901b630e689ed543a8114f50c629065925d3 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Wed, 13 Mar 2019 23:27:33 +0300
Subject: [PATCH 24/49] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index aa4868d4..2198ede9 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ All algorithms are implemented in Python, using numpy, scipy and autograd.
         git clone https://github.com/rushter/MLAlgorithms
         cd MLAlgorithms
         pip install scipy numpy
-        pip install .
+        python setup.py develop
 
 ### How to run examples without installation
         cd MLAlgorithms

From a1c47cb2aecd6ab1ae687f0470588f85f0807692 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Wed, 13 Mar 2019 23:29:11 +0300
Subject: [PATCH 25/49] Update AUTHORS

---
 AUTHORS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/AUTHORS b/AUTHORS
index 3e699bab..1f9f89c1 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -15,3 +15,5 @@ lucaskolstad
 vincent tang
 xq5he
 LanderTome
+therickli
+Andrew Melnik

From c5cc88ea48f87dd200ef9b391f7a9289dc411205 Mon Sep 17 00:00:00 2001
From: Andrew Melnik <andrew.melnik.git@gmail.com>
Date: Thu, 14 Mar 2019 12:13:55 +0100
Subject: [PATCH 26/49] bug fix env.close()

---
 mla/rl/dqn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mla/rl/dqn.py b/mla/rl/dqn.py
index 07ade405..f2ae4fa8 100644
--- a/mla/rl/dqn.py
+++ b/mla/rl/dqn.py
@@ -141,3 +141,4 @@ def play(self, episodes):
                 if done:
                     break
             logger.info('Episode: %s, reward %s' % (i, total_reward))
+        self.env.close()

From 62633dd30230a8bdc13826b37ea51ce39df69fb9 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Sat, 11 May 2019 18:14:43 +0300
Subject: [PATCH 27/49] Format code using black

---
 examples/gaussian_mixture.py              |  6 +--
 examples/gbm.py                           | 37 ++++++--------
 examples/kmeans.py                        |  7 ++-
 examples/linear_models.py                 | 28 +++++-----
 examples/naive_bayes.py                   | 13 +++--
 examples/nearest_neighbors.py             | 33 +++++++-----
 examples/nnet_convnet_mnist.py            | 17 +++----
 examples/nnet_mlp.py                      | 46 ++++++++---------
 examples/nnet_rnn_binary_add.py           | 13 ++---
 examples/nnet_rnn_text_generation.py      | 18 +++----
 examples/pca.py                           | 15 +++---
 examples/random_forest.py                 | 29 +++++------
 examples/rbm.py                           |  4 +-
 examples/rl_deep_q_learning.py            | 13 ++---
 examples/svm.py                           | 13 +++--
 examples/t-sne.py                         |  7 +--
 mla/base/base.py                          |  8 +--
 mla/datasets/__init__.py                  |  2 -
 mla/datasets/base.py                      | 22 ++++----
 mla/ensemble/base.py                      | 16 +++---
 mla/ensemble/gbm.py                       | 19 ++++---
 mla/ensemble/random_forest.py             | 41 ++++++++++-----
 mla/ensemble/tree.py                      | 32 ++++++------
 mla/fm.py                                 |  7 +--
 mla/gaussian_mixture.py                   | 40 ++++++++-------
 mla/kmeans.py                             | 20 ++++----
 mla/knn.py                                |  6 +--
 mla/linear_models.py                      |  8 +--
 mla/metrics/base.py                       |  4 +-
 mla/metrics/distance.py                   |  2 +-
 mla/metrics/metrics.py                    |  8 +--
 mla/metrics/tests/test_metrics.py         | 20 ++++----
 mla/naive_bayes.py                        |  1 +
 mla/neuralnet/activations.py              |  2 +-
 mla/neuralnet/constraints.py              |  2 +-
 mla/neuralnet/initializations.py          | 12 +++--
 mla/neuralnet/layers/__init__.py          |  2 +-
 mla/neuralnet/layers/basic.py             | 12 ++---
 mla/neuralnet/layers/convnet.py           | 25 ++++-----
 mla/neuralnet/layers/normalization.py     | 31 ++++++------
 mla/neuralnet/layers/recurrent/lstm.py    | 62 ++++++++++++-----------
 mla/neuralnet/layers/recurrent/rnn.py     | 22 ++++----
 mla/neuralnet/loss.py                     |  2 +-
 mla/neuralnet/nnet.py                     | 15 +++---
 mla/neuralnet/optimizers.py               | 27 +++++-----
 mla/neuralnet/parameters.py               | 10 ++--
 mla/neuralnet/regularizers.py             |  1 +
 mla/neuralnet/tests/test_activations.py   | 11 +---
 mla/neuralnet/tests/test_optimizers.py    | 20 ++++----
 mla/pca.py                                | 10 ++--
 mla/rbm.py                                | 17 +++----
 mla/rl/dqn.py                             | 16 +++---
 mla/svm/kernerls.py                       |  6 +--
 mla/svm/svm.py                            | 18 +++++--
 mla/tests/test_classification_accuracy.py | 30 +++++------
 mla/tests/test_reduction.py               | 10 ++--
 mla/tests/test_regression_accuracy.py     | 23 ++++-----
 mla/tsne.py                               | 16 +++---
 58 files changed, 481 insertions(+), 476 deletions(-)

diff --git a/examples/gaussian_mixture.py b/examples/gaussian_mixture.py
index 16d1fb8e..e8dfce15 100644
--- a/examples/gaussian_mixture.py
+++ b/examples/gaussian_mixture.py
@@ -19,7 +19,7 @@ def make_clusters(skew=True, *arg, **kwargs):
 
 
 def KMeans_and_GMM(K):
-    COLOR = 'bgrcmyk'
+    COLOR = "bgrcmyk"
 
     X, y = make_clusters(skew=True, n_samples=1500, centers=K)
     _, axes = plt.subplots(1, 3)
@@ -29,14 +29,14 @@ def KMeans_and_GMM(K):
     axes[0].set_title("Ground Truth")
 
     # KMeans
-    kmeans = KMeans(K=K, init='++')
+    kmeans = KMeans(K=K, init="++")
     kmeans.fit(X)
     kmeans.predict()
     axes[1].set_title("KMeans")
     kmeans.plot(ax=axes[1], holdon=True)
 
     # Gaussian Mixture
-    gmm = GaussianMixture(K=K, init='kmeans')
+    gmm = GaussianMixture(K=K, init="kmeans")
     gmm.fit(X)
     axes[2].set_title("Gaussian Mixture")
     gmm.plot(ax=axes[2])
diff --git a/examples/gbm.py b/examples/gbm.py
index 1f8ce1c3..99f14d55 100644
--- a/examples/gbm.py
+++ b/examples/gbm.py
@@ -3,6 +3,7 @@
 from sklearn.datasets import make_classification
 from sklearn.datasets import make_regression
 from sklearn.metrics import roc_auc_score
+
 try:
     from sklearn.model_selection import train_test_split
 except ImportError:
@@ -16,39 +17,33 @@
 
 def classification():
     # Generate a random binary classification problem.
-    X, y = make_classification(n_samples=350, n_features=15, n_informative=10,
-                               random_state=1111, n_classes=2,
-                               class_sep=1., n_redundant=0)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
-                                                        random_state=1111)
-
-    model = GradientBoostingClassifier(n_estimators=50, max_depth=4,
-                                       max_features=8, learning_rate=0.1)
+    X, y = make_classification(
+        n_samples=350, n_features=15, n_informative=10, random_state=1111, n_classes=2, class_sep=1.0, n_redundant=0
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111)
+
+    model = GradientBoostingClassifier(n_estimators=50, max_depth=4, max_features=8, learning_rate=0.1)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
     print(predictions)
     print(predictions.min())
     print(predictions.max())
-    print('classification, roc auc score: %s'
-          % roc_auc_score(y_test, predictions))
+    print("classification, roc auc score: %s" % roc_auc_score(y_test, predictions))
 
 
 def regression():
     # Generate a random regression problem
-    X, y = make_regression(n_samples=500, n_features=5, n_informative=5,
-                           n_targets=1, noise=0.05, random_state=1111,
-                           bias=0.5)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
-                                                        random_state=1111)
-
-    model = GradientBoostingRegressor(n_estimators=25, max_depth=5,
-                                      max_features=3, )
+    X, y = make_regression(
+        n_samples=500, n_features=5, n_informative=5, n_targets=1, noise=0.05, random_state=1111, bias=0.5
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
+
+    model = GradientBoostingRegressor(n_estimators=25, max_depth=5, max_features=3)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
-    print('regression, mse: %s'
-          % mean_squared_error(y_test.flatten(), predictions.flatten()))
+    print("regression, mse: %s" % mean_squared_error(y_test.flatten(), predictions.flatten()))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     classification()
     # regression()
diff --git a/examples/kmeans.py b/examples/kmeans.py
index baa5c113..9e08a689 100644
--- a/examples/kmeans.py
+++ b/examples/kmeans.py
@@ -5,10 +5,9 @@
 
 
 def kmeans_example(plot=False):
-    X, y = make_blobs(centers=4, n_samples=500, n_features=2,
-                      shuffle=True, random_state=42)
+    X, y = make_blobs(centers=4, n_samples=500, n_features=2, shuffle=True, random_state=42)
     clusters = len(np.unique(y))
-    k = KMeans(K=clusters, max_iters=150, init='++')
+    k = KMeans(K=clusters, max_iters=150, init="++")
     k.fit(X)
     k.predict()
 
@@ -16,5 +15,5 @@ def kmeans_example(plot=False):
         k.plot()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     kmeans_example(plot=True)
diff --git a/examples/linear_models.py b/examples/linear_models.py
index 10c81fc8..e553661d 100644
--- a/examples/linear_models.py
+++ b/examples/linear_models.py
@@ -16,32 +16,30 @@
 
 def regression():
     # Generate a random regression problem
-    X, y = make_regression(n_samples=10000, n_features=100,
-                           n_informative=75, n_targets=1, noise=0.05,
-                           random_state=1111, bias=0.5)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
-                                                        random_state=1111)
+    X, y = make_regression(
+        n_samples=10000, n_features=100, n_informative=75, n_targets=1, noise=0.05, random_state=1111, bias=0.5
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)
 
-    model = LinearRegression(lr=0.01, max_iters=2000, penalty='l2', C=0.03)
+    model = LinearRegression(lr=0.01, max_iters=2000, penalty="l2", C=0.03)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
-    print('regression mse', mean_squared_error(y_test, predictions))
+    print("regression mse", mean_squared_error(y_test, predictions))
 
 
 def classification():
     # Generate a random binary classification problem.
-    X, y = make_classification(n_samples=1000, n_features=100,
-                               n_informative=75, random_state=1111,
-                               n_classes=2, class_sep=2.5, )
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
-                                                        random_state=1111)
+    X, y = make_classification(
+        n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2, class_sep=2.5
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
 
-    model = LogisticRegression(lr=0.01, max_iters=500, penalty='l1', C=0.01)
+    model = LogisticRegression(lr=0.01, max_iters=500, penalty="l1", C=0.01)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
-    print('classification accuracy', accuracy(y_test, predictions))
+    print("classification accuracy", accuracy(y_test, predictions))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     regression()
     classification()
diff --git a/examples/naive_bayes.py b/examples/naive_bayes.py
index 43bd4156..383e997d 100644
--- a/examples/naive_bayes.py
+++ b/examples/naive_bayes.py
@@ -7,18 +7,17 @@
 
 def classification():
     # Generate a random binary classification problem.
-    X, y = make_classification(n_samples=1000, n_features=10, n_informative=10,
-                               random_state=1111, n_classes=2, class_sep=2.5,
-                               n_redundant=0)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
-                                                        random_state=1111)
+    X, y = make_classification(
+        n_samples=1000, n_features=10, n_informative=10, random_state=1111, n_classes=2, class_sep=2.5, n_redundant=0
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
 
     model = NaiveBayesClassifier()
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)[:, 1]
 
-    print('classification accuracy', roc_auc_score(y_test, predictions))
+    print("classification accuracy", roc_auc_score(y_test, predictions))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     classification()
diff --git a/examples/nearest_neighbors.py b/examples/nearest_neighbors.py
index 397502c3..d68bf208 100644
--- a/examples/nearest_neighbors.py
+++ b/examples/nearest_neighbors.py
@@ -12,33 +12,38 @@
 
 def regression():
     # Generate a random regression problem
-    X, y = make_regression(n_samples=500, n_features=5,
-                           n_informative=5, n_targets=1,
-                           noise=0.05, random_state=1111, bias=0.5)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
-                                                        random_state=1111)
+    X, y = make_regression(
+        n_samples=500, n_features=5, n_informative=5, n_targets=1, noise=0.05, random_state=1111, bias=0.5
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)
 
     model = knn.KNNRegressor(k=5, distance_func=distance.euclidean)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
-    print('regression mse', mean_squared_error(y_test, predictions))
+    print("regression mse", mean_squared_error(y_test, predictions))
 
 
 def classification():
-    X, y = make_classification(n_samples=500, n_features=5, n_informative=5,
-                               n_redundant=0, n_repeated=0, n_classes=3,
-                               random_state=1111, class_sep=1.5, )
-
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
-                                                        random_state=1111)
+    X, y = make_classification(
+        n_samples=500,
+        n_features=5,
+        n_informative=5,
+        n_redundant=0,
+        n_repeated=0,
+        n_classes=3,
+        random_state=1111,
+        class_sep=1.5,
+    )
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
 
     clf = knn.KNNClassifier(k=5, distance_func=distance.euclidean)
 
     clf.fit(X_train, y_train)
     predictions = clf.predict(X_test)
-    print('classification accuracy', accuracy(y_test, predictions))
+    print("classification accuracy", accuracy(y_test, predictions))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     regression()
     classification()
diff --git a/examples/nnet_convnet_mnist.py b/examples/nnet_convnet_mnist.py
index 4fb3ad2f..aff0b361 100644
--- a/examples/nnet_convnet_mnist.py
+++ b/examples/nnet_convnet_mnist.py
@@ -15,8 +15,8 @@
 X_train, X_test, y_train, y_test = load_mnist()
 
 # Normalize data
-X_train /= 255.
-X_test /= 255.
+X_train /= 255.0
+X_test /= 255.0
 
 y_train = one_hot(y_train.flatten())
 y_test = one_hot(y_test.flatten())
@@ -26,22 +26,21 @@
 model = NeuralNet(
     layers=[
         Convolution(n_filters=32, filter_shape=(3, 3), padding=(1, 1), stride=(1, 1)),
-        Activation('relu'),
+        Activation("relu"),
         Convolution(n_filters=32, filter_shape=(3, 3), padding=(1, 1), stride=(1, 1)),
-        Activation('relu'),
+        Activation("relu"),
         MaxPooling(pool_shape=(2, 2), stride=(2, 2)),
         Dropout(0.5),
-
         Flatten(),
         Dense(128),
-        Activation('relu'),
+        Activation("relu"),
         Dropout(0.5),
         Dense(10),
-        Activation('softmax'),
+        Activation("softmax"),
     ],
-    loss='categorical_crossentropy',
+    loss="categorical_crossentropy",
     optimizer=Adadelta(),
-    metric='accuracy',
+    metric="accuracy",
     batch_size=128,
     max_epochs=3,
 )
diff --git a/examples/nnet_mlp.py b/examples/nnet_mlp.py
index 4259c6e5..f35a4120 100644
--- a/examples/nnet_mlp.py
+++ b/examples/nnet_mlp.py
@@ -22,54 +22,50 @@
 
 def classification():
     # Generate a random binary classification problem.
-    X, y = make_classification(n_samples=1000, n_features=100,
-                               n_informative=75, random_state=1111,
-                               n_classes=2, class_sep=2.5, )
+    X, y = make_classification(
+        n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2, class_sep=2.5
+    )
     y = one_hot(y)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
-                                                        random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111)
 
     model = NeuralNet(
         layers=[
-            Dense(256, Parameters(init='uniform', regularizers={'W': L2(0.05)})),
-            Activation('relu'),
+            Dense(256, Parameters(init="uniform", regularizers={"W": L2(0.05)})),
+            Activation("relu"),
             Dropout(0.5),
-            Dense(128, Parameters(init='normal', constraints={'W': MaxNorm()})),
-            Activation('relu'),
+            Dense(128, Parameters(init="normal", constraints={"W": MaxNorm()})),
+            Activation("relu"),
             Dense(2),
-            Activation('softmax'),
+            Activation("softmax"),
         ],
-        loss='categorical_crossentropy',
+        loss="categorical_crossentropy",
         optimizer=Adadelta(),
-        metric='accuracy',
+        metric="accuracy",
         batch_size=64,
         max_epochs=25,
-
     )
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
-    print('classification accuracy', roc_auc_score(y_test[:, 0], predictions[:, 0]))
+    print("classification accuracy", roc_auc_score(y_test[:, 0], predictions[:, 0]))
 
 
 def regression():
     # Generate a random regression problem
-    X, y = make_regression(n_samples=5000, n_features=25, n_informative=25,
-                           n_targets=1, random_state=100, noise=0.05)
+    X, y = make_regression(n_samples=5000, n_features=25, n_informative=25, n_targets=1, random_state=100, noise=0.05)
     y *= 0.01
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
-                                                        random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
 
     model = NeuralNet(
         layers=[
-            Dense(64, Parameters(init='normal')),
-            Activation('linear'),
-            Dense(32, Parameters(init='normal')),
-            Activation('linear'),
+            Dense(64, Parameters(init="normal")),
+            Activation("linear"),
+            Dense(32, Parameters(init="normal")),
+            Activation("linear"),
             Dense(1),
         ],
-        loss='mse',
+        loss="mse",
         optimizer=Adam(),
-        metric='mse',
+        metric="mse",
         batch_size=256,
         max_epochs=15,
     )
@@ -78,6 +74,6 @@ def regression():
     print("regression mse", mean_squared_error(y_test, predictions.flatten()))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     classification()
     regression()
diff --git a/examples/nnet_rnn_binary_add.py b/examples/nnet_rnn_binary_add.py
index 20dbbef1..d019201b 100644
--- a/examples/nnet_rnn_binary_add.py
+++ b/examples/nnet_rnn_binary_add.py
@@ -2,6 +2,7 @@
 from itertools import combinations, islice
 
 import numpy as np
+
 try:
     from sklearn.model_selection import train_test_split
 except ImportError:
@@ -20,7 +21,7 @@ def addition_dataset(dim=10, n_samples=10000, batch_size=64):
     """Generate binary addition dataset.
     http://devankuleindiren.com/Projects/rnn_arithmetic.php
     """
-    binary_format = '{:0' + str(dim) + 'b}'
+    binary_format = "{:0" + str(dim) + "b}"
 
     # Generate all possible number combinations
     combs = list(islice(combinations(range(2 ** (dim - 1)), 2), n_samples))
@@ -55,14 +56,10 @@ def addition_problem(ReccurentLayer):
 
     print(X_train.shape, X_test.shape)
     model = NeuralNet(
-        layers=[
-            ReccurentLayer,
-            TimeDistributedDense(1),
-            Activation('sigmoid'),
-        ],
-        loss='mse',
+        layers=[ReccurentLayer, TimeDistributedDense(1), Activation("sigmoid")],
+        loss="mse",
         optimizer=Adam(),
-        metric='mse',
+        metric="mse",
         batch_size=64,
         max_epochs=15,
     )
diff --git a/examples/nnet_rnn_text_generation.py b/examples/nnet_rnn_text_generation.py
index f29ed1af..50f6ff08 100644
--- a/examples/nnet_rnn_text_generation.py
+++ b/examples/nnet_rnn_text_generation.py
@@ -18,9 +18,10 @@
 
 # Example taken from: https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py
 
+
 def sample(preds, temperature=1.0):
     # helper function to sample an index from a probability array
-    preds = np.asarray(preds).astype('float64')
+    preds = np.asarray(preds).astype("float64")
     preds = np.log(preds) / temperature
     exp_preds = np.exp(preds)
     preds = exp_preds / np.sum(exp_preds)
@@ -38,7 +39,7 @@ def sample(preds, temperature=1.0):
 print(X.shape, y.shape)
 # LSTM OR RNN
 # rnn_layer = RNN(128, return_sequences=False)
-rnn_layer = LSTM(128, return_sequences=False, )
+rnn_layer = LSTM(128, return_sequences=False)
 
 model = NeuralNet(
     layers=[
@@ -46,30 +47,29 @@ def sample(preds, temperature=1.0):
         # Flatten(),
         # TimeStepSlicer(-1),
         Dense(X.shape[2]),
-        Activation('softmax'),
+        Activation("softmax"),
     ],
-    loss='categorical_crossentropy',
+    loss="categorical_crossentropy",
     optimizer=RMSprop(learning_rate=0.01),
-    metric='accuracy',
+    metric="accuracy",
     batch_size=64,
     max_epochs=1,
     shuffle=False,
-
 )
 
 for _ in range(25):
     model.fit(X, y)
     start_index = random.randint(0, len(text) - maxlen - 1)
 
-    generated = ''
-    sentence = text[start_index: start_index + maxlen]
+    generated = ""
+    sentence = text[start_index : start_index + maxlen]
     generated += sentence
     print('----- Generating with seed: "' + sentence + '"')
     sys.stdout.write(generated)
     for i in range(100):
         x = np.zeros((64, maxlen, len(chars)))
         for t, char in enumerate(sentence):
-            x[0, t, char_indices[char]] = 1.
+            x[0, t, char_indices[char]] = 1.0
         preds = model.predict(x)[0]
         next_index = sample(preds, 0.5)
         next_char = indices_char[next_index]
diff --git a/examples/pca.py b/examples/pca.py
index 63290c83..4b7bf3ac 100644
--- a/examples/pca.py
+++ b/examples/pca.py
@@ -11,23 +11,22 @@
 # logging.basicConfig(level=logging.DEBUG)
 
 # Generate a random binary classification problem.
-X, y = make_classification(n_samples=1000, n_features=100, n_informative=75,
-                           random_state=1111, n_classes=2, class_sep=2.5, )
+X, y = make_classification(
+    n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2, class_sep=2.5
+)
 
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
-                                                        random_state=1111)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)
 
-for s in ['svd', 'eigen']:
+for s in ["svd", "eigen"]:
     p = PCA(15, solver=s)
 
     # fit PCA with training data, not entire dataset
     p.fit(X_train)
     X_train_reduced = p.transform(X_train)
     X_test_reduced = p.transform(X_test)
-    
+
     model = LogisticRegression(lr=0.001, max_iters=2500)
     model.fit(X_train_reduced, y_train)
     predictions = model.predict(X_test_reduced)
-    print('Classification accuracy for %s PCA: %s'
-          % (s, accuracy(y_test, predictions)))
+    print("Classification accuracy for %s PCA: %s" % (s, accuracy(y_test, predictions)))
diff --git a/examples/random_forest.py b/examples/random_forest.py
index b499776e..39f47377 100644
--- a/examples/random_forest.py
+++ b/examples/random_forest.py
@@ -3,6 +3,7 @@
 from sklearn.datasets import make_classification
 from sklearn.datasets import make_regression
 from sklearn.metrics import roc_auc_score
+
 try:
     from sklearn.model_selection import train_test_split
 except ImportError:
@@ -16,36 +17,32 @@
 
 def classification():
     # Generate a random binary classification problem.
-    X, y = make_classification(n_samples=500, n_features=10, n_informative=10,
-                               random_state=1111, n_classes=2,
-                               class_sep=2.5, n_redundant=0)
+    X, y = make_classification(
+        n_samples=500, n_features=10, n_informative=10, random_state=1111, n_classes=2, class_sep=2.5, n_redundant=0
+    )
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
-                                                        random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111)
 
     model = RandomForestClassifier(n_estimators=10, max_depth=4)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)[:, 1]
     # print(predictions)
-    print('classification, roc auc score: %s'
-          % roc_auc_score(y_test, predictions))
+    print("classification, roc auc score: %s" % roc_auc_score(y_test, predictions))
 
 
 def regression():
     # Generate a random regression problem
-    X, y = make_regression(n_samples=500, n_features=5, n_informative=5,
-                           n_targets=1, noise=0.05, random_state=1111,
-                           bias=0.5)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
-                                                        random_state=1111)
+    X, y = make_regression(
+        n_samples=500, n_features=5, n_informative=5, n_targets=1, noise=0.05, random_state=1111, bias=0.5
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
 
-    model = RandomForestRegressor(n_estimators=50, max_depth=10, max_features=3, )
+    model = RandomForestRegressor(n_estimators=50, max_depth=10, max_features=3)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
-    print('regression, mse: %s'
-          % mean_squared_error(y_test.flatten(), predictions.flatten()))
+    print("regression, mse: %s" % mean_squared_error(y_test.flatten(), predictions.flatten()))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     classification()
     # regression()
diff --git a/examples/rbm.py b/examples/rbm.py
index 0404b036..2d167644 100644
--- a/examples/rbm.py
+++ b/examples/rbm.py
@@ -13,7 +13,7 @@ def print_curve(rbm):
     def moving_average(a, n=25):
         ret = np.cumsum(a, dtype=float)
         ret[n:] = ret[n:] - ret[:-n]
-        return ret[n - 1:] / n
+        return ret[n - 1 :] / n
 
     plt.plot(moving_average(rbm.errors))
     plt.show()
@@ -23,5 +23,3 @@ def moving_average(a, n=25):
 rbm = RBM(n_hidden=10, max_epochs=200, batch_size=10, learning_rate=0.1)
 rbm.fit(X)
 print_curve(rbm)
-
-
diff --git a/examples/rl_deep_q_learning.py b/examples/rl_deep_q_learning.py
index 83bdd9a8..5626cd3a 100644
--- a/examples/rl_deep_q_learning.py
+++ b/examples/rl_deep_q_learning.py
@@ -10,24 +10,19 @@
 
 def mlp_model(n_actions, batch_size=64):
     model = NeuralNet(
-        layers=[
-            Dense(32),
-            Activation('relu'),
-            Dense(n_actions),
-        ],
-        loss='mse',
+        layers=[Dense(32), Activation("relu"), Dense(n_actions)],
+        loss="mse",
         optimizer=Adam(),
-        metric='mse',
+        metric="mse",
         batch_size=batch_size,
         max_epochs=1,
         verbose=False,
-
     )
     return model
 
 
 model = DQN(n_episodes=2500, batch_size=64)
-model.init_environment('CartPole-v0')
+model.init_environment("CartPole-v0")
 model.init_model(mlp_model)
 
 try:
diff --git a/examples/svm.py b/examples/svm.py
index 1eae1ea6..19535a4d 100644
--- a/examples/svm.py
+++ b/examples/svm.py
@@ -15,20 +15,19 @@
 
 def classification():
     # Generate a random binary classification problem.
-    X, y = make_classification(n_samples=1200, n_features=10, n_informative=5,
-                               random_state=1111, n_classes=2, class_sep=1.75,)
+    X, y = make_classification(
+        n_samples=1200, n_features=10, n_informative=5, random_state=1111, n_classes=2, class_sep=1.75
+    )
     # Convert y to {-1, 1}
     y = (y * 2) - 1
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
-                                                        random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1111)
 
     for kernel in [RBF(gamma=0.1), Linear()]:
         model = SVM(max_iter=500, kernel=kernel, C=0.6)
         model.fit(X_train, y_train)
         predictions = model.predict(X_test)
-        print('Classification accuracy (%s): %s'
-              % (kernel, accuracy(y_test, predictions)))
+        print("Classification accuracy (%s): %s" % (kernel, accuracy(y_test, predictions)))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     classification()
diff --git a/examples/t-sne.py b/examples/t-sne.py
index cf00df9b..36873e91 100644
--- a/examples/t-sne.py
+++ b/examples/t-sne.py
@@ -7,13 +7,14 @@
 
 logging.basicConfig(level=logging.DEBUG)
 
-X, y = make_classification(n_samples=500, n_features=10, n_informative=5, n_redundant=0, random_state=1111,
-                           n_classes=2, class_sep=2.5, )
+X, y = make_classification(
+    n_samples=500, n_features=10, n_informative=5, n_redundant=0, random_state=1111, n_classes=2, class_sep=2.5
+)
 
 p = TSNE(2, max_iter=500)
 X = p.fit_transform(X)
 
-colors = ['red', 'green']
+colors = ["red", "green"]
 for t in range(2):
     t_mask = (y == t).astype(bool)
     plt.scatter(X[t_mask, 0], X[t_mask, 1], color=colors[t])
diff --git a/mla/base/base.py b/mla/base/base.py
index cf4d7185..64998c36 100644
--- a/mla/base/base.py
+++ b/mla/base/base.py
@@ -27,7 +27,7 @@ def _setup_input(self, X, y=None):
             X = np.array(X)
 
         if X.size == 0:
-            raise ValueError('Number of features must be > 0')
+            raise ValueError("Number of features must be > 0")
 
         if X.ndim == 1:
             self.n_samples, self.n_features = 1, X.shape
@@ -38,13 +38,13 @@ def _setup_input(self, X, y=None):
 
         if self.y_required:
             if y is None:
-                raise ValueError('Missed required argument y')
+                raise ValueError("Missed required argument y")
 
             if not isinstance(y, np.ndarray):
                 y = np.array(y)
 
             if y.size == 0:
-                raise ValueError('Number of targets must be > 0')
+                raise ValueError("Number of targets must be > 0")
 
         self.y = y
 
@@ -58,7 +58,7 @@ def predict(self, X=None):
         if self.X is not None or not self.fit_required:
             return self._predict(X)
         else:
-            raise ValueError('You must call `fit` before `predict`')
+            raise ValueError("You must call `fit` before `predict`")
 
     def _predict(self, X=None):
         raise NotImplementedError()
diff --git a/mla/datasets/__init__.py b/mla/datasets/__init__.py
index e9a972bf..b1992552 100644
--- a/mla/datasets/__init__.py
+++ b/mla/datasets/__init__.py
@@ -1,3 +1 @@
-
-
 from mla.datasets.base import *
diff --git a/mla/datasets/base.py b/mla/datasets/base.py
index f58d918a..a7b9ed63 100644
--- a/mla/datasets/base.py
+++ b/mla/datasets/base.py
@@ -13,18 +13,18 @@ def load(dataset="training", digits=np.arange(10)):
         from numpy import array, int8, uint8, zeros
 
         if dataset == "train":
-            fname_img = get_filename('data/mnist/train-images-idx3-ubyte')
-            fname_lbl = get_filename('data/mnist/train-labels-idx1-ubyte')
+            fname_img = get_filename("data/mnist/train-images-idx3-ubyte")
+            fname_lbl = get_filename("data/mnist/train-labels-idx1-ubyte")
         elif dataset == "test":
-            fname_img = get_filename('data/mnist/t10k-images-idx3-ubyte')
-            fname_lbl = get_filename('data/mnist/t10k-labels-idx1-ubyte')
+            fname_img = get_filename("data/mnist/t10k-images-idx3-ubyte")
+            fname_lbl = get_filename("data/mnist/t10k-labels-idx1-ubyte")
 
-        flbl = open(fname_lbl, 'rb')
+        flbl = open(fname_lbl, "rb")
         magic_nr, size = struct.unpack(">II", flbl.read(8))
         lbl = pyarray("b", flbl.read())
         flbl.close()
 
-        fimg = open(fname_img, 'rb')
+        fimg = open(fname_img, "rb")
         magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
         img = pyarray("B", fimg.read())
         fimg.close()
@@ -35,13 +35,13 @@ def load(dataset="training", digits=np.arange(10)):
         images = zeros((N, rows, cols), dtype=uint8)
         labels = zeros((N, 1), dtype=int8)
         for i in range(len(ind)):
-            images[i] = array(img[ind[i] * rows * cols: (ind[i] + 1) * rows * cols]).reshape((rows, cols))
+            images[i] = array(img[ind[i] * rows * cols : (ind[i] + 1) * rows * cols]).reshape((rows, cols))
             labels[i] = lbl[ind[i]]
 
         return images, labels
 
-    X_train, y_train = load('train')
-    X_test, y_test = load('test')
+    X_train, y_train = load("train")
+    X_test, y_test = load("test")
 
     X_train = X_train.reshape(X_train.shape[0], 1, 28, 28).astype(np.float32)
     X_test = X_test.reshape(X_test.shape[0], 1, 28, 28).astype(np.float32)
@@ -50,7 +50,7 @@ def load(dataset="training", digits=np.arange(10)):
 
 
 def load_nietzsche():
-    text = open(get_filename('data/nietzsche.txt')).read().decode('utf-8').lower()
+    text = open(get_filename("data/nietzsche.txt")).read().decode("utf-8").lower()
     chars = set(list(text))
     char_indices = {ch: i for i, ch in enumerate(chars)}
     indices_char = {i: ch for i, ch in enumerate(chars)}
@@ -60,7 +60,7 @@ def load_nietzsche():
     sentences = []
     next_chars = []
     for i in range(0, len(text) - maxlen, step):
-        sentences.append(text[i: i + maxlen])
+        sentences.append(text[i : i + maxlen])
         next_chars.append(text[i + maxlen])
 
     X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
diff --git a/mla/ensemble/base.py b/mla/ensemble/base.py
index b1d5b7c4..a8850f11 100644
--- a/mla/ensemble/base.py
+++ b/mla/ensemble/base.py
@@ -7,7 +7,7 @@ def f_entropy(p):
     p = np.bincount(p) / float(p.shape[0])
 
     ep = stats.entropy(p)
-    if ep == -float('inf'):
+    if ep == -float("inf"):
         return 0.0
     return ep
 
@@ -23,22 +23,22 @@ def mse_criterion(y, splits):
 
 
 def xgb_criterion(y, left, right, loss):
-    left = loss.gain(left['actual'], left['y_pred'])
-    right = loss.gain(right['actual'], right['y_pred'])
-    initial = loss.gain(y['actual'], y['y_pred'])
+    left = loss.gain(left["actual"], left["y_pred"])
+    right = loss.gain(right["actual"], right["y_pred"])
+    initial = loss.gain(y["actual"], y["y_pred"])
     gain = left + right - initial
     return gain
 
 
 def get_split_mask(X, column, value):
-    left_mask = (X[:, column] < value)
-    right_mask = (X[:, column] >= value)
+    left_mask = X[:, column] < value
+    right_mask = X[:, column] >= value
     return left_mask, right_mask
 
 
 def split(X, y, value):
-    left_mask = (X < value)
-    right_mask = (X >= value)
+    left_mask = X < value
+    right_mask = X >= value
     return y[left_mask], y[right_mask]
 
 
diff --git a/mla/ensemble/gbm.py b/mla/ensemble/gbm.py
index d683ac09..37db62cf 100644
--- a/mla/ensemble/gbm.py
+++ b/mla/ensemble/gbm.py
@@ -1,4 +1,5 @@
 import numpy as np
+
 # logistic function
 from scipy.special import expit
 
@@ -40,7 +41,7 @@ def transform(self, pred):
     def gain(self, actual, predicted):
         """Calculate gain for split search."""
         nominator = self.grad(actual, predicted).sum() ** 2
-        denominator = (self.hess(actual, predicted).sum() + self.regularization)
+        denominator = self.hess(actual, predicted).sum() + self.regularization
         return 0.5 * (nominator / denominator)
 
 
@@ -98,14 +99,20 @@ def _train(self):
             # Pass multiple target values to the tree learner
             targets = {
                 # Residual values
-                'y': residuals,
+                "y": residuals,
                 # Actual target values
-                'actual': self.y,
+                "actual": self.y,
                 # Predictions from previous step
-                'y_pred': y_pred
+                "y_pred": y_pred,
             }
-            tree.train(self.X, targets, max_features=self.max_features,
-                       min_samples_split=self.min_samples_split, max_depth=self.max_depth, loss=self.loss)
+            tree.train(
+                self.X,
+                targets,
+                max_features=self.max_features,
+                min_samples_split=self.min_samples_split,
+                max_depth=self.max_depth,
+                loss=self.loss,
+            )
             predictions = tree.predict(self.X)
             y_pred += self.learning_rate * predictions
             self.trees.append(tree)
diff --git a/mla/ensemble/random_forest.py b/mla/ensemble/random_forest.py
index 603e3a34..2052b7a4 100644
--- a/mla/ensemble/random_forest.py
+++ b/mla/ensemble/random_forest.py
@@ -34,25 +34,34 @@ def fit(self, X, y):
         if self.max_features is None:
             self.max_features = int(np.sqrt(X.shape[1]))
         else:
-            assert (X.shape[1] > self.max_features)
+            assert X.shape[1] > self.max_features
         self._train()
 
     def _train(self):
         for tree in self.trees:
-            tree.train(self.X, self.y, max_features=self.max_features, min_samples_split=self.min_samples_split,
-                       max_depth=self.max_depth)
+            tree.train(
+                self.X,
+                self.y,
+                max_features=self.max_features,
+                min_samples_split=self.min_samples_split,
+                max_depth=self.max_depth,
+            )
 
     def _predict(self, X=None):
         raise NotImplementedError()
 
 
 class RandomForestClassifier(RandomForest):
-    def __init__(self, n_estimators=10, max_features=None, min_samples_split=10, max_depth=None, criterion='entropy'):
-        super(RandomForestClassifier, self).__init__(n_estimators=n_estimators, max_features=max_features,
-                                                     min_samples_split=min_samples_split, max_depth=max_depth,
-                                                     criterion=criterion)
-
-        if criterion == 'entropy':
+    def __init__(self, n_estimators=10, max_features=None, min_samples_split=10, max_depth=None, criterion="entropy"):
+        super(RandomForestClassifier, self).__init__(
+            n_estimators=n_estimators,
+            max_features=max_features,
+            min_samples_split=min_samples_split,
+            max_depth=max_depth,
+            criterion=criterion,
+        )
+
+        if criterion == "entropy":
             self.criterion = information_gain
         else:
             raise ValueError()
@@ -76,11 +85,15 @@ def _predict(self, X=None):
 
 
 class RandomForestRegressor(RandomForest):
-    def __init__(self, n_estimators=10, max_features=None, min_samples_split=10, max_depth=None, criterion='mse'):
-        super(RandomForestRegressor, self).__init__(n_estimators=n_estimators, max_features=max_features,
-                                                    min_samples_split=min_samples_split, max_depth=max_depth)
-
-        if criterion == 'mse':
+    def __init__(self, n_estimators=10, max_features=None, min_samples_split=10, max_depth=None, criterion="mse"):
+        super(RandomForestRegressor, self).__init__(
+            n_estimators=n_estimators,
+            max_features=max_features,
+            min_samples_split=min_samples_split,
+            max_depth=max_depth,
+        )
+
+        if criterion == "mse":
             self.criterion = mse_criterion
         else:
             raise ValueError()
diff --git a/mla/ensemble/tree.py b/mla/ensemble/tree.py
index fbe3654f..1127c668 100644
--- a/mla/ensemble/tree.py
+++ b/mla/ensemble/tree.py
@@ -52,8 +52,8 @@ def _find_best_split(self, X, target, n_features):
             for value in split_values:
                 if self.loss is None:
                     # Random forest
-                    splits = split(X[:, column], target['y'], value)
-                    gain = self.criterion(target['y'], splits)
+                    splits = split(X[:, column], target["y"], value)
+                    gain = self.criterion(target["y"], splits)
                 else:
                     # Gradient boosting
                     left, right = split_dataset(X, target, column, value, return_X=False)
@@ -86,7 +86,7 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
         """
 
         if not isinstance(target, dict):
-            target = {'y': target}
+            target = {"y": target}
 
         # Loss for gradient boosting
         if loss is not None:
@@ -94,18 +94,18 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
 
         try:
             # Exit from recursion using assert syntax
-            assert (X.shape[0] > min_samples_split)
-            assert (max_depth > 0)
+            assert X.shape[0] > min_samples_split
+            assert max_depth > 0
 
             if max_features is None:
                 max_features = X.shape[1]
-                
+
             column, value, gain = self._find_best_split(X, target, max_features)
             assert gain is not None
             if self.regression:
-                assert (gain != 0)
+                assert gain != 0
             else:
-                assert (gain > minimum_gain)
+                assert gain > minimum_gain
 
             self.column_index = column
             self.threshold = value
@@ -116,12 +116,14 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
 
             # Grow left and right child
             self.left_child = Tree(self.regression, self.criterion)
-            self.left_child.train(left_X, left_target, max_features, min_samples_split, max_depth - 1,
-                                  minimum_gain, loss)
+            self.left_child.train(
+                left_X, left_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss
+            )
 
             self.right_child = Tree(self.regression, self.criterion)
-            self.right_child.train(right_X, right_target, max_features, min_samples_split, max_depth - 1,
-                                   minimum_gain, loss)
+            self.right_child.train(
+                right_X, right_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss
+            )
         except AssertionError:
             self._calculate_leaf_value(target)
 
@@ -129,15 +131,15 @@ def _calculate_leaf_value(self, targets):
         """Find optimal value for leaf."""
         if self.loss is not None:
             # Gradient boosting
-            self.outcome = self.loss.approximate(targets['actual'], targets['y_pred'])
+            self.outcome = self.loss.approximate(targets["actual"], targets["y_pred"])
         else:
             # Random Forest
             if self.regression:
                 # Mean value for regression task
-                self.outcome = np.mean(targets['y'])
+                self.outcome = np.mean(targets["y"])
             else:
                 # Probability for classification task
-                self.outcome = stats.itemfreq(targets['y'])[:, 1] / float(targets['y'].shape[0])
+                self.outcome = stats.itemfreq(targets["y"])[:, 1] / float(targets["y"].shape[0])
 
     def predict_row(self, row):
         """Predict single row."""
diff --git a/mla/fm.py b/mla/fm.py
index e663ace4..a43b04c7 100644
--- a/mla/fm.py
+++ b/mla/fm.py
@@ -12,8 +12,9 @@
 
 
 class BaseFM(BaseEstimator):
-    def __init__(self, n_components=10, max_iter=100, init_stdev=0.1, learning_rate=0.01, reg_v=0.1,
-                 reg_w=0.5, reg_w0=0.):
+    def __init__(
+        self, n_components=10, max_iter=100, init_stdev=0.1, learning_rate=0.01, reg_v=0.1, reg_w=0.5, reg_w0=0.0
+    ):
         """Simplified factorization machines implementation using SGD optimizer."""
         self.reg_w0 = reg_w0
         self.reg_w = reg_w
@@ -52,7 +53,7 @@ def _factor_step(self, loss):
 
     def _predict(self, X=None):
         linear_output = np.dot(X, self.w)
-        factors_output = np.sum(np.dot(X, self.v) ** 2 - np.dot(X ** 2, self.v ** 2), axis=1) / 2.
+        factors_output = np.sum(np.dot(X, self.v) ** 2 - np.dot(X ** 2, self.v ** 2), axis=1) / 2.0
         return self.wo + linear_output + factors_output
 
 
diff --git a/mla/gaussian_mixture.py b/mla/gaussian_mixture.py
index bf7627ab..3b6b9d3c 100644
--- a/mla/gaussian_mixture.py
+++ b/mla/gaussian_mixture.py
@@ -37,7 +37,7 @@ class GaussianMixture(BaseEstimator):
 
     y_required = False
 
-    def __init__(self, K=4, init='random', max_iters=500, tolerance=1e-3):
+    def __init__(self, K=4, init="random", max_iters=500, tolerance=1e-3):
         self.K = K
         self.max_iters = max_iters
         self.init = init
@@ -46,7 +46,7 @@ def __init__(self, K=4, init='random', max_iters=500, tolerance=1e-3):
         self.tolerance = tolerance
 
     def fit(self, X, y=None):
-        '''Perform Expectation–Maximization (EM) until converged.'''
+        """Perform Expectation–Maximization (EM) until converged."""
         self._setup_input(X, y)
         self._initialize()
         for _ in range(self.max_iters):
@@ -63,12 +63,12 @@ def _initialize(self):
         covs: the covariance matrix of the clusters
         """
         self.weights = np.ones(self.K)
-        if self.init == 'random':
+        if self.init == "random":
             self.means = [self.X[x] for x in random.sample(range(self.n_samples), self.K)]
             self.covs = [np.cov(self.X.T) for _ in range(K)]
 
-        elif self.init == 'kmeans':
-            kmeans = KMeans(K=self.K, max_iters=self.max_iters // 3, init='++')
+        elif self.init == "kmeans":
+            kmeans = KMeans(K=self.K, max_iters=self.max_iters // 3, init="++")
             kmeans.fit(self.X)
             self.assignments = kmeans.predict()
             self.means = kmeans.centroids
@@ -77,11 +77,11 @@ def _initialize(self):
                 self.weights[int(i)] = (self.assignments == i).sum()
                 self.covs.append(np.cov(self.X[self.assignments == i].T))
         else:
-            raise ValueError('Unknown type of init parameter')
+            raise ValueError("Unknown type of init parameter")
         self.weights /= self.weights.sum()
 
     def _E_step(self):
-        '''Expectation(E-step) for Gaussian Mixture.'''
+        """Expectation(E-step) for Gaussian Mixture."""
         likelihoods = self._get_likelihood(self.X)
         self.likelihood.append(likelihoods.sum())
         weighted_likelihoods = self._get_weighted_likelihood(likelihoods)
@@ -90,13 +90,14 @@ def _E_step(self):
         self.responsibilities = weighted_likelihoods
 
     def _M_step(self):
-        '''Maximization (M-step) for Gaussian Mixture.'''
+        """Maximization (M-step) for Gaussian Mixture."""
         weights = self.responsibilities.sum(axis=0)
         for assignment in range(self.K):
             resp = self.responsibilities[:, assignment][:, np.newaxis]
             self.means[assignment] = (resp * self.X).sum(axis=0) / resp.sum()
             self.covs[assignment] = (self.X - self.means[assignment]).T.dot(
-                (self.X - self.means[assignment]) * resp) / weights[assignment]
+                (self.X - self.means[assignment]) * resp
+            ) / weights[assignment]
         self.weights = weights / weights.sum()
 
     def _is_converged(self):
@@ -106,7 +107,7 @@ def _is_converged(self):
         return False
 
     def _predict(self, X):
-        '''Get the assignments for X with GMM clusters.'''
+        """Get the assignments for X with GMM clusters."""
         if not X.shape:
             return self.assignments
         likelihoods = self._get_likelihood(X)
@@ -125,7 +126,7 @@ def _get_weighted_likelihood(self, likelihood):
         return self.weights * likelihood
 
     def plot(self, data=None, ax=None, holdon=False):
-        '''Plot contour for 2D data.'''
+        """Plot contour for 2D data."""
         if not (len(self.X.shape) == 2 and self.X.shape[1] == 2):
             raise AttributeError("Only support for visualizing 2D data.")
 
@@ -138,16 +139,15 @@ def plot(self, data=None, ax=None, holdon=False):
         else:
             assignments = self.predict(data)
 
-        COLOR = 'bgrcmyk'
+        COLOR = "bgrcmyk"
         cmap = lambda assignment: COLOR[int(assignment) % len(COLOR)]
 
         # generate grid
-        delta = .025
-        margin = .2
+        delta = 0.025
+        margin = 0.2
         xmax, ymax = self.X.max(axis=0) + margin
         xmin, ymin = self.X.min(axis=0) - margin
-        axis_X, axis_Y = np.meshgrid(np.arange(xmin, xmax, delta),
-                                     np.arange(ymin, ymax, delta))
+        axis_X, axis_Y = np.meshgrid(np.arange(xmin, xmax, delta), np.arange(ymin, ymax, delta))
 
         def grid_gaussian_pdf(mean, cov):
             grid_array = np.array(list(zip(axis_X.flatten(), axis_Y.flatten())))
@@ -162,8 +162,12 @@ def grid_gaussian_pdf(mean, cov):
 
         # plot contours
         for assignment in range(self.K):
-            ax.contour(axis_X, axis_Y, grid_gaussian_pdf(self.means[assignment], self.covs[assignment]),
-                       colors=cmap(assignment))
+            ax.contour(
+                axis_X,
+                axis_Y,
+                grid_gaussian_pdf(self.means[assignment], self.covs[assignment]),
+                colors=cmap(assignment),
+            )
 
         if not holdon:
             plt.show()
diff --git a/mla/kmeans.py b/mla/kmeans.py
index 65ded386..b54484c1 100644
--- a/mla/kmeans.py
+++ b/mla/kmeans.py
@@ -37,9 +37,10 @@ class KMeans(BaseEstimator):
                larger distances between initial clusters to improve convergence
                rates and avoid degenerate cases.
     """
+
     y_required = False
 
-    def __init__(self, K=5, max_iters=100, init='random'):
+    def __init__(self, K=5, max_iters=100, init="random"):
         self.K = K
         self.max_iters = max_iters
         self.clusters = [[] for _ in range(self.K)]
@@ -49,15 +50,14 @@ def __init__(self, K=5, max_iters=100, init='random'):
     def _initialize_centroids(self, init):
         """Set the initial centroids."""
 
-        if init == 'random':
-            self.centroids = [self.X[x] for x in
-                              random.sample(range(self.n_samples), self.K)]
-        elif init == '++':
+        if init == "random":
+            self.centroids = [self.X[x] for x in random.sample(range(self.n_samples), self.K)]
+        elif init == "++":
             self.centroids = [random.choice(self.X)]
             while len(self.centroids) < self.K:
                 self.centroids.append(self._choose_next_center())
         else:
-            raise ValueError('Unknown type of init parameter')
+            raise ValueError("Unknown type of init parameter")
 
     def _predict(self, X=None):
         """Perform clustering on the dataset."""
@@ -117,8 +117,8 @@ def _dist_from_centers(self):
 
     def _choose_next_center(self):
         distances = self._dist_from_centers()
-        squared_distances = distances**2
-        probs = squared_distances/squared_distances.sum()
+        squared_distances = distances ** 2
+        probs = squared_distances / squared_distances.sum()
         ind = np.random.choice(self.X.shape[0], 1, p=probs)[0]
         return self.X[ind]
 
@@ -137,14 +137,12 @@ def plot(self, ax=None, holdon=False):
         if ax is None:
             _, ax = plt.subplots()
 
-
-
         for i, index in enumerate(self.clusters):
             point = np.array(data[index]).T
             ax.scatter(*point, c=sns.color_palette("hls", self.K + 1)[i])
 
         for point in self.centroids:
-            ax.scatter(*point, marker='x', linewidths=10)
+            ax.scatter(*point, marker="x", linewidths=10)
 
         if not holdon:
             plt.show()
diff --git a/mla/knn.py b/mla/knn.py
index 8f56f4ae..307c0243 100644
--- a/mla/knn.py
+++ b/mla/knn.py
@@ -38,13 +38,11 @@ def _predict_x(self, x):
         distances = (self.distance_func(x, example) for example in self.X)
 
         # Sort all examples by their distance to x and keep their target value.
-        neighbors = sorted(((dist, target)
-                            for (dist, target) in zip(distances, self.y)),
-                           key=lambda x: x[0])
+        neighbors = sorted(((dist, target) for (dist, target) in zip(distances, self.y)), key=lambda x: x[0])
 
         # Get targets of the k-nn and aggregate them (most common one or
         # average).
-        neighbors_targets = [target for (_, target) in neighbors[:self.k]]
+        neighbors_targets = [target for (_, target) in neighbors[: self.k]]
 
         return self.aggregate(neighbors_targets)
 
diff --git a/mla/linear_models.py b/mla/linear_models.py
index 5aff07e1..537fa03c 100644
--- a/mla/linear_models.py
+++ b/mla/linear_models.py
@@ -10,7 +10,7 @@
 
 
 class BasicRegression(BaseEstimator):
-    def __init__(self, lr=0.001, penalty='None', C=0.01, tolerance=0.0001, max_iters=1000):
+    def __init__(self, lr=0.001, penalty="None", C=0.01, tolerance=0.0001, max_iters=1000):
         """Basic class for implementing continuous regression estimators which
         are trained with gradient descent optimization on their particular loss
         function.
@@ -78,7 +78,7 @@ def _add_intercept(X):
 
     def _train(self):
         self.theta, self.errors = self._gradient_descent()
-        logging.info(' Theta: %s' % self.theta.flatten())
+        logging.info(" Theta: %s" % self.theta.flatten())
 
     def _predict(self, X=None):
         X = self._add_intercept(X)
@@ -95,11 +95,11 @@ def _gradient_descent(self):
             theta -= self.lr * delta
 
             errors.append(self._cost(self.X, self.y, theta))
-            logging.info('Iteration %s, error %s' % (i, errors[i]))
+            logging.info("Iteration %s, error %s" % (i, errors[i]))
 
             error_diff = np.linalg.norm(errors[i - 1] - errors[i])
             if error_diff < self.tolerance:
-                logging.info('Convergence has reached.')
+                logging.info("Convergence has reached.")
                 break
         return theta, errors
 
diff --git a/mla/metrics/base.py b/mla/metrics/base.py
index b48ab2be..dff0217c 100644
--- a/mla/metrics/base.py
+++ b/mla/metrics/base.py
@@ -9,10 +9,10 @@ def check_data(a, b):
         b = np.array(b)
 
     if type(a) != type(b):
-        raise ValueError('Type mismatch: %s and %s' % (type(a), type(b)))
+        raise ValueError("Type mismatch: %s and %s" % (type(a), type(b)))
 
     if a.size != b.size:
-        raise ValueError('Arrays must be equal in length.')
+        raise ValueError("Arrays must be equal in length.")
     return a, b
 
 
diff --git a/mla/metrics/distance.py b/mla/metrics/distance.py
index 0bf65f29..263ea68a 100644
--- a/mla/metrics/distance.py
+++ b/mla/metrics/distance.py
@@ -12,4 +12,4 @@ def euclidean_distance(a, b):
 
 def l2_distance(X):
     sum_X = np.sum(X * X, axis=1)
-    return (-2 * np.dot(X, X.T) + sum_X).T + sum_X
\ No newline at end of file
+    return (-2 * np.dot(X, X.T) + sum_X).T + sum_X
diff --git a/mla/metrics/metrics.py b/mla/metrics/metrics.py
index 5b609374..a74fb90c 100644
--- a/mla/metrics/metrics.py
+++ b/mla/metrics/metrics.py
@@ -5,6 +5,7 @@
 
 def unhot(function):
     """Convert one-hot representation into one column."""
+
     def wrapper(actual, predicted):
         if len(actual.shape) > 1 and actual.shape[1] > 1:
             actual = actual.argmax(axis=1)
@@ -64,13 +65,12 @@ def logloss(actual, predicted):
 
 
 def hinge(actual, predicted):
-    return np.mean(np.max(1. - actual * predicted, 0.))
+    return np.mean(np.max(1.0 - actual * predicted, 0.0))
 
 
 def binary_crossentropy(actual, predicted):
     predicted = np.clip(predicted, EPS, 1 - EPS)
-    return np.mean(-np.sum(actual * np.log(predicted) +
-                           (1 - actual) * np.log(1 - predicted)))
+    return np.mean(-np.sum(actual * np.log(predicted) + (1 - actual) * np.log(1 - predicted)))
 
 
 # aliases
@@ -84,4 +84,4 @@ def get_metric(name):
     try:
         return globals()[name]
     except:
-        raise ValueError('Invalid metric function.')
+        raise ValueError("Invalid metric function.")
diff --git a/mla/metrics/tests/test_metrics.py b/mla/metrics/tests/test_metrics.py
index f75b92fa..307dca70 100644
--- a/mla/metrics/tests/test_metrics.py
+++ b/mla/metrics/tests/test_metrics.py
@@ -26,63 +26,63 @@ def metric(name):
 
 
 def test_classification_error():
-    f = metric('classification_error')
+    f = metric("classification_error")
     assert f([1, 2, 3, 4], [1, 2, 3, 4]) == 0
     assert f([1, 2, 3, 4], [1, 2, 3, 5]) == 0.25
     assert f([1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 0, 0]) == (1.0 / 6)
 
 
 def test_absolute_error():
-    f = metric('absolute_error')
+    f = metric("absolute_error")
     assert f([3], [5]) == [2]
     assert f([-1], [-4]) == [3]
 
 
 def test_mean_absolute_error():
-    f = metric('mean_absolute_error')
+    f = metric("mean_absolute_error")
     assert f([1, 2, 3], [1, 2, 3]) == 0
     assert f([1, 2, 3], [3, 2, 1]) == 4 / 3
 
 
 def test_squared_error():
-    f = metric('squared_error')
+    f = metric("squared_error")
     assert f([1], [1]) == [0]
     assert f([3], [1]) == [4]
 
 
 def test_squared_log_error():
-    f = metric('squared_log_error')
+    f = metric("squared_log_error")
     assert f([1], [1]) == [0]
     assert f([3], [1]) == [np.log(2) ** 2]
     assert f([np.exp(2) - 1], [np.exp(1) - 1]) == [1.0]
 
 
 def test_mean_squared_log_error():
-    f = metric('mean_squared_log_error')
+    f = metric("mean_squared_log_error")
     assert f([1, 2, 3], [1, 2, 3]) == 0
     assert f([1, 2, 3, np.exp(1) - 1], [1, 2, 3, np.exp(2) - 1]) == 0.25
 
 
 def test_root_mean_squared_log_error():
-    f = metric('root_mean_squared_log_error')
+    f = metric("root_mean_squared_log_error")
     assert f([1, 2, 3], [1, 2, 3]) == 0
     assert f([1, 2, 3, np.exp(1) - 1], [1, 2, 3, np.exp(2) - 1]) == 0.5
 
 
 def test_mean_squared_error():
-    f = metric('mean_squared_error')
+    f = metric("mean_squared_error")
     assert f([1, 2, 3], [1, 2, 3]) == 0
     assert f(range(1, 5), [1, 2, 3, 6]) == 1
 
 
 def test_root_mean_squared_error():
-    f = metric('root_mean_squared_error')
+    f = metric("root_mean_squared_error")
     assert f([1, 2, 3], [1, 2, 3]) == 0
     assert f(range(1, 5), [1, 2, 3, 5]) == 0.5
 
 
 def test_multiclass_logloss():
-    f = metric('logloss')
+    f = metric("logloss")
     assert_almost_equal(f([1], [1]), 0)
     assert_almost_equal(f([1, 1], [1, 1]), 0)
     assert_almost_equal(f([1], [0.5]), -np.log(0.5))
diff --git a/mla/naive_bayes.py b/mla/naive_bayes.py
index 3b941a22..d98ca801 100644
--- a/mla/naive_bayes.py
+++ b/mla/naive_bayes.py
@@ -5,6 +5,7 @@
 
 class NaiveBayesClassifier(BaseEstimator):
     """Gaussian Naive Bayes."""
+
     # Binary problem.
     n_classes = 2
 
diff --git a/mla/neuralnet/activations.py b/mla/neuralnet/activations.py
index d13b6c2c..9ec7ffd7 100644
--- a/mla/neuralnet/activations.py
+++ b/mla/neuralnet/activations.py
@@ -48,4 +48,4 @@ def get_activation(name):
     try:
         return globals()[name]
     except:
-        raise ValueError('Invalid activation function.')
+        raise ValueError("Invalid activation function.")
diff --git a/mla/neuralnet/constraints.py b/mla/neuralnet/constraints.py
index b0b73549..ccc1e4a2 100644
--- a/mla/neuralnet/constraints.py
+++ b/mla/neuralnet/constraints.py
@@ -23,7 +23,7 @@ def clip(self, p):
 
 class NonNeg(object):
     def clip(self, p):
-        p[p < 0.] = 0.
+        p[p < 0.0] = 0.0
         return p
 
 
diff --git a/mla/neuralnet/initializations.py b/mla/neuralnet/initializations.py
index eadc57d3..a380f459 100644
--- a/mla/neuralnet/initializations.py
+++ b/mla/neuralnet/initializations.py
@@ -5,6 +5,8 @@
 http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
 
 """
+
+
 def normal(shape, scale=0.5):
     return np.random.normal(size=shape, scale=scale)
 
@@ -43,25 +45,25 @@ def _glorot_fan(shape):
 
 def glorot_normal(shape, **kwargs):
     fan_in, fan_out = _glorot_fan(shape)
-    s = np.sqrt(2. / (fan_in + fan_out))
+    s = np.sqrt(2.0 / (fan_in + fan_out))
     return normal(shape, s)
 
 
 def glorot_uniform(shape, **kwargs):
     fan_in, fan_out = _glorot_fan(shape)
-    s = np.sqrt(6. / (fan_in + fan_out))
+    s = np.sqrt(6.0 / (fan_in + fan_out))
     return uniform(shape, s)
 
 
 def he_normal(shape, **kwargs):
     fan_in, fan_out = _glorot_fan(shape)
-    s = np.sqrt(2. / fan_in)
+    s = np.sqrt(2.0 / fan_in)
     return normal(shape, s)
 
 
 def he_uniform(shape, **kwargs):
     fan_in, fan_out = _glorot_fan(shape)
-    s = np.sqrt(6. / fan_in)
+    s = np.sqrt(6.0 / fan_in)
     return uniform(shape, s)
 
 
@@ -70,4 +72,4 @@ def get_initializer(name):
     try:
         return globals()[name]
     except:
-        raise ValueError('Invalid initialization function.')
+        raise ValueError("Invalid initialization function.")
diff --git a/mla/neuralnet/layers/__init__.py b/mla/neuralnet/layers/__init__.py
index 123a814d..5f4690ad 100644
--- a/mla/neuralnet/layers/__init__.py
+++ b/mla/neuralnet/layers/__init__.py
@@ -1,3 +1,3 @@
 from .basic import *
 from .convnet import *
-from .normalization import *
\ No newline at end of file
+from .normalization import *
diff --git a/mla/neuralnet/layers/basic.py b/mla/neuralnet/layers/basic.py
index 52a130e7..dc719a68 100644
--- a/mla/neuralnet/layers/basic.py
+++ b/mla/neuralnet/layers/basic.py
@@ -50,7 +50,7 @@ def is_testing(self, is_test=True):
 
 
 class Dense(Layer, ParamMixin):
-    def __init__(self, output_dim, parameters=None, ):
+    def __init__(self, output_dim, parameters=None):
         """A fully connected layer.
 
         Parameters
@@ -72,17 +72,17 @@ def forward_pass(self, X):
         return self.weight(X)
 
     def weight(self, X):
-        W = np.dot(X, self._params['W'])
-        return W + self._params['b']
+        W = np.dot(X, self._params["W"])
+        return W + self._params["b"]
 
     def backward_pass(self, delta):
         dW = np.dot(self.last_input.T, delta)
         db = np.sum(delta, axis=0)
 
         # Update gradient values
-        self._params.update_grad('W', dW)
-        self._params.update_grad('b', db)
-        return np.dot(delta, self._params['W'].T)
+        self._params.update_grad("W", dW)
+        self._params.update_grad("b", db)
+        return np.dot(delta, self._params["W"].T)
 
     def shape(self, x_shape):
         return x_shape[0], self.output_dim
diff --git a/mla/neuralnet/layers/convnet.py b/mla/neuralnet/layers/convnet.py
index 27ee87a1..4d4a8291 100644
--- a/mla/neuralnet/layers/convnet.py
+++ b/mla/neuralnet/layers/convnet.py
@@ -35,26 +35,26 @@ def setup(self, X_shape):
         n_channels, self.height, self.width = X_shape[1:]
 
         W_shape = (self.n_filters, n_channels) + self.filter_shape
-        b_shape = (self.n_filters)
+        b_shape = self.n_filters
         self._params.setup_weights(W_shape, b_shape)
 
     def forward_pass(self, X):
         n_images, n_channels, height, width = self.shape(X.shape)
         self.last_input = X
         self.col = image_to_column(X, self.filter_shape, self.stride, self.padding)
-        self.col_W = self._params['W'].reshape(self.n_filters, -1).T
+        self.col_W = self._params["W"].reshape(self.n_filters, -1).T
 
-        out = np.dot(self.col, self.col_W) + self._params['b']
+        out = np.dot(self.col, self.col_W) + self._params["b"]
         out = out.reshape(n_images, height, width, -1).transpose(0, 3, 1, 2)
         return out
 
     def backward_pass(self, delta):
         delta = delta.transpose(0, 2, 3, 1).reshape(-1, self.n_filters)
 
-        d_W = np.dot(self.col.T, delta).transpose(1, 0).reshape(self._params['W'].shape)
+        d_W = np.dot(self.col.T, delta).transpose(1, 0).reshape(self._params["W"].shape)
         d_b = np.sum(delta, axis=0)
-        self._params.update_grad('b', d_b)
-        self._params.update_grad('W', d_W)
+        self._params.update_grad("b", d_b)
+        self._params.update_grad("W", d_W)
 
         d_c = np.dot(delta, self.col_W.T)
         return column_to_image(d_c, self.last_input.shape, self.filter_shape, self.stride, self.padding)
@@ -138,14 +138,14 @@ def image_to_column(images, filter_shape, stride, padding):
     n_images, n_channels, height, width = images.shape
     f_height, f_width = filter_shape
     out_height, out_width = convoltuion_shape(height, width, (f_height, f_width), stride, padding)
-    images = np.pad(images, ((0, 0), (0, 0), padding, padding), mode='constant')
+    images = np.pad(images, ((0, 0), (0, 0), padding, padding), mode="constant")
 
     col = np.zeros((n_images, n_channels, f_height, f_width, out_height, out_width))
     for y in range(f_height):
         y_bound = y + stride[0] * out_height
         for x in range(f_width):
             x_bound = x + stride[1] * out_width
-            col[:, :, y, x, :, :] = images[:, :, y:y_bound:stride[0], x:x_bound:stride[1]]
+            col[:, :, y, x, :, :] = images[:, :, y : y_bound : stride[0], x : x_bound : stride[1]]
 
     col = col.transpose(0, 4, 5, 1, 2, 3).reshape(n_images * out_height * out_width, -1)
     return col
@@ -166,8 +166,9 @@ def column_to_image(columns, images_shape, filter_shape, stride, padding):
     f_height, f_width = filter_shape
 
     out_height, out_width = convoltuion_shape(height, width, (f_height, f_width), stride, padding)
-    columns = columns.reshape(n_images, out_height, out_width, n_channels, f_height, f_width).transpose(0, 3, 4, 5, 1,
-                                                                                                        2)
+    columns = columns.reshape(n_images, out_height, out_width, n_channels, f_height, f_width).transpose(
+        0, 3, 4, 5, 1, 2
+    )
 
     img_h = height + 2 * padding[0] + stride[0] - 1
     img_w = width + 2 * padding[1] + stride[1] - 1
@@ -176,9 +177,9 @@ def column_to_image(columns, images_shape, filter_shape, stride, padding):
         y_bound = y + stride[0] * out_height
         for x in range(f_width):
             x_bound = x + stride[1] * out_width
-            img[:, :, y:y_bound:stride[0], x:x_bound:stride[1]] += columns[:, :, y, x, :, :]
+            img[:, :, y : y_bound : stride[0], x : x_bound : stride[1]] += columns[:, :, y, x, :, :]
 
-    return img[:, :, padding[0]:height + padding[0], padding[1]:width + padding[1]]
+    return img[:, :, padding[0] : height + padding[0], padding[1] : width + padding[1]]
 
 
 def convoltuion_shape(img_height, img_width, filter_shape, stride, padding):
diff --git a/mla/neuralnet/layers/normalization.py b/mla/neuralnet/layers/normalization.py
index b5f2cdd3..a51b4e81 100644
--- a/mla/neuralnet/layers/normalization.py
+++ b/mla/neuralnet/layers/normalization.py
@@ -7,6 +7,7 @@
 https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html
 """
 
+
 class BatchNormalization(Layer, ParamMixin, PhaseMixin):
     def __init__(self, momentum=0.9, eps=1e-5, parameters=None):
         super().__init__()
@@ -22,15 +23,15 @@ def setup(self, x_shape):
         self._params.setup_weights((1, x_shape[1]))
 
     def _forward_pass(self, X):
-        gamma = self._params['W']
-        beta = self._params['b']
+        gamma = self._params["W"]
+        beta = self._params["b"]
 
         if self.is_testing:
             mu = self.ema_mean
             xmu = X - mu
             var = self.ema_var
             sqrtvar = np.sqrt(var + self.eps)
-            ivar = 1. / sqrtvar
+            ivar = 1.0 / sqrtvar
             xhat = xmu * ivar
             gammax = gamma * xhat
             return gammax + beta
@@ -38,7 +39,7 @@ def _forward_pass(self, X):
         N, D = X.shape
 
         # step1: calculate mean
-        mu = 1. / N * np.sum(X, axis=0)
+        mu = 1.0 / N * np.sum(X, axis=0)
 
         # step2: subtract mean vector of every trainings example
         xmu = X - mu
@@ -47,13 +48,13 @@ def _forward_pass(self, X):
         sq = xmu ** 2
 
         # step4: calculate variance
-        var = 1. / N * np.sum(sq, axis=0)
+        var = 1.0 / N * np.sum(sq, axis=0)
 
         # step5: add eps for numerical stability, then sqrt
         sqrtvar = np.sqrt(var + self.eps)
 
         # step6: invert sqrtwar
-        ivar = 1. / sqrtvar
+        ivar = 1.0 / sqrtvar
 
         # step7: execute normalization
         xhat = xmu * ivar
@@ -87,7 +88,7 @@ def forward_pass(self, X):
             out_flat = self._forward_pass(x_flat)
             return out_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)
         else:
-            raise NotImplementedError('Unknown model with dimensions = {}'.format(len(X.shape)))
+            raise NotImplementedError("Unknown model with dimensions = {}".format(len(X.shape)))
 
     def _backward_pass(self, delta):
         # unfold the variables stored in cache
@@ -109,30 +110,30 @@ def _backward_pass(self, delta):
         dxmu1 = dxhat * ivar
 
         # step6
-        dsqrtvar = -1. / (sqrtvar ** 2) * divar
+        dsqrtvar = -1.0 / (sqrtvar ** 2) * divar
 
         # step5
-        dvar = 0.5 * 1. / np.sqrt(var + self.eps) * dsqrtvar
+        dvar = 0.5 * 1.0 / np.sqrt(var + self.eps) * dsqrtvar
 
         # step4
-        dsq = 1. / N * np.ones((N, D)) * dvar
+        dsq = 1.0 / N * np.ones((N, D)) * dvar
 
         # step3
         dxmu2 = 2 * xmu * dsq
 
         # step2
-        dx1 = (dxmu1 + dxmu2)
+        dx1 = dxmu1 + dxmu2
         dmu = -1 * np.sum(dxmu1 + dxmu2, axis=0)
 
         # step1
-        dx2 = 1. / N * np.ones((N, D)) * dmu
+        dx2 = 1.0 / N * np.ones((N, D)) * dmu
 
         # step0
         dx = dx1 + dx2
 
         # Update gradient values
-        self._params.update_grad('W', dgamma)
-        self._params.update_grad('b', dbeta)
+        self._params.update_grad("W", dgamma)
+        self._params.update_grad("b", dbeta)
 
         return dx
 
@@ -147,7 +148,7 @@ def backward_pass(self, X):
             out_flat = self._backward_pass(x_flat)
             return out_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)
         else:
-            raise NotImplementedError('Unknown model shape: {}'.format(X.shape))
+            raise NotImplementedError("Unknown model shape: {}".format(X.shape))
 
     def shape(self, x_shape):
         return x_shape
diff --git a/mla/neuralnet/layers/recurrent/lstm.py b/mla/neuralnet/layers/recurrent/lstm.py
index 4e0da416..f298a200 100644
--- a/mla/neuralnet/layers/recurrent/lstm.py
+++ b/mla/neuralnet/layers/recurrent/lstm.py
@@ -15,7 +15,7 @@
 
 
 class LSTM(Layer, ParamMixin):
-    def __init__(self, hidden_dim, activation='tanh', inner_init='orthogonal', parameters=None, return_sequences=True):
+    def __init__(self, hidden_dim, activation="tanh", inner_init="orthogonal", parameters=None, return_sequences=True):
         self.return_sequences = return_sequences
         self.hidden_dim = hidden_dim
         self.inner_init = get_initializer(inner_init)
@@ -51,11 +51,11 @@ def setup(self, x_shape):
         """
         self.input_dim = x_shape[2]
         # Input -> Hidden
-        W_params = ['W_i', 'W_f', 'W_o', 'W_c']
+        W_params = ["W_i", "W_f", "W_o", "W_c"]
         # Hidden -> Hidden
-        U_params = ['U_i', 'U_f', 'U_o', 'U_c']
+        U_params = ["U_i", "U_f", "U_o", "U_c"]
         # Bias terms
-        b_params = ['b_i', 'b_f', 'b_o', 'b_c']
+        b_params = ["b_i", "b_f", "b_o", "b_c"]
 
         # Initialize params
         for param in W_params:
@@ -84,7 +84,7 @@ def forward_pass(self, X):
 
         self.states = np.zeros((n_samples, n_timesteps + 1, self.hidden_dim))
         self.outputs = np.zeros((n_samples, n_timesteps + 1, self.hidden_dim))
-        self.gates = {k: np.zeros((n_samples, n_timesteps, self.hidden_dim)) for k in ['i', 'f', 'o', 'c']}
+        self.gates = {k: np.zeros((n_samples, n_timesteps, self.hidden_dim)) for k in ["i", "f", "o", "c"]}
 
         self.states[:, -1, :] = self.hprev
         self.outputs[:, -1, :] = self.oprev
@@ -93,18 +93,20 @@ def forward_pass(self, X):
             t_gates = np.dot(X[:, i, :], self.W) + np.dot(self.outputs[:, i - 1, :], self.U)
 
             # Input
-            self.gates['i'][:, i, :] = sigmoid(t_gates[:, 0, :] + p['b_i'])
+            self.gates["i"][:, i, :] = sigmoid(t_gates[:, 0, :] + p["b_i"])
             # Forget
-            self.gates['f'][:, i, :] = sigmoid(t_gates[:, 1, :] + p['b_f'])
+            self.gates["f"][:, i, :] = sigmoid(t_gates[:, 1, :] + p["b_f"])
             # Output
-            self.gates['o'][:, i, :] = sigmoid(t_gates[:, 2, :] + p['b_o'])
+            self.gates["o"][:, i, :] = sigmoid(t_gates[:, 2, :] + p["b_o"])
             # Cell
-            self.gates['c'][:, i, :] = self.activation(t_gates[:, 3, :] + p['b_c'])
+            self.gates["c"][:, i, :] = self.activation(t_gates[:, 3, :] + p["b_c"])
 
             # (previous state * forget) + input + cell
-            self.states[:, i, :] = self.states[:, i - 1, :] * self.gates['f'][:, i, :] + \
-                                   self.gates['i'][:, i, :] * self.gates['c'][:, i, :]
-            self.outputs[:, i, :] = self.gates['o'][:, i, :] * self.activation(self.states[:, i, :])
+            self.states[:, i, :] = (
+                self.states[:, i - 1, :] * self.gates["f"][:, i, :]
+                + self.gates["i"][:, i, :] * self.gates["c"][:, i, :]
+            )
+            self.outputs[:, i, :] = self.gates["o"][:, i, :] * self.activation(self.states[:, i, :])
 
         self.hprev = self.states[:, n_timesteps - 1, :].copy()
         self.oprev = self.outputs[:, n_timesteps - 1, :].copy()
@@ -128,31 +130,31 @@ def backward_pass(self, delta):
 
         # Backpropagation through time
         for i in reversed(range(n_timesteps)):
-            dhi = delta[:, i, :] * self.gates['o'][:, i, :] * self.activation_d(self.states[:, i, :]) + dh_next
+            dhi = delta[:, i, :] * self.gates["o"][:, i, :] * self.activation_d(self.states[:, i, :]) + dh_next
 
             og = delta[:, i, :] * self.activation(self.states[:, i, :])
-            de_o = og * self.sigmoid_d(self.gates['o'][:, i, :])
+            de_o = og * self.sigmoid_d(self.gates["o"][:, i, :])
 
-            grad['W_o'] += np.dot(self.last_input[:, i, :].T, de_o)
-            grad['U_o'] += np.dot(self.outputs[:, i - 1, :].T, de_o)
-            grad['b_o'] += de_o.sum(axis=0)
+            grad["W_o"] += np.dot(self.last_input[:, i, :].T, de_o)
+            grad["U_o"] += np.dot(self.outputs[:, i - 1, :].T, de_o)
+            grad["b_o"] += de_o.sum(axis=0)
 
-            de_f = (dhi * self.states[:, i - 1, :]) * self.sigmoid_d(self.gates['f'][:, i, :])
-            grad['W_f'] += np.dot(self.last_input[:, i, :].T, de_f)
-            grad['U_f'] += np.dot(self.outputs[:, i - 1, :].T, de_f)
-            grad['b_f'] += de_f.sum(axis=0)
+            de_f = (dhi * self.states[:, i - 1, :]) * self.sigmoid_d(self.gates["f"][:, i, :])
+            grad["W_f"] += np.dot(self.last_input[:, i, :].T, de_f)
+            grad["U_f"] += np.dot(self.outputs[:, i - 1, :].T, de_f)
+            grad["b_f"] += de_f.sum(axis=0)
 
-            de_i = (dhi * self.gates['c'][:, i, :]) * self.sigmoid_d(self.gates['i'][:, i, :])
-            grad['W_i'] += np.dot(self.last_input[:, i, :].T, de_i)
-            grad['U_i'] += np.dot(self.outputs[:, i - 1, :].T, de_i)
-            grad['b_i'] += de_i.sum(axis=0)
+            de_i = (dhi * self.gates["c"][:, i, :]) * self.sigmoid_d(self.gates["i"][:, i, :])
+            grad["W_i"] += np.dot(self.last_input[:, i, :].T, de_i)
+            grad["U_i"] += np.dot(self.outputs[:, i - 1, :].T, de_i)
+            grad["b_i"] += de_i.sum(axis=0)
 
-            de_c = (dhi * self.gates['i'][:, i, :]) * self.activation_d(self.gates['c'][:, i, :])
-            grad['W_c'] += np.dot(self.last_input[:, i, :].T, de_c)
-            grad['U_c'] += np.dot(self.outputs[:, i - 1, :].T, de_c)
-            grad['b_c'] += de_c.sum(axis=0)
+            de_c = (dhi * self.gates["i"][:, i, :]) * self.activation_d(self.gates["c"][:, i, :])
+            grad["W_c"] += np.dot(self.last_input[:, i, :].T, de_c)
+            grad["U_c"] += np.dot(self.outputs[:, i - 1, :].T, de_c)
+            grad["b_c"] += de_c.sum(axis=0)
 
-            dh_next = dhi * self.gates['f'][:, i, :]
+            dh_next = dhi * self.gates["f"][:, i, :]
 
         # TODO: propagate error to the next layer
 
diff --git a/mla/neuralnet/layers/recurrent/rnn.py b/mla/neuralnet/layers/recurrent/rnn.py
index 07ef182d..2c2687d6 100644
--- a/mla/neuralnet/layers/recurrent/rnn.py
+++ b/mla/neuralnet/layers/recurrent/rnn.py
@@ -10,7 +10,7 @@
 class RNN(Layer, ParamMixin):
     """Vanilla RNN."""
 
-    def __init__(self, hidden_dim, activation='tanh', inner_init='orthogonal', parameters=None, return_sequences=True):
+    def __init__(self, hidden_dim, activation="tanh", inner_init="orthogonal", parameters=None, return_sequences=True):
         self.return_sequences = return_sequences
         self.hidden_dim = hidden_dim
         self.inner_init = get_initializer(inner_init)
@@ -34,11 +34,11 @@ def setup(self, x_shape):
         self.input_dim = x_shape[2]
 
         # Input -> Hidden
-        self._params['W'] = self._params.init((self.input_dim, self.hidden_dim))
+        self._params["W"] = self._params.init((self.input_dim, self.hidden_dim))
         # Bias
-        self._params['b'] = np.full((self.hidden_dim,), self._params.initial_bias)
+        self._params["b"] = np.full((self.hidden_dim,), self._params.initial_bias)
         # Hidden -> Hidden layer
-        self._params['U'] = self.inner_init((self.hidden_dim, self.hidden_dim))
+        self._params["U"] = self.inner_init((self.hidden_dim, self.hidden_dim))
 
         # Init gradient arrays
         self._params.init_grad()
@@ -53,7 +53,7 @@ def forward_pass(self, X):
         p = self._params
 
         for i in range(n_timesteps):
-            states[:, i, :] = np.tanh(np.dot(X[:, i, :], p['W']) + np.dot(states[:, i - 1, :], p['U']) + p['b'])
+            states[:, i, :] = np.tanh(np.dot(X[:, i, :], p["W"]) + np.dot(states[:, i - 1, :], p["U"]) + p["b"])
 
         self.states = states
         self.hprev = states[:, n_timesteps - 1, :].copy()
@@ -78,14 +78,14 @@ def backward_pass(self, delta):
         for i in reversed(range(n_timesteps)):
             dhi = self.activation_d(self.states[:, i, :]) * (delta[:, i, :] + dh_next)
 
-            grad['W'] += np.dot(self.last_input[:, i, :].T, dhi)
-            grad['b'] += delta[:, i, :].sum(axis=0)
-            grad['U'] += np.dot(self.states[:, i - 1, :].T, dhi)
+            grad["W"] += np.dot(self.last_input[:, i, :].T, dhi)
+            grad["b"] += delta[:, i, :].sum(axis=0)
+            grad["U"] += np.dot(self.states[:, i - 1, :].T, dhi)
 
-            dh_next = np.dot(dhi, p['U'].T)
+            dh_next = np.dot(dhi, p["U"].T)
 
-            d = np.dot(delta[:, i, :], p['U'].T)
-            output[:, i, :] = np.dot(d, p['W'].T)
+            d = np.dot(delta[:, i, :], p["U"].T)
+            output[:, i, :] = np.dot(d, p["W"].T)
 
         # Change actual gradient arrays
         for k in grad.keys():
diff --git a/mla/neuralnet/loss.py b/mla/neuralnet/loss.py
index b4e3a550..829050df 100644
--- a/mla/neuralnet/loss.py
+++ b/mla/neuralnet/loss.py
@@ -9,4 +9,4 @@ def get_loss(name):
     try:
         return globals()[name]
     except:
-        raise ValueError('Invalid metric function.')
+        raise ValueError("Invalid metric function.")
diff --git a/mla/neuralnet/nnet.py b/mla/neuralnet/nnet.py
index 6095e158..aa5f3c96 100644
--- a/mla/neuralnet/nnet.py
+++ b/mla/neuralnet/nnet.py
@@ -21,8 +21,9 @@
 class NeuralNet(BaseEstimator):
     fit_required = False
 
-    def __init__(self, layers, optimizer, loss, max_epochs=10, batch_size=64, metric='mse',
-                 shuffle=False, verbose=True):
+    def __init__(
+        self, layers, optimizer, loss, max_epochs=10, batch_size=64, metric="mse", shuffle=False, verbose=True
+    ):
         self.verbose = verbose
         self.shuffle = shuffle
         self.optimizer = optimizer
@@ -30,7 +31,7 @@ def __init__(self, layers, optimizer, loss, max_epochs=10, batch_size=64, metric
         self.loss = get_loss(loss)
 
         # TODO: fix
-        if loss == 'categorical_crossentropy':
+        if loss == "categorical_crossentropy":
             self.loss_grad = lambda actual, predicted: -(actual - predicted)
         else:
             self.loss_grad = elementwise_grad(self.loss, 1)
@@ -58,12 +59,12 @@ def _setup_layers(self, x_shape):
         # Setup optimizer
         self.optimizer.setup(self)
         self._initialized = True
-        logging.info('Total parameters: %s' % self.n_params)
+        logging.info("Total parameters: %s" % self.n_params)
 
     def _find_bprop_entry(self):
         """Find entry layer for back propagation."""
 
-        if len(self.layers) > 0 and not hasattr(self.layers[-1], 'parameters'):
+        if len(self.layers) > 0 and not hasattr(self.layers[-1], "parameters"):
             return -1
         return len(self.layers)
 
@@ -87,7 +88,7 @@ def update(self, X, y):
 
         # Backward pass
         grad = self.loss_grad(y, y_pred)
-        for layer in reversed(self.layers[:self.bprop_entry]):
+        for layer in reversed(self.layers[: self.bprop_entry]):
             grad = layer.backward_pass(grad)
         return self.loss(y, y_pred)
 
@@ -110,7 +111,7 @@ def _predict(self, X=None):
     @property
     def parametric_layers(self):
         for layer in self.layers:
-            if hasattr(layer, 'parameters'):
+            if hasattr(layer, "parameters"):
                 yield layer
 
     @property
diff --git a/mla/neuralnet/optimizers.py b/mla/neuralnet/optimizers.py
index 422c6d9e..c1cef27d 100644
--- a/mla/neuralnet/optimizers.py
+++ b/mla/neuralnet/optimizers.py
@@ -26,8 +26,8 @@ def optimize(self, network):
             if network.verbose:
                 msg = "Epoch:%s, train loss: %s" % (i, loss)
                 if network.log_metric:
-                    msg += ', train %s: %s' % (network.metric_name, network.error())
-                msg += ', elapsed: %s sec.' % (time.time() - start_time)
+                    msg += ", train %s: %s" % (network.metric_name, network.error())
+                msg += ", elapsed: %s sec." % (time.time() - start_time)
                 logging.info(msg)
         return loss_history
 
@@ -44,7 +44,7 @@ def train_epoch(self, network):
 
         batch = zip(X_batch, y_batch)
         if network.verbose:
-            batch = tqdm(batch, total=int(np.ceil(network.n_samples/network.batch_size)))
+            batch = tqdm(batch, total=int(np.ceil(network.n_samples / network.batch_size)))
 
         for X, y in batch:
             loss = np.mean(network.update(X, y))
@@ -66,7 +66,7 @@ def setup(self, network):
 
 
 class SGD(Optimizer):
-    def __init__(self, learning_rate=0.01, momentum=0.9, decay=0., nesterov=False):
+    def __init__(self, learning_rate=0.01, momentum=0.9, decay=0.0, nesterov=False):
         self.nesterov = nesterov
         self.decay = decay
         self.momentum = momentum
@@ -75,7 +75,7 @@ def __init__(self, learning_rate=0.01, momentum=0.9, decay=0., nesterov=False):
         self.velocity = None
 
     def update(self, network):
-        lr = self.lr * (1. / (1. + self.decay * self.iteration))
+        lr = self.lr * (1.0 / (1.0 + self.decay * self.iteration))
 
         for i, layer in enumerate(network.parametric_layers):
             for n in layer.parameters.keys():
@@ -127,13 +127,12 @@ def update(self, network):
         for i, layer in enumerate(network.parametric_layers):
             for n in layer.parameters.keys():
                 grad = layer.parameters.grad[n]
-                self.accu[i][n] = self.rho * self.accu[i][n] + (1. - self.rho) * grad ** 2
-                step = grad * np.sqrt(self.d_accu[i][n] + self.eps) / np.sqrt(
-                    self.accu[i][n] + self.eps)
+                self.accu[i][n] = self.rho * self.accu[i][n] + (1.0 - self.rho) * grad ** 2
+                step = grad * np.sqrt(self.d_accu[i][n] + self.eps) / np.sqrt(self.accu[i][n] + self.eps)
 
                 layer.parameters.step(n, -step * self.lr)
                 # Update delta accumulator
-                self.d_accu[i][n] = self.rho * self.d_accu[i][n] + (1. - self.rho) * step ** 2
+                self.d_accu[i][n] = self.rho * self.d_accu[i][n] + (1.0 - self.rho) * step ** 2
 
     def setup(self, network):
         # Accumulators
@@ -155,7 +154,7 @@ def update(self, network):
         for i, layer in enumerate(network.parametric_layers):
             for n in layer.parameters.keys():
                 grad = layer.parameters.grad[n]
-                self.accu[i][n] = (self.rho * self.accu[i][n]) + (1. - self.rho) * (grad ** 2)
+                self.accu[i][n] = (self.rho * self.accu[i][n]) + (1.0 - self.rho) * (grad ** 2)
                 step = self.lr * grad / (np.sqrt(self.accu[i][n]) + self.eps)
                 layer.parameters.step(n, -step)
 
@@ -168,7 +167,7 @@ def setup(self, network):
 
 
 class Adam(Optimizer):
-    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, ):
+    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8):
 
         self.epsilon = epsilon
         self.beta_2 = beta_2
@@ -181,9 +180,9 @@ def update(self, network):
         for i, layer in enumerate(network.parametric_layers):
             for n in layer.parameters.keys():
                 grad = layer.parameters.grad[n]
-                self.ms[i][n] = (self.beta_1 * self.ms[i][n]) + (1. - self.beta_1) * grad
-                self.vs[i][n] = (self.beta_2 * self.vs[i][n]) + (1. - self.beta_2) * grad ** 2
-                lr = self.lr * np.sqrt(1. - self.beta_2 ** self.t) / (1. - self.beta_1 ** self.t)
+                self.ms[i][n] = (self.beta_1 * self.ms[i][n]) + (1.0 - self.beta_1) * grad
+                self.vs[i][n] = (self.beta_2 * self.vs[i][n]) + (1.0 - self.beta_2) * grad ** 2
+                lr = self.lr * np.sqrt(1.0 - self.beta_2 ** self.t) / (1.0 - self.beta_1 ** self.t)
 
                 step = lr * self.ms[i][n] / (np.sqrt(self.vs[i][n]) + self.epsilon)
                 layer.parameters.step(n, -step)
diff --git a/mla/neuralnet/parameters.py b/mla/neuralnet/parameters.py
index 83d15a73..39ef9783 100644
--- a/mla/neuralnet/parameters.py
+++ b/mla/neuralnet/parameters.py
@@ -4,7 +4,7 @@
 
 
 class Parameters(object):
-    def __init__(self, init='glorot_uniform', scale=0.5, bias=1.0, regularizers=None, constraints=None):
+    def __init__(self, init="glorot_uniform", scale=0.5, bias=1.0, regularizers=None, constraints=None):
         """A container for layer's parameters.
 
         Parameters
@@ -39,12 +39,12 @@ def __init__(self, init='glorot_uniform', scale=0.5, bias=1.0, regularizers=None
         self._grads = {}
 
     def setup_weights(self, W_shape, b_shape=None):
-        if 'W' not in self._params:
-            self._params['W'] = self.init(shape=W_shape, scale=self.scale)
+        if "W" not in self._params:
+            self._params["W"] = self.init(shape=W_shape, scale=self.scale)
             if b_shape is None:
-                self._params['b'] = np.full(W_shape[1], self.initial_bias)
+                self._params["b"] = np.full(W_shape[1], self.initial_bias)
             else:
-                self._params['b'] = np.full(b_shape, self.initial_bias)
+                self._params["b"] = np.full(b_shape, self.initial_bias)
         self.init_grad()
 
     def init_grad(self):
diff --git a/mla/neuralnet/regularizers.py b/mla/neuralnet/regularizers.py
index a6bdd462..d674590b 100644
--- a/mla/neuralnet/regularizers.py
+++ b/mla/neuralnet/regularizers.py
@@ -30,5 +30,6 @@ def _penalty(self, weights):
 
 class ElasticNet(Regularizer):
     """Linear combination of L1 and L2 penalties."""
+
     def _penalty(self, weights):
         return 0.5 * self.C * weights ** 2 + (1.0 - self.C) * np.abs(weights)
diff --git a/mla/neuralnet/tests/test_activations.py b/mla/neuralnet/tests/test_activations.py
index 5d8caa26..9b16dc06 100644
--- a/mla/neuralnet/tests/test_activations.py
+++ b/mla/neuralnet/tests/test_activations.py
@@ -3,6 +3,7 @@
 
 from mla.neuralnet.activations import *
 
+
 def test_softplus():
     # np.exp(z_max) will overflow
     z_max = np.log(sys.float_info.max) + 1.0e10
@@ -12,14 +13,6 @@ def test_softplus():
     # naive implementation of np.log(1 + np.exp(z_max)) will overflow
     # naive implementation of z + np.log(1 + 1 / np.exp(z_min)) will
     # throw ZeroDivisionError
-    outputs = np.array([
-      np.log(2.0),
-      np.log1p(np.exp(1.0)),
-      np.log1p(np.exp(-1.0)),
-      0.0,
-      z_max
-    ])
+    outputs = np.array([np.log(2.0), np.log1p(np.exp(1.0)), np.log1p(np.exp(-1.0)), 0.0, z_max])
 
     assert np.allclose(outputs, softplus(inputs))
-
-
diff --git a/mla/neuralnet/tests/test_optimizers.py b/mla/neuralnet/tests/test_optimizers.py
index fb5fe15e..d3fc2a65 100644
--- a/mla/neuralnet/tests/test_optimizers.py
+++ b/mla/neuralnet/tests/test_optimizers.py
@@ -9,8 +9,9 @@
 
 
 def clasifier(optimizer):
-    X, y = make_classification(n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2,
-                               class_sep=2.5, )
+    X, y = make_classification(
+        n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2, class_sep=2.5
+    )
     y = one_hot(y)
 
     X -= np.mean(X, axis=0)
@@ -19,20 +20,19 @@ def clasifier(optimizer):
 
     model = NeuralNet(
         layers=[
-            Dense(128, Parameters(init='uniform')),
-            Activation('relu'),
+            Dense(128, Parameters(init="uniform")),
+            Activation("relu"),
             Dropout(0.5),
-            Dense(64, Parameters(init='normal')),
-            Activation('relu'),
+            Dense(64, Parameters(init="normal")),
+            Activation("relu"),
             Dense(2),
-            Activation('softmax'),
+            Activation("softmax"),
         ],
-        loss='categorical_crossentropy',
+        loss="categorical_crossentropy",
         optimizer=optimizer,
-        metric='accuracy',
+        metric="accuracy",
         batch_size=64,
         max_epochs=10,
-
     )
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
diff --git a/mla/pca.py b/mla/pca.py
index 887c1b8f..d495f68f 100644
--- a/mla/pca.py
+++ b/mla/pca.py
@@ -10,7 +10,7 @@
 class PCA(BaseEstimator):
     y_required = False
 
-    def __init__(self, n_components, solver='svd'):
+    def __init__(self, n_components, solver="svd"):
         """Principal component analysis (PCA) implementation.
 
         Transforms a dataset of possibly correlated values into n linearly
@@ -39,16 +39,16 @@ def _decompose(self, X):
         X = X.copy()
         X -= self.mean
 
-        if self.solver == 'svd':
+        if self.solver == "svd":
             _, s, Vh = svd(X, full_matrices=True)
-        elif self.solver == 'eigen':
+        elif self.solver == "eigen":
             s, Vh = np.linalg.eig(np.cov(X.T))
             Vh = Vh.T
 
         s_squared = s ** 2
         variance_ratio = s_squared / (s_squared).sum()
-        logging.info('Explained variance ratio: %s' % (variance_ratio[0:self.n_components]))
-        self.components = Vh[0:self.n_components]
+        logging.info("Explained variance ratio: %s" % (variance_ratio[0 : self.n_components]))
+        self.components = Vh[0 : self.n_components]
 
     def transform(self, X):
         X = X.copy()
diff --git a/mla/rbm.py b/mla/rbm.py
index 85d7ccc6..ec1ee0b6 100644
--- a/mla/rbm.py
+++ b/mla/rbm.py
@@ -15,7 +15,6 @@
 """
 
 
-
 class RBM(BaseEstimator):
     y_required = False
 
@@ -53,18 +52,18 @@ def _init_weights(self):
         self.errors = []
 
     def _train(self):
-        '''Use CD-1 training procedure, basically an exact inference for `positive_associations`, 
-        followed by a "non burn-in" block Gibbs Sampling for the `negative_associations`.'''
+        """Use CD-1 training procedure, basically an exact inference for `positive_associations`, 
+        followed by a "non burn-in" block Gibbs Sampling for the `negative_associations`."""
 
         for i in range(self.max_epochs):
             error = 0
             for batch in batch_iterator(self.X, batch_size=self.batch_size):
                 positive_hidden = sigmoid(np.dot(batch, self.W) + self.bias_h)
-                hidden_states = self._sample(positive_hidden) # sample hidden state h1
+                hidden_states = self._sample(positive_hidden)  # sample hidden state h1
                 positive_associations = np.dot(batch.T, positive_hidden)
 
                 negative_visible = sigmoid(np.dot(hidden_states, self.W.T) + self.bias_v)
-                negative_visible = self._sample(negative_visible) # use the samped hidden state h1 to sample v1
+                negative_visible = self._sample(negative_visible)  # use the samped hidden state h1 to sample v1
                 negative_hidden = sigmoid(np.dot(negative_visible, self.W) + self.bias_h)
                 negative_associations = np.dot(negative_visible.T, negative_hidden)
 
@@ -76,10 +75,10 @@ def _train(self):
                 error += np.sum((batch - negative_visible) ** 2)
 
             self.errors.append(error)
-            logging.info('Iteration %s, error %s' % (i, error))
-        logging.debug('Weights: %s' % self.W)
-        logging.debug('Hidden bias: %s' % self.bias_h)
-        logging.debug('Visible bias: %s' % self.bias_v)
+            logging.info("Iteration %s, error %s" % (i, error))
+        logging.debug("Weights: %s" % self.W)
+        logging.debug("Hidden bias: %s" % self.bias_h)
+        logging.debug("Visible bias: %s" % self.bias_v)
 
     def _sample(self, X):
         return X > np.random.random_sample(size=X.shape)
diff --git a/mla/rl/dqn.py b/mla/rl/dqn.py
index f2ae4fa8..2816899b 100644
--- a/mla/rl/dqn.py
+++ b/mla/rl/dqn.py
@@ -18,8 +18,9 @@
 
 
 class DQN(object):
-    def __init__(self, n_episodes=500, gamma=0.99, batch_size=32, epsilon=1., decay=0.005, min_epsilon=0.1,
-                 memory_limit=500):
+    def __init__(
+        self, n_episodes=500, gamma=0.99, batch_size=32, epsilon=1.0, decay=0.005, min_epsilon=0.1, memory_limit=500
+    ):
         """Deep Q learning implementation.
 
         Parameters
@@ -44,7 +45,7 @@ def __init__(self, n_episodes=500, gamma=0.99, batch_size=32, epsilon=1., decay=
         self.batch_size = batch_size
         self.decay = decay
 
-    def init_environment(self, name='CartPole-v0', monitor=False):
+    def init_environment(self, name="CartPole-v0", monitor=False):
         self.env = gym.make(name)
         if monitor:
             self.env = wrappers.Monitor(self.env, name, force=True, video_callable=False)
@@ -124,9 +125,10 @@ def train(self, render=False):
             self.epsilon = self.min_epsilon + (1.0 - self.min_epsilon) * np.exp(-self.decay * ep)
 
             max_reward = max(max_reward, total_reward)
-            logger.info('Episode: %s, reward %s,  epsilon %s, max reward %s' % (ep, total_reward,
-                                                                                self.epsilon, max_reward))
-        logging.info('Training finished.')
+            logger.info(
+                "Episode: %s, reward %s,  epsilon %s, max reward %s" % (ep, total_reward, self.epsilon, max_reward)
+            )
+        logging.info("Training finished.")
 
     def play(self, episodes):
         for i in range(episodes):
@@ -140,5 +142,5 @@ def play(self, episodes):
                 total_reward += reward
                 if done:
                     break
-            logger.info('Episode: %s, reward %s' % (i, total_reward))
+            logger.info("Episode: %s, reward %s" % (i, total_reward))
         self.env.close()
diff --git a/mla/svm/kernerls.py b/mla/svm/kernerls.py
index ff635bea..45d6a00a 100644
--- a/mla/svm/kernerls.py
+++ b/mla/svm/kernerls.py
@@ -7,7 +7,7 @@ def __call__(self, x, y):
         return np.dot(x, y.T)
 
     def __repr__(self):
-        return 'Linear kernel'
+        return "Linear kernel"
 
 
 class Poly(object):
@@ -18,7 +18,7 @@ def __call__(self, x, y):
         return np.dot(x, y.T) ** self.degree
 
     def __repr__(self):
-        return 'Poly kernel'
+        return "Poly kernel"
 
 
 class RBF(object):
@@ -31,4 +31,4 @@ def __call__(self, x, y):
         return np.exp(-self.gamma * dist.cdist(x, y) ** 2).flatten()
 
     def __repr__(self):
-        return 'RBF kernel'
+        return "RBF kernel"
diff --git a/mla/svm/svm.py b/mla/svm/svm.py
index 89f74204..37922bf9 100644
--- a/mla/svm/svm.py
+++ b/mla/svm/svm.py
@@ -72,10 +72,18 @@ def _train(self):
                 self.alpha[i] = self.alpha[i] + self.y[i] * self.y[j] * (alpha_jo - self.alpha[j])
 
                 # Find intercept
-                b1 = self.b - e_i - self.y[i] * (self.alpha[i] - alpha_jo) * self.K[i, i] - \
-                     self.y[j] * (self.alpha[j] - alpha_jo) * self.K[i, j]
-                b2 = self.b - e_j - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[j, j] - \
-                     self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, j]
+                b1 = (
+                    self.b
+                    - e_i
+                    - self.y[i] * (self.alpha[i] - alpha_jo) * self.K[i, i]
+                    - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[i, j]
+                )
+                b2 = (
+                    self.b
+                    - e_j
+                    - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[j, j]
+                    - self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, j]
+                )
                 if 0 < self.alpha[i] < self.C:
                     self.b = b1
                 elif 0 < self.alpha[j] < self.C:
@@ -87,7 +95,7 @@ def _train(self):
             diff = np.linalg.norm(self.alpha - alpha_prev)
             if diff < self.tol:
                 break
-        logging.info('Convergence has reached after %s.' % iters)
+        logging.info("Convergence has reached after %s." % iters)
 
         # Save support vectors index
         self.sv_idx = np.where(self.alpha > 0)[0]
diff --git a/mla/tests/test_classification_accuracy.py b/mla/tests/test_classification_accuracy.py
index 0f443cb5..6dd73089 100644
--- a/mla/tests/test_classification_accuracy.py
+++ b/mla/tests/test_classification_accuracy.py
@@ -23,17 +23,17 @@
 from sklearn.datasets import make_classification
 
 # Generate a random regression problem
-X, y = make_classification(n_samples=750, n_features=10,
-                           n_informative=8, random_state=1111,
-                           n_classes=2, class_sep=2.5, n_redundant=0)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12,
-                                                    random_state=1111)
+X, y = make_classification(
+    n_samples=750, n_features=10, n_informative=8, random_state=1111, n_classes=2, class_sep=2.5, n_redundant=0
+)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12, random_state=1111)
 
 
 # All classifiers except convnet, RNN, LSTM.
 
+
 def test_linear_model():
-    model = LogisticRegression(lr=0.01, max_iters=500, penalty='l1', C=0.01)
+    model = LogisticRegression(lr=0.01, max_iters=500, penalty="l1", C=0.01)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
     assert roc_auc_score(y_test, predictions) >= 0.95
@@ -63,20 +63,19 @@ def test_mlp():
 
     model = NeuralNet(
         layers=[
-            Dense(256, Parameters(init='uniform', regularizers={'W': L2(0.05)})),
-            Activation('relu'),
+            Dense(256, Parameters(init="uniform", regularizers={"W": L2(0.05)})),
+            Activation("relu"),
             Dropout(0.5),
-            Dense(128, Parameters(init='normal', constraints={'W': MaxNorm()})),
-            Activation('relu'),
+            Dense(128, Parameters(init="normal", constraints={"W": MaxNorm()})),
+            Activation("relu"),
             Dense(2),
-            Activation('softmax'),
+            Activation("softmax"),
         ],
-        loss='categorical_crossentropy',
+        loss="categorical_crossentropy",
         optimizer=Adadelta(),
-        metric='accuracy',
+        metric="accuracy",
         batch_size=64,
         max_epochs=25,
-
     )
     model.fit(X_train, y_train_onehot)
     predictions = model.predict(X_test)
@@ -84,8 +83,7 @@ def test_mlp():
 
 
 def test_gbm():
-    model = GradientBoostingClassifier(n_estimators=25, max_depth=3,
-                                       max_features=5, learning_rate=0.1)
+    model = GradientBoostingClassifier(n_estimators=25, max_depth=3, max_features=5, learning_rate=0.1)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
     assert roc_auc_score(y_test, predictions) >= 0.95
diff --git a/mla/tests/test_reduction.py b/mla/tests/test_reduction.py
index 722194a7..4946ea6f 100644
--- a/mla/tests/test_reduction.py
+++ b/mla/tests/test_reduction.py
@@ -16,17 +16,17 @@
 @pytest.fixture
 def dataset():
     # Generate a random binary classification problem.
-    return make_classification(n_samples=1000, n_features=100, n_informative=75,
-                               random_state=1111, n_classes=2, class_sep=2.5, )
+    return make_classification(
+        n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2, class_sep=2.5
+    )
 
 
 # TODO: fix
 @pytest.mark.skip()
 def test_PCA(dataset):
     X, y = dataset
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
-                                                        random_state=1111)
-    p = PCA(50, solver='eigen')
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)
+    p = PCA(50, solver="eigen")
 
     # fit PCA with training set, not the entire dataset
     p.fit(X_train)
diff --git a/mla/tests/test_regression_accuracy.py b/mla/tests/test_regression_accuracy.py
index 81e65ded..3483992a 100644
--- a/mla/tests/test_regression_accuracy.py
+++ b/mla/tests/test_regression_accuracy.py
@@ -14,15 +14,14 @@
 
 
 # Generate a random regression problem
-X, y = make_regression(n_samples=1000, n_features=10,
-                       n_informative=10, n_targets=1, noise=0.05,
-                       random_state=1111, bias=0.5)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
-                                                    random_state=1111)
+X, y = make_regression(
+    n_samples=1000, n_features=10, n_informative=10, n_targets=1, noise=0.05, random_state=1111, bias=0.5
+)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)
 
 
 def test_linear():
-    model = LinearRegression(lr=0.01, max_iters=2000, penalty='l2', C=0.003)
+    model = LinearRegression(lr=0.01, max_iters=2000, penalty="l2", C=0.003)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
     assert mean_squared_error(y_test, predictions) < 0.25
@@ -31,15 +30,15 @@ def test_linear():
 def test_mlp():
     model = NeuralNet(
         layers=[
-            Dense(16, Parameters(init='normal')),
-            Activation('linear'),
-            Dense(8, Parameters(init='normal')),
-            Activation('linear'),
+            Dense(16, Parameters(init="normal")),
+            Activation("linear"),
+            Dense(8, Parameters(init="normal")),
+            Activation("linear"),
             Dense(1),
         ],
-        loss='mse',
+        loss="mse",
         optimizer=Adam(),
-        metric='mse',
+        metric="mse",
         batch_size=64,
         max_epochs=150,
     )
diff --git a/mla/tsne.py b/mla/tsne.py
index 5727c3b1..7e4f0bdd 100644
--- a/mla/tsne.py
+++ b/mla/tsne.py
@@ -19,7 +19,7 @@
 class TSNE(BaseEstimator):
     y_required = False
 
-    def __init__(self, n_components=2, perplexity=30., max_iter=200, learning_rate=500):
+    def __init__(self, n_components=2, perplexity=30.0, max_iter=200, learning_rate=500):
         """A t-Distributed Stochastic Neighbor Embedding implementation.
 
         Parameters
@@ -88,7 +88,7 @@ def _get_pairwise_affinities(self, X):
             affines[i, :] = self._binary_search(distances[i], target_entropy)
 
         # Fill diagonal with near zero value
-        np.fill_diagonal(affines, 1.e-12)
+        np.fill_diagonal(affines, 1.0e-12)
 
         affines = affines.clip(min=1e-100)
         affines = (affines + affines.T) / (2 * self.n_samples)
@@ -97,15 +97,15 @@ def _get_pairwise_affinities(self, X):
     def _binary_search(self, dist, target_entropy):
         """Performs binary search to find suitable precision."""
         precision_min = 0
-        precision_max = 1.e15
-        precision = 1.e5
+        precision_max = 1.0e15
+        precision = 1.0e5
 
         for _ in range(self.perplexity_tries):
-            denom = np.sum(np.exp(-dist[dist > 0.] / precision))
+            denom = np.sum(np.exp(-dist[dist > 0.0] / precision))
             beta = np.exp(-dist / precision) / denom
 
             # Exclude zeros
-            g_beta = beta[beta > 0.]
+            g_beta = beta[beta > 0.0]
             entropy = -np.sum(g_beta * np.log2(g_beta))
 
             error = entropy - target_entropy
@@ -113,11 +113,11 @@ def _binary_search(self, dist, target_entropy):
             if error > 0:
                 # Decrease precision
                 precision_max = precision
-                precision = (precision + precision_min) / 2.
+                precision = (precision + precision_min) / 2.0
             else:
                 # Increase precision
                 precision_min = precision
-                precision = (precision + precision_max) / 2.
+                precision = (precision + precision_max) / 2.0
 
             if np.abs(error) < self.tol:
                 break

From d2de145bf1000b638867e4972769ab76e9910f1a Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Mon, 22 Jul 2019 14:13:53 +0300
Subject: [PATCH 28/49] Fix plot for k-means. Resolves #54

---
 mla/kmeans.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mla/kmeans.py b/mla/kmeans.py
index b54484c1..60d80c29 100644
--- a/mla/kmeans.py
+++ b/mla/kmeans.py
@@ -131,7 +131,7 @@ def _is_converged(self, centroids_old, centroids):
 
     def plot(self, ax=None, holdon=False):
         sns.set(style="white")
-
+        palette = sns.color_palette("hls", self.K + 1)
         data = self.X
 
         if ax is None:
@@ -139,7 +139,7 @@ def plot(self, ax=None, holdon=False):
 
         for i, index in enumerate(self.clusters):
             point = np.array(data[index]).T
-            ax.scatter(*point, c=sns.color_palette("hls", self.K + 1)[i])
+            ax.scatter(*point, c=[palette[i], ])
 
         for point in self.centroids:
             ax.scatter(*point, marker="x", linewidths=10)

From ba450f87c41e4757d776f59e66aed388024a0276 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Sun, 25 Aug 2019 19:59:26 +0300
Subject: [PATCH 29/49] Minor fixes

---
 examples/rbm.py                            | 2 +-
 examples/rl_deep_q_learning.py             | 2 +-
 mla/base/__init__.py                       | 1 +
 mla/base/base.py                           | 1 +
 mla/datasets/__init__.py                   | 1 +
 mla/datasets/base.py                       | 3 ++-
 mla/ensemble/__init__.py                   | 1 +
 mla/ensemble/base.py                       | 1 +
 mla/ensemble/gbm.py                        | 1 +
 mla/ensemble/random_forest.py              | 1 +
 mla/ensemble/tree.py                       | 1 +
 mla/fm.py                                  | 2 ++
 mla/gaussian_mixture.py                    | 4 +++-
 mla/kmeans.py                              | 2 ++
 mla/knn.py                                 | 2 ++
 mla/linear_models.py                       | 2 ++
 mla/metrics/__init__.py                    | 1 +
 mla/metrics/base.py                        | 1 +
 mla/metrics/distance.py                    | 1 +
 mla/metrics/metrics.py                     | 3 ++-
 mla/naive_bayes.py                         | 2 ++
 mla/neuralnet/activations.py               | 2 +-
 mla/neuralnet/initializations.py           | 2 +-
 mla/neuralnet/layers/__init__.py           | 1 +
 mla/neuralnet/layers/basic.py              | 1 +
 mla/neuralnet/layers/convnet.py            | 6 +++---
 mla/neuralnet/layers/normalization.py      | 1 +
 mla/neuralnet/layers/recurrent/__init__.py | 1 +
 mla/neuralnet/layers/recurrent/lstm.py     | 1 +
 mla/neuralnet/layers/recurrent/rnn.py      | 1 +
 mla/neuralnet/loss.py                      | 3 +--
 mla/neuralnet/nnet.py                      | 5 +++--
 mla/neuralnet/optimizers.py                | 3 ++-
 mla/neuralnet/parameters.py                | 1 +
 mla/pca.py                                 | 5 +++--
 mla/rbm.py                                 | 3 ++-
 mla/rl/__init__.py                         | 2 +-
 mla/rl/dqn.py                              | 1 +
 mla/svm/__init__.py                        | 2 +-
 mla/svm/kernerls.py                        | 1 +
 mla/tsne.py                                | 1 +
 setup.cfg                                  | 5 ++++-
 42 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/examples/rbm.py b/examples/rbm.py
index 2d167644..74f2a772 100644
--- a/examples/rbm.py
+++ b/examples/rbm.py
@@ -13,7 +13,7 @@ def print_curve(rbm):
     def moving_average(a, n=25):
         ret = np.cumsum(a, dtype=float)
         ret[n:] = ret[n:] - ret[:-n]
-        return ret[n - 1 :] / n
+        return ret[n - 1:] / n
 
     plt.plot(moving_average(rbm.errors))
     plt.show()
diff --git a/examples/rl_deep_q_learning.py b/examples/rl_deep_q_learning.py
index 5626cd3a..15a39ffd 100644
--- a/examples/rl_deep_q_learning.py
+++ b/examples/rl_deep_q_learning.py
@@ -31,7 +31,7 @@ def mlp_model(n_actions, batch_size=64):
     # You can stop training process using Ctrl+C signal
     # Read more about this problem: https://gym.openai.com/envs/CartPole-v0
     model.train(render=False)
-except:
+except KeyboardInterrupt:
     pass
 # Render trained model
 model.play(episodes=100)
diff --git a/mla/base/__init__.py b/mla/base/__init__.py
index 9b5ed21c..0ffd952c 100644
--- a/mla/base/__init__.py
+++ b/mla/base/__init__.py
@@ -1 +1,2 @@
+# coding:utf-8
 from .base import *
diff --git a/mla/base/base.py b/mla/base/base.py
index 64998c36..ea82a20d 100644
--- a/mla/base/base.py
+++ b/mla/base/base.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import numpy as np
 
 
diff --git a/mla/datasets/__init__.py b/mla/datasets/__init__.py
index b1992552..d9f114e9 100644
--- a/mla/datasets/__init__.py
+++ b/mla/datasets/__init__.py
@@ -1 +1,2 @@
+# coding:utf-8
 from mla.datasets.base import *
diff --git a/mla/datasets/base.py b/mla/datasets/base.py
index a7b9ed63..eb1aca87 100644
--- a/mla/datasets/base.py
+++ b/mla/datasets/base.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import os
 import numpy as np
 
@@ -60,7 +61,7 @@ def load_nietzsche():
     sentences = []
     next_chars = []
     for i in range(0, len(text) - maxlen, step):
-        sentences.append(text[i : i + maxlen])
+        sentences.append(text[i: i + maxlen])
         next_chars.append(text[i + maxlen])
 
     X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
diff --git a/mla/ensemble/__init__.py b/mla/ensemble/__init__.py
index 45cb543e..bf3c9c05 100644
--- a/mla/ensemble/__init__.py
+++ b/mla/ensemble/__init__.py
@@ -1 +1,2 @@
+# coding:utf-8
 from .random_forest import RandomForestClassifier, RandomForestRegressor
diff --git a/mla/ensemble/base.py b/mla/ensemble/base.py
index a8850f11..2ba41b2e 100644
--- a/mla/ensemble/base.py
+++ b/mla/ensemble/base.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import numpy as np
 from scipy import stats
 
diff --git a/mla/ensemble/gbm.py b/mla/ensemble/gbm.py
index 37db62cf..c84159ab 100644
--- a/mla/ensemble/gbm.py
+++ b/mla/ensemble/gbm.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import numpy as np
 
 # logistic function
diff --git a/mla/ensemble/random_forest.py b/mla/ensemble/random_forest.py
index 2052b7a4..263a1685 100644
--- a/mla/ensemble/random_forest.py
+++ b/mla/ensemble/random_forest.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import numpy as np
 
 from mla.base import BaseEstimator
diff --git a/mla/ensemble/tree.py b/mla/ensemble/tree.py
index 1127c668..83c90f34 100644
--- a/mla/ensemble/tree.py
+++ b/mla/ensemble/tree.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import random
 
 import numpy as np
diff --git a/mla/fm.py b/mla/fm.py
index a43b04c7..bfb320f7 100644
--- a/mla/fm.py
+++ b/mla/fm.py
@@ -1,3 +1,5 @@
+# coding:utf-8
+
 from mla.base import BaseEstimator
 from mla.metrics import mean_squared_error, binary_crossentropy
 import autograd.numpy as np
diff --git a/mla/gaussian_mixture.py b/mla/gaussian_mixture.py
index 3b6b9d3c..a39e6e91 100644
--- a/mla/gaussian_mixture.py
+++ b/mla/gaussian_mixture.py
@@ -1,3 +1,5 @@
+# coding:utf-8
+
 import random
 import numpy as np
 from scipy.stats import multivariate_normal
@@ -65,7 +67,7 @@ def _initialize(self):
         self.weights = np.ones(self.K)
         if self.init == "random":
             self.means = [self.X[x] for x in random.sample(range(self.n_samples), self.K)]
-            self.covs = [np.cov(self.X.T) for _ in range(K)]
+            self.covs = [np.cov(self.X.T) for _ in range(self.K)]
 
         elif self.init == "kmeans":
             kmeans = KMeans(K=self.K, max_iters=self.max_iters // 3, init="++")
diff --git a/mla/kmeans.py b/mla/kmeans.py
index 60d80c29..d96711cb 100644
--- a/mla/kmeans.py
+++ b/mla/kmeans.py
@@ -1,3 +1,5 @@
+# coding:utf-8
+
 import random
 import seaborn as sns
 import matplotlib.pyplot as plt
diff --git a/mla/knn.py b/mla/knn.py
index 307c0243..30bdd339 100644
--- a/mla/knn.py
+++ b/mla/knn.py
@@ -1,3 +1,5 @@
+# coding:utf-8
+
 from collections import Counter
 
 import numpy as np
diff --git a/mla/linear_models.py b/mla/linear_models.py
index 537fa03c..d7d4e9c9 100644
--- a/mla/linear_models.py
+++ b/mla/linear_models.py
@@ -1,3 +1,5 @@
+# coding:utf-8
+
 import logging
 
 import autograd.numpy as np
diff --git a/mla/metrics/__init__.py b/mla/metrics/__init__.py
index 1761d1aa..db3e0b36 100644
--- a/mla/metrics/__init__.py
+++ b/mla/metrics/__init__.py
@@ -1 +1,2 @@
+# coding:utf-8
 from .metrics import *
diff --git a/mla/metrics/base.py b/mla/metrics/base.py
index dff0217c..065e5bcb 100644
--- a/mla/metrics/base.py
+++ b/mla/metrics/base.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import numpy as np
 
 
diff --git a/mla/metrics/distance.py b/mla/metrics/distance.py
index 263ea68a..dcfe3abe 100644
--- a/mla/metrics/distance.py
+++ b/mla/metrics/distance.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import numpy as np
 import math
 
diff --git a/mla/metrics/metrics.py b/mla/metrics/metrics.py
index a74fb90c..9fb20ded 100644
--- a/mla/metrics/metrics.py
+++ b/mla/metrics/metrics.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import autograd.numpy as np
 
 EPS = 1e-15
@@ -83,5 +84,5 @@ def get_metric(name):
     """Return metric function by name"""
     try:
         return globals()[name]
-    except:
+    except Exception:
         raise ValueError("Invalid metric function.")
diff --git a/mla/naive_bayes.py b/mla/naive_bayes.py
index d98ca801..ec0d499b 100644
--- a/mla/naive_bayes.py
+++ b/mla/naive_bayes.py
@@ -1,3 +1,5 @@
+# coding:utf-8
+
 import numpy as np
 from mla.base import BaseEstimator
 from mla.neuralnet.activations import softmax
diff --git a/mla/neuralnet/activations.py b/mla/neuralnet/activations.py
index 9ec7ffd7..949cdf75 100644
--- a/mla/neuralnet/activations.py
+++ b/mla/neuralnet/activations.py
@@ -47,5 +47,5 @@ def get_activation(name):
     """Return activation function by name"""
     try:
         return globals()[name]
-    except:
+    except Exception:
         raise ValueError("Invalid activation function.")
diff --git a/mla/neuralnet/initializations.py b/mla/neuralnet/initializations.py
index a380f459..f67fe9f6 100644
--- a/mla/neuralnet/initializations.py
+++ b/mla/neuralnet/initializations.py
@@ -71,5 +71,5 @@ def get_initializer(name):
     """Returns initialization function by the name."""
     try:
         return globals()[name]
-    except:
+    except Exception:
         raise ValueError("Invalid initialization function.")
diff --git a/mla/neuralnet/layers/__init__.py b/mla/neuralnet/layers/__init__.py
index 5f4690ad..9b727049 100644
--- a/mla/neuralnet/layers/__init__.py
+++ b/mla/neuralnet/layers/__init__.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 from .basic import *
 from .convnet import *
 from .normalization import *
diff --git a/mla/neuralnet/layers/basic.py b/mla/neuralnet/layers/basic.py
index dc719a68..119855e2 100644
--- a/mla/neuralnet/layers/basic.py
+++ b/mla/neuralnet/layers/basic.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import autograd.numpy as np
 from autograd import elementwise_grad
 
diff --git a/mla/neuralnet/layers/convnet.py b/mla/neuralnet/layers/convnet.py
index 4d4a8291..485706c1 100644
--- a/mla/neuralnet/layers/convnet.py
+++ b/mla/neuralnet/layers/convnet.py
@@ -145,7 +145,7 @@ def image_to_column(images, filter_shape, stride, padding):
         y_bound = y + stride[0] * out_height
         for x in range(f_width):
             x_bound = x + stride[1] * out_width
-            col[:, :, y, x, :, :] = images[:, :, y : y_bound : stride[0], x : x_bound : stride[1]]
+            col[:, :, y, x, :, :] = images[:, :, y: y_bound: stride[0], x: x_bound: stride[1]]
 
     col = col.transpose(0, 4, 5, 1, 2, 3).reshape(n_images * out_height * out_width, -1)
     return col
@@ -177,9 +177,9 @@ def column_to_image(columns, images_shape, filter_shape, stride, padding):
         y_bound = y + stride[0] * out_height
         for x in range(f_width):
             x_bound = x + stride[1] * out_width
-            img[:, :, y : y_bound : stride[0], x : x_bound : stride[1]] += columns[:, :, y, x, :, :]
+            img[:, :, y: y_bound: stride[0], x: x_bound: stride[1]] += columns[:, :, y, x, :, :]
 
-    return img[:, :, padding[0] : height + padding[0], padding[1] : width + padding[1]]
+    return img[:, :, padding[0]: height + padding[0], padding[1]: width + padding[1]]
 
 
 def convoltuion_shape(img_height, img_width, filter_shape, stride, padding):
diff --git a/mla/neuralnet/layers/normalization.py b/mla/neuralnet/layers/normalization.py
index a51b4e81..6a1ae28d 100644
--- a/mla/neuralnet/layers/normalization.py
+++ b/mla/neuralnet/layers/normalization.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 from mla.neuralnet.layers import Layer, PhaseMixin, ParamMixin
 from mla.neuralnet.parameters import Parameters
 import numpy as np
diff --git a/mla/neuralnet/layers/recurrent/__init__.py b/mla/neuralnet/layers/recurrent/__init__.py
index 0eca30b8..a6e70cd9 100644
--- a/mla/neuralnet/layers/recurrent/__init__.py
+++ b/mla/neuralnet/layers/recurrent/__init__.py
@@ -1,2 +1,3 @@
+# coding:utf-8
 from .rnn import *
 from .lstm import *
diff --git a/mla/neuralnet/layers/recurrent/lstm.py b/mla/neuralnet/layers/recurrent/lstm.py
index f298a200..8d3ade4f 100644
--- a/mla/neuralnet/layers/recurrent/lstm.py
+++ b/mla/neuralnet/layers/recurrent/lstm.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import autograd.numpy as np
 from autograd import elementwise_grad
 from six.moves import range
diff --git a/mla/neuralnet/layers/recurrent/rnn.py b/mla/neuralnet/layers/recurrent/rnn.py
index 2c2687d6..2b848080 100644
--- a/mla/neuralnet/layers/recurrent/rnn.py
+++ b/mla/neuralnet/layers/recurrent/rnn.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import autograd.numpy as np
 from autograd import elementwise_grad
 from six.moves import range
diff --git a/mla/neuralnet/loss.py b/mla/neuralnet/loss.py
index 829050df..30def7e9 100644
--- a/mla/neuralnet/loss.py
+++ b/mla/neuralnet/loss.py
@@ -5,8 +5,7 @@
 
 def get_loss(name):
     """Returns loss function by the name."""
-
     try:
         return globals()[name]
-    except:
+    except KeyError:
         raise ValueError("Invalid metric function.")
diff --git a/mla/neuralnet/nnet.py b/mla/neuralnet/nnet.py
index aa5f3c96..2809fb0b 100644
--- a/mla/neuralnet/nnet.py
+++ b/mla/neuralnet/nnet.py
@@ -13,8 +13,9 @@
 
 """
 Architecture inspired from:
-https://github.com/fchollet/keras
-https://github.com/andersbll/deeppy
+
+    https://github.com/fchollet/keras
+    https://github.com/andersbll/deeppy
 """
 
 
diff --git a/mla/neuralnet/optimizers.py b/mla/neuralnet/optimizers.py
index c1cef27d..a0a08c4f 100644
--- a/mla/neuralnet/optimizers.py
+++ b/mla/neuralnet/optimizers.py
@@ -9,7 +9,8 @@
 
 """
 References:
-Gradient descent optimization algorithms  http://sebastianruder.com/optimizing-gradient-descent/index.html
+
+    Gradient descent optimization algorithms  http://sebastianruder.com/optimizing-gradient-descent/index.html
 """
 
 
diff --git a/mla/neuralnet/parameters.py b/mla/neuralnet/parameters.py
index 39ef9783..65873c26 100644
--- a/mla/neuralnet/parameters.py
+++ b/mla/neuralnet/parameters.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import numpy as np
 
 from mla.neuralnet.initializations import get_initializer
diff --git a/mla/pca.py b/mla/pca.py
index d495f68f..adc076a7 100644
--- a/mla/pca.py
+++ b/mla/pca.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 from scipy.linalg import svd
 import numpy as np
 import logging
@@ -47,8 +48,8 @@ def _decompose(self, X):
 
         s_squared = s ** 2
         variance_ratio = s_squared / (s_squared).sum()
-        logging.info("Explained variance ratio: %s" % (variance_ratio[0 : self.n_components]))
-        self.components = Vh[0 : self.n_components]
+        logging.info("Explained variance ratio: %s" % (variance_ratio[0: self.n_components]))
+        self.components = Vh[0: self.n_components]
 
     def transform(self, X):
         X = X.copy()
diff --git a/mla/rbm.py b/mla/rbm.py
index ec1ee0b6..f7c56b1d 100644
--- a/mla/rbm.py
+++ b/mla/rbm.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import logging
 
 from mla.base import BaseEstimator
@@ -52,7 +53,7 @@ def _init_weights(self):
         self.errors = []
 
     def _train(self):
-        """Use CD-1 training procedure, basically an exact inference for `positive_associations`, 
+        """Use CD-1 training procedure, basically an exact inference for `positive_associations`,
         followed by a "non burn-in" block Gibbs Sampling for the `negative_associations`."""
 
         for i in range(self.max_epochs):
diff --git a/mla/rl/__init__.py b/mla/rl/__init__.py
index 8b137891..f512deae 100644
--- a/mla/rl/__init__.py
+++ b/mla/rl/__init__.py
@@ -1 +1 @@
-
+# coding:utf-8
diff --git a/mla/rl/dqn.py b/mla/rl/dqn.py
index 2816899b..0b79412e 100644
--- a/mla/rl/dqn.py
+++ b/mla/rl/dqn.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import logging
 import random
 
diff --git a/mla/svm/__init__.py b/mla/svm/__init__.py
index 8b137891..f512deae 100644
--- a/mla/svm/__init__.py
+++ b/mla/svm/__init__.py
@@ -1 +1 @@
-
+# coding:utf-8
diff --git a/mla/svm/kernerls.py b/mla/svm/kernerls.py
index 45d6a00a..da289a18 100644
--- a/mla/svm/kernerls.py
+++ b/mla/svm/kernerls.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import numpy as np
 import scipy.spatial.distance as dist
 
diff --git a/mla/tsne.py b/mla/tsne.py
index 7e4f0bdd..880408d3 100644
--- a/mla/tsne.py
+++ b/mla/tsne.py
@@ -1,3 +1,4 @@
+# coding:utf-8
 import logging
 
 import numpy as np
diff --git a/setup.cfg b/setup.cfg
index cb4a338e..1e036754 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,4 +2,7 @@
 universal=1
 
 [metadata]
-description-file=README.md
\ No newline at end of file
+description-file=README.md
+
+[flake8]
+max-line-length = 120

From fff1c2832bf7dfbe16c5ed4be42b2a856db407e5 Mon Sep 17 00:00:00 2001
From: Antetokounpo <antor.232@outlook.com>
Date: Tue, 1 Oct 2019 19:34:27 -0400
Subject: [PATCH 30/49] Add Adamax optimizer and unit test

---
 mla/neuralnet/optimizers.py            | 28 ++++++++++++++++++++++++++
 mla/neuralnet/tests/test_optimizers.py |  2 ++
 2 files changed, 30 insertions(+)

diff --git a/mla/neuralnet/optimizers.py b/mla/neuralnet/optimizers.py
index a0a08c4f..9ce4d6c5 100644
--- a/mla/neuralnet/optimizers.py
+++ b/mla/neuralnet/optimizers.py
@@ -197,3 +197,31 @@ def setup(self, network):
             for n in layer.parameters.keys():
                 self.ms[i][n] = np.zeros_like(layer.parameters[n])
                 self.vs[i][n] = np.zeros_like(layer.parameters[n])
+
+class Adamax(Optimizer):
+    def __init__(self, learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-8):
+
+        self.epsilon = epsilon
+        self.beta_2 = beta_2
+        self.beta_1 = beta_1
+        self.lr = learning_rate
+        self.t = 1
+
+    def update(self, network):
+        for i, layer in enumerate(network.parametric_layers):
+            for n in layer.parameters.keys():
+                grad = layer.parameters.grad[n]
+                self.ms[i][n] = self.beta_1 * self.ms[i][n] + (1.0 - self.beta_1) * grad
+                self.us[i][n] = np.maximum(self.beta_2 * self.us[i][n], np.abs(grad))
+
+                step = self.lr / (1 - self.beta_1 ** self.t) * self.ms[i][n]/(self.us[i][n] + self.epsilon)
+                layer.parameters.step(n, -step)
+        self.t += 1
+
+    def setup(self, network):
+        self.ms = defaultdict(dict)
+        self.us = defaultdict(dict)
+        for i, layer in enumerate(network.parametric_layers):
+            for n in layer.parameters.keys():
+                self.ms[i][n] = np.zeros_like(layer.parameters[n])
+                self.us[i][n] = np.zeros_like(layer.parameters[n])
diff --git a/mla/neuralnet/tests/test_optimizers.py b/mla/neuralnet/tests/test_optimizers.py
index d3fc2a65..863924ea 100644
--- a/mla/neuralnet/tests/test_optimizers.py
+++ b/mla/neuralnet/tests/test_optimizers.py
@@ -46,6 +46,8 @@ def test_adadelta():
 def test_adam():
     assert clasifier(Adam()) > 0.9
 
+def test_adamax():
+    assert clasifier(Adamax()) > 0.9
 
 def test_rmsprop():
     assert clasifier(RMSprop()) > 0.9

From 0d1022ee46e3296825e2e743bc8f050c226b17ad Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Wed, 2 Oct 2019 15:35:46 +0300
Subject: [PATCH 31/49] Improve SVM test

---
 mla/tests/test_classification_accuracy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mla/tests/test_classification_accuracy.py b/mla/tests/test_classification_accuracy.py
index 6dd73089..f4fb42aa 100644
--- a/mla/tests/test_classification_accuracy.py
+++ b/mla/tests/test_classification_accuracy.py
@@ -50,8 +50,8 @@ def test_svm_classification():
     y_signed_train = (y_train * 2) - 1
     y_signed_test = (y_test * 2) - 1
 
-    for kernel in [RBF(gamma=0.1), Linear()]:
-        model = SVM(max_iter=250, kernel=kernel)
+    for kernel in [RBF(gamma=0.05), Linear()]:
+        model = SVM(max_iter=500, kernel=kernel)
         model.fit(X_train, y_signed_train)
         predictions = model.predict(X_test)
         assert accuracy(y_signed_test, predictions) >= 0.8

From 8ba4818ba77b63f930932bfcfaa4da60985a5e1c Mon Sep 17 00:00:00 2001
From: Mohit Juneja <mohit0920@icloud.com>
Date: Thu, 10 Oct 2019 12:40:05 +0530
Subject: [PATCH 32/49] sh in documentation

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 2198ede9..d0acc8b5 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@ All algorithms are implemented in Python, using numpy, scipy and autograd.
 
 
 ### Installation
+```sh
         git clone https://github.com/rushter/MLAlgorithms
         cd MLAlgorithms
         pip install scipy numpy

From 81b6b181dc46081c79db723a8fea18c4013ae9c6 Mon Sep 17 00:00:00 2001
From: Mohit Juneja <mohit0920@icloud.com>
Date: Thu, 10 Oct 2019 12:41:17 +0530
Subject: [PATCH 33/49] improved documentation

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index d0acc8b5..bd3b4aa5 100644
--- a/README.md
+++ b/README.md
@@ -29,17 +29,19 @@ All algorithms are implemented in Python, using numpy, scipy and autograd.
         cd MLAlgorithms
         pip install scipy numpy
         python setup.py develop
-
+```
 ### How to run examples without installation
+```sh
         cd MLAlgorithms
         python -m examples.linear_models
-
+```
 ### How to run examples within Docker
+```sh
         cd MLAlgorithms
         docker build -t mlalgorithms .
         docker run --rm -it mlalgorithms bash
         python -m examples.linear_models
-
+```
 ### Contributing
 
 Your contributions are always welcome!  

From 601c817cb4e24180142986bf2bbafa78a9a8078c Mon Sep 17 00:00:00 2001
From: nachtsky1077 <44351210+nachtsky1077@users.noreply.github.com>
Date: Fri, 29 Nov 2019 13:33:39 +0800
Subject: [PATCH 34/49] fix typo when calculating intercept in simplified SMO
 algorithm

---
 mla/svm/svm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mla/svm/svm.py b/mla/svm/svm.py
index 37922bf9..2874645c 100644
--- a/mla/svm/svm.py
+++ b/mla/svm/svm.py
@@ -75,7 +75,7 @@ def _train(self):
                 b1 = (
                     self.b
                     - e_i
-                    - self.y[i] * (self.alpha[i] - alpha_jo) * self.K[i, i]
+                    - self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, i]
                     - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[i, j]
                 )
                 b2 = (

From 4558cd390b57c7bd32824aa49dd073a2a45cc259 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Wed, 1 Jan 2020 20:40:00 +0300
Subject: [PATCH 35/49] Update copyright

---
 LICENSE | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/LICENSE b/LICENSE
index 41ce622e..6620ff65 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2016 Artem Golubin
+Copyright (c) 2016-2020 Artem Golubin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+SOFTWARE.

From bcf95627fa1ac2d92fa760e8c7db8e4017f8ccf4 Mon Sep 17 00:00:00 2001
From: "Musketeer.Liu" <Musketeer.Liu@gmail.com>
Date: Sun, 23 Feb 2020 23:44:38 -0800
Subject: [PATCH 36/49] Fix the rbm example via delete unnecessary decode on
 str Object in mla/datasets/base.py line 54

---
 mla/datasets/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mla/datasets/base.py b/mla/datasets/base.py
index eb1aca87..efab346f 100644
--- a/mla/datasets/base.py
+++ b/mla/datasets/base.py
@@ -51,7 +51,7 @@ def load(dataset="training", digits=np.arange(10)):
 
 
 def load_nietzsche():
-    text = open(get_filename("data/nietzsche.txt")).read().decode("utf-8").lower()
+    text = open(get_filename("data/nietzsche.txt")).read().lower() # str Object has already been decoded
     chars = set(list(text))
     char_indices = {ch: i for i, ch in enumerate(chars)}
     indices_char = {i: ch for i, ch in enumerate(chars)}

From 68522f2e6d6cb5fc8331acbb2d2ad5ac11dac981 Mon Sep 17 00:00:00 2001
From: "Musketeer.Liu" <Musketeer.Liu@gmail.com>
Date: Mon, 24 Feb 2020 11:19:10 -0800
Subject: [PATCH 37/49] Change open function mode for decode issue

---
 mla/datasets/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mla/datasets/base.py b/mla/datasets/base.py
index efab346f..7133abcd 100644
--- a/mla/datasets/base.py
+++ b/mla/datasets/base.py
@@ -51,7 +51,7 @@ def load(dataset="training", digits=np.arange(10)):
 
 
 def load_nietzsche():
-    text = open(get_filename("data/nietzsche.txt")).read().lower() # str Object has already been decoded
+    text = open(get_filename("data/nietzsche.txt", "rt")).read().decode("utf-8").lower()
     chars = set(list(text))
     char_indices = {ch: i for i, ch in enumerate(chars)}
     indices_char = {i: ch for i, ch in enumerate(chars)}

From ad58bf49f23548fbe442f78cd6161ba5aa8fa97e Mon Sep 17 00:00:00 2001
From: "Musketeer.Liu" <Musketeer.Liu@gmail.com>
Date: Wed, 26 Feb 2020 01:07:45 -0800
Subject: [PATCH 38/49] Add Mode as rt to avoid str Object decoding issue

---
 mla/datasets/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mla/datasets/base.py b/mla/datasets/base.py
index 7133abcd..0a242129 100644
--- a/mla/datasets/base.py
+++ b/mla/datasets/base.py
@@ -51,7 +51,7 @@ def load(dataset="training", digits=np.arange(10)):
 
 
 def load_nietzsche():
-    text = open(get_filename("data/nietzsche.txt", "rt")).read().decode("utf-8").lower()
+    text = open(get_filename("data/nietzsche.txt"), "rt").read().lower()
     chars = set(list(text))
     char_indices = {ch: i for i, ch in enumerate(chars)}
     indices_char = {i: ch for i, ch in enumerate(chars)}

From 0b9ebf3a99ab810d95b6a93af368ade92e474ec0 Mon Sep 17 00:00:00 2001
From: Shihab Shahriar <redoykhan555@gmail.com>
Date: Thu, 28 May 2020 20:17:03 +0600
Subject: [PATCH 39/49] resolved classification bug in decision tree using
 n_classes

---
 examples/random_forest.py     | 17 ++++++++++++-----
 mla/ensemble/random_forest.py |  8 +++++++-
 mla/ensemble/tree.py          | 16 ++++++++++------
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/examples/random_forest.py b/examples/random_forest.py
index 39f47377..e1408244 100644
--- a/examples/random_forest.py
+++ b/examples/random_forest.py
@@ -1,8 +1,11 @@
+from timeit import default_timer
+start = default_timer()
 import logging
 
-from sklearn.datasets import make_classification
+import numpy as np
+from sklearn.datasets import make_classification, load_boston, load_digits, load_breast_cancer, load_iris
 from sklearn.datasets import make_regression
-from sklearn.metrics import roc_auc_score
+from sklearn.metrics import roc_auc_score, accuracy_score
 
 try:
     from sklearn.model_selection import train_test_split
@@ -20,13 +23,15 @@ def classification():
     X, y = make_classification(
         n_samples=500, n_features=10, n_informative=10, random_state=1111, n_classes=2, class_sep=2.5, n_redundant=0
     )
+    #X,y = load_breast_cancer(return_X_y=True)   
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111)
 
-    model = RandomForestClassifier(n_estimators=10, max_depth=4)
+    model = RandomForestClassifier(n_estimators=5, max_depth=4)
     model.fit(X_train, y_train)
-    predictions = model.predict(X_test)[:, 1]
-    # print(predictions)
+    predictions = model.predict(X_test)[:,1]
+    #predictions = np.argmax(model.predict(X_test),axis=1)
+    print(predictions.shape)
     print("classification, roc auc score: %s" % roc_auc_score(y_test, predictions))
 
 
@@ -46,3 +51,5 @@ def regression():
 if __name__ == "__main__":
     classification()
     # regression()
+    end = default_timer()
+    print(end-start)
\ No newline at end of file
diff --git a/mla/ensemble/random_forest.py b/mla/ensemble/random_forest.py
index 263a1685..14d430e4 100644
--- a/mla/ensemble/random_forest.py
+++ b/mla/ensemble/random_forest.py
@@ -39,6 +39,7 @@ def fit(self, X, y):
         self._train()
 
     def _train(self):
+        n_classes = None if self.trees[0].regression else len(np.unique(self.y))
         for tree in self.trees:
             tree.train(
                 self.X,
@@ -46,6 +47,7 @@ def _train(self):
                 max_features=self.max_features,
                 min_samples_split=self.min_samples_split,
                 max_depth=self.max_depth,
+                n_classes=n_classes
             )
 
     def _predict(self, X=None):
@@ -78,10 +80,14 @@ def _predict(self, X=None):
         for i in range(X.shape[0]):
             row_pred = np.zeros(y_shape)
             for tree in self.trees:
-                row_pred += tree.predict_row(X[i, :])
+                tmp = tree.predict_row(X[i, :])
+                print(tmp,row_pred.shape,row_pred)
+                row_pred += tmp
+
 
             row_pred /= self.n_estimators
             predictions[i, :] = row_pred
+            print(f"i={i},{row_pred}\n")
         return predictions
 
 
diff --git a/mla/ensemble/tree.py b/mla/ensemble/tree.py
index 83c90f34..dd87a684 100644
--- a/mla/ensemble/tree.py
+++ b/mla/ensemble/tree.py
@@ -64,7 +64,8 @@ def _find_best_split(self, X, target, n_features):
                     max_col, max_val, max_gain = column, value, gain
         return max_col, max_val, max_gain
 
-    def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01, loss=None):
+    def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, 
+                minimum_gain=0.01, loss=None, n_classes = None):
         """Build a decision tree from training set.
 
         Parameters
@@ -84,6 +85,8 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
             Minimum gain required for splitting.
         loss : function, default None
             Loss function for gradient boosting.
+        n_classes : int, default None
+            No of unique labels in case of classification
         """
 
         if not isinstance(target, dict):
@@ -118,17 +121,17 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
             # Grow left and right child
             self.left_child = Tree(self.regression, self.criterion)
             self.left_child.train(
-                left_X, left_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss
+                left_X, left_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss, n_classes
             )
 
             self.right_child = Tree(self.regression, self.criterion)
             self.right_child.train(
-                right_X, right_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss
+                right_X, right_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss, n_classes
             )
         except AssertionError:
-            self._calculate_leaf_value(target)
+            self._calculate_leaf_value(target, n_classes)
 
-    def _calculate_leaf_value(self, targets):
+    def _calculate_leaf_value(self, targets, n_classes):
         """Find optimal value for leaf."""
         if self.loss is not None:
             # Gradient boosting
@@ -140,7 +143,8 @@ def _calculate_leaf_value(self, targets):
                 self.outcome = np.mean(targets["y"])
             else:
                 # Probability for classification task
-                self.outcome = stats.itemfreq(targets["y"])[:, 1] / float(targets["y"].shape[0])
+                #self.outcome = stats.itemfreq(targets["y"])[:, 1] / float(targets["y"].shape[0])
+                self.outcome = np.bincount(targets["y"], minlength=n_classes) / targets["y"].shape[0]
 
     def predict_row(self, row):
         """Predict single row."""

From 30e4c49f443cfa57f82d7b3b8e6974bb59405baf Mon Sep 17 00:00:00 2001
From: Shihab Shahriar <redoykhan555@gmail.com>
Date: Fri, 29 May 2020 18:32:01 +0600
Subject: [PATCH 40/49] small code cleanup

---
 examples/random_forest.py     | 19 ++++++++-----------
 mla/ensemble/random_forest.py |  6 +-----
 mla/ensemble/tree.py          |  5 ++---
 3 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/examples/random_forest.py b/examples/random_forest.py
index e1408244..ad0c2261 100644
--- a/examples/random_forest.py
+++ b/examples/random_forest.py
@@ -1,9 +1,7 @@
-from timeit import default_timer
-start = default_timer()
 import logging
 
 import numpy as np
-from sklearn.datasets import make_classification, load_boston, load_digits, load_breast_cancer, load_iris
+from sklearn.datasets import make_classification
 from sklearn.datasets import make_regression
 from sklearn.metrics import roc_auc_score, accuracy_score
 
@@ -23,16 +21,17 @@ def classification():
     X, y = make_classification(
         n_samples=500, n_features=10, n_informative=10, random_state=1111, n_classes=2, class_sep=2.5, n_redundant=0
     )
-    #X,y = load_breast_cancer(return_X_y=True)   
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111)
 
-    model = RandomForestClassifier(n_estimators=5, max_depth=4)
+    model = RandomForestClassifier(n_estimators=10, max_depth=4)
     model.fit(X_train, y_train)
-    predictions = model.predict(X_test)[:,1]
-    #predictions = np.argmax(model.predict(X_test),axis=1)
-    print(predictions.shape)
-    print("classification, roc auc score: %s" % roc_auc_score(y_test, predictions))
+
+    predictions_prob = model.predict(X_test)[:, 1]
+    predictions = np.argmax(model.predict(X_test), axis=1)
+    #print(predictions.shape)
+    print("classification, roc auc score: %s" % roc_auc_score(y_test, predictions_prob))
+    print("classification, accuracy score: %s" % accuracy_score(y_test, predictions))
 
 
 def regression():
@@ -51,5 +50,3 @@ def regression():
 if __name__ == "__main__":
     classification()
     # regression()
-    end = default_timer()
-    print(end-start)
\ No newline at end of file
diff --git a/mla/ensemble/random_forest.py b/mla/ensemble/random_forest.py
index 14d430e4..3cc78155 100644
--- a/mla/ensemble/random_forest.py
+++ b/mla/ensemble/random_forest.py
@@ -80,14 +80,10 @@ def _predict(self, X=None):
         for i in range(X.shape[0]):
             row_pred = np.zeros(y_shape)
             for tree in self.trees:
-                tmp = tree.predict_row(X[i, :])
-                print(tmp,row_pred.shape,row_pred)
-                row_pred += tmp
-
+                row_pred += tree.predict_row(X[i, :])
 
             row_pred /= self.n_estimators
             predictions[i, :] = row_pred
-            print(f"i={i},{row_pred}\n")
         return predictions
 
 
diff --git a/mla/ensemble/tree.py b/mla/ensemble/tree.py
index dd87a684..5f95ca45 100644
--- a/mla/ensemble/tree.py
+++ b/mla/ensemble/tree.py
@@ -65,7 +65,7 @@ def _find_best_split(self, X, target, n_features):
         return max_col, max_val, max_gain
 
     def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, 
-                minimum_gain=0.01, loss=None, n_classes = None):
+                minimum_gain=0.01, loss=None, n_classes=None):
         """Build a decision tree from training set.
 
         Parameters
@@ -85,7 +85,7 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
             Minimum gain required for splitting.
         loss : function, default None
             Loss function for gradient boosting.
-        n_classes : int, default None
+        n_classes : int or None
             No of unique labels in case of classification
         """
 
@@ -143,7 +143,6 @@ def _calculate_leaf_value(self, targets, n_classes):
                 self.outcome = np.mean(targets["y"])
             else:
                 # Probability for classification task
-                #self.outcome = stats.itemfreq(targets["y"])[:, 1] / float(targets["y"].shape[0])
                 self.outcome = np.bincount(targets["y"], minlength=n_classes) / targets["y"].shape[0]
 
     def predict_row(self, row):

From fbb730844d8704bc2fe73899797e42e41627990b Mon Sep 17 00:00:00 2001
From: Shihab Shahriar <redoykhan555@gmail.com>
Date: Fri, 29 May 2020 19:25:18 +0600
Subject: [PATCH 41/49] resolved bug when all labels in leaf node are 0

---
 mla/ensemble/tree.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mla/ensemble/tree.py b/mla/ensemble/tree.py
index 5f95ca45..780e4ec8 100644
--- a/mla/ensemble/tree.py
+++ b/mla/ensemble/tree.py
@@ -65,7 +65,7 @@ def _find_best_split(self, X, target, n_features):
         return max_col, max_val, max_gain
 
     def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, 
-                minimum_gain=0.01, loss=None, n_classes=None):
+                minimum_gain=0.01, loss=None, n_classes=2):
         """Build a decision tree from training set.
 
         Parameters
@@ -85,7 +85,7 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
             Minimum gain required for splitting.
         loss : function, default None
             Loss function for gradient boosting.
-        n_classes : int or None
+        n_classes : int
             No of unique labels in case of classification
         """
 

From 60ee2893c255920626d116595ebafc30e0b6e144 Mon Sep 17 00:00:00 2001
From: Shihab Shahriar <redoykhan555@gmail.com>
Date: Fri, 29 May 2020 20:12:48 +0600
Subject: [PATCH 42/49] refactored Decision Tree to support n_classes

---
 mla/ensemble/random_forest.py |  4 +-
 mla/ensemble/tree.py          | 84 +++++++++++++++++++----------------
 2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/mla/ensemble/random_forest.py b/mla/ensemble/random_forest.py
index 3cc78155..98d98e7b 100644
--- a/mla/ensemble/random_forest.py
+++ b/mla/ensemble/random_forest.py
@@ -39,15 +39,13 @@ def fit(self, X, y):
         self._train()
 
     def _train(self):
-        n_classes = None if self.trees[0].regression else len(np.unique(self.y))
         for tree in self.trees:
             tree.train(
                 self.X,
                 self.y,
                 max_features=self.max_features,
                 min_samples_split=self.min_samples_split,
-                max_depth=self.max_depth,
-                n_classes=n_classes
+                max_depth=self.max_depth
             )
 
     def _predict(self, X=None):
diff --git a/mla/ensemble/tree.py b/mla/ensemble/tree.py
index 780e4ec8..959378a1 100644
--- a/mla/ensemble/tree.py
+++ b/mla/ensemble/tree.py
@@ -12,7 +12,7 @@
 class Tree(object):
     """Recursive implementation of decision tree."""
 
-    def __init__(self, regression=False, criterion=None):
+    def __init__(self, regression=False, criterion=None, n_classes=None):
         self.regression = regression
         self.impurity = None
         self.threshold = None
@@ -20,6 +20,7 @@ def __init__(self, regression=False, criterion=None):
         self.outcome = None
         self.criterion = criterion
         self.loss = None
+        self.n_classes = n_classes  #Only for classification
 
         self.left_child = None
         self.right_child = None
@@ -64,8 +65,43 @@ def _find_best_split(self, X, target, n_features):
                     max_col, max_val, max_gain = column, value, gain
         return max_col, max_val, max_gain
 
-    def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, 
-                minimum_gain=0.01, loss=None, n_classes=2):
+    def _train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01):
+        try:
+            # Exit from recursion using assert syntax
+            assert X.shape[0] > min_samples_split
+            assert max_depth > 0
+
+            if max_features is None:
+                max_features = X.shape[1]
+
+            column, value, gain = self._find_best_split(X, target, max_features)
+            assert gain is not None
+            if self.regression:
+                assert gain != 0
+            else:
+                assert gain > minimum_gain
+
+            self.column_index = column
+            self.threshold = value
+            self.impurity = gain
+
+            # Split dataset
+            left_X, right_X, left_target, right_target = split_dataset(X, target, column, value)
+
+            # Grow left and right child
+            self.left_child = Tree(self.regression, self.criterion, self.n_classes)
+            self.left_child._train(
+                left_X, left_target, max_features, min_samples_split, max_depth - 1, minimum_gain
+            )
+
+            self.right_child = Tree(self.regression, self.criterion, self.n_classes)
+            self.right_child._train(
+                right_X, right_target, max_features, min_samples_split, max_depth - 1, minimum_gain
+            )
+        except AssertionError:
+            self._calculate_leaf_value(target)
+
+    def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01, loss=None):
         """Build a decision tree from training set.
 
         Parameters
@@ -85,8 +121,6 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
             Minimum gain required for splitting.
         loss : function, default None
             Loss function for gradient boosting.
-        n_classes : int
-            No of unique labels in case of classification
         """
 
         if not isinstance(target, dict):
@@ -96,42 +130,14 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
         if loss is not None:
             self.loss = loss
 
-        try:
-            # Exit from recursion using assert syntax
-            assert X.shape[0] > min_samples_split
-            assert max_depth > 0
-
-            if max_features is None:
-                max_features = X.shape[1]
-
-            column, value, gain = self._find_best_split(X, target, max_features)
-            assert gain is not None
-            if self.regression:
-                assert gain != 0
-            else:
-                assert gain > minimum_gain
-
-            self.column_index = column
-            self.threshold = value
-            self.impurity = gain
+        if self.regression==False:
+            self.n_classes = len(np.unique(target['y']))
 
-            # Split dataset
-            left_X, right_X, left_target, right_target = split_dataset(X, target, column, value)
-
-            # Grow left and right child
-            self.left_child = Tree(self.regression, self.criterion)
-            self.left_child.train(
-                left_X, left_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss, n_classes
-            )
+        self._train(X, target, max_features=max_features, min_samples_split=min_samples_split,
+                    max_depth=max_depth, minimum_gain=minimum_gain)
 
-            self.right_child = Tree(self.regression, self.criterion)
-            self.right_child.train(
-                right_X, right_target, max_features, min_samples_split, max_depth - 1, minimum_gain, loss, n_classes
-            )
-        except AssertionError:
-            self._calculate_leaf_value(target, n_classes)
 
-    def _calculate_leaf_value(self, targets, n_classes):
+    def _calculate_leaf_value(self, targets):
         """Find optimal value for leaf."""
         if self.loss is not None:
             # Gradient boosting
@@ -143,7 +149,7 @@ def _calculate_leaf_value(self, targets, n_classes):
                 self.outcome = np.mean(targets["y"])
             else:
                 # Probability for classification task
-                self.outcome = np.bincount(targets["y"], minlength=n_classes) / targets["y"].shape[0]
+                self.outcome = np.bincount(targets["y"], minlength=self.n_classes) / targets["y"].shape[0]
 
     def predict_row(self, row):
         """Predict single row."""

From 7a62559bf415ea256adadfa8b66b605113216ac4 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Thu, 20 Aug 2020 22:18:01 +0300
Subject: [PATCH 43/49] Create python-app.yml

---
 .github/workflows/python-app.yml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 .github/workflows/python-app.yml

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
new file mode 100644
index 00000000..0c51d852
--- /dev/null
+++ b/.github/workflows/python-app.yml
@@ -0,0 +1,28 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Python application
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        pip install -r requirements.txt
+    - name: Test with pytest
+      run: |
+        pytest

From 8b8fbb291c7700242714ed982501bf02cf41758e Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Thu, 20 Aug 2020 22:20:13 +0300
Subject: [PATCH 44/49] Remove TravisCI

---
 .github/workflows/python-app.yml |  1 +
 .travis.yml                      | 29 -----------------------------
 2 files changed, 1 insertion(+), 29 deletions(-)
 delete mode 100644 .travis.yml

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 0c51d852..d88e2e6e 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -11,6 +11,7 @@ on:
 
 jobs:
   build:
+    timeout-minutes: 5
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 9efaa8b2..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-language: python
-python:
-  # We don't actually use the Travis Python, but this keeps it organized.
-  - "2.7"
-  - "3.5"
-install:
-  - sudo apt-get update
-  # We do this conditionally because it saves us some downloading if the
-  # version is the same.
-  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
-      wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
-    else
-      wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
-    fi
-  - bash miniconda.sh -b -p $HOME/miniconda
-  - export PATH="$HOME/miniconda/bin:$PATH"
-  - hash -r
-  - conda config --set always_yes yes --set changeps1 no
-  - conda update -q conda
-  - conda info -a
-
-  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy scikit-learn
-  - source activate test-environment
-  - pip install pytest
-  - pip install -r requirements.txt
-
-
-script:
-  - py.test

From 9f9e0615f5ef6960e6b79a925ecda1ec6816c171 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Thu, 20 Aug 2020 22:26:04 +0300
Subject: [PATCH 45/49] Minor edits to CI config

---
 .github/workflows/python-app.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index d88e2e6e..b3ff0e27 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -1,7 +1,4 @@
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Python application
+name: Tests
 
 on:
   push:

From ffbe4085e204205e61bec192c463239b2cfc46fe Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Thu, 20 Aug 2020 22:43:06 +0300
Subject: [PATCH 46/49] Minor fixes

---
 mla/base/base.py            | 8 +++-----
 mla/datasets/base.py        | 4 +++-
 mla/neuralnet/optimizers.py | 1 +
 mla/pca.py                  | 2 +-
 mla/rbm.py                  | 2 +-
 mla/svm/svm.py              | 8 ++------
 requirements.txt            | 1 +
 7 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/mla/base/base.py b/mla/base/base.py
index ea82a20d..caa71e0b 100644
--- a/mla/base/base.py
+++ b/mla/base/base.py
@@ -2,9 +2,7 @@
 import numpy as np
 
 
-class BaseEstimator(object):
-    X = None
-    y = None
+class BaseEstimator:
     y_required = True
     fit_required = True
 
@@ -28,7 +26,7 @@ def _setup_input(self, X, y=None):
             X = np.array(X)
 
         if X.size == 0:
-            raise ValueError("Number of features must be > 0")
+            raise ValueError("Got an empty matrix.")
 
         if X.ndim == 1:
             self.n_samples, self.n_features = 1, X.shape
@@ -45,7 +43,7 @@ def _setup_input(self, X, y=None):
                 y = np.array(y)
 
             if y.size == 0:
-                raise ValueError("Number of targets must be > 0")
+                raise ValueError("The targets array must be no-empty.")
 
         self.y = y
 
diff --git a/mla/datasets/base.py b/mla/datasets/base.py
index 0a242129..a7f9cede 100644
--- a/mla/datasets/base.py
+++ b/mla/datasets/base.py
@@ -19,6 +19,8 @@ def load(dataset="training", digits=np.arange(10)):
         elif dataset == "test":
             fname_img = get_filename("data/mnist/t10k-images-idx3-ubyte")
             fname_lbl = get_filename("data/mnist/t10k-labels-idx1-ubyte")
+        else:
+            raise ValueError("Unexpected dataset name: %r" % dataset)
 
         flbl = open(fname_lbl, "rb")
         magic_nr, size = struct.unpack(">II", flbl.read(8))
@@ -36,7 +38,7 @@ def load(dataset="training", digits=np.arange(10)):
         images = zeros((N, rows, cols), dtype=uint8)
         labels = zeros((N, 1), dtype=int8)
         for i in range(len(ind)):
-            images[i] = array(img[ind[i] * rows * cols : (ind[i] + 1) * rows * cols]).reshape((rows, cols))
+            images[i] = array(img[ind[i] * rows * cols: (ind[i] + 1) * rows * cols]).reshape((rows, cols))
             labels[i] = lbl[ind[i]]
 
         return images, labels
diff --git a/mla/neuralnet/optimizers.py b/mla/neuralnet/optimizers.py
index 9ce4d6c5..9baa8b73 100644
--- a/mla/neuralnet/optimizers.py
+++ b/mla/neuralnet/optimizers.py
@@ -198,6 +198,7 @@ def setup(self, network):
                 self.ms[i][n] = np.zeros_like(layer.parameters[n])
                 self.vs[i][n] = np.zeros_like(layer.parameters[n])
 
+
 class Adamax(Optimizer):
     def __init__(self, learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-8):
 
diff --git a/mla/pca.py b/mla/pca.py
index adc076a7..6d4f440c 100644
--- a/mla/pca.py
+++ b/mla/pca.py
@@ -47,7 +47,7 @@ def _decompose(self, X):
             Vh = Vh.T
 
         s_squared = s ** 2
-        variance_ratio = s_squared / (s_squared).sum()
+        variance_ratio = s_squared / s_squared.sum()
         logging.info("Explained variance ratio: %s" % (variance_ratio[0: self.n_components]))
         self.components = Vh[0: self.n_components]
 
diff --git a/mla/rbm.py b/mla/rbm.py
index f7c56b1d..5462cfd4 100644
--- a/mla/rbm.py
+++ b/mla/rbm.py
@@ -64,7 +64,7 @@ def _train(self):
                 positive_associations = np.dot(batch.T, positive_hidden)
 
                 negative_visible = sigmoid(np.dot(hidden_states, self.W.T) + self.bias_v)
-                negative_visible = self._sample(negative_visible)  # use the samped hidden state h1 to sample v1
+                negative_visible = self._sample(negative_visible)  # use the sampled hidden state h1 to sample v1
                 negative_hidden = sigmoid(np.dot(negative_visible, self.W) + self.bias_h)
                 negative_associations = np.dot(negative_visible.T, negative_hidden)
 
diff --git a/mla/svm/svm.py b/mla/svm/svm.py
index 2874645c..31ed794c 100644
--- a/mla/svm/svm.py
+++ b/mla/svm/svm.py
@@ -73,15 +73,11 @@ def _train(self):
 
                 # Find intercept
                 b1 = (
-                    self.b
-                    - e_i
-                    - self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, i]
+                    self.b - e_i - self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, i]
                     - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[i, j]
                 )
                 b2 = (
-                    self.b
-                    - e_j
-                    - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[j, j]
+                    self.b - e_j - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[j, j]
                     - self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, j]
                 )
                 if 0 < self.alpha[i] < self.C:
diff --git a/requirements.txt b/requirements.txt
index ddb89721..df2b88f2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ scipy>=0.18.0
 seaborn>=0.7.1
 six>=1.10.0
 autograd>=1.1.7
+gym

From 2378011a2c9cf203a1ee9598ce958e3901db8c29 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Thu, 20 Aug 2020 22:45:02 +0300
Subject: [PATCH 47/49] Style fixes

---
 mla/datasets/base.py                       | 1 +
 mla/ensemble/gbm.py                        | 1 -
 mla/ensemble/random_forest.py              | 1 -
 mla/fm.py                                  | 5 +++--
 mla/gaussian_mixture.py                    | 4 +++-
 mla/kmeans.py                              | 4 ++--
 mla/metrics/distance.py                    | 3 ++-
 mla/naive_bayes.py                         | 1 +
 mla/neuralnet/layers/normalization.py      | 3 ++-
 mla/neuralnet/layers/recurrent/__init__.py | 2 +-
 mla/neuralnet/layers/recurrent/lstm.py     | 1 -
 mla/neuralnet/layers/recurrent/rnn.py      | 1 -
 mla/neuralnet/loss.py                      | 1 -
 mla/neuralnet/optimizers.py                | 2 +-
 mla/neuralnet/regularizers.py              | 2 +-
 mla/neuralnet/tests/test_activations.py    | 1 +
 mla/neuralnet/tests/test_optimizers.py     | 4 +++-
 mla/pca.py                                 | 5 +++--
 mla/rbm.py                                 | 4 ++--
 mla/rl/dqn.py                              | 1 -
 mla/svm/svm.py                             | 6 ++++--
 mla/tests/test_reduction.py                | 3 +--
 mla/tests/test_regression_accuracy.py      | 1 -
 mla/tsne.py                                | 1 -
 requirements.txt                           | 1 -
 25 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/mla/datasets/base.py b/mla/datasets/base.py
index a7f9cede..9aa30c77 100644
--- a/mla/datasets/base.py
+++ b/mla/datasets/base.py
@@ -1,5 +1,6 @@
 # coding:utf-8
 import os
+
 import numpy as np
 
 
diff --git a/mla/ensemble/gbm.py b/mla/ensemble/gbm.py
index c84159ab..7a956616 100644
--- a/mla/ensemble/gbm.py
+++ b/mla/ensemble/gbm.py
@@ -1,6 +1,5 @@
 # coding:utf-8
 import numpy as np
-
 # logistic function
 from scipy.special import expit
 
diff --git a/mla/ensemble/random_forest.py b/mla/ensemble/random_forest.py
index 263a1685..8a0d5edf 100644
--- a/mla/ensemble/random_forest.py
+++ b/mla/ensemble/random_forest.py
@@ -4,7 +4,6 @@
 from mla.base import BaseEstimator
 from mla.ensemble.base import information_gain, mse_criterion
 from mla.ensemble.tree import Tree
-from six.moves import range
 
 
 class RandomForest(BaseEstimator):
diff --git a/mla/fm.py b/mla/fm.py
index bfb320f7..85964a99 100644
--- a/mla/fm.py
+++ b/mla/fm.py
@@ -1,10 +1,11 @@
 # coding:utf-8
 
-from mla.base import BaseEstimator
-from mla.metrics import mean_squared_error, binary_crossentropy
 import autograd.numpy as np
 from autograd import elementwise_grad
 
+from mla.base import BaseEstimator
+from mla.metrics import mean_squared_error, binary_crossentropy
+
 np.random.seed(9999)
 
 """
diff --git a/mla/gaussian_mixture.py b/mla/gaussian_mixture.py
index a39e6e91..d2f1b9b2 100644
--- a/mla/gaussian_mixture.py
+++ b/mla/gaussian_mixture.py
@@ -1,9 +1,11 @@
 # coding:utf-8
 
 import random
+
+import matplotlib.pyplot as plt
 import numpy as np
 from scipy.stats import multivariate_normal
-import matplotlib.pyplot as plt
+
 from mla.base import BaseEstimator
 from mla.kmeans import KMeans
 
diff --git a/mla/kmeans.py b/mla/kmeans.py
index d96711cb..261de8e1 100644
--- a/mla/kmeans.py
+++ b/mla/kmeans.py
@@ -1,10 +1,10 @@
 # coding:utf-8
 
 import random
-import seaborn as sns
-import matplotlib.pyplot as plt
 
+import matplotlib.pyplot as plt
 import numpy as np
+import seaborn as sns
 
 from mla.base import BaseEstimator
 from mla.metrics.distance import euclidean_distance
diff --git a/mla/metrics/distance.py b/mla/metrics/distance.py
index dcfe3abe..919d4650 100644
--- a/mla/metrics/distance.py
+++ b/mla/metrics/distance.py
@@ -1,7 +1,8 @@
 # coding:utf-8
-import numpy as np
 import math
 
+import numpy as np
+
 
 def euclidean_distance(a, b):
     if isinstance(a, list) and isinstance(b, list):
diff --git a/mla/naive_bayes.py b/mla/naive_bayes.py
index ec0d499b..4b7f4cd2 100644
--- a/mla/naive_bayes.py
+++ b/mla/naive_bayes.py
@@ -1,6 +1,7 @@
 # coding:utf-8
 
 import numpy as np
+
 from mla.base import BaseEstimator
 from mla.neuralnet.activations import softmax
 
diff --git a/mla/neuralnet/layers/normalization.py b/mla/neuralnet/layers/normalization.py
index 6a1ae28d..4f601a81 100644
--- a/mla/neuralnet/layers/normalization.py
+++ b/mla/neuralnet/layers/normalization.py
@@ -1,7 +1,8 @@
 # coding:utf-8
+import numpy as np
+
 from mla.neuralnet.layers import Layer, PhaseMixin, ParamMixin
 from mla.neuralnet.parameters import Parameters
-import numpy as np
 
 """
 References:
diff --git a/mla/neuralnet/layers/recurrent/__init__.py b/mla/neuralnet/layers/recurrent/__init__.py
index a6e70cd9..390b5754 100644
--- a/mla/neuralnet/layers/recurrent/__init__.py
+++ b/mla/neuralnet/layers/recurrent/__init__.py
@@ -1,3 +1,3 @@
 # coding:utf-8
-from .rnn import *
 from .lstm import *
+from .rnn import *
diff --git a/mla/neuralnet/layers/recurrent/lstm.py b/mla/neuralnet/layers/recurrent/lstm.py
index 8d3ade4f..e0b4ce0f 100644
--- a/mla/neuralnet/layers/recurrent/lstm.py
+++ b/mla/neuralnet/layers/recurrent/lstm.py
@@ -1,7 +1,6 @@
 # coding:utf-8
 import autograd.numpy as np
 from autograd import elementwise_grad
-from six.moves import range
 
 from mla.neuralnet.activations import sigmoid
 from mla.neuralnet.initializations import get_initializer
diff --git a/mla/neuralnet/layers/recurrent/rnn.py b/mla/neuralnet/layers/recurrent/rnn.py
index 2b848080..3110a261 100644
--- a/mla/neuralnet/layers/recurrent/rnn.py
+++ b/mla/neuralnet/layers/recurrent/rnn.py
@@ -1,7 +1,6 @@
 # coding:utf-8
 import autograd.numpy as np
 from autograd import elementwise_grad
-from six.moves import range
 
 from mla.neuralnet.initializations import get_initializer
 from mla.neuralnet.layers import Layer, get_activation, ParamMixin
diff --git a/mla/neuralnet/loss.py b/mla/neuralnet/loss.py
index 30def7e9..8be4dbe3 100644
--- a/mla/neuralnet/loss.py
+++ b/mla/neuralnet/loss.py
@@ -1,5 +1,4 @@
 from ..metrics import mse, logloss, mae, hinge, binary_crossentropy
-
 categorical_crossentropy = logloss
 
 
diff --git a/mla/neuralnet/optimizers.py b/mla/neuralnet/optimizers.py
index 9baa8b73..fa254870 100644
--- a/mla/neuralnet/optimizers.py
+++ b/mla/neuralnet/optimizers.py
@@ -215,7 +215,7 @@ def update(self, network):
                 self.ms[i][n] = self.beta_1 * self.ms[i][n] + (1.0 - self.beta_1) * grad
                 self.us[i][n] = np.maximum(self.beta_2 * self.us[i][n], np.abs(grad))
 
-                step = self.lr / (1 - self.beta_1 ** self.t) * self.ms[i][n]/(self.us[i][n] + self.epsilon)
+                step = self.lr / (1 - self.beta_1 ** self.t) * self.ms[i][n] / (self.us[i][n] + self.epsilon)
                 layer.parameters.step(n, -step)
         self.t += 1
 
diff --git a/mla/neuralnet/regularizers.py b/mla/neuralnet/regularizers.py
index d674590b..53bc3b37 100644
--- a/mla/neuralnet/regularizers.py
+++ b/mla/neuralnet/regularizers.py
@@ -1,6 +1,6 @@
 # coding:utf-8
-from autograd import elementwise_grad
 import numpy as np
+from autograd import elementwise_grad
 
 
 class Regularizer(object):
diff --git a/mla/neuralnet/tests/test_activations.py b/mla/neuralnet/tests/test_activations.py
index 9b16dc06..fc5de9ad 100644
--- a/mla/neuralnet/tests/test_activations.py
+++ b/mla/neuralnet/tests/test_activations.py
@@ -1,4 +1,5 @@
 import sys
+
 import numpy as np
 
 from mla.neuralnet.activations import *
diff --git a/mla/neuralnet/tests/test_optimizers.py b/mla/neuralnet/tests/test_optimizers.py
index 863924ea..a42b5036 100644
--- a/mla/neuralnet/tests/test_optimizers.py
+++ b/mla/neuralnet/tests/test_optimizers.py
@@ -1,6 +1,6 @@
-from sklearn.model_selection import train_test_split
 from sklearn.datasets import make_classification
 from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import train_test_split
 
 from mla.neuralnet import NeuralNet
 from mla.neuralnet.layers import Dense, Activation, Dropout, Parameters
@@ -46,9 +46,11 @@ def test_adadelta():
 def test_adam():
     assert clasifier(Adam()) > 0.9
 
+
 def test_adamax():
     assert clasifier(Adamax()) > 0.9
 
+
 def test_rmsprop():
     assert clasifier(RMSprop()) > 0.9
 
diff --git a/mla/pca.py b/mla/pca.py
index 6d4f440c..64d6a614 100644
--- a/mla/pca.py
+++ b/mla/pca.py
@@ -1,8 +1,9 @@
 # coding:utf-8
-from scipy.linalg import svd
-import numpy as np
 import logging
 
+import numpy as np
+from scipy.linalg import svd
+
 from mla.base import BaseEstimator
 
 np.random.seed(1000)
diff --git a/mla/rbm.py b/mla/rbm.py
index 5462cfd4..f74234ef 100644
--- a/mla/rbm.py
+++ b/mla/rbm.py
@@ -1,10 +1,10 @@
 # coding:utf-8
 import logging
 
-from mla.base import BaseEstimator
-from scipy.special import expit
 import numpy as np
+from scipy.special import expit
 
+from mla.base import BaseEstimator
 from mla.utils import batch_iterator
 
 np.random.seed(9999)
diff --git a/mla/rl/dqn.py b/mla/rl/dqn.py
index 0b79412e..ec8c6c06 100644
--- a/mla/rl/dqn.py
+++ b/mla/rl/dqn.py
@@ -5,7 +5,6 @@
 import gym
 import numpy as np
 from gym import wrappers
-from six.moves import range
 
 np.random.seed(9999)
 
diff --git a/mla/svm/svm.py b/mla/svm/svm.py
index 31ed794c..b9695e13 100644
--- a/mla/svm/svm.py
+++ b/mla/svm/svm.py
@@ -1,8 +1,10 @@
 # coding:utf-8
+import logging
+
+import numpy as np
+
 from mla.base import BaseEstimator
 from mla.svm.kernerls import Linear
-import numpy as np
-import logging
 
 np.random.seed(9999)
 
diff --git a/mla/tests/test_reduction.py b/mla/tests/test_reduction.py
index 4946ea6f..da87fc82 100644
--- a/mla/tests/test_reduction.py
+++ b/mla/tests/test_reduction.py
@@ -1,8 +1,7 @@
 # coding=utf-8
 import pytest
-
-from sklearn.metrics import roc_auc_score
 from sklearn.datasets import make_classification
+from sklearn.metrics import roc_auc_score
 
 try:
     from sklearn.model_selection import train_test_split
diff --git a/mla/tests/test_regression_accuracy.py b/mla/tests/test_regression_accuracy.py
index 3483992a..5c13b7f7 100644
--- a/mla/tests/test_regression_accuracy.py
+++ b/mla/tests/test_regression_accuracy.py
@@ -12,7 +12,6 @@
 from mla.neuralnet.optimizers import Adam
 from mla.neuralnet.parameters import Parameters
 
-
 # Generate a random regression problem
 X, y = make_regression(
     n_samples=1000, n_features=10, n_informative=10, n_targets=1, noise=0.05, random_state=1111, bias=0.5
diff --git a/mla/tsne.py b/mla/tsne.py
index 880408d3..c76dc89f 100644
--- a/mla/tsne.py
+++ b/mla/tsne.py
@@ -2,7 +2,6 @@
 import logging
 
 import numpy as np
-from six.moves import range
 
 from mla.base import BaseEstimator
 from mla.metrics.distance import l2_distance
diff --git a/requirements.txt b/requirements.txt
index df2b88f2..45f053d0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,5 @@ numpy>=1.11.1
 scikit-learn>=0.18
 scipy>=0.18.0
 seaborn>=0.7.1
-six>=1.10.0
 autograd>=1.1.7
 gym

From ef5aca0ae36334b0596062553339c47f076304b2 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Mon, 14 Sep 2020 01:13:43 +0300
Subject: [PATCH 48/49] Apply suggestions from code review

---
 mla/ensemble/tree.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mla/ensemble/tree.py b/mla/ensemble/tree.py
index 959378a1..0b4e9769 100644
--- a/mla/ensemble/tree.py
+++ b/mla/ensemble/tree.py
@@ -20,7 +20,7 @@ def __init__(self, regression=False, criterion=None, n_classes=None):
         self.outcome = None
         self.criterion = criterion
         self.loss = None
-        self.n_classes = n_classes  #Only for classification
+        self.n_classes = n_classes  # Only for classification
 
         self.left_child = None
         self.right_child = None
@@ -130,7 +130,7 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
         if loss is not None:
             self.loss = loss
 
-        if self.regression==False:
+        if not self.regression:
             self.n_classes = len(np.unique(target['y']))
 
         self._train(X, target, max_features=max_features, min_samples_split=min_samples_split,

From d20d3130d601c38148117bf9e5956de8d99a79ae Mon Sep 17 00:00:00 2001
From: Paul Inder <35228969+paulinder@users.noreply.github.com>
Date: Sun, 30 Jan 2022 18:51:43 -0500
Subject: [PATCH 49/49] fix gradient descent optimization algorithm link

sebastian ruder updated the domain of his site.
---
 mla/neuralnet/optimizers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mla/neuralnet/optimizers.py b/mla/neuralnet/optimizers.py
index fa254870..fc9ae1bb 100644
--- a/mla/neuralnet/optimizers.py
+++ b/mla/neuralnet/optimizers.py
@@ -10,7 +10,7 @@
 """
 References:
 
-    Gradient descent optimization algorithms  http://sebastianruder.com/optimizing-gradient-descent/index.html
+    Gradient descent optimization algorithms  https://ruder.io/optimizing-gradient-descent/
 """