From edf9a137a8173618ab43bf7b164516c47b1ae88d Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Mon, 16 Jun 2025 01:11:28 +0400
Subject: [PATCH 1/3] Update github actions

---
 .github/workflows/python-app.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index b3ff0e27..a5a8d93e 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -11,11 +11,11 @@ jobs:
     timeout-minutes: 5
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v2
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v4
       with:
-        python-version: 3.8
+        python-version: 3.12
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

From 4e93532b6f6907de9199b2b649ae6557688d5eb6 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Mon, 16 Jun 2025 01:15:27 +0400
Subject: [PATCH 2/3] Format using ruff

---
 mla/datasets/base.py                      |  6 ++-
 mla/ensemble/base.py                      | 11 +++-
 mla/ensemble/gbm.py                       | 14 +++++-
 mla/ensemble/random_forest.py             | 29 +++++++++--
 mla/ensemble/tree.py                      | 61 +++++++++++++++++++----
 mla/fm.py                                 | 18 +++++--
 mla/gaussian_mixture.py                   | 16 ++++--
 mla/kmeans.py                             | 18 +++++--
 mla/knn.py                                |  5 +-
 mla/linear_models.py                      |  4 +-
 mla/metrics/metrics.py                    |  4 +-
 mla/naive_bayes.py                        |  2 +-
 mla/neuralnet/constraints.py              |  4 +-
 mla/neuralnet/layers/convnet.py           | 51 ++++++++++++++-----
 mla/neuralnet/layers/normalization.py     |  8 +--
 mla/neuralnet/layers/recurrent/lstm.py    | 41 ++++++++++++---
 mla/neuralnet/layers/recurrent/rnn.py     | 15 +++++-
 mla/neuralnet/loss.py                     |  1 +
 mla/neuralnet/nnet.py                     | 10 +++-
 mla/neuralnet/optimizers.py               | 47 ++++++++++++-----
 mla/neuralnet/parameters.py               |  9 +++-
 mla/neuralnet/regularizers.py             |  4 +-
 mla/neuralnet/tests/test_activations.py   |  4 +-
 mla/neuralnet/tests/test_optimizers.py    | 11 +++-
 mla/pca.py                                |  8 +--
 mla/rbm.py                                | 27 +++++++---
 mla/rl/dqn.py                             | 20 ++++++--
 mla/svm/svm.py                            | 12 +++--
 mla/tests/test_classification_accuracy.py | 16 ++++--
 mla/tests/test_reduction.py               | 11 +++-
 mla/tests/test_regression_accuracy.py     | 12 ++++-
 mla/tsne.py                               |  8 ++-
 32 files changed, 397 insertions(+), 110 deletions(-)

diff --git a/mla/datasets/base.py b/mla/datasets/base.py
index 9aa30c77..efefbcd9 100644
--- a/mla/datasets/base.py
+++ b/mla/datasets/base.py
@@ -39,7 +39,9 @@ def load(dataset="training", digits=np.arange(10)):
         images = zeros((N, rows, cols), dtype=uint8)
         labels = zeros((N, 1), dtype=int8)
         for i in range(len(ind)):
-            images[i] = array(img[ind[i] * rows * cols: (ind[i] + 1) * rows * cols]).reshape((rows, cols))
+            images[i] = array(
+                img[ind[i] * rows * cols : (ind[i] + 1) * rows * cols]
+            ).reshape((rows, cols))
             labels[i] = lbl[ind[i]]
 
         return images, labels
@@ -64,7 +66,7 @@ def load_nietzsche():
     sentences = []
     next_chars = []
     for i in range(0, len(text) - maxlen, step):
-        sentences.append(text[i: i + maxlen])
+        sentences.append(text[i : i + maxlen])
         next_chars.append(text[i + maxlen])
 
     X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
diff --git a/mla/ensemble/base.py b/mla/ensemble/base.py
index 2ba41b2e..c1a97fb4 100644
--- a/mla/ensemble/base.py
+++ b/mla/ensemble/base.py
@@ -14,13 +14,20 @@ def f_entropy(p):
 
 
 def information_gain(y, splits):
-    splits_entropy = sum([f_entropy(split) * (float(split.shape[0]) / y.shape[0]) for split in splits])
+    splits_entropy = sum(
+        [f_entropy(split) * (float(split.shape[0]) / y.shape[0]) for split in splits]
+    )
     return f_entropy(y) - splits_entropy
 
 
 def mse_criterion(y, splits):
     y_mean = np.mean(y)
-    return -sum([np.sum((split - y_mean) ** 2) * (float(split.shape[0]) / y.shape[0]) for split in splits])
+    return -sum(
+        [
+            np.sum((split - y_mean) ** 2) * (float(split.shape[0]) / y.shape[0])
+            for split in splits
+        ]
+    )
 
 
 def xgb_criterion(y, left, right, loss):
diff --git a/mla/ensemble/gbm.py b/mla/ensemble/gbm.py
index 7a956616..58fbff3b 100644
--- a/mla/ensemble/gbm.py
+++ b/mla/ensemble/gbm.py
@@ -1,5 +1,6 @@
 # coding:utf-8
 import numpy as np
+
 # logistic function
 from scipy.special import expit
 
@@ -32,7 +33,9 @@ def hess(self, actual, predicted):
 
     def approximate(self, actual, predicted):
         """Approximate leaf value."""
-        return self.grad(actual, predicted).sum() / (self.hess(actual, predicted).sum() + self.regularization)
+        return self.grad(actual, predicted).sum() / (
+            self.hess(actual, predicted).sum() + self.regularization
+        )
 
     def transform(self, pred):
         """Transform predictions values."""
@@ -73,7 +76,14 @@ def transform(self, output):
 class GradientBoosting(BaseEstimator):
     """Gradient boosting trees with Taylor's expansion approximation (as in xgboost)."""
 
-    def __init__(self, n_estimators, learning_rate=0.1, max_features=10, max_depth=2, min_samples_split=10):
+    def __init__(
+        self,
+        n_estimators,
+        learning_rate=0.1,
+        max_features=10,
+        max_depth=2,
+        min_samples_split=10,
+    ):
         self.min_samples_split = min_samples_split
         self.learning_rate = learning_rate
         self.max_depth = max_depth
diff --git a/mla/ensemble/random_forest.py b/mla/ensemble/random_forest.py
index f4fc5491..57eddf31 100644
--- a/mla/ensemble/random_forest.py
+++ b/mla/ensemble/random_forest.py
@@ -7,7 +7,14 @@
 
 
 class RandomForest(BaseEstimator):
-    def __init__(self, n_estimators=10, max_features=None, min_samples_split=10, max_depth=None, criterion=None):
+    def __init__(
+        self,
+        n_estimators=10,
+        max_features=None,
+        min_samples_split=10,
+        max_depth=None,
+        criterion=None,
+    ):
         """Base class for RandomForest.
 
         Parameters
@@ -44,7 +51,7 @@ def _train(self):
                 self.y,
                 max_features=self.max_features,
                 min_samples_split=self.min_samples_split,
-                max_depth=self.max_depth
+                max_depth=self.max_depth,
             )
 
     def _predict(self, X=None):
@@ -52,7 +59,14 @@ def _predict(self, X=None):
 
 
 class RandomForestClassifier(RandomForest):
-    def __init__(self, n_estimators=10, max_features=None, min_samples_split=10, max_depth=None, criterion="entropy"):
+    def __init__(
+        self,
+        n_estimators=10,
+        max_features=None,
+        min_samples_split=10,
+        max_depth=None,
+        criterion="entropy",
+    ):
         super(RandomForestClassifier, self).__init__(
             n_estimators=n_estimators,
             max_features=max_features,
@@ -85,7 +99,14 @@ def _predict(self, X=None):
 
 
 class RandomForestRegressor(RandomForest):
-    def __init__(self, n_estimators=10, max_features=None, min_samples_split=10, max_depth=None, criterion="mse"):
+    def __init__(
+        self,
+        n_estimators=10,
+        max_features=None,
+        min_samples_split=10,
+        max_depth=None,
+        criterion="mse",
+    ):
         super(RandomForestRegressor, self).__init__(
             n_estimators=n_estimators,
             max_features=max_features,
diff --git a/mla/ensemble/tree.py b/mla/ensemble/tree.py
index 0b4e9769..3e6ae6f7 100644
--- a/mla/ensemble/tree.py
+++ b/mla/ensemble/tree.py
@@ -58,14 +58,24 @@ def _find_best_split(self, X, target, n_features):
                     gain = self.criterion(target["y"], splits)
                 else:
                     # Gradient boosting
-                    left, right = split_dataset(X, target, column, value, return_X=False)
+                    left, right = split_dataset(
+                        X, target, column, value, return_X=False
+                    )
                     gain = xgb_criterion(target, left, right, self.loss)
 
                 if (max_gain is None) or (gain > max_gain):
                     max_col, max_val, max_gain = column, value, gain
         return max_col, max_val, max_gain
 
-    def _train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01):
+    def _train(
+        self,
+        X,
+        target,
+        max_features=None,
+        min_samples_split=10,
+        max_depth=None,
+        minimum_gain=0.01,
+    ):
         try:
             # Exit from recursion using assert syntax
             assert X.shape[0] > min_samples_split
@@ -86,22 +96,43 @@ def _train(self, X, target, max_features=None, min_samples_split=10, max_depth=N
             self.impurity = gain
 
             # Split dataset
-            left_X, right_X, left_target, right_target = split_dataset(X, target, column, value)
+            left_X, right_X, left_target, right_target = split_dataset(
+                X, target, column, value
+            )
 
             # Grow left and right child
             self.left_child = Tree(self.regression, self.criterion, self.n_classes)
             self.left_child._train(
-                left_X, left_target, max_features, min_samples_split, max_depth - 1, minimum_gain
+                left_X,
+                left_target,
+                max_features,
+                min_samples_split,
+                max_depth - 1,
+                minimum_gain,
             )
 
             self.right_child = Tree(self.regression, self.criterion, self.n_classes)
             self.right_child._train(
-                right_X, right_target, max_features, min_samples_split, max_depth - 1, minimum_gain
+                right_X,
+                right_target,
+                max_features,
+                min_samples_split,
+                max_depth - 1,
+                minimum_gain,
             )
         except AssertionError:
             self._calculate_leaf_value(target)
 
-    def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01, loss=None):
+    def train(
+        self,
+        X,
+        target,
+        max_features=None,
+        min_samples_split=10,
+        max_depth=None,
+        minimum_gain=0.01,
+        loss=None,
+    ):
         """Build a decision tree from training set.
 
         Parameters
@@ -131,11 +162,16 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
             self.loss = loss
 
         if not self.regression:
-            self.n_classes = len(np.unique(target['y']))
-
-        self._train(X, target, max_features=max_features, min_samples_split=min_samples_split,
-                    max_depth=max_depth, minimum_gain=minimum_gain)
+            self.n_classes = len(np.unique(target["y"]))
 
+        self._train(
+            X,
+            target,
+            max_features=max_features,
+            min_samples_split=min_samples_split,
+            max_depth=max_depth,
+            minimum_gain=minimum_gain,
+        )
 
     def _calculate_leaf_value(self, targets):
         """Find optimal value for leaf."""
@@ -149,7 +185,10 @@ def _calculate_leaf_value(self, targets):
                 self.outcome = np.mean(targets["y"])
             else:
                 # Probability for classification task
-                self.outcome = np.bincount(targets["y"], minlength=self.n_classes) / targets["y"].shape[0]
+                self.outcome = (
+                    np.bincount(targets["y"], minlength=self.n_classes)
+                    / targets["y"].shape[0]
+                )
 
     def predict_row(self, row):
         """Predict single row."""
diff --git a/mla/fm.py b/mla/fm.py
index 85964a99..6e1c1423 100644
--- a/mla/fm.py
+++ b/mla/fm.py
@@ -6,6 +6,7 @@
 from mla.base import BaseEstimator
 from mla.metrics import mean_squared_error, binary_crossentropy
 
+
 np.random.seed(9999)
 
 """
@@ -16,7 +17,14 @@
 
 class BaseFM(BaseEstimator):
     def __init__(
-        self, n_components=10, max_iter=100, init_stdev=0.1, learning_rate=0.01, reg_v=0.1, reg_w=0.5, reg_w0=0.0
+        self,
+        n_components=10,
+        max_iter=100,
+        init_stdev=0.1,
+        learning_rate=0.01,
+        reg_v=0.1,
+        reg_w=0.5,
+        reg_w0=0.0,
     ):
         """Simplified factorization machines implementation using SGD optimizer."""
         self.reg_w0 = reg_w0
@@ -36,7 +44,9 @@ def fit(self, X, y=None):
         # Feature weights
         self.w = np.zeros(self.n_features)
         # Factor weights
-        self.v = np.random.normal(scale=self.init_stdev, size=(self.n_features, self.n_components))
+        self.v = np.random.normal(
+            scale=self.init_stdev, size=(self.n_features, self.n_components)
+        )
         self._train()
 
     def _train(self):
@@ -56,7 +66,9 @@ def _factor_step(self, loss):
 
     def _predict(self, X=None):
         linear_output = np.dot(X, self.w)
-        factors_output = np.sum(np.dot(X, self.v) ** 2 - np.dot(X ** 2, self.v ** 2), axis=1) / 2.0
+        factors_output = (
+            np.sum(np.dot(X, self.v) ** 2 - np.dot(X**2, self.v**2), axis=1) / 2.0
+        )
         return self.wo + linear_output + factors_output
 
 
diff --git a/mla/gaussian_mixture.py b/mla/gaussian_mixture.py
index d2f1b9b2..8ab82fb0 100644
--- a/mla/gaussian_mixture.py
+++ b/mla/gaussian_mixture.py
@@ -68,7 +68,9 @@ def _initialize(self):
         """
         self.weights = np.ones(self.K)
         if self.init == "random":
-            self.means = [self.X[x] for x in random.sample(range(self.n_samples), self.K)]
+            self.means = [
+                self.X[x] for x in random.sample(range(self.n_samples), self.K)
+            ]
             self.covs = [np.cov(self.X.T) for _ in range(self.K)]
 
         elif self.init == "kmeans":
@@ -106,7 +108,9 @@ def _M_step(self):
 
     def _is_converged(self):
         """Check if the difference of the latest two likelihood is less than the tolerance."""
-        if (len(self.likelihood) > 1) and (self.likelihood[-1] - self.likelihood[-2] <= self.tolerance):
+        if (len(self.likelihood) > 1) and (
+            self.likelihood[-1] - self.likelihood[-2] <= self.tolerance
+        ):
             return True
         return False
 
@@ -123,7 +127,9 @@ def _get_likelihood(self, data):
         n_data = data.shape[0]
         likelihoods = np.zeros([n_data, self.K])
         for c in range(self.K):
-            likelihoods[:, c] = multivariate_normal.pdf(data, self.means[c], self.covs[c])
+            likelihoods[:, c] = multivariate_normal.pdf(
+                data, self.means[c], self.covs[c]
+            )
         return likelihoods
 
     def _get_weighted_likelihood(self, likelihood):
@@ -151,7 +157,9 @@ def plot(self, data=None, ax=None, holdon=False):
         margin = 0.2
         xmax, ymax = self.X.max(axis=0) + margin
         xmin, ymin = self.X.min(axis=0) - margin
-        axis_X, axis_Y = np.meshgrid(np.arange(xmin, xmax, delta), np.arange(ymin, ymax, delta))
+        axis_X, axis_Y = np.meshgrid(
+            np.arange(xmin, xmax, delta), np.arange(ymin, ymax, delta)
+        )
 
         def grid_gaussian_pdf(mean, cov):
             grid_array = np.array(list(zip(axis_X.flatten(), axis_Y.flatten())))
diff --git a/mla/kmeans.py b/mla/kmeans.py
index 261de8e1..fb3bc513 100644
--- a/mla/kmeans.py
+++ b/mla/kmeans.py
@@ -53,7 +53,9 @@ def _initialize_centroids(self, init):
         """Set the initial centroids."""
 
         if init == "random":
-            self.centroids = [self.X[x] for x in random.sample(range(self.n_samples), self.K)]
+            self.centroids = [
+                self.X[x] for x in random.sample(range(self.n_samples), self.K)
+            ]
         elif init == "++":
             self.centroids = [random.choice(self.X)]
             while len(self.centroids) < self.K:
@@ -88,7 +90,6 @@ def _get_predictions(self):
         return predictions
 
     def _assign(self, centroids):
-
         for row in range(self.n_samples):
             for i, cluster in enumerate(self.clusters):
                 if row in cluster:
@@ -115,11 +116,13 @@ def _get_centroid(self, cluster):
 
     def _dist_from_centers(self):
         """Calculate distance from centers."""
-        return np.array([min([euclidean_distance(x, c) for c in self.centroids]) for x in self.X])
+        return np.array(
+            [min([euclidean_distance(x, c) for c in self.centroids]) for x in self.X]
+        )
 
     def _choose_next_center(self):
         distances = self._dist_from_centers()
-        squared_distances = distances ** 2
+        squared_distances = distances**2
         probs = squared_distances / squared_distances.sum()
         ind = np.random.choice(self.X.shape[0], 1, p=probs)[0]
         return self.X[ind]
@@ -141,7 +144,12 @@ def plot(self, ax=None, holdon=False):
 
         for i, index in enumerate(self.clusters):
             point = np.array(data[index]).T
-            ax.scatter(*point, c=[palette[i], ])
+            ax.scatter(
+                *point,
+                c=[
+                    palette[i],
+                ],
+            )
 
         for point in self.centroids:
             ax.scatter(*point, marker="x", linewidths=10)
diff --git a/mla/knn.py b/mla/knn.py
index 30bdd339..f24c1d79 100644
--- a/mla/knn.py
+++ b/mla/knn.py
@@ -40,7 +40,10 @@ def _predict_x(self, x):
         distances = (self.distance_func(x, example) for example in self.X)
 
         # Sort all examples by their distance to x and keep their target value.
-        neighbors = sorted(((dist, target) for (dist, target) in zip(distances, self.y)), key=lambda x: x[0])
+        neighbors = sorted(
+            ((dist, target) for (dist, target) in zip(distances, self.y)),
+            key=lambda x: x[0],
+        )
 
         # Get targets of the k-nn and aggregate them (most common one or
         # average).
diff --git a/mla/linear_models.py b/mla/linear_models.py
index d7d4e9c9..a7d351dc 100644
--- a/mla/linear_models.py
+++ b/mla/linear_models.py
@@ -12,7 +12,9 @@
 
 
 class BasicRegression(BaseEstimator):
-    def __init__(self, lr=0.001, penalty="None", C=0.01, tolerance=0.0001, max_iters=1000):
+    def __init__(
+        self, lr=0.001, penalty="None", C=0.01, tolerance=0.0001, max_iters=1000
+    ):
         """Basic class for implementing continuous regression estimators which
         are trained with gradient descent optimization on their particular loss
         function.
diff --git a/mla/metrics/metrics.py b/mla/metrics/metrics.py
index 9fb20ded..3cffcee3 100644
--- a/mla/metrics/metrics.py
+++ b/mla/metrics/metrics.py
@@ -71,7 +71,9 @@ def hinge(actual, predicted):
 
 def binary_crossentropy(actual, predicted):
     predicted = np.clip(predicted, EPS, 1 - EPS)
-    return np.mean(-np.sum(actual * np.log(predicted) + (1 - actual) * np.log(1 - predicted)))
+    return np.mean(
+        -np.sum(actual * np.log(predicted) + (1 - actual) * np.log(1 - predicted))
+    )
 
 
 # aliases
diff --git a/mla/naive_bayes.py b/mla/naive_bayes.py
index 4b7f4cd2..16ba89e9 100644
--- a/mla/naive_bayes.py
+++ b/mla/naive_bayes.py
@@ -56,6 +56,6 @@ def _pdf(self, n_class, x):
         mean = self._mean[n_class]
         var = self._var[n_class]
 
-        numerator = np.exp(-(x - mean) ** 2 / (2 * var))
+        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
         denominator = np.sqrt(2 * np.pi * var)
         return numerator / denominator
diff --git a/mla/neuralnet/constraints.py b/mla/neuralnet/constraints.py
index ccc1e4a2..d33e410e 100644
--- a/mla/neuralnet/constraints.py
+++ b/mla/neuralnet/constraints.py
@@ -15,7 +15,7 @@ def __init__(self, m=2, axis=0):
         self.m = m
 
     def clip(self, p):
-        norms = np.sqrt(np.sum(p ** 2, axis=self.axis))
+        norms = np.sqrt(np.sum(p**2, axis=self.axis))
         desired = np.clip(norms, 0, self.m)
         p = p * (desired / (EPSILON + norms))
         return p
@@ -37,4 +37,4 @@ def __init__(self, axis=0):
         self.axis = axis
 
     def clip(self, p):
-        return p / (EPSILON + np.sqrt(np.sum(p ** 2, axis=self.axis)))
+        return p / (EPSILON + np.sqrt(np.sum(p**2, axis=self.axis)))
diff --git a/mla/neuralnet/layers/convnet.py b/mla/neuralnet/layers/convnet.py
index 485706c1..40ecef1f 100644
--- a/mla/neuralnet/layers/convnet.py
+++ b/mla/neuralnet/layers/convnet.py
@@ -6,7 +6,14 @@
 
 
 class Convolution(Layer, ParamMixin):
-    def __init__(self, n_filters=8, filter_shape=(3, 3), padding=(0, 0), stride=(1, 1), parameters=None):
+    def __init__(
+        self,
+        n_filters=8,
+        filter_shape=(3, 3),
+        padding=(0, 0),
+        stride=(1, 1),
+        parameters=None,
+    ):
         """A 2D convolutional layer.
         Input shape: (n_images, n_channels, height, width)
 
@@ -57,10 +64,14 @@ def backward_pass(self, delta):
         self._params.update_grad("W", d_W)
 
         d_c = np.dot(delta, self.col_W.T)
-        return column_to_image(d_c, self.last_input.shape, self.filter_shape, self.stride, self.padding)
+        return column_to_image(
+            d_c, self.last_input.shape, self.filter_shape, self.stride, self.padding
+        )
 
     def shape(self, x_shape):
-        height, width = convoltuion_shape(self.height, self.width, self.filter_shape, self.stride, self.padding)
+        height, width = convoltuion_shape(
+            self.height, self.width, self.filter_shape, self.stride, self.padding
+        )
         return x_shape[0], self.n_filters, height, width
 
 
@@ -91,7 +102,9 @@ def forward_pass(self, X):
         arg_max = np.argmax(col, axis=1)
         out = np.max(col, axis=1)
         self.arg_max = arg_max
-        return out.reshape(n_images, out_height, out_width, n_channels).transpose(0, 3, 1, 2)
+        return out.reshape(n_images, out_height, out_width, n_channels).transpose(
+            0, 3, 1, 2
+        )
 
     def backward_pass(self, delta):
         delta = delta.transpose(0, 2, 3, 1)
@@ -102,10 +115,14 @@ def backward_pass(self, delta):
         y_max = y_max.reshape(delta.shape + (pool_size,))
 
         dcol = y_max.reshape(y_max.shape[0] * y_max.shape[1] * y_max.shape[2], -1)
-        return column_to_image(dcol, self.last_input.shape, self.pool_shape, self.stride, self.padding)
+        return column_to_image(
+            dcol, self.last_input.shape, self.pool_shape, self.stride, self.padding
+        )
 
     def shape(self, x_shape):
-        h, w = convoltuion_shape(x_shape[2], x_shape[3], self.pool_shape, self.stride, self.padding)
+        h, w = convoltuion_shape(
+            x_shape[2], x_shape[3], self.pool_shape, self.stride, self.padding
+        )
         return x_shape[0], x_shape[1], h, w
 
 
@@ -137,7 +154,9 @@ def image_to_column(images, filter_shape, stride, padding):
     """
     n_images, n_channels, height, width = images.shape
     f_height, f_width = filter_shape
-    out_height, out_width = convoltuion_shape(height, width, (f_height, f_width), stride, padding)
+    out_height, out_width = convoltuion_shape(
+        height, width, (f_height, f_width), stride, padding
+    )
     images = np.pad(images, ((0, 0), (0, 0), padding, padding), mode="constant")
 
     col = np.zeros((n_images, n_channels, f_height, f_width, out_height, out_width))
@@ -145,7 +164,9 @@ def image_to_column(images, filter_shape, stride, padding):
         y_bound = y + stride[0] * out_height
         for x in range(f_width):
             x_bound = x + stride[1] * out_width
-            col[:, :, y, x, :, :] = images[:, :, y: y_bound: stride[0], x: x_bound: stride[1]]
+            col[:, :, y, x, :, :] = images[
+                :, :, y : y_bound : stride[0], x : x_bound : stride[1]
+            ]
 
     col = col.transpose(0, 4, 5, 1, 2, 3).reshape(n_images * out_height * out_width, -1)
     return col
@@ -165,10 +186,12 @@ def column_to_image(columns, images_shape, filter_shape, stride, padding):
     n_images, n_channels, height, width = images_shape
     f_height, f_width = filter_shape
 
-    out_height, out_width = convoltuion_shape(height, width, (f_height, f_width), stride, padding)
-    columns = columns.reshape(n_images, out_height, out_width, n_channels, f_height, f_width).transpose(
-        0, 3, 4, 5, 1, 2
+    out_height, out_width = convoltuion_shape(
+        height, width, (f_height, f_width), stride, padding
     )
+    columns = columns.reshape(
+        n_images, out_height, out_width, n_channels, f_height, f_width
+    ).transpose(0, 3, 4, 5, 1, 2)
 
     img_h = height + 2 * padding[0] + stride[0] - 1
     img_w = width + 2 * padding[1] + stride[1] - 1
@@ -177,9 +200,11 @@ def column_to_image(columns, images_shape, filter_shape, stride, padding):
         y_bound = y + stride[0] * out_height
         for x in range(f_width):
             x_bound = x + stride[1] * out_width
-            img[:, :, y: y_bound: stride[0], x: x_bound: stride[1]] += columns[:, :, y, x, :, :]
+            img[:, :, y : y_bound : stride[0], x : x_bound : stride[1]] += columns[
+                :, :, y, x, :, :
+            ]
 
-    return img[:, :, padding[0]: height + padding[0], padding[1]: width + padding[1]]
+    return img[:, :, padding[0] : height + padding[0], padding[1] : width + padding[1]]
 
 
 def convoltuion_shape(img_height, img_width, filter_shape, stride, padding):
diff --git a/mla/neuralnet/layers/normalization.py b/mla/neuralnet/layers/normalization.py
index 4f601a81..4ed77054 100644
--- a/mla/neuralnet/layers/normalization.py
+++ b/mla/neuralnet/layers/normalization.py
@@ -47,7 +47,7 @@ def _forward_pass(self, X):
         xmu = X - mu
 
         # step3: following the lower branch - calculation denominator
-        sq = xmu ** 2
+        sq = xmu**2
 
         # step4: calculate variance
         var = 1.0 / N * np.sum(sq, axis=0)
@@ -90,7 +90,9 @@ def forward_pass(self, X):
             out_flat = self._forward_pass(x_flat)
             return out_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)
         else:
-            raise NotImplementedError("Unknown model with dimensions = {}".format(len(X.shape)))
+            raise NotImplementedError(
+                "Unknown model with dimensions = {}".format(len(X.shape))
+            )
 
     def _backward_pass(self, delta):
         # unfold the variables stored in cache
@@ -112,7 +114,7 @@ def _backward_pass(self, delta):
         dxmu1 = dxhat * ivar
 
         # step6
-        dsqrtvar = -1.0 / (sqrtvar ** 2) * divar
+        dsqrtvar = -1.0 / (sqrtvar**2) * divar
 
         # step5
         dvar = 0.5 * 1.0 / np.sqrt(var + self.eps) * dsqrtvar
diff --git a/mla/neuralnet/layers/recurrent/lstm.py b/mla/neuralnet/layers/recurrent/lstm.py
index e0b4ce0f..9997f612 100644
--- a/mla/neuralnet/layers/recurrent/lstm.py
+++ b/mla/neuralnet/layers/recurrent/lstm.py
@@ -15,7 +15,14 @@
 
 
 class LSTM(Layer, ParamMixin):
-    def __init__(self, hidden_dim, activation="tanh", inner_init="orthogonal", parameters=None, return_sequences=True):
+    def __init__(
+        self,
+        hidden_dim,
+        activation="tanh",
+        inner_init="orthogonal",
+        parameters=None,
+        return_sequences=True,
+    ):
         self.return_sequences = return_sequences
         self.hidden_dim = hidden_dim
         self.inner_init = get_initializer(inner_init)
@@ -84,13 +91,18 @@ def forward_pass(self, X):
 
         self.states = np.zeros((n_samples, n_timesteps + 1, self.hidden_dim))
         self.outputs = np.zeros((n_samples, n_timesteps + 1, self.hidden_dim))
-        self.gates = {k: np.zeros((n_samples, n_timesteps, self.hidden_dim)) for k in ["i", "f", "o", "c"]}
+        self.gates = {
+            k: np.zeros((n_samples, n_timesteps, self.hidden_dim))
+            for k in ["i", "f", "o", "c"]
+        }
 
         self.states[:, -1, :] = self.hprev
         self.outputs[:, -1, :] = self.oprev
 
         for i in range(n_timesteps):
-            t_gates = np.dot(X[:, i, :], self.W) + np.dot(self.outputs[:, i - 1, :], self.U)
+            t_gates = np.dot(X[:, i, :], self.W) + np.dot(
+                self.outputs[:, i - 1, :], self.U
+            )
 
             # Input
             self.gates["i"][:, i, :] = sigmoid(t_gates[:, 0, :] + p["b_i"])
@@ -106,7 +118,9 @@ def forward_pass(self, X):
                 self.states[:, i - 1, :] * self.gates["f"][:, i, :]
                 + self.gates["i"][:, i, :] * self.gates["c"][:, i, :]
             )
-            self.outputs[:, i, :] = self.gates["o"][:, i, :] * self.activation(self.states[:, i, :])
+            self.outputs[:, i, :] = self.gates["o"][:, i, :] * self.activation(
+                self.states[:, i, :]
+            )
 
         self.hprev = self.states[:, n_timesteps - 1, :].copy()
         self.oprev = self.outputs[:, n_timesteps - 1, :].copy()
@@ -130,7 +144,12 @@ def backward_pass(self, delta):
 
         # Backpropagation through time
         for i in reversed(range(n_timesteps)):
-            dhi = delta[:, i, :] * self.gates["o"][:, i, :] * self.activation_d(self.states[:, i, :]) + dh_next
+            dhi = (
+                delta[:, i, :]
+                * self.gates["o"][:, i, :]
+                * self.activation_d(self.states[:, i, :])
+                + dh_next
+            )
 
             og = delta[:, i, :] * self.activation(self.states[:, i, :])
             de_o = og * self.sigmoid_d(self.gates["o"][:, i, :])
@@ -139,17 +158,23 @@ def backward_pass(self, delta):
             grad["U_o"] += np.dot(self.outputs[:, i - 1, :].T, de_o)
             grad["b_o"] += de_o.sum(axis=0)
 
-            de_f = (dhi * self.states[:, i - 1, :]) * self.sigmoid_d(self.gates["f"][:, i, :])
+            de_f = (dhi * self.states[:, i - 1, :]) * self.sigmoid_d(
+                self.gates["f"][:, i, :]
+            )
             grad["W_f"] += np.dot(self.last_input[:, i, :].T, de_f)
             grad["U_f"] += np.dot(self.outputs[:, i - 1, :].T, de_f)
             grad["b_f"] += de_f.sum(axis=0)
 
-            de_i = (dhi * self.gates["c"][:, i, :]) * self.sigmoid_d(self.gates["i"][:, i, :])
+            de_i = (dhi * self.gates["c"][:, i, :]) * self.sigmoid_d(
+                self.gates["i"][:, i, :]
+            )
             grad["W_i"] += np.dot(self.last_input[:, i, :].T, de_i)
             grad["U_i"] += np.dot(self.outputs[:, i - 1, :].T, de_i)
             grad["b_i"] += de_i.sum(axis=0)
 
-            de_c = (dhi * self.gates["i"][:, i, :]) * self.activation_d(self.gates["c"][:, i, :])
+            de_c = (dhi * self.gates["i"][:, i, :]) * self.activation_d(
+                self.gates["c"][:, i, :]
+            )
             grad["W_c"] += np.dot(self.last_input[:, i, :].T, de_c)
             grad["U_c"] += np.dot(self.outputs[:, i - 1, :].T, de_c)
             grad["b_c"] += de_c.sum(axis=0)
diff --git a/mla/neuralnet/layers/recurrent/rnn.py b/mla/neuralnet/layers/recurrent/rnn.py
index 3110a261..232daf10 100644
--- a/mla/neuralnet/layers/recurrent/rnn.py
+++ b/mla/neuralnet/layers/recurrent/rnn.py
@@ -10,7 +10,14 @@
 class RNN(Layer, ParamMixin):
     """Vanilla RNN."""
 
-    def __init__(self, hidden_dim, activation="tanh", inner_init="orthogonal", parameters=None, return_sequences=True):
+    def __init__(
+        self,
+        hidden_dim,
+        activation="tanh",
+        inner_init="orthogonal",
+        parameters=None,
+        return_sequences=True,
+    ):
         self.return_sequences = return_sequences
         self.hidden_dim = hidden_dim
         self.inner_init = get_initializer(inner_init)
@@ -53,7 +60,11 @@ def forward_pass(self, X):
         p = self._params
 
         for i in range(n_timesteps):
-            states[:, i, :] = np.tanh(np.dot(X[:, i, :], p["W"]) + np.dot(states[:, i - 1, :], p["U"]) + p["b"])
+            states[:, i, :] = np.tanh(
+                np.dot(X[:, i, :], p["W"])
+                + np.dot(states[:, i - 1, :], p["U"])
+                + p["b"]
+            )
 
         self.states = states
         self.hprev = states[:, n_timesteps - 1, :].copy()
diff --git a/mla/neuralnet/loss.py b/mla/neuralnet/loss.py
index 8be4dbe3..30def7e9 100644
--- a/mla/neuralnet/loss.py
+++ b/mla/neuralnet/loss.py
@@ -1,4 +1,5 @@
 from ..metrics import mse, logloss, mae, hinge, binary_crossentropy
+
 categorical_crossentropy = logloss
 
 
diff --git a/mla/neuralnet/nnet.py b/mla/neuralnet/nnet.py
index 2809fb0b..c39b96be 100644
--- a/mla/neuralnet/nnet.py
+++ b/mla/neuralnet/nnet.py
@@ -23,7 +23,15 @@ class NeuralNet(BaseEstimator):
     fit_required = False
 
     def __init__(
-        self, layers, optimizer, loss, max_epochs=10, batch_size=64, metric="mse", shuffle=False, verbose=True
+        self,
+        layers,
+        optimizer,
+        loss,
+        max_epochs=10,
+        batch_size=64,
+        metric="mse",
+        shuffle=False,
+        verbose=True,
     ):
         self.verbose = verbose
         self.shuffle = shuffle
diff --git a/mla/neuralnet/optimizers.py b/mla/neuralnet/optimizers.py
index fc9ae1bb..1e2a68a5 100644
--- a/mla/neuralnet/optimizers.py
+++ b/mla/neuralnet/optimizers.py
@@ -45,7 +45,9 @@ def train_epoch(self, network):
 
         batch = zip(X_batch, y_batch)
         if network.verbose:
-            batch = tqdm(batch, total=int(np.ceil(network.n_samples / network.batch_size)))
+            batch = tqdm(
+                batch, total=int(np.ceil(network.n_samples / network.batch_size))
+            )
 
         for X, y in batch:
             loss = np.mean(network.update(X, y))
@@ -106,7 +108,7 @@ def update(self, network):
         for i, layer in enumerate(network.parametric_layers):
             for n in layer.parameters.keys():
                 grad = layer.parameters.grad[n]
-                self.accu[i][n] += grad ** 2
+                self.accu[i][n] += grad**2
                 step = self.lr * grad / (np.sqrt(self.accu[i][n]) + self.eps)
                 layer.parameters.step(n, -step)
 
@@ -128,12 +130,20 @@ def update(self, network):
         for i, layer in enumerate(network.parametric_layers):
             for n in layer.parameters.keys():
                 grad = layer.parameters.grad[n]
-                self.accu[i][n] = self.rho * self.accu[i][n] + (1.0 - self.rho) * grad ** 2
-                step = grad * np.sqrt(self.d_accu[i][n] + self.eps) / np.sqrt(self.accu[i][n] + self.eps)
+                self.accu[i][n] = (
+                    self.rho * self.accu[i][n] + (1.0 - self.rho) * grad**2
+                )
+                step = (
+                    grad
+                    * np.sqrt(self.d_accu[i][n] + self.eps)
+                    / np.sqrt(self.accu[i][n] + self.eps)
+                )
 
                 layer.parameters.step(n, -step * self.lr)
                 # Update delta accumulator
-                self.d_accu[i][n] = self.rho * self.d_accu[i][n] + (1.0 - self.rho) * step ** 2
+                self.d_accu[i][n] = (
+                    self.rho * self.d_accu[i][n] + (1.0 - self.rho) * step**2
+                )
 
     def setup(self, network):
         # Accumulators
@@ -155,7 +165,9 @@ def update(self, network):
         for i, layer in enumerate(network.parametric_layers):
             for n in layer.parameters.keys():
                 grad = layer.parameters.grad[n]
-                self.accu[i][n] = (self.rho * self.accu[i][n]) + (1.0 - self.rho) * (grad ** 2)
+                self.accu[i][n] = (self.rho * self.accu[i][n]) + (1.0 - self.rho) * (
+                    grad**2
+                )
                 step = self.lr * grad / (np.sqrt(self.accu[i][n]) + self.eps)
                 layer.parameters.step(n, -step)
 
@@ -169,7 +181,6 @@ def setup(self, network):
 
 class Adam(Optimizer):
     def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8):
-
         self.epsilon = epsilon
         self.beta_2 = beta_2
         self.beta_1 = beta_1
@@ -181,9 +192,17 @@ def update(self, network):
         for i, layer in enumerate(network.parametric_layers):
             for n in layer.parameters.keys():
                 grad = layer.parameters.grad[n]
-                self.ms[i][n] = (self.beta_1 * self.ms[i][n]) + (1.0 - self.beta_1) * grad
-                self.vs[i][n] = (self.beta_2 * self.vs[i][n]) + (1.0 - self.beta_2) * grad ** 2
-                lr = self.lr * np.sqrt(1.0 - self.beta_2 ** self.t) / (1.0 - self.beta_1 ** self.t)
+                self.ms[i][n] = (self.beta_1 * self.ms[i][n]) + (
+                    1.0 - self.beta_1
+                ) * grad
+                self.vs[i][n] = (self.beta_2 * self.vs[i][n]) + (
+                    1.0 - self.beta_2
+                ) * grad**2
+                lr = (
+                    self.lr
+                    * np.sqrt(1.0 - self.beta_2**self.t)
+                    / (1.0 - self.beta_1**self.t)
+                )
 
                 step = lr * self.ms[i][n] / (np.sqrt(self.vs[i][n]) + self.epsilon)
                 layer.parameters.step(n, -step)
@@ -201,7 +220,6 @@ def setup(self, network):
 
 class Adamax(Optimizer):
     def __init__(self, learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-8):
-
         self.epsilon = epsilon
         self.beta_2 = beta_2
         self.beta_1 = beta_1
@@ -215,7 +233,12 @@ def update(self, network):
                 self.ms[i][n] = self.beta_1 * self.ms[i][n] + (1.0 - self.beta_1) * grad
                 self.us[i][n] = np.maximum(self.beta_2 * self.us[i][n], np.abs(grad))
 
-                step = self.lr / (1 - self.beta_1 ** self.t) * self.ms[i][n] / (self.us[i][n] + self.epsilon)
+                step = (
+                    self.lr
+                    / (1 - self.beta_1**self.t)
+                    * self.ms[i][n]
+                    / (self.us[i][n] + self.epsilon)
+                )
                 layer.parameters.step(n, -step)
         self.t += 1
 
diff --git a/mla/neuralnet/parameters.py b/mla/neuralnet/parameters.py
index 65873c26..e81f18db 100644
--- a/mla/neuralnet/parameters.py
+++ b/mla/neuralnet/parameters.py
@@ -5,7 +5,14 @@
 
 
 class Parameters(object):
-    def __init__(self, init="glorot_uniform", scale=0.5, bias=1.0, regularizers=None, constraints=None):
+    def __init__(
+        self,
+        init="glorot_uniform",
+        scale=0.5,
+        bias=1.0,
+        regularizers=None,
+        constraints=None,
+    ):
         """A container for layer's parameters.
 
         Parameters
diff --git a/mla/neuralnet/regularizers.py b/mla/neuralnet/regularizers.py
index 53bc3b37..723cccea 100644
--- a/mla/neuralnet/regularizers.py
+++ b/mla/neuralnet/regularizers.py
@@ -25,11 +25,11 @@ def _penalty(self, weights):
 
 class L2(Regularizer):
     def _penalty(self, weights):
-        return self.C * weights ** 2
+        return self.C * weights**2
 
 
 class ElasticNet(Regularizer):
     """Linear combination of L1 and L2 penalties."""
 
     def _penalty(self, weights):
-        return 0.5 * self.C * weights ** 2 + (1.0 - self.C) * np.abs(weights)
+        return 0.5 * self.C * weights**2 + (1.0 - self.C) * np.abs(weights)
diff --git a/mla/neuralnet/tests/test_activations.py b/mla/neuralnet/tests/test_activations.py
index fc5de9ad..7bb095a6 100644
--- a/mla/neuralnet/tests/test_activations.py
+++ b/mla/neuralnet/tests/test_activations.py
@@ -14,6 +14,8 @@ def test_softplus():
     # naive implementation of np.log(1 + np.exp(z_max)) will overflow
     # naive implementation of z + np.log(1 + 1 / np.exp(z_min)) will
     # throw ZeroDivisionError
-    outputs = np.array([np.log(2.0), np.log1p(np.exp(1.0)), np.log1p(np.exp(-1.0)), 0.0, z_max])
+    outputs = np.array(
+        [np.log(2.0), np.log1p(np.exp(1.0)), np.log1p(np.exp(-1.0)), 0.0, z_max]
+    )
 
     assert np.allclose(outputs, softplus(inputs))
diff --git a/mla/neuralnet/tests/test_optimizers.py b/mla/neuralnet/tests/test_optimizers.py
index a42b5036..0c9c7d84 100644
--- a/mla/neuralnet/tests/test_optimizers.py
+++ b/mla/neuralnet/tests/test_optimizers.py
@@ -10,13 +10,20 @@
 
 def clasifier(optimizer):
     X, y = make_classification(
-        n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2, class_sep=2.5
+        n_samples=1000,
+        n_features=100,
+        n_informative=75,
+        random_state=1111,
+        n_classes=2,
+        class_sep=2.5,
     )
     y = one_hot(y)
 
     X -= np.mean(X, axis=0)
     X /= np.std(X, axis=0)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.15, random_state=1111
+    )
 
     model = NeuralNet(
         layers=[
diff --git a/mla/pca.py b/mla/pca.py
index 64d6a614..9155f919 100644
--- a/mla/pca.py
+++ b/mla/pca.py
@@ -47,10 +47,12 @@ def _decompose(self, X):
             s, Vh = np.linalg.eig(np.cov(X.T))
             Vh = Vh.T
 
-        s_squared = s ** 2
+        s_squared = s**2
         variance_ratio = s_squared / s_squared.sum()
-        logging.info("Explained variance ratio: %s" % (variance_ratio[0: self.n_components]))
-        self.components = Vh[0: self.n_components]
+        logging.info(
+            "Explained variance ratio: %s" % (variance_ratio[0 : self.n_components])
+        )
+        self.components = Vh[0 : self.n_components]
 
     def transform(self, X):
         X = X.copy()
diff --git a/mla/rbm.py b/mla/rbm.py
index f74234ef..90143b7e 100644
--- a/mla/rbm.py
+++ b/mla/rbm.py
@@ -43,7 +43,6 @@ def fit(self, X, y=None):
         self._train()
 
     def _init_weights(self):
-
         self.W = np.random.randn(self.n_visible, self.n_hidden) * 0.1
 
         # Bias for visible and hidden units
@@ -63,15 +62,29 @@ def _train(self):
                 hidden_states = self._sample(positive_hidden)  # sample hidden state h1
                 positive_associations = np.dot(batch.T, positive_hidden)
 
-                negative_visible = sigmoid(np.dot(hidden_states, self.W.T) + self.bias_v)
-                negative_visible = self._sample(negative_visible)  # use the sampled hidden state h1 to sample v1
-                negative_hidden = sigmoid(np.dot(negative_visible, self.W) + self.bias_h)
+                negative_visible = sigmoid(
+                    np.dot(hidden_states, self.W.T) + self.bias_v
+                )
+                negative_visible = self._sample(
+                    negative_visible
+                )  # use the sampled hidden state h1 to sample v1
+                negative_hidden = sigmoid(
+                    np.dot(negative_visible, self.W) + self.bias_h
+                )
                 negative_associations = np.dot(negative_visible.T, negative_hidden)
 
                 lr = self.lr / float(batch.shape[0])
-                self.W += lr * ((positive_associations - negative_associations) / float(self.batch_size))
-                self.bias_h += lr * (negative_hidden.sum(axis=0) - negative_associations.sum(axis=0))
-                self.bias_v += lr * (np.asarray(batch.sum(axis=0)).squeeze() - negative_visible.sum(axis=0))
+                self.W += lr * (
+                    (positive_associations - negative_associations)
+                    / float(self.batch_size)
+                )
+                self.bias_h += lr * (
+                    negative_hidden.sum(axis=0) - negative_associations.sum(axis=0)
+                )
+                self.bias_v += lr * (
+                    np.asarray(batch.sum(axis=0)).squeeze()
+                    - negative_visible.sum(axis=0)
+                )
 
                 error += np.sum((batch - negative_visible) ** 2)
 
diff --git a/mla/rl/dqn.py b/mla/rl/dqn.py
index ec8c6c06..42b58097 100644
--- a/mla/rl/dqn.py
+++ b/mla/rl/dqn.py
@@ -19,7 +19,14 @@
 
 class DQN(object):
     def __init__(
-        self, n_episodes=500, gamma=0.99, batch_size=32, epsilon=1.0, decay=0.005, min_epsilon=0.1, memory_limit=500
+        self,
+        n_episodes=500,
+        gamma=0.99,
+        batch_size=32,
+        epsilon=1.0,
+        decay=0.005,
+        min_epsilon=0.1,
+        memory_limit=500,
     ):
         """Deep Q learning implementation.
 
@@ -48,7 +55,9 @@ def __init__(
     def init_environment(self, name="CartPole-v0", monitor=False):
         self.env = gym.make(name)
         if monitor:
-            self.env = wrappers.Monitor(self.env, name, force=True, video_callable=False)
+            self.env = wrappers.Monitor(
+                self.env, name, force=True, video_callable=False
+            )
 
         self.n_states = self.env.observation_space.shape[0]
         self.n_actions = self.env.action_space.n
@@ -122,11 +131,14 @@ def train(self, render=False):
             while len(self.replay) > self.memory_limit:
                 self.replay.pop(0)
 
-            self.epsilon = self.min_epsilon + (1.0 - self.min_epsilon) * np.exp(-self.decay * ep)
+            self.epsilon = self.min_epsilon + (1.0 - self.min_epsilon) * np.exp(
+                -self.decay * ep
+            )
 
             max_reward = max(max_reward, total_reward)
             logger.info(
-                "Episode: %s, reward %s,  epsilon %s, max reward %s" % (ep, total_reward, self.epsilon, max_reward)
+                "Episode: %s, reward %s,  epsilon %s, max reward %s"
+                % (ep, total_reward, self.epsilon, max_reward)
             )
         logging.info("Training finished.")
 
diff --git a/mla/svm/svm.py b/mla/svm/svm.py
index b9695e13..a4b38aea 100644
--- a/mla/svm/svm.py
+++ b/mla/svm/svm.py
@@ -71,15 +71,21 @@ def _train(self):
                 self.alpha[j] -= (self.y[j] * (e_i - e_j)) / eta
                 self.alpha[j] = self.clip(self.alpha[j], H, L)
 
-                self.alpha[i] = self.alpha[i] + self.y[i] * self.y[j] * (alpha_jo - self.alpha[j])
+                self.alpha[i] = self.alpha[i] + self.y[i] * self.y[j] * (
+                    alpha_jo - self.alpha[j]
+                )
 
                 # Find intercept
                 b1 = (
-                    self.b - e_i - self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, i]
+                    self.b
+                    - e_i
+                    - self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, i]
                     - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[i, j]
                 )
                 b2 = (
-                    self.b - e_j - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[j, j]
+                    self.b
+                    - e_j
+                    - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[j, j]
                     - self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, j]
                 )
                 if 0 < self.alpha[i] < self.C:
diff --git a/mla/tests/test_classification_accuracy.py b/mla/tests/test_classification_accuracy.py
index f4fb42aa..8698daa1 100644
--- a/mla/tests/test_classification_accuracy.py
+++ b/mla/tests/test_classification_accuracy.py
@@ -24,9 +24,17 @@
 
 # Generate a random regression problem
 X, y = make_classification(
-    n_samples=750, n_features=10, n_informative=8, random_state=1111, n_classes=2, class_sep=2.5, n_redundant=0
+    n_samples=750,
+    n_features=10,
+    n_informative=8,
+    random_state=1111,
+    n_classes=2,
+    class_sep=2.5,
+    n_redundant=0,
+)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.12, random_state=1111
 )
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12, random_state=1111)
 
 
 # All classifiers except convnet, RNN, LSTM.
@@ -83,7 +91,9 @@ def test_mlp():
 
 
 def test_gbm():
-    model = GradientBoostingClassifier(n_estimators=25, max_depth=3, max_features=5, learning_rate=0.1)
+    model = GradientBoostingClassifier(
+        n_estimators=25, max_depth=3, max_features=5, learning_rate=0.1
+    )
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
     assert roc_auc_score(y_test, predictions) >= 0.95
diff --git a/mla/tests/test_reduction.py b/mla/tests/test_reduction.py
index da87fc82..b9346147 100644
--- a/mla/tests/test_reduction.py
+++ b/mla/tests/test_reduction.py
@@ -16,7 +16,12 @@
 def dataset():
     # Generate a random binary classification problem.
     return make_classification(
-        n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2, class_sep=2.5
+        n_samples=1000,
+        n_features=100,
+        n_informative=75,
+        random_state=1111,
+        n_classes=2,
+        class_sep=2.5,
     )
 
 
@@ -24,7 +29,9 @@ def dataset():
 @pytest.mark.skip()
 def test_PCA(dataset):
     X, y = dataset
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.25, random_state=1111
+    )
     p = PCA(50, solver="eigen")
 
     # fit PCA with training set, not the entire dataset
diff --git a/mla/tests/test_regression_accuracy.py b/mla/tests/test_regression_accuracy.py
index 5c13b7f7..33cf2f57 100644
--- a/mla/tests/test_regression_accuracy.py
+++ b/mla/tests/test_regression_accuracy.py
@@ -14,9 +14,17 @@
 
 # Generate a random regression problem
 X, y = make_regression(
-    n_samples=1000, n_features=10, n_informative=10, n_targets=1, noise=0.05, random_state=1111, bias=0.5
+    n_samples=1000,
+    n_features=10,
+    n_informative=10,
+    n_targets=1,
+    noise=0.05,
+    random_state=1111,
+    bias=0.5,
+)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.25, random_state=1111
 )
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)
 
 
 def test_linear():
diff --git a/mla/tsne.py b/mla/tsne.py
index c76dc89f..c2995d6d 100644
--- a/mla/tsne.py
+++ b/mla/tsne.py
@@ -19,7 +19,9 @@
 class TSNE(BaseEstimator):
     y_required = False
 
-    def __init__(self, n_components=2, perplexity=30.0, max_iter=200, learning_rate=500):
+    def __init__(
+        self, n_components=2, perplexity=30.0, max_iter=200, learning_rate=500
+    ):
         """A t-Distributed Stochastic Neighbor Embedding implementation.
 
         Parameters
@@ -67,7 +69,9 @@ def fit_transform(self, X, y=None):
                 grad = 4 * np.dot((pmul * P[i] - Q_n[i]) * Q[i], Y[i] - Y)
                 grads[i] = grad
 
-            gains = (gains + 0.2) * ((grads > 0) != (velocity > 0)) + (gains * 0.8) * ((grads > 0) == (velocity > 0))
+            gains = (gains + 0.2) * ((grads > 0) != (velocity > 0)) + (gains * 0.8) * (
+                (grads > 0) == (velocity > 0)
+            )
             gains = gains.clip(min=self.min_gain)
 
             velocity = momentum * velocity - self.lr * (gains * grads)

From 8fb8bb4282f37ad797052930dc12d0faace7bdb6 Mon Sep 17 00:00:00 2001
From: Artem Golubin <me@rushter.com>
Date: Mon, 16 Jun 2025 01:18:12 +0400
Subject: [PATCH 3/3] Format using ruff

---
 examples/gbm.py                 | 33 +++++++++++++++++++++++++++------
 examples/kmeans.py              |  4 +++-
 examples/linear_models.py       | 23 +++++++++++++++++++----
 examples/naive_bayes.py         | 12 ++++++++++--
 examples/nearest_neighbors.py   | 16 +++++++++++++---
 examples/nnet_convnet_mnist.py  |  9 ++++++++-
 examples/nnet_mlp.py            | 24 ++++++++++++++++++++----
 examples/nnet_rnn_binary_add.py |  4 +++-
 examples/pca.py                 | 11 +++++++++--
 examples/random_forest.py       | 31 +++++++++++++++++++++++++------
 examples/rbm.py                 |  2 +-
 examples/svm.py                 | 15 ++++++++++++---
 examples/t-sne.py               |  8 +++++++-
 13 files changed, 157 insertions(+), 35 deletions(-)

diff --git a/examples/gbm.py b/examples/gbm.py
index 99f14d55..f3f85fdf 100644
--- a/examples/gbm.py
+++ b/examples/gbm.py
@@ -18,11 +18,21 @@
 def classification():
     # Generate a random binary classification problem.
     X, y = make_classification(
-        n_samples=350, n_features=15, n_informative=10, random_state=1111, n_classes=2, class_sep=1.0, n_redundant=0
+        n_samples=350,
+        n_features=15,
+        n_informative=10,
+        random_state=1111,
+        n_classes=2,
+        class_sep=1.0,
+        n_redundant=0,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.15, random_state=1111
     )
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111)
 
-    model = GradientBoostingClassifier(n_estimators=50, max_depth=4, max_features=8, learning_rate=0.1)
+    model = GradientBoostingClassifier(
+        n_estimators=50, max_depth=4, max_features=8, learning_rate=0.1
+    )
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
     print(predictions)
@@ -34,14 +44,25 @@ def classification():
 def regression():
     # Generate a random regression problem
     X, y = make_regression(
-        n_samples=500, n_features=5, n_informative=5, n_targets=1, noise=0.05, random_state=1111, bias=0.5
+        n_samples=500,
+        n_features=5,
+        n_informative=5,
+        n_targets=1,
+        noise=0.05,
+        random_state=1111,
+        bias=0.5,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.1, random_state=1111
     )
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
 
     model = GradientBoostingRegressor(n_estimators=25, max_depth=5, max_features=3)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
-    print("regression, mse: %s" % mean_squared_error(y_test.flatten(), predictions.flatten()))
+    print(
+        "regression, mse: %s"
+        % mean_squared_error(y_test.flatten(), predictions.flatten())
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/kmeans.py b/examples/kmeans.py
index 9e08a689..9714c43e 100644
--- a/examples/kmeans.py
+++ b/examples/kmeans.py
@@ -5,7 +5,9 @@
 
 
 def kmeans_example(plot=False):
-    X, y = make_blobs(centers=4, n_samples=500, n_features=2, shuffle=True, random_state=42)
+    X, y = make_blobs(
+        centers=4, n_samples=500, n_features=2, shuffle=True, random_state=42
+    )
     clusters = len(np.unique(y))
     k = KMeans(K=clusters, max_iters=150, init="++")
     k.fit(X)
diff --git a/examples/linear_models.py b/examples/linear_models.py
index e553661d..9bdb1cd0 100644
--- a/examples/linear_models.py
+++ b/examples/linear_models.py
@@ -17,9 +17,17 @@
 def regression():
     # Generate a random regression problem
     X, y = make_regression(
-        n_samples=10000, n_features=100, n_informative=75, n_targets=1, noise=0.05, random_state=1111, bias=0.5
+        n_samples=10000,
+        n_features=100,
+        n_informative=75,
+        n_targets=1,
+        noise=0.05,
+        random_state=1111,
+        bias=0.5,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.25, random_state=1111
     )
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)
 
     model = LinearRegression(lr=0.01, max_iters=2000, penalty="l2", C=0.03)
     model.fit(X_train, y_train)
@@ -30,9 +38,16 @@ def regression():
 def classification():
     # Generate a random binary classification problem.
     X, y = make_classification(
-        n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2, class_sep=2.5
+        n_samples=1000,
+        n_features=100,
+        n_informative=75,
+        random_state=1111,
+        n_classes=2,
+        class_sep=2.5,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.1, random_state=1111
     )
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
 
     model = LogisticRegression(lr=0.01, max_iters=500, penalty="l1", C=0.01)
     model.fit(X_train, y_train)
diff --git a/examples/naive_bayes.py b/examples/naive_bayes.py
index 383e997d..9e051d48 100644
--- a/examples/naive_bayes.py
+++ b/examples/naive_bayes.py
@@ -8,9 +8,17 @@
 def classification():
     # Generate a random binary classification problem.
     X, y = make_classification(
-        n_samples=1000, n_features=10, n_informative=10, random_state=1111, n_classes=2, class_sep=2.5, n_redundant=0
+        n_samples=1000,
+        n_features=10,
+        n_informative=10,
+        random_state=1111,
+        n_classes=2,
+        class_sep=2.5,
+        n_redundant=0,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.1, random_state=1111
     )
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
 
     model = NaiveBayesClassifier()
     model.fit(X_train, y_train)
diff --git a/examples/nearest_neighbors.py b/examples/nearest_neighbors.py
index d68bf208..f551ab05 100644
--- a/examples/nearest_neighbors.py
+++ b/examples/nearest_neighbors.py
@@ -13,9 +13,17 @@
 def regression():
     # Generate a random regression problem
     X, y = make_regression(
-        n_samples=500, n_features=5, n_informative=5, n_targets=1, noise=0.05, random_state=1111, bias=0.5
+        n_samples=500,
+        n_features=5,
+        n_informative=5,
+        n_targets=1,
+        noise=0.05,
+        random_state=1111,
+        bias=0.5,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.25, random_state=1111
     )
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)
 
     model = knn.KNNRegressor(k=5, distance_func=distance.euclidean)
     model.fit(X_train, y_train)
@@ -35,7 +43,9 @@ def classification():
         class_sep=1.5,
     )
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.1, random_state=1111
+    )
 
     clf = knn.KNNClassifier(k=5, distance_func=distance.euclidean)
 
diff --git a/examples/nnet_convnet_mnist.py b/examples/nnet_convnet_mnist.py
index aff0b361..4161a060 100644
--- a/examples/nnet_convnet_mnist.py
+++ b/examples/nnet_convnet_mnist.py
@@ -3,7 +3,14 @@
 from mla.datasets import load_mnist
 from mla.metrics import accuracy
 from mla.neuralnet import NeuralNet
-from mla.neuralnet.layers import Activation, Convolution, MaxPooling, Flatten, Dropout, Parameters
+from mla.neuralnet.layers import (
+    Activation,
+    Convolution,
+    MaxPooling,
+    Flatten,
+    Dropout,
+    Parameters,
+)
 from mla.neuralnet.layers import Dense
 from mla.neuralnet.optimizers import Adadelta
 from mla.utils import one_hot
diff --git a/examples/nnet_mlp.py b/examples/nnet_mlp.py
index f35a4120..484989b3 100644
--- a/examples/nnet_mlp.py
+++ b/examples/nnet_mlp.py
@@ -23,10 +23,17 @@
 def classification():
     # Generate a random binary classification problem.
     X, y = make_classification(
-        n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2, class_sep=2.5
+        n_samples=1000,
+        n_features=100,
+        n_informative=75,
+        random_state=1111,
+        n_classes=2,
+        class_sep=2.5,
     )
     y = one_hot(y)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.15, random_state=1111
+    )
 
     model = NeuralNet(
         layers=[
@@ -51,9 +58,18 @@ def classification():
 
 def regression():
     # Generate a random regression problem
-    X, y = make_regression(n_samples=5000, n_features=25, n_informative=25, n_targets=1, random_state=100, noise=0.05)
+    X, y = make_regression(
+        n_samples=5000,
+        n_features=25,
+        n_informative=25,
+        n_targets=1,
+        random_state=100,
+        noise=0.05,
+    )
     y *= 0.01
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.1, random_state=1111
+    )
 
     model = NeuralNet(
         layers=[
diff --git a/examples/nnet_rnn_binary_add.py b/examples/nnet_rnn_binary_add.py
index d019201b..5057cc5b 100644
--- a/examples/nnet_rnn_binary_add.py
+++ b/examples/nnet_rnn_binary_add.py
@@ -38,7 +38,9 @@ def addition_dataset(dim=10, n_samples=10000, batch_size=64):
         # Generate target variable (a+b)
         y[i, :, 0] = list(reversed([int(x) for x in binary_format.format(a + b)]))
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=1111
+    )
 
     # Round number of examples for batch processing
     train_b = (X_train.shape[0] // batch_size) * batch_size
diff --git a/examples/pca.py b/examples/pca.py
index 4b7bf3ac..10321ada 100644
--- a/examples/pca.py
+++ b/examples/pca.py
@@ -12,11 +12,18 @@
 
 # Generate a random binary classification problem.
 X, y = make_classification(
-    n_samples=1000, n_features=100, n_informative=75, random_state=1111, n_classes=2, class_sep=2.5
+    n_samples=1000,
+    n_features=100,
+    n_informative=75,
+    random_state=1111,
+    n_classes=2,
+    class_sep=2.5,
 )
 
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.25, random_state=1111
+)
 
 for s in ["svd", "eigen"]:
     p = PCA(15, solver=s)
diff --git a/examples/random_forest.py b/examples/random_forest.py
index ad0c2261..f2fcb44e 100644
--- a/examples/random_forest.py
+++ b/examples/random_forest.py
@@ -19,17 +19,25 @@
 def classification():
     # Generate a random binary classification problem.
     X, y = make_classification(
-        n_samples=500, n_features=10, n_informative=10, random_state=1111, n_classes=2, class_sep=2.5, n_redundant=0
+        n_samples=500,
+        n_features=10,
+        n_informative=10,
+        random_state=1111,
+        n_classes=2,
+        class_sep=2.5,
+        n_redundant=0,
     )
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.15, random_state=1111
+    )
 
     model = RandomForestClassifier(n_estimators=10, max_depth=4)
     model.fit(X_train, y_train)
 
     predictions_prob = model.predict(X_test)[:, 1]
     predictions = np.argmax(model.predict(X_test), axis=1)
-    #print(predictions.shape)
+    # print(predictions.shape)
     print("classification, roc auc score: %s" % roc_auc_score(y_test, predictions_prob))
     print("classification, accuracy score: %s" % accuracy_score(y_test, predictions))
 
@@ -37,14 +45,25 @@ def classification():
 def regression():
     # Generate a random regression problem
     X, y = make_regression(
-        n_samples=500, n_features=5, n_informative=5, n_targets=1, noise=0.05, random_state=1111, bias=0.5
+        n_samples=500,
+        n_features=5,
+        n_informative=5,
+        n_targets=1,
+        noise=0.05,
+        random_state=1111,
+        bias=0.5,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.1, random_state=1111
     )
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111)
 
     model = RandomForestRegressor(n_estimators=50, max_depth=10, max_features=3)
     model.fit(X_train, y_train)
     predictions = model.predict(X_test)
-    print("regression, mse: %s" % mean_squared_error(y_test.flatten(), predictions.flatten()))
+    print(
+        "regression, mse: %s"
+        % mean_squared_error(y_test.flatten(), predictions.flatten())
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/rbm.py b/examples/rbm.py
index 74f2a772..2d167644 100644
--- a/examples/rbm.py
+++ b/examples/rbm.py
@@ -13,7 +13,7 @@ def print_curve(rbm):
     def moving_average(a, n=25):
         ret = np.cumsum(a, dtype=float)
         ret[n:] = ret[n:] - ret[:-n]
-        return ret[n - 1:] / n
+        return ret[n - 1 :] / n
 
     plt.plot(moving_average(rbm.errors))
     plt.show()
diff --git a/examples/svm.py b/examples/svm.py
index 19535a4d..062a8710 100644
--- a/examples/svm.py
+++ b/examples/svm.py
@@ -16,17 +16,26 @@
 def classification():
     # Generate a random binary classification problem.
     X, y = make_classification(
-        n_samples=1200, n_features=10, n_informative=5, random_state=1111, n_classes=2, class_sep=1.75
+        n_samples=1200,
+        n_features=10,
+        n_informative=5,
+        random_state=1111,
+        n_classes=2,
+        class_sep=1.75,
     )
     # Convert y to {-1, 1}
     y = (y * 2) - 1
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1111)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=1111
+    )
 
     for kernel in [RBF(gamma=0.1), Linear()]:
         model = SVM(max_iter=500, kernel=kernel, C=0.6)
         model.fit(X_train, y_train)
         predictions = model.predict(X_test)
-        print("Classification accuracy (%s): %s" % (kernel, accuracy(y_test, predictions)))
+        print(
+            "Classification accuracy (%s): %s" % (kernel, accuracy(y_test, predictions))
+        )
 
 
 if __name__ == "__main__":
diff --git a/examples/t-sne.py b/examples/t-sne.py
index 36873e91..bd08581d 100644
--- a/examples/t-sne.py
+++ b/examples/t-sne.py
@@ -8,7 +8,13 @@
 logging.basicConfig(level=logging.DEBUG)
 
 X, y = make_classification(
-    n_samples=500, n_features=10, n_informative=5, n_redundant=0, random_state=1111, n_classes=2, class_sep=2.5
+    n_samples=500,
+    n_features=10,
+    n_informative=5,
+    n_redundant=0,
+    random_state=1111,
+    n_classes=2,
+    class_sep=2.5,
 )
 
 p = TSNE(2, max_iter=500)