Borgio
diff --git a/‎machine-learning/dimensionality-reduction-feature-selection/README.md
Lines changed: 1 addition & 0 deletions b/‎machine-learning/dimensionality-reduction-feature-selection/README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎machine-learning/dimensionality-reduction-feature-selection/dimensionality_reduction_using_feature_selection.ipynb
Lines changed: 306 additions & 0 deletions b/‎machine-learning/dimensionality-reduction-feature-selection/dimensionality_reduction_using_feature_selection.ipynb
Lines changed: 306 additions & 0 deletions
@@ -0,0 +1 @@
+# [Dimensionality Reduction Using Feature Selection in Python](https://www.thepythoncode.com/article/dimensionality-reduction-feature-selection)
@@ -0,0 +1,306 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "iImkWEpRSiRq"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "# Load libraries\n",
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "from sklearn.datasets import load_iris, make_regression\n",
+        "from sklearn.feature_selection import SelectKBest, chi2, f_classif, SelectPercentile, VarianceThreshold, RFECV\n",
+        "from sklearn.preprocessing import StandardScaler\n",
+        "import warnings\n",
+        "from sklearn import datasets, linear_model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ZEK7KAyzSokS",
+        "outputId": "7ce72382-c116-4f51-df7b-1f975c1c25f8"
+      },
+      "outputs": [],
+      "source": [
+        "# Load libraries\n",
+        "# import data\n",
+        "iris = datasets.load_iris()\n",
+        "# Create features and target\n",
+        "features_i = iris.data\n",
+        "target_i = iris.target\n",
+        "# thresholder  creation\n",
+        "thresholder = VarianceThreshold(threshold=.4)\n",
+        "# high variance feature matrix creation\n",
+        "f_high_variance = thresholder.fit_transform(features_i)\n",
+        "# View high variance feature matrix\n",
+        "f_high_variance[0:3]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "7ZZgOg1-SpuX",
+        "outputId": "a869adde-0b29-4630-9661-34377f110d4f"
+      },
+      "outputs": [],
+      "source": [
+        "# View variances\n",
+        "thresholder.fit(features_i).variances_"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "zYNK4wP5Sq9R",
+        "outputId": "30e18ea5-4b63-43e5-819e-9a99251dfae6"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "# feature matrix stantardization\n",
+        "scaler = StandardScaler()\n",
+        "f_std = scaler.fit_transform(features_i)\n",
+        "# variance of each feature calculation\n",
+        "selection = VarianceThreshold()\n",
+        "selection.fit(f_std).variances_"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "jDGMP97LSuiB",
+        "outputId": "c1b9d537-495f-4109-ef75-324fe9943668"
+      },
+      "outputs": [],
+      "source": [
+        "# feature matrix creation with:\n",
+        "# for Feature 0: 80% class 0\n",
+        "# for Feature 1: 80% class 1\n",
+        "# for Feature 2: 60% class 0, 40% class 1\n",
+        "features_i = [[0, 2, 0],\n",
+        "[0, 1, 1],\n",
+        "[0, 1, 0],\n",
+        "[0, 1, 1],\n",
+        "[1, 0, 0]]\n",
+        "# threshold by variance\n",
+        "thresholding = VarianceThreshold(threshold=(.65 * (1 - .65)))\n",
+        "thresholding.fit_transform(features_i)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 198
+        },
+        "id": "JvnObeKXS6xm",
+        "outputId": "19dac143-9407-4bb4-cc23-b19b06025617"
+      },
+      "outputs": [],
+      "source": [
+        "# Create feature matrix with two highly correlated features\n",
+        "features_m = np.array([[1, 1, 1],\n",
+        "[2, 2, 0],\n",
+        "[3, 3, 1],\n",
+        "[4, 4, 0],\n",
+        "[5, 5, 1],\n",
+        "[6, 6, 0],\n",
+        "[7, 7, 1],\n",
+        "[8, 7, 0],\n",
+        "[9, 7, 1]])\n",
+        "# Conversion of  feature matrix\n",
+        "dataframe = pd.DataFrame(features_m)\n",
+        "# correlation matrix creation\n",
+        "corr_m = dataframe.corr().abs()\n",
+        "# upper triangle selection\n",
+        "upper1 = corr_m.where(np.triu(np.ones(corr_m.shape),\n",
+        "k=1).astype(np.bool))\n",
+        "# For correlation greater than 0.85, Find index of feature columns\n",
+        "droping = [col for col in upper1.columns if any(upper1[col] > 0.85)]\n",
+        "# Drop features\n",
+        "dataframe.drop(dataframe.columns[droping], axis=1).head(3)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Dos1ZfkDS-Zd",
+        "outputId": "17e96f0d-a55a-4943-90a9-99aa3c31fad3"
+      },
+      "outputs": [],
+      "source": [
+        "# Load data\n",
+        "iris_i = load_iris()\n",
+        "features_v = iris.data\n",
+        "target = iris.target\n",
+        "# categorical data coversion\n",
+        "features_v = features_v.astype(int)\n",
+        "# Selection of two features using highest chi-squared \n",
+        "chi2_s = SelectKBest(chi2, k=2)\n",
+        "f_kbest = chi2_s.fit_transform(features_v, target)\n",
+        "# Show results\n",
+        "print(\"Original number of features:\", features_v.shape[1])\n",
+        "print(\"Reduced number of features:\", f_kbest.shape[1])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "y10u_gQbTCwR",
+        "outputId": "651182ab-d857-4a3d-db61-4fff866d167c"
+      },
+      "outputs": [],
+      "source": [
+        "# Selection of two features using highest F-values\n",
+        "f_selector = SelectKBest(f_classif, k=2)\n",
+        "f_kbest = f_selector.fit_transform(features_v, target)\n",
+        "# Pisplay results\n",
+        "print(\"Original number of features:\", features_v.shape[1])\n",
+        "print(\"Reduced number of features:\", f_kbest.shape[1])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "5NXAa6UKTHiu",
+        "outputId": "c34866b2-c08c-4020-b14d-78deb98f2834"
+      },
+      "outputs": [],
+      "source": [
+        "# Selection of top 65% of features \n",
+        "f_selector = SelectPercentile(f_classif, percentile=65)\n",
+        "f_kbest = f_selector.fit_transform(features_v, target)\n",
+        "# Display results\n",
+        "print(\"Original number of features:\", features_v.shape[1])\n",
+        "print(\"Reduced number of features:\", f_kbest.shape[1])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "39-Wq-F9TKVg",
+        "outputId": "e52c0537-2245-4f12-ea9a-ace232984ec1"
+      },
+      "outputs": [],
+      "source": [
+        "# Load libraries\n",
+        "# Suppress an annoying but harmless warning\n",
+        "warnings.filterwarnings(action=\"ignore\", module=\"scipy\",\n",
+        "message=\"^internal gelsd\")\n",
+        "#  features matrix, target vector, true coefficients\n",
+        "features_f, target_t = make_regression(n_samples = 10000,\n",
+        "n_features = 100,\n",
+        "n_informative = 2,\n",
+        "random_state = 1)\n",
+        "# linear regression creation\n",
+        "ols = linear_model.LinearRegression()\n",
+        "# Recursive features elimination\n",
+        "rfecv = RFECV(estimator=ols, step=2, scoring=\"neg_mean_squared_error\")\n",
+        "rfecv.fit(features_f, target_t)\n",
+        "rfecv.transform(features_f)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Ut1mgIGEUhJM",
+        "outputId": "f365a4d5-63f4-4a55-e828-d331e6f06308"
+      },
+      "outputs": [],
+      "source": [
+        "# Number of best features\n",
+        "rfecv.n_features_"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Lpt7I_Q0UjN1",
+        "outputId": "4d6938dc-d813-42a5-c1b7-9ba4865a0e86"
+      },
+      "outputs": [],
+      "source": [
+        "# What the best categories ?\n",
+        "rfecv.support_"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ojYKsEbTUkMu",
+        "outputId": "98652d92-f58f-41fe-9ba1-b1ecd3ef7ecb"
+      },
+      "outputs": [],
+      "source": [
+        "# We can even see how the features are ranked\n",
+        "rfecv.ranking_"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "name": "Untitled42.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# [Dimensionality Reduction Using Feature Selection in Python](https://www.thepythoncode.com/article/dimensionality-reduction-feature-selection)`