Skip to content

Commit cae3ce0

Browse files
committed
add dimensionality reduction using feature selection tutorial
1 parent ccd6b89 commit cae3ce0

File tree

4 files changed

+446
-0
lines changed

4 files changed

+446
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# [Dimensionality Reduction Using Feature Selection in Python](https://www.thepythoncode.com/article/dimensionality-reduction-feature-selection)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"id": "iImkWEpRSiRq"
8+
},
9+
"outputs": [],
10+
"source": [
11+
"\n",
12+
"# Load libraries\n",
13+
"import pandas as pd\n",
14+
"import numpy as np\n",
15+
"from sklearn.datasets import load_iris, make_regression\n",
16+
"from sklearn.feature_selection import SelectKBest, chi2, f_classif, SelectPercentile, VarianceThreshold, RFECV\n",
17+
"from sklearn.preprocessing import StandardScaler\n",
18+
"import warnings\n",
19+
"from sklearn import datasets, linear_model"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": null,
25+
"metadata": {
26+
"colab": {
27+
"base_uri": "https://localhost:8080/"
28+
},
29+
"id": "ZEK7KAyzSokS",
30+
"outputId": "7ce72382-c116-4f51-df7b-1f975c1c25f8"
31+
},
32+
"outputs": [],
33+
"source": [
34+
"# Load libraries\n",
35+
"# import data\n",
36+
"iris = datasets.load_iris()\n",
37+
"# Create features and target\n",
38+
"features_i = iris.data\n",
39+
"target_i = iris.target\n",
40+
"# thresholder creation\n",
41+
"thresholder = VarianceThreshold(threshold=.4)\n",
42+
"# high variance feature matrix creation\n",
43+
"f_high_variance = thresholder.fit_transform(features_i)\n",
44+
"# View high variance feature matrix\n",
45+
"f_high_variance[0:3]"
46+
]
47+
},
48+
{
49+
"cell_type": "code",
50+
"execution_count": null,
51+
"metadata": {
52+
"colab": {
53+
"base_uri": "https://localhost:8080/"
54+
},
55+
"id": "7ZZgOg1-SpuX",
56+
"outputId": "a869adde-0b29-4630-9661-34377f110d4f"
57+
},
58+
"outputs": [],
59+
"source": [
60+
"# View variances\n",
61+
"thresholder.fit(features_i).variances_"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {
68+
"colab": {
69+
"base_uri": "https://localhost:8080/"
70+
},
71+
"id": "zYNK4wP5Sq9R",
72+
"outputId": "30e18ea5-4b63-43e5-819e-9a99251dfae6"
73+
},
74+
"outputs": [],
75+
"source": [
76+
"\n",
77+
"# feature matrix stantardization\n",
78+
"scaler = StandardScaler()\n",
79+
"f_std = scaler.fit_transform(features_i)\n",
80+
"# variance of each feature calculation\n",
81+
"selection = VarianceThreshold()\n",
82+
"selection.fit(f_std).variances_"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": null,
88+
"metadata": {
89+
"colab": {
90+
"base_uri": "https://localhost:8080/"
91+
},
92+
"id": "jDGMP97LSuiB",
93+
"outputId": "c1b9d537-495f-4109-ef75-324fe9943668"
94+
},
95+
"outputs": [],
96+
"source": [
97+
"# feature matrix creation with:\n",
98+
"# for Feature 0: 80% class 0\n",
99+
"# for Feature 1: 80% class 1\n",
100+
"# for Feature 2: 60% class 0, 40% class 1\n",
101+
"features_i = [[0, 2, 0],\n",
102+
"[0, 1, 1],\n",
103+
"[0, 1, 0],\n",
104+
"[0, 1, 1],\n",
105+
"[1, 0, 0]]\n",
106+
"# threshold by variance\n",
107+
"thresholding = VarianceThreshold(threshold=(.65 * (1 - .65)))\n",
108+
"thresholding.fit_transform(features_i)"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": null,
114+
"metadata": {
115+
"colab": {
116+
"base_uri": "https://localhost:8080/",
117+
"height": 198
118+
},
119+
"id": "JvnObeKXS6xm",
120+
"outputId": "19dac143-9407-4bb4-cc23-b19b06025617"
121+
},
122+
"outputs": [],
123+
"source": [
124+
"# Create feature matrix with two highly correlated features\n",
125+
"features_m = np.array([[1, 1, 1],\n",
126+
"[2, 2, 0],\n",
127+
"[3, 3, 1],\n",
128+
"[4, 4, 0],\n",
129+
"[5, 5, 1],\n",
130+
"[6, 6, 0],\n",
131+
"[7, 7, 1],\n",
132+
"[8, 7, 0],\n",
133+
"[9, 7, 1]])\n",
134+
"# Conversion of feature matrix\n",
135+
"dataframe = pd.DataFrame(features_m)\n",
136+
"# correlation matrix creation\n",
137+
"corr_m = dataframe.corr().abs()\n",
138+
"# upper triangle selection\n",
139+
"upper1 = corr_m.where(np.triu(np.ones(corr_m.shape),\n",
140+
"k=1).astype(np.bool))\n",
141+
"# For correlation greater than 0.85, Find index of feature columns\n",
142+
"droping = [col for col in upper1.columns if any(upper1[col] > 0.85)]\n",
143+
"# Drop features\n",
144+
"dataframe.drop(dataframe.columns[droping], axis=1).head(3)"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": null,
150+
"metadata": {
151+
"colab": {
152+
"base_uri": "https://localhost:8080/"
153+
},
154+
"id": "Dos1ZfkDS-Zd",
155+
"outputId": "17e96f0d-a55a-4943-90a9-99aa3c31fad3"
156+
},
157+
"outputs": [],
158+
"source": [
159+
"# Load data\n",
160+
"iris_i = load_iris()\n",
161+
"features_v = iris.data\n",
162+
"target = iris.target\n",
163+
"# categorical data coversion\n",
164+
"features_v = features_v.astype(int)\n",
165+
"# Selection of two features using highest chi-squared \n",
166+
"chi2_s = SelectKBest(chi2, k=2)\n",
167+
"f_kbest = chi2_s.fit_transform(features_v, target)\n",
168+
"# Show results\n",
169+
"print(\"Original number of features:\", features_v.shape[1])\n",
170+
"print(\"Reduced number of features:\", f_kbest.shape[1])"
171+
]
172+
},
173+
{
174+
"cell_type": "code",
175+
"execution_count": null,
176+
"metadata": {
177+
"colab": {
178+
"base_uri": "https://localhost:8080/"
179+
},
180+
"id": "y10u_gQbTCwR",
181+
"outputId": "651182ab-d857-4a3d-db61-4fff866d167c"
182+
},
183+
"outputs": [],
184+
"source": [
185+
"# Selection of two features using highest F-values\n",
186+
"f_selector = SelectKBest(f_classif, k=2)\n",
187+
"f_kbest = f_selector.fit_transform(features_v, target)\n",
188+
"# Pisplay results\n",
189+
"print(\"Original number of features:\", features_v.shape[1])\n",
190+
"print(\"Reduced number of features:\", f_kbest.shape[1])"
191+
]
192+
},
193+
{
194+
"cell_type": "code",
195+
"execution_count": null,
196+
"metadata": {
197+
"colab": {
198+
"base_uri": "https://localhost:8080/"
199+
},
200+
"id": "5NXAa6UKTHiu",
201+
"outputId": "c34866b2-c08c-4020-b14d-78deb98f2834"
202+
},
203+
"outputs": [],
204+
"source": [
205+
"# Selection of top 65% of features \n",
206+
"f_selector = SelectPercentile(f_classif, percentile=65)\n",
207+
"f_kbest = f_selector.fit_transform(features_v, target)\n",
208+
"# Display results\n",
209+
"print(\"Original number of features:\", features_v.shape[1])\n",
210+
"print(\"Reduced number of features:\", f_kbest.shape[1])"
211+
]
212+
},
213+
{
214+
"cell_type": "code",
215+
"execution_count": null,
216+
"metadata": {
217+
"colab": {
218+
"base_uri": "https://localhost:8080/"
219+
},
220+
"id": "39-Wq-F9TKVg",
221+
"outputId": "e52c0537-2245-4f12-ea9a-ace232984ec1"
222+
},
223+
"outputs": [],
224+
"source": [
225+
"# Load libraries\n",
226+
"# Suppress an annoying but harmless warning\n",
227+
"warnings.filterwarnings(action=\"ignore\", module=\"scipy\",\n",
228+
"message=\"^internal gelsd\")\n",
229+
"# features matrix, target vector, true coefficients\n",
230+
"features_f, target_t = make_regression(n_samples = 10000,\n",
231+
"n_features = 100,\n",
232+
"n_informative = 2,\n",
233+
"random_state = 1)\n",
234+
"# linear regression creation\n",
235+
"ols = linear_model.LinearRegression()\n",
236+
"# Recursive features elimination\n",
237+
"rfecv = RFECV(estimator=ols, step=2, scoring=\"neg_mean_squared_error\")\n",
238+
"rfecv.fit(features_f, target_t)\n",
239+
"rfecv.transform(features_f)"
240+
]
241+
},
242+
{
243+
"cell_type": "code",
244+
"execution_count": null,
245+
"metadata": {
246+
"colab": {
247+
"base_uri": "https://localhost:8080/"
248+
},
249+
"id": "Ut1mgIGEUhJM",
250+
"outputId": "f365a4d5-63f4-4a55-e828-d331e6f06308"
251+
},
252+
"outputs": [],
253+
"source": [
254+
"# Number of best features\n",
255+
"rfecv.n_features_"
256+
]
257+
},
258+
{
259+
"cell_type": "code",
260+
"execution_count": null,
261+
"metadata": {
262+
"colab": {
263+
"base_uri": "https://localhost:8080/"
264+
},
265+
"id": "Lpt7I_Q0UjN1",
266+
"outputId": "4d6938dc-d813-42a5-c1b7-9ba4865a0e86"
267+
},
268+
"outputs": [],
269+
"source": [
270+
"# What the best categories ?\n",
271+
"rfecv.support_"
272+
]
273+
},
274+
{
275+
"cell_type": "code",
276+
"execution_count": null,
277+
"metadata": {
278+
"colab": {
279+
"base_uri": "https://localhost:8080/"
280+
},
281+
"id": "ojYKsEbTUkMu",
282+
"outputId": "98652d92-f58f-41fe-9ba1-b1ecd3ef7ecb"
283+
},
284+
"outputs": [],
285+
"source": [
286+
"# We can even see how the features are ranked\n",
287+
"rfecv.ranking_"
288+
]
289+
}
290+
],
291+
"metadata": {
292+
"colab": {
293+
"name": "Untitled42.ipynb",
294+
"provenance": []
295+
},
296+
"kernelspec": {
297+
"display_name": "Python 3",
298+
"name": "python3"
299+
},
300+
"language_info": {
301+
"name": "python"
302+
}
303+
},
304+
"nbformat": 4,
305+
"nbformat_minor": 0
306+
}

0 commit comments

Comments
 (0)