Skip to content

Commit c50100f

Browse files
committed
add dimensionality reduction feature extraction tutorial
1 parent a29ed38 commit c50100f

File tree

5 files changed

+455
-0
lines changed

5 files changed

+455
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
9191
- [Logistic Regression using PyTorch in Python](https://www.thepythoncode.com/article/logistic-regression-using-pytorch). ([code](machine-learning/logistic-regression-in-pytorch))
9292
- [Dropout Regularization using PyTorch in Python](https://www.thepythoncode.com/article/dropout-regularization-in-pytorch). ([code](machine-learning/dropout-in-pytorch))
9393
- [K-Fold Cross Validation using Scikit-Learn in Python](https://www.thepythoncode.com/article/kfold-cross-validation-using-sklearn-in-python). ([code](machine-learning/k-fold-cross-validation-sklearn))
94+
- [Dimensionality Reduction: Feature Extraction using Scikit-learn in Python](https://www.thepythoncode.com/article/dimensionality-reduction-using-feature-extraction-sklearn). ([code](machine-learning/dimensionality-reduction-feature-extraction))
9495

9596
- ### [General Python Topics](https://www.thepythoncode.com/topic/general-python-topics)
9697
- [How to Make Facebook Messenger bot in Python](https://www.thepythoncode.com/article/make-bot-fbchat-python). ([code](general/messenger-bot))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,341 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"id": "NNamP65y8eGf"
8+
},
9+
"outputs": [],
10+
"source": [
11+
"from sklearn import datasets\n",
12+
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
13+
"from sklearn.decomposition import PCA, KernelPCA\n",
14+
"from sklearn.datasets import make_circles\n",
15+
"from sklearn.preprocessing import StandardScaler\n",
16+
"from sklearn.decomposition import NMF\n",
17+
"from sklearn.decomposition import TruncatedSVD\n",
18+
"from scipy.sparse import csr_matrix"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": 2,
24+
"metadata": {
25+
"colab": {
26+
"base_uri": "https://localhost:8080/"
27+
},
28+
"id": "fvJfKhFq8hQc",
29+
"outputId": "acbc4c59-acbd-4ff4-bacb-e54b55e0312f"
30+
},
31+
"outputs": [
32+
{
33+
"name": "stdout",
34+
"output_type": "stream",
35+
"text": [
36+
"Original number of features: 64\n",
37+
"Reduced number of features: 40\n"
38+
]
39+
}
40+
],
41+
"source": [
42+
"# Load the data\n",
43+
"digits = datasets.load_digits()\n",
44+
"# Feature matrix standardization\n",
45+
"features = StandardScaler().fit_transform(digits.data)\n",
46+
"# Perform PCA While retaining 80% of variance\n",
47+
"pca = PCA(n_components=0.95, whiten=True)\n",
48+
"# perform PCA\n",
49+
"pcafeatures = pca.fit_transform(features)\n",
50+
"# Display results\n",
51+
"print(\"Original number of features:\", features.shape[1])\n",
52+
"print(\"Reduced number of features:\", pcafeatures.shape[1])"
53+
]
54+
},
55+
{
56+
"cell_type": "code",
57+
"execution_count": 3,
58+
"metadata": {
59+
"colab": {
60+
"base_uri": "https://localhost:8080/"
61+
},
62+
"id": "jyU800Lf8it4",
63+
"outputId": "0d4c73bf-7d08-48e6-a44f-a5647a2e0c11"
64+
},
65+
"outputs": [
66+
{
67+
"name": "stdout",
68+
"output_type": "stream",
69+
"text": [
70+
"Original number of features: 2\n",
71+
"Reduced number of features: 1\n"
72+
]
73+
}
74+
],
75+
"source": [
76+
"# Creation of the linearly inseparable data\n",
77+
"features, _ = make_circles(n_samples=2000, random_state=1, noise=0.1, factor=0.1)\n",
78+
"# kernal PCA with radius basis function (RBF) kernel application\n",
79+
"k_pca = KernelPCA(kernel=\"rbf\", gamma=16, n_components=1)\n",
80+
"k_pcaf = k_pca.fit_transform(features)\n",
81+
"print(\"Original number of features:\", features.shape[1])\n",
82+
"print(\"Reduced number of features:\", k_pcaf.shape[1])"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": 4,
88+
"metadata": {
89+
"colab": {
90+
"base_uri": "https://localhost:8080/"
91+
},
92+
"id": "IfCo5TA28kn6",
93+
"outputId": "312956a9-9fb5-4296-d766-a3e642649da1"
94+
},
95+
"outputs": [
96+
{
97+
"name": "stdout",
98+
"output_type": "stream",
99+
"text": [
100+
"number of features(original): 4\n",
101+
"number of features that was reduced: 1\n"
102+
]
103+
}
104+
],
105+
"source": [
106+
"#flower dataset loading:\n",
107+
"iris = datasets.load_iris()\n",
108+
"features = iris.data\n",
109+
"target = iris.target\n",
110+
"# Creation of LDA. Use of LDA for features transformation\n",
111+
"lda = LinearDiscriminantAnalysis(n_components=1)\n",
112+
"features_lda = lda.fit(features, target).transform(features)\n",
113+
"# Print the number of features\n",
114+
"print(\"number of features(original):\", features.shape[1])\n",
115+
"print(\"number of features that was reduced:\", features_lda.shape[1])"
116+
]
117+
},
118+
{
119+
"cell_type": "code",
120+
"execution_count": 5,
121+
"metadata": {
122+
"colab": {
123+
"base_uri": "https://localhost:8080/"
124+
},
125+
"id": "yjQBlMtM8mQu",
126+
"outputId": "800279fb-f44b-43e8-9210-a35b8e190fc7"
127+
},
128+
"outputs": [
129+
{
130+
"data": {
131+
"text/plain": [
132+
"array([0.9912126])"
133+
]
134+
},
135+
"execution_count": 5,
136+
"metadata": {},
137+
"output_type": "execute_result"
138+
}
139+
],
140+
"source": [
141+
"lda.explained_variance_ratio_"
142+
]
143+
},
144+
{
145+
"cell_type": "code",
146+
"execution_count": 10,
147+
"metadata": {
148+
"colab": {
149+
"base_uri": "https://localhost:8080/"
150+
},
151+
"id": "tHOWTxn18nf7",
152+
"outputId": "ae3c857a-0ca8-4508-affc-b5ea4dff6788"
153+
},
154+
"outputs": [
155+
{
156+
"data": {
157+
"text/plain": [
158+
"1"
159+
]
160+
},
161+
"execution_count": 10,
162+
"metadata": {},
163+
"output_type": "execute_result"
164+
}
165+
],
166+
"source": [
167+
"# Load Iris flower dataset:\n",
168+
"iris123 = datasets.load_iris()\n",
169+
"features = iris123.data\n",
170+
"target = iris123.target\n",
171+
"# Create and run LDA\n",
172+
"lda_r = LinearDiscriminantAnalysis(n_components=None)\n",
173+
"features_lda = lda_r.fit(features, target)\n",
174+
"# array of explained variance ratios\n",
175+
"lda_var_r = lda_r.explained_variance_ratio_\n",
176+
"# function ceration\n",
177+
"def select_n_c(v_ratio, g_var: float) -> int:\n",
178+
" # initial variance explained setting\n",
179+
" total_v = 0.0\n",
180+
" # number of features initialisation\n",
181+
" n_components = 0\n",
182+
" # If we consider explained variance of each feature:\n",
183+
" for explained_v in v_ratio:\n",
184+
" # explained variance addition to the total\n",
185+
" total_v += explained_v\n",
186+
" # add one to number of components\n",
187+
" n_components += 1\n",
188+
" # we attain our goal level of explained variance\n",
189+
" if total_v >= g_var:\n",
190+
" # end the loop\n",
191+
" break\n",
192+
" # return the number of components\n",
193+
" return n_components\n",
194+
"\n",
195+
"# run the function\n",
196+
"select_n_c(lda_var_r, 0.95)"
197+
]
198+
},
199+
{
200+
"cell_type": "code",
201+
"execution_count": 7,
202+
"metadata": {
203+
"colab": {
204+
"base_uri": "https://localhost:8080/"
205+
},
206+
"id": "12zwY1Du8o6i",
207+
"outputId": "e9178fdf-2195-41cc-f4c3-a1e52c030df5"
208+
},
209+
"outputs": [
210+
{
211+
"name": "stderr",
212+
"output_type": "stream",
213+
"text": [
214+
"/usr/local/lib/python3.7/dist-packages/sklearn/decomposition/_nmf.py:294: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).\n",
215+
" FutureWarning,\n"
216+
]
217+
},
218+
{
219+
"name": "stdout",
220+
"output_type": "stream",
221+
"text": [
222+
"Original number of features: 64\n",
223+
"Reduced number of features: 12\n"
224+
]
225+
},
226+
{
227+
"name": "stderr",
228+
"output_type": "stream",
229+
"text": [
230+
"/usr/local/lib/python3.7/dist-packages/sklearn/decomposition/_nmf.py:1641: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.\n",
231+
" ConvergenceWarning,\n"
232+
]
233+
}
234+
],
235+
"source": [
236+
"# data loading\n",
237+
"digit = datasets.load_digits()\n",
238+
"# feature matrix loading\n",
239+
"feature_m = digit.data\n",
240+
"# Creation, fit and application of NMF\n",
241+
"n_mf = NMF(n_components=12, random_state=1)\n",
242+
"features_nmf = n_mf.fit_transform(feature_m)\n",
243+
"# Show results\n",
244+
"print(\"Original number of features:\", feature_m.shape[1])\n",
245+
"print(\"Reduced number of features:\", features_nmf.shape[1])"
246+
]
247+
},
248+
{
249+
"cell_type": "code",
250+
"execution_count": 8,
251+
"metadata": {
252+
"colab": {
253+
"base_uri": "https://localhost:8080/"
254+
},
255+
"id": "wrEYF9Ql8qtU",
256+
"outputId": "c28d28be-4f0b-4bd7-bb56-fde6ead38a45"
257+
},
258+
"outputs": [
259+
{
260+
"name": "stdout",
261+
"output_type": "stream",
262+
"text": [
263+
"Original number of features: 64\n",
264+
"Reduced number of features: 12\n"
265+
]
266+
}
267+
],
268+
"source": [
269+
"# data loading\n",
270+
"digit123 = datasets.load_digits()\n",
271+
"# feature matrix Standardization\n",
272+
"features_m = StandardScaler().fit_transform(digit123.data)\n",
273+
"# sparse matrix creation\n",
274+
"f_sparse = csr_matrix(features_m)\n",
275+
"# TSVD creation\n",
276+
"tsvd = TruncatedSVD(n_components=12)\n",
277+
"# sparse matrix TSVD\n",
278+
"features_sp_tsvd = tsvd.fit(f_sparse).transform(f_sparse)\n",
279+
"# results\n",
280+
"print(\"Original number of features:\", f_sparse.shape[1])\n",
281+
"print(\"Reduced number of features:\", features_sp_tsvd.shape[1])"
282+
]
283+
},
284+
{
285+
"cell_type": "code",
286+
"execution_count": 9,
287+
"metadata": {
288+
"colab": {
289+
"base_uri": "https://localhost:8080/"
290+
},
291+
"id": "xRQ_nUf_8sZA",
292+
"outputId": "19b8d99c-b330-406d-e728-407c18d82f20"
293+
},
294+
"outputs": [
295+
{
296+
"data": {
297+
"text/plain": [
298+
"0.3003938539283667"
299+
]
300+
},
301+
"execution_count": 9,
302+
"metadata": {},
303+
"output_type": "execute_result"
304+
}
305+
],
306+
"source": [
307+
"# Sum of first three components' explained variance ratios\n",
308+
"tsvd.explained_variance_ratio_[0:3].sum()"
309+
]
310+
},
311+
{
312+
"cell_type": "code",
313+
"execution_count": null,
314+
"metadata": {
315+
"id": "zbExVkXp8vpi"
316+
},
317+
"outputs": [],
318+
"source": []
319+
}
320+
],
321+
"metadata": {
322+
"colab": {
323+
"name": "DimentionalityReductionUsingFeatureExtraction_PythonCodeTutorial.ipynb",
324+
"provenance": []
325+
},
326+
"interpreter": {
327+
"hash": "f89a88aed07bbcd763ac68893150ace71e487877d8c6527a76855322f20001c6"
328+
},
329+
"kernelspec": {
330+
"display_name": "Python 3.9.12 64-bit",
331+
"language": "python",
332+
"name": "python3"
333+
},
334+
"language_info": {
335+
"name": "python",
336+
"version": "3.9.12"
337+
}
338+
},
339+
"nbformat": 4,
340+
"nbformat_minor": 0
341+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# [Dimensionality Reduction: Feature Extraction using Scikit-learn in Python](https://www.thepythoncode.com/article/dimensionality-reduction-using-feature-extraction-sklearn)

0 commit comments

Comments
 (0)