10/25/2020 SomritaProject_Clustering
In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
%matplotlib inline
In [2]:
data_bank=pd.read_csv("bank_marketing_part1_Data.csv")
data_bank.head()
Out[2]:
spending advance_payments probability_of_full_payment current_balance credit_limit mi
0 19.94 16.92 0.8752 6.675 3.763
1 15.99 14.89 0.9064 5.363 3.582
2 18.95 16.42 0.8829 6.248 3.755
3 10.83 12.96 0.8099 5.278 2.641
4 17.99 15.86 0.8992 5.890 3.694
In [4]:
data_bank.describe().T
Out[4]:
count mean std min 25% 50%
spending 210.0 14.847524 2.909699 10.5900 12.27000 14.35500 17.3
advance_payments 210.0 14.559286 1.305959 12.4100 13.45000 14.32000 15.7
probability_of_full_payment 210.0 0.870999 0.023629 0.8081 0.85690 0.87345 0.8
current_balance 210.0 5.628533 0.443063 4.8990 5.26225 5.52350 5.9
credit_limit 210.0 3.258605 0.377714 2.6300 2.94400 3.23700 3.5
min_payment_amt 210.0 3.700201 1.503557 0.7651 2.56150 3.59900 4.7
max_spent_in_single_shopping 210.0 5.408071 0.491480 4.5190 5.04500 5.22300 5.8
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 1/20
10/25/2020 SomritaProject_Clustering
In [5]:
data_bank.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 spending 210 non-null float64
1 advance_payments 210 non-null float64
2 probability_of_full_payment 210 non-null float64
3 current_balance 210 non-null float64
4 credit_limit 210 non-null float64
5 min_payment_amt 210 non-null float64
6 max_spent_in_single_shopping 210 non-null float64
dtypes: float64(7)
memory usage: 11.6 KB
In [6]:
data_bank.shape
Out[6]:
(210, 7)
In [7]:
data_bank.duplicated().sum()
Out[7]:
In [8]:
data_bank.isnull().sum()
Out[8]:
spending 0
advance_payments 0
probability_of_full_payment 0
current_balance 0
credit_limit 0
min_payment_amt 0
max_spent_in_single_shopping 0
dtype: int64
In [9]:
data_bank.columns
Out[9]:
Index(['spending', 'advance_payments', 'probability_of_full_payment',
'current_balance', 'credit_limit', 'min_payment_amt',
'max_spent_in_single_shopping'],
dtype='object')
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 2/20
10/25/2020 SomritaProject_Clustering
In [10]:
columns=('spending', 'advance_payments', 'probability_of_full_payment',
'current_balance', 'credit_limit', 'min_payment_amt',
'max_spent_in_single_shopping')
plt.figure(figsize=[20,40])
for i in range(len(columns)):
plt.subplot(9,2,i+1)
sns.boxplot(data_bank[columns[i]])
In [11]:
def remove_outlier(col):
sorted(col)
Q1,Q3=col.quantile([0.25,0.75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 3/20
10/25/2020 SomritaProject_Clustering
In [12]:
lrincome,urincome=remove_outlier(data_bank['probability_of_full_payment'])
data_bank['probability_of_full_payment']=np.where(data_bank['probability_of_full_paymen
t']>urincome,urincome,data_bank['probability_of_full_payment'])
data_bank['probability_of_full_payment']=np.where(data_bank['probability_of_full_paymen
t']<lrincome,lrincome,data_bank['probability_of_full_payment'])
lrtravel,urtravel=remove_outlier(data_bank['min_payment_amt'])
data_bank['min_payment_amt']=np.where(data_bank['min_payment_amt']>urtravel,urtravel,da
ta_bank['min_payment_amt'])
data_bank['min_payment_amt']=np.where(data_bank['min_payment_amt']<lrtravel,lrtravel,da
ta_bank['min_payment_amt'])
In [13]:
columns=('spending', 'advance_payments', 'probability_of_full_payment',
'current_balance', 'credit_limit', 'min_payment_amt',
'max_spent_in_single_shopping')
plt.figure(figsize=[20,40])
for i in range(len(columns)):
plt.subplot(9,2,i+1)
sns.boxplot(data_bank[columns[i]])
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 4/20
10/25/2020 SomritaProject_Clustering
In [14]:
columns=('spending', 'advance_payments', 'probability_of_full_payment',
'current_balance', 'credit_limit', 'min_payment_amt',
'max_spent_in_single_shopping')
plt.figure(figsize=[20,40])
for i in range(len(columns)):
plt.subplot(9,2,i+1)
sns.distplot(data_bank[columns[i]],bins=20)
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 5/20
10/25/2020 SomritaProject_Clustering
In [19]:
data_bank.hist(figsize=(15,20))
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 6/20
10/25/2020 SomritaProject_Clustering
Out[19]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000028F91331F8
8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000028F9133B54
8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000028F91361B8
8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000028F9139B50
8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000028F913CFF0
8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000028F9140B90
8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000028F9144628
8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000028F91479F0
8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000028F9148C18
8>]],
dtype=object)
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 7/20
10/25/2020 SomritaProject_Clustering
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 8/20
10/25/2020 SomritaProject_Clustering
In [20]:
plt.figure(figsize=[10,5])
sns.pairplot(data_bank)
plt.show()
<Figure size 720x360 with 0 Axes>
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 9/20
10/25/2020 SomritaProject_Clustering
In [21]:
corr = data_bank.corr()
corr
Out[21]:
spending advance_payments probability_of_full_payment cur
spending 1.000000 0.994341 0.608900
advance_payments 0.994341 1.000000 0.529925
probability_of_full_payment 0.608900 0.529925 1.000000
current_balance 0.949985 0.972422 0.368419
credit_limit 0.970771 0.944829 0.762218
min_payment_amt -0.229619 -0.217051 -0.335071
max_spent_in_single_shopping 0.863693 0.890784 0.227140
In [22]:
plt.figure(figsize=(12,7))
sns.heatmap(data_bank.corr(), annot=True, fmt='.2f', cmap='Blues')
plt.show()
In [23]:
# importing the StandardScaler Module
from sklearn.preprocessing import StandardScaler
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 10/20
10/25/2020 SomritaProject_Clustering
In [24]:
X = StandardScaler()
X
Out[24]:
StandardScaler()
In [25]:
scaled_df= X.fit_transform(data_bank)
In [26]:
scaled_df = pd.DataFrame(scaled_df, index=data_bank.index, columns=data_bank.columns)
scaled_df.head()
Out[26]:
spending advance_payments probability_of_full_payment current_balance credit_limit m
0 1.754355 1.811968 0.177628 2.367533 1.338579
1 0.393582 0.253840 1.505071 -0.600744 0.858236
2 1.413300 1.428192 0.505234 1.401485 1.317348
3 -1.384034 -1.227533 -2.571391 -0.793049 -1.639017
4 1.082581 0.998364 1.198738 0.591544 1.155464
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 11/20
10/25/2020 SomritaProject_Clustering
In [27]:
columns=('spending', 'advance_payments', 'probability_of_full_payment',
'current_balance', 'credit_limit', 'min_payment_amt',
'max_spent_in_single_shopping')
plt.figure(figsize=[20,40])
for i in range(len(columns)):
plt.subplot(9,2,i+1)
sns.boxplot(scaled_df[columns[i]])
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 12/20
10/25/2020 SomritaProject_Clustering
In [28]:
lrincome,urincome=remove_outlier(scaled_df['probability_of_full_payment'])
scaled_df['probability_of_full_payment']=np.where(scaled_df['probability_of_full_paymen
t']>urincome,urincome,scaled_df['probability_of_full_payment'])
scaled_df['probability_of_full_payment']=np.where(scaled_df['probability_of_full_paymen
t']<lrincome,lrincome,scaled_df['probability_of_full_payment'])
In [29]:
columns=('spending', 'advance_payments', 'probability_of_full_payment',
'current_balance', 'credit_limit', 'min_payment_amt',
'max_spent_in_single_shopping')
plt.figure(figsize=[20,40])
for i in range(len(columns)):
plt.subplot(9,2,i+1)
sns.boxplot(scaled_df[columns[i]])
In [30]:
from scipy.cluster.hierarchy import dendrogram, linkage
In [31]:
wardlink = linkage(scaled_df, method = 'ward')
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 13/20
10/25/2020 SomritaProject_Clustering
In [32]:
dend=dendrogram(wardlink)
In [33]:
dend=dendrogram(wardlink,truncate_mode='lastp',p=10)
In [34]:
from scipy.cluster.hierarchy import fcluster
In [35]:
clusters=fcluster(wardlink,3,criterion='maxclust')
In [36]:
data_bank['clusters']=clusters
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 14/20
10/25/2020 SomritaProject_Clustering
In [37]:
data_bank.head()
Out[37]:
spending advance_payments probability_of_full_payment current_balance credit_limit mi
0 19.94 16.92 0.875200 6.675 3.763
1 15.99 14.89 0.906400 5.363 3.582
2 18.95 16.42 0.882900 6.248 3.755
3 10.83 12.96 0.810588 5.278 2.641
4 17.99 15.86 0.899200 5.890 3.694
In [38]:
data_bank.clusters.value_counts().sort_index()
Out[38]:
1 70
2 67
3 73
Name: clusters, dtype: int64
In [39]:
aggdata=data_bank.groupby('clusters').mean()
aggdata['Freq']=data_bank.clusters.value_counts().sort_index()
aggdata
Out[39]:
spending advance_payments probability_of_full_payment current_balance credit_li
clusters
1 18.371429 16.145429 0.884400 6.158171 3.6846
2 11.872388 13.257015 0.848155 5.238940 2.8485
3 14.199041 14.233562 0.879190 5.478233 3.2264
In [40]:
data_bank.to_csv("HierarchicalProject.csv")
In [41]:
from sklearn.cluster import AgglomerativeClustering
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 15/20
10/25/2020 SomritaProject_Clustering
In [42]:
clusterAgglo = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ave
rage')
Cluster_agglo=clusterAgglo.fit_predict(scaled_df)
print(Cluster_agglo)
[1 0 1 2 1 0 2 2 1 2 1 1 2 1 0 0 0 2 2 2 2 2 1 2 0 1 0 2 2 2 2 2 2 0 2 2 2
2 2 1 1 0 1 1 2 2 0 1 1 1 2 1 1 1 1 1 2 2 2 1 0 2 2 1 0 1 1 0 1 2 0 2 1 1
2 1 0 2 1 0 0 0 0 1 2 1 1 1 1 0 0 1 0 2 2 1 1 1 2 1 0 1 0 1 0 1 1 2 0 1 1
0 1 2 2 1 0 0 2 1 0 2 2 2 0 0 1 2 0 0 2 0 0 1 2 1 1 2 1 0 0 0 2 2 2 2 1 2
0 2 0 2 0 1 0 0 2 2 0 1 1 2 1 1 1 2 1 0 0 2 0 2 0 1 1 1 0 2 0 2 0 2 0 0 1
1 0 1 0 2 0 0 2 1 0 1 1 2 1 2 0 0 0 2 1 0 1 0 0 1]
In [43]:
data_bank["Agglo_CLusters"]=Cluster_agglo
In [45]:
data_bank.head(10)
Out[45]:
spending advance_payments probability_of_full_payment current_balance credit_limit mi
0 19.94 16.92 0.875200 6.675 3.763
1 15.99 14.89 0.906400 5.363 3.582
2 18.95 16.42 0.882900 6.248 3.755
3 10.83 12.96 0.810588 5.278 2.641
4 17.99 15.86 0.899200 5.890 3.694
5 12.70 13.41 0.887400 5.183 3.091
6 12.02 13.33 0.850300 5.350 2.810
7 13.74 14.05 0.874400 5.482 3.114
8 18.17 16.26 0.863700 6.271 3.512
9 11.23 12.88 0.851100 5.140 2.795
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 16/20
10/25/2020 SomritaProject_Clustering
In [46]:
agglo_data=data_bank.groupby('Agglo_CLusters').mean()
agglo_data['Freq']=data_bank.Agglo_CLusters.value_counts().sort_index()
agglo_data
Out[46]:
spending advance_payments probability_of_full_payment current_balance
Agglo_CLusters
0 14.217077 14.195846 0.884869 5.442000
1 18.129200 16.058000 0.881595 6.135747
2 11.916857 13.291000 0.846845 5.258300
In [48]:
#KMeans
k_means = KMeans(n_clusters = 3)
k_means.fit(scaled_df)
k_means.inertia_
Out[48]:
430.298481751223
In [50]:
k_means = KMeans(n_clusters = 2)
k_means.fit(scaled_df)
k_means.inertia_
Out[50]:
659.1474009548498
In [51]:
k_means = KMeans(n_clusters = 1)
k_means.fit(scaled_df)
k_means.inertia_
Out[51]:
1469.9999999999995
In [53]:
k_means = KMeans(n_clusters = 4)
k_means.fit(scaled_df)
k_means.inertia_
Out[53]:
370.8685962394206
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 17/20
10/25/2020 SomritaProject_Clustering
In [54]:
k_means = KMeans(n_clusters = 5)
k_means.fit(scaled_df)
k_means.inertia_
Out[54]:
325.9098750065543
In [55]:
wss =[]
In [56]:
for i in range(1,11):
KM = KMeans(n_clusters=i)
KM.fit(scaled_df)
wss.append(KM.inertia_)
In [57]:
wss
Out[57]:
[1469.9999999999995,
659.1474009548498,
430.298481751223,
371.4400252695771,
327.39077808486644,
289.50583097697313,
262.5952276605776,
239.57831775716886,
223.42523703383455,
208.7028049634438]
In [58]:
plt.plot(range(1,11), wss)
Out[58]:
[<matplotlib.lines.Line2D at 0x28f968b9a88>]
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 18/20
10/25/2020 SomritaProject_Clustering
In [59]:
k_means = KMeans(n_clusters = 3)
k_means.fit(scaled_df)
labels = k_means.labels_
In [61]:
data_bank["Clus_kmeans"] = labels
data_bank.head(7)
Out[61]:
spending advance_payments probability_of_full_payment current_balance credit_limit mi
0 19.94 16.92 0.875200 6.675 3.763
1 15.99 14.89 0.906400 5.363 3.582
2 18.95 16.42 0.882900 6.248 3.755
3 10.83 12.96 0.810588 5.278 2.641
4 17.99 15.86 0.899200 5.890 3.694
5 12.70 13.41 0.887400 5.183 3.091
6 12.02 13.33 0.850300 5.350 2.810
In [62]:
from sklearn.metrics import silhouette_samples, silhouette_score
In [63]:
silhouette_score(scaled_df,labels)
Out[63]:
0.4008059221522216
In [64]:
sil_width = silhouette_samples(scaled_df,labels)
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 19/20
10/25/2020 SomritaProject_Clustering
In [65]:
data_bank["sil_width"] = sil_width
data_bank.head(7)
Out[65]:
spending advance_payments probability_of_full_payment current_balance credit_limit mi
0 19.94 16.92 0.875200 6.675 3.763
1 15.99 14.89 0.906400 5.363 3.582
2 18.95 16.42 0.882900 6.248 3.755
3 10.83 12.96 0.810588 5.278 2.641
4 17.99 15.86 0.899200 5.890 3.694
5 12.70 13.41 0.887400 5.183 3.091
6 12.02 13.33 0.850300 5.350 2.810
In [71]:
kmeansdata=data_bank.groupby('Clus_kmeans').mean()
kmeansdata['Freq']=data_bank.Clus_kmeans.value_counts().sort_index()
kmeansdata
Out[71]:
spending advance_payments probability_of_full_payment current_balance cre
Clus_kmeans
0 11.856944 13.247778 0.848330 5.231750 2
1 18.495373 16.203433 0.884210 6.175687
2 14.437887 14.337746 0.881597 5.514577
In [72]:
silhouette_samples(scaled_df,labels).min()
Out[72]:
0.002768541128616533
In [ ]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 20/20