Phase 4
Phase 4
Phase 4
PROGRAM :
1,import the libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from scipy.stats import iqr
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
data.head()
Output:
3.create a dataset
df_s = df.sample(10000, random_state=42)
df_s["InvoiceDate"] = df_s["InvoiceDate"].dt.date
df_s["TotalSum"] = df_s["Quantity"] * df_s["UnitPrice"]
snapshot_date = max(df_s.InvoiceDate) + datetime.timedelta(days=1)
customers = df_s.groupby(['CustomerID']).agg({'InvoiceDate': lambda x:
(snapshot_date - x.max()).days,
'InvoiceNo':
'count','TotalSum': 'sum'})
customers.rename(columns = {'InvoiceDate': 'Recency','InvoiceNo':
'Frequency',
'TotalSum': 'MonetaryValue'}, inplace=True)
display(customers.head())
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("../input/customer-segmentation-tutorial-in-
python/Mall_Customers.csv")
df.head()
4
.DATA VISUALIZATION:
age18_25 = df.Age[(df.Age <= 25) & (df.Age >= 18)]
age26_35 = df.Age[(df.Age <= 35) & (df.Age >= 26)]
age36_45 = df.Age[(df.Age <= 45) & (df.Age >= 36)]
age46_55 = df.Age[(df.Age <= 55) & (df.Age >= 46)]
age55above = df.Age[df.Age >= 56]
x = ["18-25","26-35","36-45","46-55","55+"]
y =
[len(age18_25.values),len(age26_35.values),len(age36_45.values),len(age
46_55.values),len(age55above.values)]
plt.figure(figsize=(15,6))
sns.barplot(x=x, y=y, palette="rocket")
plt.title("Number of Customer and Ages")
plt.xlabel("Age")
plt.ylabel("Number of Customer")
plt.show()
x_axis = df_std['Age']
y_axis = df_std['Income']
plt.figure(figsize = (10, 8))
sns.scatterplot(x_axis, y_axis, hue = df_std['Labels'], palette = ['g',
'r', 'c', 'm'])
plt.title('Segmentation K-means')
plt.show()
5.MODEL DEVELOPING:
df = pd.read_csv('Mall_Customers.csv')
df = df.drop(['CustomerID'],axis=1
pred = model.predict(PCA_components.iloc[:,:2])
frame = pd.DataFrame(df)
frame['cluster'] = pred
frame.head()
6.FREQUENCY:
freq = df_rec.groupby('CustomerID')['Date'].count()
df_freq = pd.DataFrame(freq).reset_index()
df_freq.columns = ['CustomerID','frequency']
rec_freq = df_freq.merge(df_rec,on='CustomerID')