0% found this document useful (0 votes)

64 views70 pages

Data Visualization

Uploaded by

fashionmusicowner

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

64 views70 pages

Data Visualization

Uploaded by

fashionmusicowner

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 70

1

Machine Learning Visualization from Basic

to Advance

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
2

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
3

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
4

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
5

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
6

Barplot:
sns.barplot(x='day', y='total_bill', data=tips, palette='tab10');

Boxplot
sns.boxplot(x='day', y='total_bill', hue='sex', data=tips, linewidth
=2.5, palette='Dark2');

Kdeplot
sns.kdeplot(data=df , x='Age', hue='Sex', multiple='stack', palette='tab10');

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
7

Violinplot
sns.violinplot(x="day", y="total_bill", data=tips);

Stripplot
sns.stripplot(x="time", y="total_bill", hue="sex", data=tips);

Scatterplot
sns.scatterplot(x = 'total_bill', y = 'tip', hue = 'sex', data = tips);

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
8

Swarmplot
sns.swarmplot(x="day", y="total_bill", hue="sex", data=tips);

Boxenplot
sns.boxenplot( x='time', y="total_bill", hue='sex', data=tips);

Lineplot
sns.lineplot(x="size",y="total_bill",data=tips,hue='sex',markers=True);

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
9

Jointplot
sns.jointplot(x="chol", y="thalachh",data=heart,kind="hist",hue='sex');

Jointplot
sns.jointplot(x="chol",y="trtbps",data=heart,kind="kde",hue='sex');

JointGrid
g = sns.JointGrid(data=heart, x="age", y="chol", hue="output")
g.plot(sns.scatterplot, sns.histplot);

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
10

Lmplot
g= sns.lmplot(x="age", y="chol", hue="cp", data=heart)

Relplot
g = sns.relplot(x="age", y="chol", data=heart,hue='sex')

Heatmap
mask = np.triu(np.ones_like(tips.corr(), dtype=bool))
sns.heatmap(tips.corr(), mask = mask, annot=True, cmap='Dark2');

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
11

Catplot
sns.catplot(x='smoker', col='sex', kind='count', data=tips
,palette="Dark2");

Violinplot
plt.violinplot([wine["alcohol"], wine['fixed acidity'],wine['free sulfur
dioxide']], positions=[1,2,3], showmeans=True);

Distplot
bar = sns.distplot(titanic["Age"],color='Blue',kde=True,bins=25)
bar.legend(["Skewness: {:.2f}".format(titanic['Age'].skew())])
plt.title("Age Distribution");

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
12

titanic.groupby("Sex")["Age","Fare","Pclass"].mean().plot(kind='bar')

color = plt.cm.copper(np.linspace(0, 1, 10))

titanic.groupby(['Embarked','Sex'])['Age'].count().plot(kind='bar',
width=.4,color='gold');

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
13

sns.displot(data=titanic, x="Age", kde=True, bins = 100,color =

"red", facecolor = "#3F7F7F",height = 5, aspect = 3.5);

plt.hist(tips['total_bill'],color='orange',bins=[10,15,25,30,50],edgecol
or='black',rwidth=0.5);

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
14

ht = pd.pivot_table(data=titanic, index="Pclass", aggfunc="median")

sns.barplot(x=ht.index,y=ht['Fare'])
plt.title("Proportion of Fare", fontweight="bold");

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
15

g = sns.FacetGrid(heart, col="fbs", hue="cp")

g.map_dataframe(sns.scatterplot, x="age", y="chol")
g.add_legend();

g = sns.FacetGrid(heart, col="cp")
g = g.map(plt.hist, "age");

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
16

fig = sns.FacetGrid(heart, hue="output", aspect=4)

fig.map(sns.kdeplot, 'age', shade=True)
oldest = heart['age'].max()
fig.set(xlim=(0, oldest))
fig.add_legend()
plt.show()

sns.FacetGrid(titanic, hue='Sex', height=4).map(sns.distplot,

'Age').add_legend();

pd.crosstab(titanic['Sex'],titanic['Embarked']).plot(kind='bar',stacke
d=True);
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
17

grid = sns.FacetGrid(titanic, row='Embarked', aspect=1.6)

grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend();

plt.figure(figsize=(10,4))
print("Skewness: %f" % titanic['Fare'].skew())
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
18

print("Kurtosis: %f" % titanic['Fare'].kurt())

sns.distplot(titanic['Fare'],bins=50,hist_kws={"edgecolor": (1,0,0,1)})
plt.show()

a=tips['total_bill']
mean=a.mean()
median=np.median(a)
mode=a.mode()
sns.distplot(a,hist=False)
plt.axvline(mean,color='r',label='mean')
plt.axvline(median,color='b',label='median')
plt.axvline(mode[0],color='g',label='mode')
plt.legend()
plt.show()

plt.boxplot(a)
plt.text(0.85,13,s='Q1',size=13)
plt.text(0.85,17,s='Q2',size=13)
plt.text(0.85,23,s='Q3',size=13)
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
19

plt.text(1.1,16,s='IQR',rotation=0,size=10)
plt.show()

cat = ['Sex','Embarked']
sns.set_theme(rc = {'figure.dpi': 100, 'axes.labelsize': 12,
'axes.facecolor': '#f0eee9', 'grid.color': '#fffdfa',
'figure.facecolor': '#e8e6e1'}, font_scale = 1.2)
fig, ax = plt.subplots(5, 2, figsize = (12, 22))
for indx, (column, axes) in list(enumerate(list(zip(cat,
ax.flatten())))):

sns.countplot(ax = axes, x = titanic[column], hue = titanic['Pclass'],

palette = 'magma', alpha = 0.8)

else:
[axes.set_visible(False) for axes in ax.flatten()[indx + 1:]]
plt.tight_layout()
plt.show()

num = wine.select_dtypes(include="number")
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
20

fig, ax = plt.subplots(14, 1, figsize = (7, 30))

for indx, (column, axes) in list(enumerate(list(zip(num, ax.flatten())))):

sns.scatterplot(ax = axes, y = wine[column].index, x = wine[column],hue =

wine['total sulfur dioxide'],
palette = 'magma', alpha = 0.8)

else:
[axes.set_visible(False) for axes in ax.flatten()[indx + 1:]]
plt.tight_layout()
plt.show()

num = heart.select_dtypes(include="number")
fig, ax = plt.subplots(3, 2, figsize = (14, 15))
for indx, (column, axes) in list(enumerate(list(zip(num, ax.flatten())))):

sns.histplot(ax = axes, x = heart[column],hue = heart['HeartDisease'],

palette = 'magma', alpha = 0.8, multiple = 'stack')

legend = axes.get_legend() # sns.hisplot has some issues with legend

Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
21

handles = legend.legendHandles
legend.remove()
axes.legend(handles, ['0', '1'], title = 'HeartDisease', loc = 'upper right')
Quantiles = np.quantile(heart[column], [0, 0.25, 0.50, 0.75, 1])

for q in Quantiles: axes.axvline(x = q, linewidth = 0.5, color = 'r')

plt.tight_layout()
plt.show()

raw_df = raw_df [['name', 'year', 'selling_price', 'km_driven', 'fuel',

'seller_type',
'transmission', 'owner']]
def barw(ax):
for p in ax.patches:
val = p.get_width() #height of the bar
x = p.get_x()+ p.get_width() # x- position
y = p.get_y() + p.get_height()/2 #y-position
ax.annotate(round(val,2),(x,y))
plt.figure(figsize=(10,5))
ax0 = sns.countplot(data = raw_df, y ='owner', order =
raw_df['owner'].value_counts().index)
barw(ax0)
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
22

#Correlation with Response Variable class

X = heart.drop(['HeartDisease'], axis=1)
y = heart['HeartDisease']

X.corrwith(y).plot.bar(figsize=(16, 4), rot=90, grid=False)

plt.title('Correlation with heart',
fontsize=25,
color='Blue',
font='Times New Roman')
plt.show()

import matplotlib
matplotlib.rcParams.update({'font.size': 12})
corr = heart.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(dpi=100)
plt.title('Correlation Analysis',
fontsize=15,
color='Blue',
font='Lucida Calligraphy')
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
23

sns.heatmap(corr,
mask=mask,
annot=True,
lw=0,
linecolor='white',
cmap='viridis',
fmt="0.2f")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

matplotlib.rcParams.update({'font.size': 15})
ax=heart['Sex'].value_counts().plot.pie(explode=[0.1,
0.1],autopct='%1.2f%%',shadow=True);
ax.set_title(label = "Sex", fontsize = 40,color='DarkOrange',font='Lucida
Calligraphy');
plt.legend(labels=['M','F'])
plt.axis('off');

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
24

#set configuration for charts

plt.rcParams["figure.figsize"]=[18 , 7]
plt.rcParams["font.size"]=15
plt.rcParams["legend.fontsize"]="medium"
plt.rcParams["figure.titlesize"]="medium"

def plot_disribution(data , x ,color,bins ):

mean = data[x].mean()
std = data[x].std()
info=dict(data = data , x = x , color = color)
plt.subplot(1 , 3 , 1 , title =f"Ditstribution of {x} column")
sns.distplot(a=data[x] , bins = bins)
plt.xlabel(f"bins of {x}")
plt.axvline(mean , label ="mean" , color ="red")
plt.ylabel("frequency")
plt.legend(["${\sigma}$ = %d"%std , f"mean = {mean:.2f}"])
plt.title(f"histogram of {x} column")
plt.subplot(1 , 3 , 2)
sns.boxplot(**info)
plt.xlabel(f"{x}")
plt.title(f"box plot of {x} column")
plt.subplot(1 , 3 , 3)
sns.swarmplot(**info)
plt.xlabel(f"{x}")
plt.title(f"distribution of points in {x} column")
plt.suptitle(f"Distribution of {x} column" , fontsize =20 , color="red")
plt.show()

age_bins = np.arange(29 , 77+5 , 5)

base_color = sns.color_palette()[4]
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
25

plot_disribution(data = heart , x ="Age" , color = base_color ,

bins=age_bins)

sns.set_style("white")
sns.set_context("poster",font_scale = 1.2)
palette =
["#1d7874","#679289","#f4c095","#ee2e31","#ffb563","#918450","#f85e00","#a416
23","#9a031e","#d6d6d6","#ffee32","#ffd100","#333533","#202020"]
plt.subplots(figsize=(20,8))
p = sns.barplot(x=titanic["Pclass"][:14],y=titanic["Age"],palette=palette,
saturation=1, edgecolor = "#1c1c1c", linewidth = 2)
p.axes.set_title("\nTop Anime Community\n", fontsize=25)
plt.ylabel("Total Member" , fontsize = 20)
plt.xlabel("\nAnime Name" , fontsize = 20)
plt.xticks(rotation = 90)
for container in p.containers:
p.bar_label(container,label_type = "center",padding = 6,size = 25,color =
"black",rotation = 90,
bbox={"boxstyle": "round", "pad": 0.6, "facecolor": "orange", "edgecolor":
"black", "alpha": 1})

sns.despine(left=True, bottom=True)
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
26

countfeature = ["Survived", "Pclass", "Sex", "SibSp", "Parch", "Embarked"]

countlist = list(enumerate(countfeature))

plt.figure(figsize = (15,10))
plt.suptitle("Countplot of Categorical Features", fontsize=25,color='Red')
for i in countlist:
plt.subplot(2,3,i[0]+1)
sns.countplot(data = titanic, x = i[1], hue = "Survived",
palette="rainbow")
plt.ylabel("")
plt.legend(['Not Survived', 'Survived'], loc='upper center', prop={'size':
10})
plt.tight_layout()
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
27

numfeature = ["Age", "Fare"]

enumfeat = list(enumerate(numfeature))

plt.figure(figsize=(20,7))
plt.suptitle("Distribution and Outliers of Numerical Data",
fontsize=25,color='Blue')
for i in enumfeat:
plt.subplot(1,4,i[0]+1)
sns.boxplot(data = titanic[i[1]], palette="Dark2")
plt.xlabel(str(i[1]))
for i in enumfeat:
plt.subplot(1,4,i[0]+3)
sns.histplot(data = titanic[i[1]], palette="tab10", bins=15)
plt.xlabel(str(i[1]))
plt.tight_layout()
plt.show()

plt.figure(figsize=(15,7))
plt.suptitle("Probability Distribution of numerical columns according to
number of Survived", fontsize = 25,color="Red")
for i in enumfeat:
plt.subplot(1,2,i[0]+1)
sns.kdeplot(data=titanic, x=i[1], hue="Survived")
plt.tight_layout()
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
28

plt.figure(figsize=(12,8))
data_4 = data.corr()["Fire Alarm"].sort_values(ascending=False)
indices = data_4.index
labels = []
corr = []
for i in range(1, len(indices)):
labels.append(indices[i])
corr.append(data_4[i])
sns.barplot(x=corr, y=labels, palette='mako')
plt.title('Correlation coefficient between different features and Fire
Alarm ')
plt.show()

education=df['parental level of education'].value_counts()

sns.set_palette('bright')
plt.figure(figsize=(10,7))
labels=education.index
sizes=education.values
plt.pie(sizes,labels=labels,autopct='%1.1f%%',
shadow=True,startangle=90)
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
29

import matplotlib
matplotlib.rcParams.update({'font.size': 15})
plt.figure(figsize=(18,9))
cols_out = ["RestingBP", "Cholesterol", "MaxHR", "Age",'ChestPainType']
sns.pairplot(heart[cols_out], hue="ChestPainType", diag_kind="hist",
palette="tab10") # tab10
plt.show();

fig, ax = plt.subplots(figsize = (18,8))

sns.countplot(x= wine["quality"])
plt.title("Wine Quality Count",fontsize=20,color='#1a4441',font='Comic
Sans Ms',pad=20)
plt.xlabel("Quality ",fontsize=15,color='#1a4441',font='Comic Sans Ms')
plt.ylabel("Count",fontsize=15,color='#1a4441',font='Comic Sans Ms');

total = len(wine)
for p in ax.patches:
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
30

percentage = f'{100 * p.get_height() / total:.1f}%\n'

x = p.get_x() + p.get_width() / 2
y = p.get_height()
ax.annotate(percentage, (x, y), ha='center', va='center')

print("Skewly distributed columns by skewness value:\n")

skew_df = wine.skew().sort_values()

fig,ax = plt.subplots(figsize=(25,7))
ax.bar(x = skew_df[(skew_df<2)& (skew_df>-2)].index, height =
skew_df[(skew_df<2)& (skew_df>-2)], color = "g", label= "Semi-normal
distribition")
ax.bar(x = skew_df[skew_df>2].index, height = skew_df[skew_df>2], color
= "r", label = "Positively skewed features")
ax.bar(x = skew_df[skew_df<-2].index, height = skew_df[skew_df<-2], color
= "b", label = "Negatively skewed features")
ax.legend()
fig.suptitle("Skewness of numerical columns",fontsize = 20)
ax.tick_params(labelrotation=90);

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
31

from wordcloud import WordCloud, STOPWORDS

text = " ".join(Company for Company in df["Cuisines"])
#font = "Quicksand-Bold.ttf"
word_cloud = WordCloud(width = 2300,
height = 800,
colormap = 'jet',
background_color = "white").generate(text)
plt.figure(figsize = (50, 8))
plt.imshow(word_cloud, interpolation = "gaussian")
plt.axis("off")
plt.show()

plt.figure(figsize=(10,5))
#plotting the values for people who have heart disease
plt.scatter(heart.Age[heart.HeartDisease==1],
heart.Cholesterol[heart.HeartDisease==1],
c="tomato")
#plotting the values for people who doesn't have heart disease
plt.scatter(heart.Age[heart.HeartDisease==0],
heart.Cholesterol[heart.HeartDisease==0],
c="lightgreen")
plt.title("Heart Disease w.r.t Age and Max Heart Rate")
plt.xlabel("Age")
plt.legend(["Disease", "No Disease"])
plt.ylabel("Max Heart Rate");
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
32

df2=df.groupby('Type Of Restaurant')['Cost Per

Head'].mean().sort_values(ascending=False)
plt.figure(figsize = (15,6))
color = [('b' if i < 500 else 'r') for i in df2]
df2.plot.bar(color=color);

import math
cont_features=['fixed acidity', 'volatile acidity', 'citric acid','free sulfur
dioxide','pH', 'alcohol']

y=3
x=math.ceil(len(cont_features)/y)

plt.subplots(x,y,figsize=(15,10))
for i in range(1,len(cont_features)+1) :
plt.subplot(x,y,i)
sns.boxplot(data=wine,y=cont_features[i-
1],x='quality',palette=['#e60000','#FAFAD2','#660000','#DEB078','#FF8C00','
black'])
plt.tight_layout()
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
33

sns.pairplot(wine.drop(columns=['quality']),kind="reg",diag_kind='kde',plot
_kws={'line_kws':{'color':'red'}},corner=True)
plt.tight_layout()
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
34

features = ['fixed acidity','citric acid','volatile acidity']

fig, axs = plt.subplots(1,3, figsize=(16,6))
for f, ax in zip(features,axs.ravel()):
sns.histplot(wine, x=f, ax=ax)
plt.show()

corr_mat_train = wine.drop(columns = ['quality'], axis = 1).corr()

threshold = 0.3
corr_threshold_train = corr_mat_train[(corr_mat_train > threshold) | (corr_mat_train
< -threshold)]
plt.figure(figsize = (8, 6))
sns.heatmap(corr_threshold_train, annot = True, cmap = 'seismic', fmt = ".2f",
linewidths = 0.5, cbar_kws={'shrink': .5},annot_kws={'size':
8}).set_title('Correlations Among Features (in Train)');

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
35

import matplotlib.pyplot as plt

import seaborn as sns
%matplotlib inline

def missing_values(data, thresh = 20, color = 'black', edgecolor = 'black',

height = 3, width = 15):

plt.figure(figsize = (width, height))

percentage = (data.isnull().mean()) * 100
percentage.sort_values(ascending = False).plot.bar(color = color,
edgecolor = edgecolor)
plt.axhline(y = thresh, color = 'r', linestyle = '-')

plt.title('Missing values percentage per column', fontsize = 20, weight =

'bold' )

plt.text(len(data.isnull().sum()/len(data))/1.7, thresh + 12.5, f'Columns

with more than {thresh}% missing values', fontsize = 12, color = 'crimson',
ha = 'left' ,va = 'top')
plt.text(len(data.isnull().sum()/len(data))/1.7, thresh - 5, f'Columns with
less than {thresh}% missing values', fontsize=12, color='green',
ha = 'left' ,va = 'top')
plt.xlabel('Columns', size = 15, weight = 'bold')
plt.ylabel('Missing values percentage')
plt.yticks(weight = 'bold')

return plt.show()

missing_values(titanic, thresh = 10, color = sns.color_palette('Reds',15))

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
36

# Pie chart
labels = df['listed_in(type)'].value_counts().index
sizes = df['listed_in(type)'].value_counts().values
# only "explode" the 2nd slice (i.e. 'Hogs')
explode = (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1)
fig1, ax1 = plt.subplots(figsize = (8, 8))
ax1.pie(sizes, labels = labels,
shadow = True, startangle = 90, explode = explode, rotatelabels = True)
centre_circle = plt.Circle((0, 0), 0.70,fc = 'white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
ax1.axis('equal')
plt.tight_layout()
plt.show()

plt.rcParams['figure.figsize'] = (18, 5)
Y = pd.crosstab(df['rate'], df['book_table'])
Y.div(Y.sum(1).astype(float), axis = 0).plot(kind = 'bar', stacked =
True,color=['red','yellow'])
plt.title('table booking vs Normal rate', fontweight = 30, fontsize = 20)
plt.legend(loc="upper right")
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
37

# check distribution of Na_to_k (based on Drug_Type)

%matplotlib inline
plt.style.use('seaborn-notebook')
for i, label in enumerate(df.Drug_Type.unique().tolist()):
sns.kdeplot(df2.loc[df2['Drug_Type'] == i+1, 'Na_to_K'],
label=label, shade=True)
plt.title('1. KDE of Na_to_k (based on Drug_Type)', fontdict=font,
pad=15)
plt.xticks(np.arange(0,46,2), rotation=90)
plt.xlim([0,46])
plt.legend()
plt.show()

# draw countplot and pie plot of categorical data

for col in categorical:
fig, axes = plt.subplots(1,2,figsize=(10,4))
# count of col (countplot)
sns.countplot(data=df2, x=col, ax=axes[0])
for container in axes[0].containers:
axes[0].bar_label(container)
# count of col (pie chart)
slices = df2[col].value_counts().values
activities = [f"{i} ({var})" for i, var in zip(df2[col].value_counts().index,
df[col].value_counts().index)]
axes[1].pie(slices, labels=activities, shadow=True, autopct='%1.1f%%')
plt.suptitle(f'Count of Unique Value in {col}', y=1.09, **font)
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
38

# count of purchased based on Gender

%matplotlib inline
for col in ['Sex','BP','Cholesterol']:
ax = sns.countplot(data=df, x='Drug_Type', hue=col)
for container in ax.containers:
ax.bar_label(container)
plt.title(f'Count of Drug (based on {col})', fontdict=font, pad=15)
plt.show()

# Mean of Age and Na_to_K based on each feature

for col in ['Sex', 'BP', 'Cholesterol']:
fig , ax= plt.subplots(1,2, figsize=(10,4))
gp = df.groupby([col])['Na_to_K'].mean().to_frame().reset_index()
sns.barplot(data=gp, x=col, y='Na_to_K', ax=ax[0])
for container in ax[0].containers:
ax[0].bar_label(container)
ax[0].set_title(f'Mean of Na_to_K (based on {col})', y=1.09, **font)
sns.boxplot(data=df, x=col, y='Na_to_K', ax=ax[1])
ax[1].set_title(f'Boxplot of {col})', y=1.09, **font)
plt.show()
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
39

# use scatter plot for numerics feature (Age and Na_to_K)

fig, ax = plt.subplots(2,2,figsize=(14,8))
for i, col in enumerate(['Sex', 'BP', 'Cholesterol', 'Drug_Type']):
sns.scatterplot(data=df, x='Age', y='Na_to_K', hue=col, ax=ax[i//2, i%2],
palette='turbo')
ax[i//2, i%2].set_title(f'Na_to_K vs Age (based on {col}', y=1.09, **font)
ax[i//2, i%2].legend(loc='upper center', bbox_to_anchor=(1.2, 0.6),
fancybox=True, shadow=True)

fig.tight_layout()
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
40

fig, ax = plt.subplots(3,2,figsize=(14,12))
sns.swarmplot(data=df, x='Cholesterol', y='Na_to_K', hue='Drug_Type',
ax=ax[0,0])
sns.swarmplot(data=df, x='Cholesterol', y='Age', hue='Drug_Type',
ax=ax[0,1])
sns.swarmplot(data=df, x='BP', y='Na_to_K', hue='Drug_Type', ax=ax[1,0])
sns.swarmplot(data=df, x='BP', y='Age', hue='Drug_Type', ax=ax[1,1])
sns.swarmplot(data=df, x='Sex', y='Na_to_K', hue='Drug_Type', ax=ax[2,0])
sns.swarmplot(data=df, x='Sex', y='Age', hue='Drug_Type', ax=ax[2,1])
ax[0,0].set_title('Swarmplot of Drug Type vs Na_to_K',y=1.05, **font)
ax[0,1].set_title('Swarmplot of Drug Type vs Age',y=1.05, **font)
plt.tight_layout()
plt.show()

# Mean of Income and CCAvg based on each feature

for i, col in enumerate(['Income', 'CCAvg','Mortgage']):
print('='*30, f"Mean of {col} in each categorical feature", '='*30)
for j, cat in enumerate(discrete_cols2):
fig , ax= plt.subplots(1,2, figsize=(10,4))
gp = df.groupby([cat])[col].mean().to_frame().reset_index()
sns.barplot(data=gp, x=cat, y=col, ax=ax[0])
for container in ax[0].containers:
ax[0].bar_label(container)
ax[0].set_title(f'Mean of {col} (based on {cat})', y=1.09, **FONT)
sns.boxplot(data=df, x=cat, y=col, ax=ax[1])
ax[1].set_title(f'Boxplot of {cat} (Fig {i+11}-{j+1})', y=1.09,
**FONT)
plt.show()
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
41

continuous_cols = ['Age','Experience','CCAvg','Mortgage']

for i, col in enumerate(continuous_cols):

fig = px.scatter_3d(
data_frame= df,
x=df.Income,
y=df[col],
z=df['Personal Loan'],
color=df['Personal Loan'].astype(str),
color_discrete_map={'1':'orange', '0':'red'},
template='ggplot2',
hover_name='Age',
# hover_data=
opacity=0.6,
# symbol='Transmission',
# symbol_map=
# log_x=True,
# log_z=True,
height=700,
title=f'3D scatter of features based on Personal Loan (Fig {i+1})')
fig.update_layout(
title_text="Box Plot Styling Outliers",
title_font=dict(color='orange', family='newtimeroman', size=25),
title_x=0.45,
paper_bgcolor='#145A32',
# plot_bgcolor='#DAF7A6',
font=dict(color='#DAF7A6', family='newtimeroman', size=16),
)
pio.show(fig)

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
42

df["Type Of Restaurant"].value_counts()[:10].plot.pie(figsize = (10, 10),

autopct = '%1.0f%%')
plt.title("Pie Chart")
plt.xticks(rotation = 90)
plt.show()

df['city_1'].value_counts().nlargest(n=20, keep='first').plot.pie(figsize = (10, 10),

autopct = '%1.0f%%')
plt.title("Pie Chart")
plt.xticks(rotation = 90)
plt.show()
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
43

plt.figure(figsize=(10, 5))
sns.set_context("paper")

kdeplt = sns.kdeplot(
data=heart_dft_chol_n0,
x="Cholesterol",
hue="Sex",
palette=sex_color,
alpha=0.7,
lw=2,
)
kdeplt.set_title("Cholesterol values distribution\n Male VS Female", fontsize=12)
kdeplt.set_xlabel("Cholesterol", fontsize=12)
plt.axvline(x=Chol_mean_f, color="#c90076", ls="--", lw=1.3)
plt.axvline(x=Chol_mean_m, color="#2986cc", ls="--", lw=1.3)
plt.text(108, 0.00612, "Mean Cholesterol / Male", fontsize=10, color="#2986cc")
plt.text(260, 0.006, "Mean Cholesterol / Female", fontsize=10, color="#c90076")
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
44

heart_df_fg = sns.FacetGrid(
data=heart_dft_chol_n0,
col="Sex",
hue="Sex",
row="HeartDisease",
height=4,
aspect=1.3,
palette=sex_color,
col_order=["Male", "Female"],
)
heart_df_fg.map_dataframe(sns.regplot, "Age", "MaxHR")
plt.show()

mean_SalePrice = usa_housing_df[["SalePrice"]].mean().squeeze()
median_SalePrice = usa_housing_df[["SalePrice"]].median().squeeze()

plt.figure(figsize=(10, 5))
sns.set_context("paper")

histplt = sns.histplot(
data=usa_housing_df,
x="SalePrice",
color="#4f758f",
bins=60,
alpha=0.5,
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
45

lw=2,
)
histplt.set_title("SalePrice Distribution", fontsize=12)
histplt.set_xlabel("SalePrice", fontsize=12)

plt.axvline(x=mean_SalePrice, color="#14967f", ls="--", lw=1.5)

plt.axvline(x=median_SalePrice, color="#9b0f33", ls="--", lw=1.5)
plt.text(mean_SalePrice + 5000, 175, "Mean SalePrice", fontsize=9,
color="#14967f")
plt.text(
median_SalePrice - 115000, 175, "Median SalePrice", fontsize=9,
color="#9b0f33"
)
histplt.xaxis.set_major_formatter(ticker.EngFormatter())
plt.ylim(0, 200)
plt.show()

df2 = titanic[['Survived','Pclass','Sex','Embarked','SibSp','Parch',"Age"]]

fig, axes = plt.subplots(1, 2)

fig.set_figheight(10)
fig.set_figwidth(20)
for i,col in enumerate(df2.select_dtypes('object')):
sns.boxplot(x="Age", y=col, data=df2,
whis=[0, 100], width=.6,ax=axes[i])

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
46

df2 = titanic[['Survived','Pclass','Sex','Embarked','SibSp','Parch',"Age"]]
#create the subplots
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True,
gridspec_kw={"height_ratios": (.15, .85)})
#title
ax_box.title.set_text('Price countplot and Boxplot')
# assigning a graph to each ax
sns.boxplot(df2["Age"], orient="h" ,ax=ax_box)
sns.histplot(data=df2, x="Age", ax=ax_hist)
# Remove x axis name for the boxplot
ax_box.set(xlabel='')
plt.show()

NUMERICAL = wine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',

'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol']]
fig, axes = plt.subplots(2, 4)
fig.set_figheight(12)
fig.set_figwidth(16)
for i,col in enumerate(NUMERICAL):
sns.histplot(wine[col],ax=axes[(i // 4) -1 ,(i % 4)], kde = True)
axes[(i // 4) -1 ,(i % 4)].axvline(wine[col].mean(), color='k', linestyle='dashed',
linewidth=1)

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
47

fig, axes = plt.subplots(1, 3)

fig.set_figheight(7)
fig.set_figwidth(20)
sns.scatterplot(data=titanic, x="Age", y="Fare", hue="Survived", size="Survived",
ax=axes[0])
sns.scatterplot(data=titanic, x="Age", y="Fare", hue="Pclass", size="Pclass",
ax=axes[1])
sns.scatterplot(data=titanic, x="Age", y="Fare", hue="SibSp", size="SibSp",
ax=axes[2]);

color = list(np.full(12, 'grey'))

color[2], color[10] = 'orange', 'orange'
df.groupby('month').mean().active_power.plot(kind='bar', title='Average of Active
Power of each Months', color=color, rot=0)
plt.ylabel('Active Power [kW]');

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
48

plt.title('Actual Power vs Theoretical Power')

plt.plot(df.theor_power, df.active_power, 'o', markersize= 1)
plt.grid('both')
plt.xlabel('Theoretcial Power (kW)')
plt.ylabel('Actual Power (kW)')
plt.plot([0,3650], [0,3650], '-', c= 'k')
plt.show()

group_hours = df_demand['load'].groupby(pd.Grouper(freq='D', how='mean'))

fig, axs = plt.subplots(1,1, figsize=(12,5))
year_demands = pd.DataFrame()
for name, group in group_hours:
year_demands[name.year] = pd.Series(group.values)
year_demands.plot(ax=axs)
axs.set_xlabel('Hour of the day')
axs.set_ylabel('Energy Demanded MWh')
axs.set_title('Mean yearly energy demand by hour of the day ');

plot , ax = plt.subplots(1 , 3 , figsize=(14,4))

sns.histplot(data = train_data.loc[train_data["Pclass"]==1] , x = "Age" , hue = "Surv
ived",binwidth=5,ax = ax[0],palette = sns.color_palette(["yellow" , "green"]),multip
le = "stack").set_title("1-Pclass")
sns.histplot(data = train_data.loc[train_data["Pclass"]==2] , x = "Age" , hue = "Surv
ived",binwidth=5,ax = ax[1],palette = sns.color_palette(["yellow" , "green"]),multip
le = "stack").set_title("2-Pclass")
sns.histplot(data = train_data.loc[train_data["Pclass"]==3] , x = "Age" , hue = "Surv
ived",binwidth=5,ax = ax[2],palette = sns.color_palette(["yellow" , "green"]),multip
le = "stack").set_title("3-Pclass")
plt.show()
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
49

#Plotting the distributions of the numerical variables

color_plot =
['#de972c','#74c91e','#1681de','#e069f5','#f54545','#f0ea46','#7950cc']

fig,ax = plt.subplots(4,2,figsize=(20,20))
sns.kdeplot(df['HeartDisease'],color=np.random.choice(color_plot), ax=ax[0][0],
shade=True)
sns.kdeplot(df['Oldpeak'],color=np.random.choice(color_plot), ax=ax[0][1],
shade=True)
sns.kdeplot(df['Age'],color=np.random.choice(color_plot), ax=ax[1][0],
shade=True)
sns.kdeplot(df['FastingBS'],color=np.random.choice(color_plot), ax=ax[1][1],
shade=True)
sns.kdeplot(df['RestingBP'],color=np.random.choice(color_plot),
ax=ax[2][0],shade=True)
sns.kdeplot(df['Cholesterol'],color=np.random.choice(color_plot), ax=ax[2][1],
shade=True)
sns.kdeplot(df['MaxHR'],color=np.random.choice(color_plot),
ax=ax[3][0],shade=True)
fig.delaxes(ax[3][1])

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
50

hm= df.drop('id', axis =1)

mask = np.zeros_like(hm.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)]= True

plt.suptitle('Correlation', size = 20, weight='bold')

ax = sns.heatmap(hm.corr(), linewidths = 0.9, linecolor = 'white', cbar = True,mask

=mask, cmap=heatmap)

ax.annotate('Low Correlation',
fontsize=10,fontweight='bold',
xy=(1.3, 3.5), xycoords='data',
xytext=(0.6, 0.95), textcoords='axes fraction',
arrowprops=dict(
facecolor=heatmap[0], shrink=0.025,
connectionstyle='arc3, rad=0.50'),
horizontalalignment='left', verticalalignment='top'
)

ax.annotate('High Correlation',
fontsize=10,fontweight='bold',
xy=(3.3, 7.5), xycoords='data',
xytext=(0.8, 0.4), textcoords='axes fraction',
arrowprops=dict(
facecolor=heatmap[0], shrink=0.025,
connectionstyle='arc3, rad=-0.6'),
horizontalalignment='left', verticalalignment='top'
)
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
51

fig = plt.figure( figsize=(8, 6))

ax = fig.add_axes([0,0,1,1])
sns.boxplot(ax=ax, data=df, x='TARGET', y='LDH')#,flierprops=dict(marker='o', mar
kersize=6),fliersize=2)

ax.axhline(y=550,color='b')
ax.axhline(y=650,color='orange')
ax.axhline(y=1200,color='g')

plt.suptitle('Target Variable', size = 20, weight='bold')

song_popularity = df['song_popularity'].map({0:'UnPopular', 1:'Popular'})

a = sns.countplot(data = df, x =song_popularity,palette=theme)

plt.tick_params(axis="x", colors=theme[0],labelsize=15)

for p in a.patches:
width = p.get_width()
height = p.get_height()
x, y = p.get_xy()
a.annotate(f'{height/df.shape[0]*100} %', (x + width/2, y + height*1.02), ha='cent
er')

plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
52

cont = ['song_duration_ms', 'acousticness', 'danceability', 'energy',

'instrumentalness', 'liveness', 'loudness',
'speechiness', 'tempo', 'audio_valence']
cat = [ 'key', 'audio_mode', 'time_signature']

a = 4 # number of rows
b = 3 # number of columns
c = 1 # initialize plot counter

plt.figure(figsize= (18,18))

for i in cont:
plt.suptitle('Distribution of Features', size = 20, weight='bold')
plt.subplot(a, b, c)
A=sns.kdeplot(data= df, x=i,hue=song_popularity,palette=theme[:-2], linewidt
h = 1.3,shade=True, alpha=0.35)
plt.title(i)
plt.xlabel(" ")
c=c+1

#plotting
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 9))
Syed Afroz Ali
Data Scientist (Kaggle Grandmaster)
53

fig.suptitle(' Highest and Lowest Correlation ', size = 20, weight='bold')

axs = [ax1, ax2]

#kdeplot
sns.kdeplot(data=df, y='energy', x='acousticness', ax=ax1, color=heatmap[0])
ax1.set_title('Energy vs Acousticness', size = 14, weight='bold', pad=20)

#kdeplot
sns.kdeplot(data=df, y='energy', x='loudness', ax=ax2, color=heatmap[4])
ax2.set_title('Energy vs Loudness', size = 14, weight='bold', pad=20);

#Parameters for Plots

plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['axes.edgecolor'] = 'black'
plt.rcParams['axes.linewidth'] = 1.5
plt.rcParams['figure.frameon'] = True
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams["font.family"] = "monospace";

#Colors for charts

colors = ["#e9d9c8","#cca383","#070c23","#f82d06","#e8c195","#cd7551","#a499
95","#a3a49c","#6c7470"]
sns.palplot(sns.color_palette(colors))

#plot
A = sns.countplot(train_df['case_num'],
color=colors[1],
edgecolor='white',
linewidth=1.5,
saturation=1.5)

#Patch
patch_h = []
for patch in A.patches:
reading = patch.get_height()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
54

patch_h.append(reading)

idx_tallest = np.argmax(patch_h)
A.patches[idx_tallest].set_facecolor(colors[3])

#Lables
plt.ylabel('Count', weight='semibold', fontname = 'Georgia')
plt.xlabel('Cases', weight='semibold', fontname = 'Georgia')
plt.suptitle('Number of Cases', fontname = 'Georgia', weight='bold', size = 18, color = colors
[2])
A.bar_label(A.containers[0], label_type='edge')

plt.show()

import matplotlib as mlb

import matplotlib.image as mpimg
from matplotlib.offsetbox import AnnotationBbox, OffsetImage

#plotting
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 11))
fig.suptitle(' Potablity of Water Quality ', size = 26, color = theme[3], weight='bold')
axs = [ax1, ax2]

#Count-Plot
sns.countplot(water_df['Potability'], ax=ax1, palette='husl')
ax1.set_title('Count Plot', size = 14, color = theme[3], weight='bold', pad=20)

#Data-2
names = ["Not Potable", "Potable"]
values = water_df['Potability'].value_counts()
colors = ["#E68193","#459E97"]
explode = (0.01, 0.01)

#Doughnut-chart

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
55

ax2.pie(x= values,labels =names, colors=colors,autopct='%1.0f%%', pctdistance=

0.8,explode=explode)

#draw-circle
centre_circle = plt.Circle((0,0),0.62,fc='white')
ax2.add_artist(centre_circle)
ax2.axis('equal')

ax2.set_title('Pie Chart', size = 14, color = theme[3], weight='bold', pad=20)

#Image

path = mpimg.imread('../input/water/water bottle.png')

imagebox = OffsetImage(path , zoom=0.3)
xy = (0.5, 0.7)
ab = AnnotationBbox(imagebox, xy, frameon=False, pad=1, xybox=(0.02, 0.05))
ax2.add_artist(ab)

plt.subplots_adjust(left=None, bottom=None, right=None, top=0.8, wspace=0.4, hs

pace=None);

fig, ax = plt.subplots(ncols=3, figsize=(18,6))

colors = [['#ADEFD1FF', '#00203FFF'], ['#97BC62FF', '#2C5F2D'], ['#F5C7B8FF', '#F

FA177FF']]
explode = [0, 0.2]
columns = ['Parking', 'Warehouse', 'Elevator']
for i in range(3):
data = df[columns[i]].value_counts()
ax[i].pie(data, labels=data.values, explode=explode, colors=colors[i], shadow
=True)
ax[i].legend(labels=data.index, fontsize='large')
ax[i].set_title('{} distribution'.format(columns[i]))

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
56

def plot_hist(feature):
fig, ax = plt.subplots(2, 1, figsize=(17, 12))
sns.histplot(data = titanic[feature], kde = True, ax =
ax[0],color="Brown")
ax[0].axvline(x = titanic[feature].mean(), color = 'r', linestyle = '--',
linewidth = 2, label = 'Mean: {}'.format(round(titanic[feature].mean(), 3)))
ax[0].axvline(x = titanic[feature].median(), color = 'orange', linewidth =
2, label = 'Median: {}'.format(round(titanic[feature].median(), 3)))
ax[0].axvline(x = statistics.mode(titanic[feature]), color = 'yellow',
linewidth = 2, label = 'Mode: {}'.format(statistics.mode(titanic[feature])))
ax[0].legend()

sns.boxplot(x = titanic[feature], ax = ax[1],color="Brown")

plt.show()
plot_hist('Age')

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
57

plt.figure(figsize=(12,5))
plt.title('top categories')
plt.ylabel('item_price')
titanic.groupby('Embarked')['Fare'].mean().sort_values(ascending=Fa
lse)[0:15].plot(kind='line', marker='*', color='red', ms=10)
titanic.groupby('Embarked')['Fare'].mean().sort_values(ascending=Fa
lse)[0:15].plot(kind='bar',color=sns.color_palette("inferno_r", 7))
plt.show()

import matplotlib.pyplot as plt

import seaborn as sns

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
58

sns.scatterplot(x=df.iloc[:,0], y=df.iloc[:,1], hue=y)

plt.annotate("KD65", (df.iloc[64,0], df.iloc[64,1]), (8*1e6, 1), arrowprops=dict(arrowstyle="-
>"), fontsize="xx-large",c='red')
plt.annotate("KD99", (df.iloc[98,0], df.iloc[98,1]), (8*1e6, 2*1e6), arrowprops=dict(arrowstyl
e="->"), fontsize="xx-large",c='red')
plt.annotate("control3", (df.iloc[107,0], df.iloc[107,1]), (8*1e6, 3*1e6), arrowprops=dict(arro
wstyle="->"), fontsize="xx-large",c='red')
plt.annotate("control13", (df.iloc[117,0], df.iloc[117,1]), (8*1e6, 4*1e6), arrowprops=dict(arr
owstyle="->"), fontsize="xx-large",c='red')

l = df_current['Q3'].value_counts(normalize=True).mul(100).tolist()[1]-df_old['Q2'].v
alue_counts(normalize=True).mul(100).values.tolist()[1]

print(5*'\n',"\033[1;32m Increase in Woman is only\033[1;32m",round(l, 2),'%\033[1;

32m Over Last Year\033[1;32m',5*'\n')

fig, ax = plt.subplots(1, 2, figsize=(20,8))

fig.text(0.1, 0.95, "Visualisation of Gender Distribution for 2022 and 2021", fontsiz
e=15, fontweight='bold')

sns.countplot(x='Q3', data=df_current,palette="Dark2", ax=ax[0]); #Current Year

sns.countplot(x='Q2', data=df_old,palette="Dark2",ax=ax[1]); #Last Year

for i, ax in enumerate(ax.flatten()):
ax.grid(axis='y', linestyle='-', alpha=0.4)
if i==0:t=shape;year = 2022
else:t=shape_21;year =2021
for p in ax.patches:
percentage = f'{100 * p.get_height() / t:.2f}%\n'
ax.annotate(percentage, (p.get_x() + p.get_width() / 2,p.get_height()), ha='cen
ter', va='center')
ax.set_xlabel('Gender');ax.set_title("Gender Wise Distribution in "+ str(year))
if not(0.5 <= p.get_x() < 1.5):
p.set_facecolor('lightgrey')

plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
59

fig, ax = plt.subplots(1,2, figsize=(20,8))

fig.text(0.1, 0.95, "Age Distribution of Kaggle Users - 2022", fontsize=15, font
weight='bold')
sns.barplot(x=df_current['Q2'].value_counts().index, y=df_current['Q2'].value_c
ounts().values, ax=ax[0],
edgecolor='black', linewidth=1.5, saturation=1.5)
ax[0].yaxis.set_major_locator(MaxNLocator(nbins=20));ax[0].grid(axis='y', line
style='-', alpha=0.4)
ax[0].set_ylabel('Count', weight='semibold')
ax[0].set_xlabel('Age Group 2022', weight='semibold')
ax[1].set_xlabel('Pie Chart for Age Group 2022', weight='semibold')
for p in ax[0].patches:
percentage = f'{100 * p.get_height() / t:.1f}%\n'
ax[0].annotate(percentage, (p.get_x() + p.get_width() / 2,p.get_height()), h
a='center', va='center')

ax[1].pie(df_current['Q2'].value_counts(), labels = df_current['Q2'].value_count

s().index, autopct='%1.1f%%',
explode=[0.03 for i in df_current['Q2'].value_counts().index])
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
60

fig, ax = plt.subplots(1, 1)

plt.xlim(-1,26)
plt.ylim(0,1)
x = np.linspace(f.ppf(0.0000000001, dfn, dfd),f.ppf(0.9999999999, dfn, dfd), 10
0)
ax.plot(x, f.pdf(x, dfn, dfd), 'r-')
ax.axvline(f.ppf(0.95, dfn, dfd), ls = "--", color = "navy")
print('upper 5%:', f.ppf(0.95, dfn, dfd))

import plotly.graph_objects as go

labels = confirmed_bookings['meal'].unique()
values = confirmed_bookings['meal'].value_counts()
palette = ["#f6bd60", "#f5cac3", "#84a59d", "#f28482"]

fig = go.Figure(data=[go.Pie(labels = labels,

values = values,
hole=.5,
title = 'Meal plans',
legendgroup = True,
pull = [0.1, 0.1, 0.1, 0.1]
)
]
)

fig.update_traces(marker = dict(colors = palette))

fig.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
61

x = rent_df["Rent"]
y = rent_df["Size"]
colors = rent_df["Size"]
sizes = rent_df["Size"]

plt.figure(figsize = (25, 8))

plt.ticklabel_format(style = 'plain')
plt.scatter(x, y, c = colors, s = sizes, alpha = 0.3, cmap = 'viridis')
plt.colorbar();

# Free or Paid Courses - Countplot

fig, ax = plt.subplots(figsize=(7,5), dpi=100)
ax = sns.countplot(data=courses, x='is_paid', palette='magma_r')
ax.set_xticklabels(labels=['Free', 'Paid'])
ax.set_xlabel("Free/Paid courses")
ax.set_ylabel("Number of courses")
ax.set_title("Share of Free and Paid Courses on Udemy")
percentage = round(courses['is_paid'].value_counts() * 100 /len(courses), 2)
patches = ax.patches
for i in range(len(patches)):
x = patches[i].get_x() + patches[i].get_width()/2
y = patches[i].get_height()+.05
ax.annotate('{:.2f}%'.format(percentage[i]), (x, y), ha='center')

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
62

df_cpy = df.drop(['profile_id'], axis=1)

flierprops = dict(markerfacecolor='g', color='g', alpha=0.5)

n_cols = 4
n_rows = int(np.ceil(df_cpy.shape[-1]*2 / n_cols))
fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 3 * n_rows))
for i, (col) in enumerate(list(df_cpy.columns)):
mean = df_cpy[col].mean()
median = df_cpy[col].median()
sns.histplot(df_cpy[col], ax=axes.flatten()[2*i], kde=True)
sns.boxplot(x=df_cpy[col], orient='h', ax=axes.flatten()[2*i+1], color='g')
axes.flatten()[2*i+1].vlines(mean, ymin = -1, ymax = 1, color='r',
label=f"For [{col}]\nMean: {mean:.2}\nMedian: {median:.2}")
axes.flatten()[2*i+1].legend()

if i % n_cols == 0:
ax.set_ylabel('Frequency')
else:
ax.set_ylabel('')
plt.tight_layout()

sns.set(rc={'figure.figsize':(10,7)})
sns.set_style("white")
sns.scatterplot(data=df, x="horsepower", y="mpg", size="acceleration",
hue='origin',legend=True, sizes=(10, 500))

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
63

import altair as alt

plot=alt.Chart(df).mark_bar(size=40).encode(
alt.X('cylinders'),
alt.Y('mpg'),
alt.Color('origin')
)
plot.properties(title='cylinders vs mpg')

import altair as alt

select = alt.selection(type='interval')
values = alt.Chart(df).mark_point().encode(
x='horsepower:Q',
y='mpg:Q',
color=alt.condition(select, 'origin:N', alt.value('lightgray'))
).add_selection(
select
)
bars = alt.Chart(df).mark_bar().encode(
y='origin:N',
color='origin:N',
x='count(origin):Q'
).transform_filter(
select
)
values & bars

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
64

import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
iris = pd.read_csv("Iris.csv")
# Create a figure and axes for the 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# Scatter plot the data
ax.scatter(iris["sepal_length"], iris["sepal_width"], iris["petal_length"],
c=iris["petal_length"], cmap='viridis')
# Add labels to the axes
ax.set_xlabel("sepal_length")
ax.set_ylabel("sepal_width")
ax.set_zlabel("petal_length")
# Show the plot
plt.show()

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_trisurf(data['temp'], data['co'], data['smoke'], cmap =
plt.cm.twilight_shifted)
plt.title('Relation between Carbon di oxide levels, Smoke and
Temperature.')
plt.xlabel('co')
plt.ylabel('smoke')
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
65

import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
df = pd.read_csv("titanic.csv")
# Create a figure and axes for the 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# Scatter plot the data
ax.scatter(df["Age"], df["Fare"], df["Survived"], c=df["Survived"],
cmap='viridis')
# Add labels to the axes
ax.set_xlabel("Age")
ax.set_ylabel("Fare")
ax.set_zlabel("Survived")
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
66

from mpl_toolkits.mplot3d import Axes3D

x = np.linspace(-5, 5, 100)
y = np.linspace(-5, 5, 100)
X, Y = np.meshgrid(x, y)
Z = np.sin(np.sqrt(X**2 + Y**2))
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X, Y, Z, cmap='viridis')
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_zlabel("Z")
plt.show()
# Create a figure and axes for the 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
hist, xedges, yedges = np.histogram2d(df["Age"], df["Fare"],
bins=10)
X, Y = np.meshgrid(xedges[:-1], yedges[:-1])
ax.plot_surface(X, Y, hist, cmap='viridis')
# Add labels to the axes
ax.set_xlabel("Age")
ax.set_ylabel("Fare")
ax.set_zlabel("Count")

# Show the plot

plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
67

# Create a figure and axes for the 3D plot

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Create a histogram of the data

hist1, xedges1, yedges1 = np.histogram2d(df[df["Survived"] ==
1]["Age"], df[df["Survived"] == 1]["Fare"], bins=10)
hist2, xedges2, yedges2 = np.histogram2d(df[df["Survived"] ==
0]["Age"], df[df["Survived"] == 0]["Fare"], bins=10)

# Create a mesh grid of the binned data

X1, Y1 = np.meshgrid(xedges1[:-1], yedges1[:-1])
X2, Y2 = np.meshgrid(xedges2[:-1], yedges2[:-1])

# Plot the Tri-Surface plot

ax.plot_surface(X1, Y1, hist1, color='r', alpha=0.3)
ax.plot_surface(X2, Y2, hist2, color='b', alpha=0.3)

# Add labels to the axes

ax.set_xlabel("Age")
ax.set_ylabel("Fare")
ax.set_zlabel("Count")

# Show the plot

plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
68

# 3D Scatter Plot
import plotly.express as px

fig = px.scatter_3d(titanic, x='Embarked', y='Fare', z='Age',

color='Sex')
fig.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
69

# 3D Scatter Plot
import plotly.express as px
fig = px.scatter_3d(wine, x='fixed acidity', y='volatile acidity',
z='total sulfur dioxide', color='quality')
fig.show()

fig = plt.figure(figsize=(20,20))
ax = plt.axes(projection="3d")
ax.scatter3D(normalized_i_q, normalized_u_d, normalized_torque, s=0.5,
c=normalized_torque, cmap=plt.get_cmap("jet"))
plt.show()

u_q = electric_motor_temprature_data['u_q']
u_d = electric_motor_temprature_data['u_d']
i_q = electric_motor_temprature_data['i_q']
i_d = electric_motor_temprature_data['i_d']
torque = electric_motor_temprature_data['torque']

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)
70

normalized_u_q = (u_q - u_q.min())/(u_q.max()-u_q.min())

normalized_u_d = (u_d - u_d.min())/(u_d.max()-u_d.min())
normalized_i_q = (i_q - i_q.min())/(i_q.max()-i_q.min())
normalized_i_d = (i_d - i_d.min())/(i_d.max()-i_d.min())
normalized_torque = (torque - torque.min())/(torque.max()-torque.min())
fig = plt.figure(figsize=(20,20))
ax = plt.axes(projection="3d")
ax.scatter3D(normalized_u_q, normalized_u_d, normalized_torque, s=0.5,
c=normalized_torque, cmap=plt.get_cmap("jet"))
plt.show()

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
hist, xedges, yedges = np.histogram2d(df["Age"], df["Fare"],
bins=10)
X, Y = np.meshgrid(xedges[:-1], yedges[:-1])
ax.plot_wireframe(X, Y, hist)
ax.set_xlabel("Age")
ax.set_ylabel("Fare")
ax.set_zlabel("Count")
plt.show()

Syed Afroz Ali

Data Scientist (Kaggle Grandmaster)

Instant Download Developing Kaggle Notebooks Gabriel Preda PDF All Chapter
100% (3)
Instant Download Developing Kaggle Notebooks Gabriel Preda PDF All Chapter
54 pages
Delhivery Mani
No ratings yet
Delhivery Mani
79 pages
Ip Xi
No ratings yet
Ip Xi
16 pages
World Cup Analysis
No ratings yet
World Cup Analysis
15 pages
Mastering Data Visualization Techniques
No ratings yet
Mastering Data Visualization Techniques
159 pages
Data Visualization
No ratings yet
Data Visualization
159 pages
Mastering Data Visualization Techniques 1728896857
No ratings yet
Mastering Data Visualization Techniques 1728896857
85 pages
Mastering Data Visualization Techniques (Part 1)
No ratings yet
Mastering Data Visualization Techniques (Part 1)
20 pages
Aim: Objective
No ratings yet
Aim: Objective
7 pages
Roll NO 2020
No ratings yet
Roll NO 2020
8 pages
Data Analyzer
No ratings yet
Data Analyzer
10 pages
Boston House Prediction - Colab1
No ratings yet
Boston House Prediction - Colab1
10 pages
FDS All Practicals
No ratings yet
FDS All Practicals
10 pages
Fds Slips
No ratings yet
Fds Slips
6 pages
Print Print Print Print: Import As
No ratings yet
Print Print Print Print: Import As
6 pages
Malicious Coding
No ratings yet
Malicious Coding
4 pages
Mayank Chaudhary DEV Practicals
No ratings yet
Mayank Chaudhary DEV Practicals
14 pages
Python Slips
No ratings yet
Python Slips
9 pages
DSBDAL - Assignment No 9
No ratings yet
DSBDAL - Assignment No 9
12 pages
DAVL PR1.2 Mit
No ratings yet
DAVL PR1.2 Mit
10 pages
Python Code Library
No ratings yet
Python Code Library
8 pages
Dav Week8 240953580
No ratings yet
Dav Week8 240953580
15 pages
Data Analisis 2
No ratings yet
Data Analisis 2
13 pages
PML Ex3
No ratings yet
PML Ex3
20 pages
Experiment No 9
No ratings yet
Experiment No 9
13 pages
Main - Py Text File
No ratings yet
Main - Py Text File
5 pages
Content From Jose Portilla's Udemy Course Learning Python For Data Analysis and Visualization Notes by Michael Brothers, Available On
No ratings yet
Content From Jose Portilla's Udemy Course Learning Python For Data Analysis and Visualization Notes by Michael Brothers, Available On
13 pages
Data Visualization With Python
No ratings yet
Data Visualization With Python
34 pages
DVA Practical
No ratings yet
DVA Practical
19 pages
Data Science Assignment Submission
No ratings yet
Data Science Assignment Submission
12 pages
Modulo 8. Data Visualization With Python
No ratings yet
Modulo 8. Data Visualization With Python
30 pages
Ai&Ml Bail606 ML Lab Manual
No ratings yet
Ai&Ml Bail606 ML Lab Manual
50 pages
Exp 12 and 15
No ratings yet
Exp 12 and 15
4 pages
Matplotlib
No ratings yet
Matplotlib
5 pages
Data Visualization Part 2
No ratings yet
Data Visualization Part 2
18 pages
Data Science and Analtics Laboratory
No ratings yet
Data Science and Analtics Laboratory
21 pages
AD3411
No ratings yet
AD3411
28 pages
Be A 65 Ads Exp 2
No ratings yet
Be A 65 Ads Exp 2
10 pages
Time Series Analysis Group 9
No ratings yet
Time Series Analysis Group 9
16 pages
Code Shabab Error 7
No ratings yet
Code Shabab Error 7
5 pages
Formulario - EA
No ratings yet
Formulario - EA
6 pages
ML Labmanual
No ratings yet
ML Labmanual
33 pages
Assignment 1 - LP1
No ratings yet
Assignment 1 - LP1
14 pages
Data Visualization EDA-print
No ratings yet
Data Visualization EDA-print
18 pages
SESION 12 (Pandas)
No ratings yet
SESION 12 (Pandas)
41 pages
04 Boxplot
No ratings yet
04 Boxplot
22 pages
Basic Line Plot Using Matplotlib
No ratings yet
Basic Line Plot Using Matplotlib
9 pages
R Note
No ratings yet
R Note
56 pages
Data Visualization Lab3
No ratings yet
Data Visualization Lab3
23 pages
Preksha Ai Practical Class 10th - 070428
No ratings yet
Preksha Ai Practical Class 10th - 070428
13 pages
Python Course Cheat Sheet
No ratings yet
Python Course Cheat Sheet
30 pages
ML Lab
No ratings yet
ML Lab
14 pages
Presentation 1
No ratings yet
Presentation 1
30 pages
Class XII (As Per CBSE Board) : Informatics Practices
No ratings yet
Class XII (As Per CBSE Board) : Informatics Practices
27 pages
Lab 5 &6
No ratings yet
Lab 5 &6
6 pages
1 10
No ratings yet
1 10
4 pages
Pattern Recognition
No ratings yet
Pattern Recognition
26 pages
PRO Level Data Visualization Cheat Sheet
No ratings yet
PRO Level Data Visualization Cheat Sheet
15 pages
Class X Practical-2025 - Jupyter Notebook
No ratings yet
Class X Practical-2025 - Jupyter Notebook
6 pages
No Ph.D. Game Design With Three.js
From Everand
No Ph.D. Game Design With Three.js
Nikiforos Kontopoulos
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Semantic Language Poetry
From Everand
Semantic Language Poetry
Vladan Kuzmanović
No ratings yet
Learning Path Machine Learning
No ratings yet
Learning Path Machine Learning
7 pages
ROHIT
No ratings yet
ROHIT
7 pages
Task 9 Implementation of Object Detection and Localization
No ratings yet
Task 9 Implementation of Object Detection and Localization
7 pages
Data Science - A Kaggle Walkthrough - Introduction - 1 PDF
No ratings yet
Data Science - A Kaggle Walkthrough - Introduction - 1 PDF
5 pages
Atharva Kale 10..
No ratings yet
Atharva Kale 10..
7 pages
Data Anallysis Roadmap
No ratings yet
Data Anallysis Roadmap
4 pages
Visual Taxonomy Report
No ratings yet
Visual Taxonomy Report
10 pages
Sakshi Singh
No ratings yet
Sakshi Singh
1 page
Big Data in Practice Esampler PDF
50% (6)
Big Data in Practice Esampler PDF
14 pages
Kaggle Tutorial 1
No ratings yet
Kaggle Tutorial 1
29 pages
Week 1 - Lab
No ratings yet
Week 1 - Lab
21 pages
How To Start Kaggle
No ratings yet
How To Start Kaggle
40 pages
LAB01
No ratings yet
LAB01
8 pages
Pure Math PHD Looking To Transition Into ML - Data Science Industry. I'm Looking For Suggestions. (X-Post From R - MLjobs) - R - Datascience
No ratings yet
Pure Math PHD Looking To Transition Into ML - Data Science Industry. I'm Looking For Suggestions. (X-Post From R - MLjobs) - R - Datascience
5 pages
Final Data Sciene Report
No ratings yet
Final Data Sciene Report
23 pages
Participants Kaggle Guide - Shopee Code League 2020
0% (1)
Participants Kaggle Guide - Shopee Code League 2020
18 pages
Kaggle's State of Data Science and Machine Learning 2019: Enterprise Executive Summary
No ratings yet
Kaggle's State of Data Science and Machine Learning 2019: Enterprise Executive Summary
23 pages
(V2) Kaggle's Community Competitions Setup Guide and FAQs
No ratings yet
(V2) Kaggle's Community Competitions Setup Guide and FAQs
24 pages
MLAgentBench Evaluating Language Agents On Machine Learning Experimentation
No ratings yet
MLAgentBench Evaluating Language Agents On Machine Learning Experimentation
39 pages
AIDTM - Digital Librabry Resources
No ratings yet
AIDTM - Digital Librabry Resources
13 pages
Sanskar's Resume
No ratings yet
Sanskar's Resume
1 page
Kaggle
No ratings yet
Kaggle
12 pages
Sales Analysis Using Python and SQL
No ratings yet
Sales Analysis Using Python and SQL
15 pages
Google Smartphone Decimeter Challenge 2021
No ratings yet
Google Smartphone Decimeter Challenge 2021
3 pages
Datathon 3.0 Final Playbook
No ratings yet
Datathon 3.0 Final Playbook
19 pages
Mini Project Report
No ratings yet
Mini Project Report
21 pages
Data Science - The 12th Statistika Ria 2017 v.1.2
No ratings yet
Data Science - The 12th Statistika Ria 2017 v.1.2
36 pages
Inteliment Technologies Presentation
No ratings yet
Inteliment Technologies Presentation
11 pages