Ensmble - Learning - ML - 5 - Jupyter Notebook

10/1/24, 11:26 PM Ensmble_Learning_ML_5 - Jupyter Notebook
In [1]: #bagging - bootstrap aggregating, parallel approach

#boosting - sequential approach
#decision tree - supervised learning, non-parametric
#To avoid the drawbrack of decision tree i.e ovefitting, random forest mode
#Random forest is used for feature selection
In [1]: import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classificatio
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
In [2]: df = pd.read_csv('car_data.csv')
In [3]: df.head()
Out[3]:
vhigh vhigh.1 2 2.1 small low unacc
0 vhigh vhigh 2 2 small med unacc
1 vhigh vhigh 2 2 small high unacc
2 vhigh vhigh 2 2 med low unacc
3 vhigh vhigh 2 2 med med unacc
4 vhigh vhigh 2 2 med high unacc
In [4]: df.shape
Out[4]: (1727, 7)
In [5]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 vhigh 1727 non-null object
1 vhigh.1 1727 non-null object
2 2 1727 non-null object
3 2.1 1727 non-null object
4 small 1727 non-null object
5 low 1727 non-null object
6 unacc 1727 non-null object
dtypes: object(7)
memory usage: 94.6+ KB
localhost:8888/notebooks/BE_PRACTICALS/Ensmble_Learning_ML_5.ipynb 1/7
In [6]: df.describe()
Out[6]:
vhigh vhigh.1 2 2.1 small low unacc
count 1727 1727 1727 1727 1727 1727 1727
unique 4 4 4 3 3 3 4
top high high 3 4 med med unacc
freq 432 432 432 576 576 576 1209
In [7]: df.isna().sum()
Out[7]: vhigh 0
vhigh.1 0
2 0
2.1 0
small 0
low 0
unacc 0
dtype: int64
In [8]: df_col = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'cla

df.columns=df_col
df.head()
Out[8]:
buying maint doors persons lug_boot safety class
0 vhigh vhigh 2 2 small med unacc
1 vhigh vhigh 2 2 small high unacc
2 vhigh vhigh 2 2 med low unacc
3 vhigh vhigh 2 2 med med unacc
4 vhigh vhigh 2 2 med high unacc
In [9]: for i in df_col:

print(df[i].value_counts())
buying
high 432
med 432
low 432
vhigh 431
Name: count, dtype: int64
maint
high 432
med 432
low 432
vhigh 431
doors
3 432
4 432
5more 432
2 431
persons
4 576
more 576
2 575
lug_boot
med 576
big 576
small 575
safety
med 576
high 576
low 575
class
unacc 1209
acc 384
good 69
vgood 65
In [10]: df['class'].unique()
Out[10]: array(['unacc', 'acc', 'vgood', 'good'], dtype=object)
In [11]: x = df.drop('class', axis = 1)

y = df['class']
y.unique()
Out[11]: array(['unacc', 'acc', 'vgood', 'good'], dtype=object)
In [12]: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, ra
In [13]: x_train.head()
Out[13]:
buying maint doors persons lug_boot safety
107 vhigh high 2 2 small low
900 med vhigh 3 4 small med
1708 low low 5more 2 big high
705 high med 4 2 med med
678 high med 3 2 med med
In [14]: x_train.shape
Out[14]: (1381, 6)
In [15]: x_test.shape
Out[15]: (346, 6)
In [16]: x_train.dtypes
Out[16]: buying object

maint object
doors object
persons object
lug_boot object
safety object
dtype: object
In [17]: !pip install category_encoders
Collecting category_encoders
Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 k
B)
Requirement already satisfied: numpy>=1.14.0 in c:\python310\lib\site-pack
ages (from category_encoders) (1.23.5)
Requirement already satisfied: scikit-learn>=0.20.0 in c:\python310\lib\si
te-packages (from category_encoders) (1.3.1)
Requirement already satisfied: scipy>=1.0.0 in c:\python310\lib\site-packa
ges (from category_encoders) (1.11.3)
Requirement already satisfied: statsmodels>=0.9.0 in c:\python310\lib\site
-packages (from category_encoders) (0.14.3)
Requirement already satisfied: pandas>=1.0.5 in c:\python310\lib\site-pack
ages (from category_encoders) (2.1.1)
Requirement already satisfied: patsy>=0.5.1 in c:\python310\lib\site-packa
ges (from category_encoders) (0.5.6)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\python310\lib
\site-packages (from pandas>=1.0.5->category_encoders) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\python310\lib\site-packa
ges (from pandas>=1.0.5->category_encoders) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\python310\lib\site-pac
kages (from pandas>=1.0.5->category_encoders) (2023.3)
Requirement already satisfied: six in c:\python310\lib\site-packages (from
patsy>=0.5.1->category_encoders) (1.16.0)
Requirement already satisfied: joblib>=1.1.1 in c:\python310\lib\site-pack
ages (from scikit-learn>=0.20.0->category_encoders) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\python310\lib\si
te-packages (from scikit-learn>=0.20.0->category_encoders) (3.2.0)
Requirement already satisfied: packaging>=21.3 in c:\python310\lib\site-pa
ckages (from statsmodels>=0.9.0->category_encoders) (23.0)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
---------------------------------------- 0.0/81.9 kB ? eta -:--:--
----------------------------------- ---- 71.7/81.9 kB 2.0 MB/s eta 0:0
0:01
---------------------------------------- 81.9/81.9 kB 1.5 MB/s eta 0:0
0:00
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3
WARNING: Ignoring invalid distribution -eras (c:\python310\lib\site-packag

es)
WARNING: Ignoring invalid distribution -eras (c:\python310\lib\site-packag
es)
[notice] A new release of pip is available: 24.0 -> 24.2

[notice] To update, run: python.exe -m pip install --upgrade pip
In [18]: import category_encoders as ce

encoder = ce.OrdinalEncoder(cols = ['buying', 'maint', 'doors', 'persons',
In [19]: x_train = encoder.fit_transform(x_train)

x_test = encoder.transform(x_test)
In [20]: random_forest = RandomForestClassifier(random_state = 0)

random_forest.fit(x_train, y_train)

y_pred = random_forest.predict(x_test)
In [21]: print(accuracy_score(y_test, y_pred))
0.9479768786127167
In [23]: random_forest_1000 = RandomForestClassifier(n_estimators = 1000, random_sta
In [24]: random_forest_1000.fit(x_train, y_train)

y_pred_1000 = random_forest_1000.predict(x_test)
In [25]: print(accuracy_score(y_test, y_pred_1000))
0.953757225433526
In [26]: random_forest_777 = RandomForestClassifier(n_estimators = 777, random_state

random_forest_777.fit(x_train, y_train)

y_pred_777 = random_forest_777.predict(x_test)

print(accuracy_score(y_test, y_pred_777))
0.953757225433526
In [27]: feature_scores = pd.Series(random_forest_1000.feature_importances_, index =

print(feature_scores)
safety 0.293866
persons 0.238120
buying 0.201061
maint 0.125037
lug_boot 0.081547
doors 0.060370
dtype: float64
In [28]: x = df.drop(['class', 'doors'], axis = 1)

y = df['class']
In [29]: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, r
In [30]: encoder = ce.OrdinalEncoder(cols = ['buying', 'maint', 'persons', 'lug_boot
In [31]: x_train = encoder.fit_transform(x_train)

x_test = encoder.transform(x_test)
In [32]: random_forest = RandomForestClassifier(random_state = 0)

random_forest.fit(x_train, y_train)

y_pred = random_forest.predict(x_test)
In [33]: print(accuracy_score(y_test, y_pred))
0.9263157894736842
In [34]: from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))
precision recall f1-score support
acc 0.88 0.85 0.86 127

good 0.62 0.56 0.59 18
unacc 0.97 0.97 0.97 399
vgood 0.75 0.81 0.78 26
accuracy 0.93 570

macro avg 0.80 0.80 0.80 570
weighted avg 0.93 0.93 0.93 570
In [ ]:

Ensmble - Learning - ML - 5 - Jupyter Notebook

Uploaded by

Copyright:

Available Formats

Ensmble - Learning - ML - 5 - Jupyter Notebook

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Ensmble - Learning - ML - 5 - Jupyter Notebook

Uploaded by

Copyright:

Available Formats

10/1/24, 11:26 PM Ensmble_Learning_ML_5 - Jupyter Notebook

In [1]: #bagging - bootstrap aggregating, parallel approach

In [1]: import pandas as pd

0 vhigh vhigh 2 2 small med unacc

1 vhigh vhigh 2 2 small high unacc

2 vhigh vhigh 2 2 med low unacc

3 vhigh vhigh 2 2 med med unacc

4 vhigh vhigh 2 2 med high unacc

count 1727 1727 1727 1727 1727 1727 1727

top high high 3 4 med med unacc

freq 432 432 432 576 576 576 1209

In [8]: df_col = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'cla

0 vhigh vhigh 2 2 small med unacc

1 vhigh vhigh 2 2 small high unacc

2 vhigh vhigh 2 2 med low unacc

3 vhigh vhigh 2 2 med med unacc

4 vhigh vhigh 2 2 med high unacc

In [9]: for i in df_col:

Out[10]: array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [11]: x = df.drop('class', axis = 1)

Out[11]: array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [12]: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, ra

107 vhigh high 2 2 small low

900 med vhigh 3 4 small med

1708 low low 5more 2 big high

705 high med 4 2 med med

678 high med 3 2 med med

Out[16]: buying object

In [17]: !pip install category_encoders

WARNING: Ignoring invalid distribution -eras (c:\python310\lib\site-packag

[notice] A new release of pip is available: 24.0 -> 24.2

In [18]: import category_encoders as ce

In [19]: x_train = encoder.fit_transform(x_train)

In [20]: random_forest = RandomForestClassifier(random_state = 0)

In [21]: print(accuracy_score(y_test, y_pred))

In [23]: random_forest_1000 = RandomForestClassifier(n_estimators = 1000, random_sta

In [24]: random_forest_1000.fit(x_train, y_train)

In [25]: print(accuracy_score(y_test, y_pred_1000))

In [26]: random_forest_777 = RandomForestClassifier(n_estimators = 777, random_state

In [27]: feature_scores = pd.Series(random_forest_1000.feature_importances_, index =

In [28]: x = df.drop(['class', 'doors'], axis = 1)

In [29]: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, r

In [30]: encoder = ce.OrdinalEncoder(cols = ['buying', 'maint', 'persons', 'lug_boot

In [31]: x_train = encoder.fit_transform(x_train)

In [32]: random_forest = RandomForestClassifier(random_state = 0)

In [33]: print(accuracy_score(y_test, y_pred))

In [34]: from sklearn.metrics import classification_report

precision recall f1-score support

acc 0.88 0.85 0.86 127

accuracy 0.93 570

You might also like