Sms Spam Detection
Sms Spam Detection
Sms Spam Detection
import pandas as pd
ds=pd.read_csv('spam.csv',encoding="ISO-8859-1", engine="python")
ds.sample(5)
v1 v2 Unnamed:
2 \
1816 ham Are you going to write ccna exam this week??
NaN
1398 ham No did you check? I got his detailed message now
NaN
4874 ham I know dat feelin had it with Pete! Wuld get w...
NaN
3924 ham As if i wasn't having enough trouble sleeping.
NaN
3368 ham Sorry i've not gone to that place. I.ll do so ...
NaN
Unnamed: 3 Unnamed: 4
1816 NaN NaN
1398 NaN NaN
4874 NaN NaN
3924 NaN NaN
3368 NaN NaN
ds.shape
(5572, 5)
#data cleaning
#EDA(exploratry data analysis)
#text preprocessing
#model building
#evalution of model
#improvement
1. Data Cleaning
ds.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 v1 5572 non-null object
1 v2 5572 non-null object
2 Unnamed: 2 50 non-null object
3 Unnamed: 3 12 non-null object
4 Unnamed: 4 6 non-null object
dtypes: object(5)
memory usage: 217.8+ KB
ds.head()
v1 v2
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...
target text
2642 ham They can try! They can get lost, in fact. Tee hee
5281 ham And how you will do that, princess? :)
3224 ham I need... Coz i never go before
1631 ham We not watching movie already. Xy wants 2 shop...
2913 spam Sorry! U can not unsubscribe yet. THE MOB offe...
ds.head()
target text
0 0 Go until jurong point, crazy.. Available only ...
1 0 Ok lar... Joking wif u oni...
2 1 Free entry in 2 a wkly comp to win FA Cup fina...
3 0 U dun say so early hor... U c already then say...
4 0 Nah I don't think he goes to usf, he lives aro...
403
# remove duplicates
ds=ds.drop_duplicates(keep='first')
ds.duplicated().sum()
#no duplicate value
ds.shape
(5169, 2)
0 4516
1 653
Name: target, dtype: int64
# repersent in piechart
import matplotlib.pyplot as plt
plt.pie(ds['target'].value_counts(), labels=['non
spam','spam'],autopct="%0.2f")
plt.show()
data is imbalanced spam is less then ham
# for deeper analysis make 3 new column 1.no of character, 2.no of
words, 3.no of sentences so use nltk lib
import nltk
nltk.download('punkt')
True
# count no of character
ds['char_count']=ds['text'].apply(len)
ds.head()
target text
char_count
0 0 Go until jurong point, crazy.. Available only ...
111
1 0 Ok lar... Joking wif u oni...
29
2 1 Free entry in 2 a wkly comp to win FA Cup fina...
155
3 0 U dun say so early hor... U c already then say...
49
4 0 Nah I don't think he goes to usf, he lives aro...
61
ds['text'].apply(lambda x:nltk.word_tokenize(x))
# count no of words
ds['word_count']=words
ds.head()
target text
char_count \
0 0 Go until jurong point, crazy.. Available only ...
111
1 0 Ok lar... Joking wif u oni...
29
2 1 Free entry in 2 a wkly comp to win FA Cup fina...
155
3 0 U dun say so early hor... U c already then say...
49
4 0 Nah I don't think he goes to usf, he lives aro...
61
word_count
0 24
1 8
2 37
3 13
4 15
ds['text'].apply(lambda x:nltk.sent_tokenize(x))
target text
char_count \
0 0 Go until jurong point, crazy.. Available only ...
111
1 0 Ok lar... Joking wif u oni...
29
2 1 Free entry in 2 a wkly comp to win FA Cup fina...
155
3 0 U dun say so early hor... U c already then say...
49
4 0 Nah I don't think he goes to usf, he lives aro...
61
word_count sent_count
0 24 2
1 8 2
2 37 2
3 13 1
4 15 1
<AxesSubplot:xlabel='char_count', ylabel='Count'>
plt.figure(figsize=(11,6))
sns.histplot(ds[ds['target']==0]['word_count'])
sns.histplot(ds[ds['target']==1]['word_count'],color='red')
<AxesSubplot:xlabel='word_count', ylabel='Count'>
# relation between no of column and sentence
sns.pairplot(ds,hue='target')
<seaborn.axisgrid.PairGrid at 0x1d21d92c8e0>
# correlation between table in heatmap
sns.heatmap(ds.corr(),annot=True)
<AxesSubplot:>
Data Preprocessing
.lower case convert .Tokenization (split into words) .Removing special characters(take only
alphabet,aplhanumeric) .Removing stop words and punctuation .Stemming
nltk.download('stopwords')
True
import string
from nltk.corpus import stopwords
# for punctuation remove we need string
'love'
def transform_text(text):
text=text.lower()
text=nltk.word_tokenize(text)
y=[]
for i in text:
if i.isalnum():
y.append(i)
text=y[:]
y.clear()
for i in text:
if i not in stopwords.words('english') and i not in
string.punctuation:
y.append(i)
text=y[:]
y.clear()
for i in text:
y.append(ps.stem(i))
'go jurong point crazi avail bugi n great world la e buffet cine got
amor wat'
ds.head()
target text
char_count \
0 0 Go until jurong point, crazy.. Available only ...
111
1 0 Ok lar... Joking wif u oni...
29
2 1 Free entry in 2 a wkly comp to win FA Cup fina...
155
3 0 U dun say so early hor... U c already then say...
49
4 0 Nah I don't think he goes to usf, he lives aro...
61
word_count sent_count
transform_text
0 24 2 go jurong point crazi avail bugi n great
world...
1 8 2 ok lar joke wif
u oni
2 37 2 free entri 2 wkli comp win fa cup final tkt
21...
3 13 1 u dun say earli hor u c
alreadi say
4 15 1 nah think goe usf live around
though
Note: you may need to restart the kernel to use updated packages.
spam_wc=wc.generate(ds[ds['target']==1]
['transform_text'].str.cat(sep=" "))
<matplotlib.image.AxesImage at 0x1d21f32e070>
<matplotlib.image.AxesImage at 0x1d221930d30>
# most used 30 word in spam
spam_corpus = []
for msg in ds[ds['target']==1]['transform_text'].tolist():
for word in msg.split():
spam_corpus.append(word)
len(spam_corpus)
9939
len(ham_corpus)
35402
x=tfidf.fit_transform(ds['transform_text']).toarray()
x.shape
(5169, 3000)
y=ds['target'].values
y.shape
(5169,)
x_train,x_test,y_train,y_test =
train_test_split(x,y,test_size=0.2,random_state=2)
gnb.fit(x_train,y_train)
y_pred1 = gnb.predict(x_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))
0.8694390715667312
[[788 108]
[ 27 111]]
0.5068493150684932
mnb.fit(x_train,y_train)
y_pred2 = mnb.predict(x_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))
0.9709864603481625
[[896 0]
[ 30 108]]
1.0
bnb.fit(x_train,y_train)
y_pred3 = bnb.predict(x_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))
0.9835589941972921
[[895 1]
[ 16 122]]
0.991869918699187
clfs = {
'SVC' : svc,
'KN' : knc,
'NB': mnb,
'DT': dtc,
'LR': lrc,
'RF': rfc,
'AdaBoost': abc,
'BgC': bc,
'ETC': etc,
'GBDT':gbdt,
'xgb':xgb
}
def train_classifier(clf,X_train,y_train,X_test,y_test):
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
return accuracy,precision
train_classifier(svc,x_train,y_train,x_test,y_test)
(0.9758220502901354, 0.9747899159663865)
accuracy_scores = []
precision_scores = []
print("For ",name)
print("Accuracy - ",current_accuracy)
print("Precision - ",current_precision)
accuracy_scores.append(current_accuracy)
precision_scores.append(current_precision)
For SVC
Accuracy - 0.9758220502901354
Precision - 0.9747899159663865
For KN
Accuracy - 0.9052224371373307
Precision - 1.0
For NB
Accuracy - 0.9709864603481625
Precision - 1.0
For DT
Accuracy - 0.9274661508704062
Precision - 0.8118811881188119
For LR
Accuracy - 0.9584139264990329
Precision - 0.9702970297029703
For RF
Accuracy - 0.9758220502901354
Precision - 0.9829059829059829
For AdaBoost
Accuracy - 0.960348162475822
Precision - 0.9292035398230089
For BgC
Accuracy - 0.9584139264990329
Precision - 0.8682170542635659
For ETC
Accuracy - 0.9748549323017408
Precision - 0.9745762711864406
For GBDT
Accuracy - 0.9468085106382979
Precision - 0.9191919191919192
For xgb
Accuracy - 0.9671179883945842
Precision - 0.9333333333333333
performance_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Prec
ision':precision_scores}).sort_values('Precision',ascending=False)
performance_df
Algorithm Accuracy Precision
1 KN 0.905222 1.000000
2 NB 0.970986 1.000000
5 RF 0.975822 0.982906
0 SVC 0.975822 0.974790
8 ETC 0.974855 0.974576
4 LR 0.958414 0.970297
10 xgb 0.967118 0.933333
6 AdaBoost 0.960348 0.929204
9 GBDT 0.946809 0.919192
7 BgC 0.958414 0.868217
3 DT 0.927466 0.811881
performance_df1
temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_
scores,'Precision_max_ft_3000':precision_scores}).sort_values('Precisi
on_max_ft_3000',ascending=False)
temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scor
es,'Precision_scaling':precision_scores}).sort_values('Precision_scali
ng',ascending=False)
new_df = performance_df.merge(temp_df,on='Algorithm')
new_df_scaled = new_df.merge(temp_df,on='Algorithm')
temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_num_chars':accuracy_sc
ores,'Precision_num_chars':precision_scores}).sort_values('Precision_n
um_chars',ascending=False)
new_df_scaled.merge(temp_df,on='Algorithm')
Accuracy_num_chars Precision_num_chars
0 0.905222 1.000000
1 0.970986 1.000000
2 0.975822 0.982906
3 0.975822 0.974790
4 0.974855 0.974576
5 0.958414 0.970297
6 0.967118 0.933333
7 0.960348 0.929204
8 0.946809 0.919192
9 0.958414 0.868217
10 0.927466 0.811881
# Voting Classifier
svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
mnb = MultinomialNB()
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
voting.fit(x_train,y_train)
VotingClassifier(estimators=[('svm',
SVC(gamma=1.0, kernel='sigmoid',
probability=True)),
('nb', MultinomialNB()),
('et',
ExtraTreesClassifier(n_estimators=50,
random_state=2))],
voting='soft')
y_pred = voting.predict(x_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))
Accuracy 0.9816247582205029
Precision 0.9917355371900827
# Applying stacking
estimators=[('svm', svc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()
clf = StackingClassifier(estimators=estimators,
final_estimator=final_estimator)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))
Accuracy 0.9806576402321083
Precision 0.9538461538461539
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))