Skip to content

Commit 14848f4

Browse files
committed
added spam classifier
1 parent 5a53324 commit 14848f4

12 files changed

+5803
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# [How to Build a Spam Classifier using Keras in Python](https://www.thepythoncode.com/article/build-spam-classifier-keras-python)
2+
To run this:
3+
- `pip3 install -r requirements.txt`
4+
- For training, since we're using transfer learning, you first need to download, extract [GloVe](http://nlp.stanford.edu/data/glove.6B.zip) and put to `data` folder, this is a pre trained embedding vectors that map each word to its vector, two words that have similar meanings tend to have very close vectors, and so on.
5+
```
6+
python3 spam_classifier.py
7+
```
8+
This will spawn tensorflow logs in `logs` folder, as well as the model and tokenizer in `results`, so `test.py` will use them.
9+
- After the training has finished, try testing your own emails, or change the code on your needs, or whatever:
10+
```
11+
python3 test.py
12+
```

machine-learning/nlp/spam-classifier/data/SMSSpamCollection

Lines changed: 5574 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
sklearn
2+
keras
3+
tqdm
4+
numpy
5+
keras_metrics
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# to use CPU uncomment below code
2+
# import os
3+
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
4+
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
5+
6+
# import tensorflow as tf
7+
8+
# config = tf.ConfigProto(intra_op_parallelism_threads=5,
9+
# inter_op_parallelism_threads=5,
10+
# allow_soft_placement=True,
11+
# device_count = {'CPU' : 1,
12+
# 'GPU' : 0}
13+
# )
14+
15+
16+
from keras.preprocessing.text import Tokenizer
17+
from keras.preprocessing.sequence import pad_sequences
18+
from keras.utils import to_categorical
19+
from keras.callbacks import ModelCheckpoint, TensorBoard
20+
from sklearn.model_selection import train_test_split
21+
import time
22+
import numpy as np
23+
import pickle
24+
25+
from utils import get_embedding_vectors, get_model, SEQUENCE_LENGTH, EMBEDDING_SIZE, TEST_SIZE
26+
from utils import BATCH_SIZE, EPOCHS, int2label, label2int
27+
28+
29+
def load_data():
30+
"""
31+
Loads SMS Spam Collection dataset
32+
"""
33+
texts, labels = [], []
34+
with open("data/SMSSpamCollection") as f:
35+
for line in f:
36+
split = line.split()
37+
labels.append(split[0].strip())
38+
texts.append(' '.join(split[1:]).strip())
39+
return texts, labels
40+
41+
42+
# load the data
43+
X, y = load_data()
44+
45+
# Text tokenization
46+
# vectorizing text, turning each text into sequence of integers
47+
tokenizer = Tokenizer()
48+
tokenizer.fit_on_texts(X)
49+
# lets dump it to a file, so we can use it in testing
50+
pickle.dump(tokenizer, open("results/tokenizer.pickle", "wb"))
51+
52+
# convert to sequence of integers
53+
X = tokenizer.texts_to_sequences(X)
54+
print(X[0])
55+
# convert to numpy arrays
56+
X = np.array(X)
57+
y = np.array(y)
58+
# pad sequences at the beginning of each sequence with 0's
59+
# for example if SEQUENCE_LENGTH=4:
60+
# [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
61+
# will be transformed to:
62+
# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
63+
X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)
64+
print(X[0])
65+
# One Hot encoding labels
66+
# [spam, ham, spam, ham, ham] will be converted to:
67+
# [1, 0, 1, 0, 1] and then to:
68+
# [[0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]
69+
70+
y = [ label2int[label] for label in y ]
71+
y = to_categorical(y)
72+
73+
print(y[0])
74+
75+
# split and shuffle
76+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)
77+
78+
# constructs the model with 128 LSTM units
79+
model = get_model(tokenizer=tokenizer, lstm_units=128)
80+
81+
# initialize our ModelCheckpoint and TensorBoard callbacks
82+
# model checkpoint for saving best weights
83+
model_checkpoint = ModelCheckpoint("results/spam_classifier_{val_loss:.2f}", save_best_only=True,
84+
verbose=1)
85+
# for better visualization
86+
tensorboard = TensorBoard(f"logs/spam_classifier_{time.time()}")
87+
# print our data shapes
88+
print("X_train.shape:", X_train.shape)
89+
print("X_test.shape:", X_test.shape)
90+
print("y_train.shape:", y_train.shape)
91+
print("y_test.shape:", y_test.shape)
92+
# train the model
93+
model.fit(X_train, y_train, validation_data=(X_test, y_test),
94+
batch_size=BATCH_SIZE, epochs=EPOCHS,
95+
callbacks=[tensorboard, model_checkpoint],
96+
verbose=1)
97+
98+
# get the loss and metrics
99+
result = model.evaluate(X_test, y_test)
100+
# extract those
101+
loss = result[0]
102+
accuracy = result[1]
103+
precision = result[2]
104+
recall = result[3]
105+
106+
print(f"[+] Accuracy: {accuracy*100:.2f}%")
107+
print(f"[+] Precision: {precision*100:.2f}%")
108+
print(f"[+] Recall: {recall*100:.2f}%")
109+
110+
111+
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
3+
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
4+
5+
# import tensorflow as tf
6+
7+
# config = tf.ConfigProto(intra_op_parallelism_threads=5,
8+
# inter_op_parallelism_threads=5,
9+
# allow_soft_placement=True,
10+
# device_count = {'CPU' : 1,
11+
# 'GPU' : 0}
12+
# )
13+
from utils import get_model, int2label, label2int
14+
from keras.preprocessing.sequence import pad_sequences
15+
16+
import pickle
17+
import numpy as np
18+
19+
SEQUENCE_LENGTH = 100
20+
21+
# get the tokenizer
22+
tokenizer = pickle.load(open("results/tokenizer.pickle", "rb"))
23+
24+
model = get_model(tokenizer, 128)
25+
model.load_weights("results/spam_classifier_0.05")
26+
27+
def get_predictions(text):
28+
sequence = tokenizer.texts_to_sequences([text])
29+
# pad the sequence
30+
sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)
31+
# get the prediction
32+
prediction = model.predict(sequence)[0]
33+
# one-hot encoded vector, revert using np.argmax
34+
return int2label[np.argmax(prediction)]
35+
36+
37+
while True:
38+
text = input("Enter the mail:")
39+
# convert to sequences
40+
print(get_predictions(text))
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import tqdm
2+
import numpy as np
3+
from keras.preprocessing.sequence import pad_sequences
4+
from keras.layers import Embedding, LSTM, Dropout, Dense
5+
from keras.models import Sequential
6+
import keras_metrics
7+
8+
SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)
9+
EMBEDDING_SIZE = 100 # Using 100-Dimensional GloVe embedding vectors
10+
TEST_SIZE = 0.25 # ratio of testing set
11+
12+
BATCH_SIZE = 64
13+
EPOCHS = 20 # number of epochs
14+
15+
label2int = {"ham": 0, "spam": 1}
16+
int2label = {0: "ham", 1: "spam"}
17+
18+
def get_embedding_vectors(tokenizer, dim=100):
19+
embedding_index = {}
20+
with open(f"data/glove.6B.{dim}d.txt", encoding='utf8') as f:
21+
for line in tqdm.tqdm(f, "Reading GloVe"):
22+
values = line.split()
23+
word = values[0]
24+
vectors = np.asarray(values[1:], dtype='float32')
25+
embedding_index[word] = vectors
26+
27+
word_index = tokenizer.word_index
28+
# we do +1 because Tokenizer() starts from 1
29+
embedding_matrix = np.zeros((len(word_index)+1, dim))
30+
for word, i in word_index.items():
31+
embedding_vector = embedding_index.get(word)
32+
if embedding_vector is not None:
33+
# words not found will be 0s
34+
embedding_matrix[i] = embedding_vector
35+
36+
return embedding_matrix
37+
38+
39+
def get_model(tokenizer, lstm_units):
40+
"""
41+
Constructs the model,
42+
Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation
43+
"""
44+
# get the GloVe embedding vectors
45+
embedding_matrix = get_embedding_vectors(tokenizer)
46+
model = Sequential()
47+
model.add(Embedding(len(tokenizer.word_index)+1,
48+
EMBEDDING_SIZE,
49+
weights=[embedding_matrix],
50+
trainable=False,
51+
input_length=SEQUENCE_LENGTH))
52+
53+
model.add(LSTM(lstm_units, recurrent_dropout=0.2))
54+
model.add(Dropout(0.3))
55+
model.add(Dense(2, activation="softmax"))
56+
# compile as rmsprop optimizer
57+
# aswell as with recall metric
58+
model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
59+
metrics=["accuracy", keras_metrics.precision(), keras_metrics.recall()])
60+
model.summary()
61+
return model

0 commit comments

Comments
 (0)