sovanc
diff --git a/‎machine-learning/nlp/spam-classifier/README.md
Lines changed: 12 additions & 0 deletions b/‎machine-learning/nlp/spam-classifier/README.md
Lines changed: 12 additions & 0 deletions
diff --git a/‎machine-learning/nlp/spam-classifier/data/SMSSpamCollection
Lines changed: 5574 additions & 0 deletions b/‎machine-learning/nlp/spam-classifier/data/SMSSpamCollection
Lines changed: 5574 additions & 0 deletions
diff --git a/‎machine-learning/nlp/spam-classifier/logs/spam_classifier_1566057536.51311/events.out.tfevents.1566057541.DESKTOP-JCAH48A
1.31 MB b/‎machine-learning/nlp/spam-classifier/logs/spam_classifier_1566057536.51311/events.out.tfevents.1566057541.DESKTOP-JCAH48A
1.31 MB
diff --git a/‎machine-learning/nlp/spam-classifier/requirements.txt
Lines changed: 5 additions & 0 deletions b/‎machine-learning/nlp/spam-classifier/requirements.txt
Lines changed: 5 additions & 0 deletions
diff --git a/‎machine-learning/nlp/spam-classifier/results/spam_classifier_0.05
4.36 MB b/‎machine-learning/nlp/spam-classifier/results/spam_classifier_0.05
4.36 MB
diff --git a/‎machine-learning/nlp/spam-classifier/results/spam_classifier_0.06
4.36 MB b/‎machine-learning/nlp/spam-classifier/results/spam_classifier_0.06
4.36 MB
diff --git a/‎machine-learning/nlp/spam-classifier/results/spam_classifier_0.08
4.36 MB b/‎machine-learning/nlp/spam-classifier/results/spam_classifier_0.08
4.36 MB
diff --git a/‎machine-learning/nlp/spam-classifier/results/spam_classifier_0.10
4.36 MB b/‎machine-learning/nlp/spam-classifier/results/spam_classifier_0.10
4.36 MB
diff --git a/‎machine-learning/nlp/spam-classifier/results/tokenizer.pickle
404 KB b/‎machine-learning/nlp/spam-classifier/results/tokenizer.pickle
404 KB
diff --git a/‎machine-learning/nlp/spam-classifier/spam_classifier.py
Lines changed: 111 additions & 0 deletions b/‎machine-learning/nlp/spam-classifier/spam_classifier.py
Lines changed: 111 additions & 0 deletions
@@ -0,0 +1,12 @@
+# [How to Build a Spam Classifier using Keras in Python](https://www.thepythoncode.com/article/build-spam-classifier-keras-python)
+To run this:
+- `pip3 install -r requirements.txt`
+- For training, since we're using transfer learning, you first need to download, extract [GloVe](http://nlp.stanford.edu/data/glove.6B.zip) and put to `data` folder, this is a pre trained embedding vectors that map each word to its vector, two words that have similar meanings tend to have very close vectors, and so on.
+    ```
+    python3 spam_classifier.py
+    ```
+    This will spawn tensorflow logs in `logs` folder, as well as the model and tokenizer in `results`, so `test.py` will use them.
+- After the training has finished, try testing your own emails, or change the code on your needs, or whatever:
+    ```
+    python3 test.py
+    ```
@@ -0,0 +1,5 @@
+sklearn
+keras
+tqdm
+numpy
+keras_metrics
@@ -0,0 +1,111 @@
+# to use CPU uncomment below code
+# import os
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+# import tensorflow as tf
+
+# config = tf.ConfigProto(intra_op_parallelism_threads=5,
+#                         inter_op_parallelism_threads=5, 
+#                         allow_soft_placement=True,
+#                         device_count = {'CPU' : 1,
+#                                         'GPU' : 0}
+#                        )
+
+
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils import to_categorical
+from keras.callbacks import ModelCheckpoint, TensorBoard
+from sklearn.model_selection import train_test_split
+import time
+import numpy as np
+import pickle
+
+from utils import get_embedding_vectors, get_model, SEQUENCE_LENGTH, EMBEDDING_SIZE, TEST_SIZE
+from utils import BATCH_SIZE, EPOCHS, int2label, label2int
+
+
+def load_data():
+    """
+    Loads SMS Spam Collection dataset
+    """
+    texts, labels = [], []
+    with open("data/SMSSpamCollection") as f:
+        for line in f:
+            split = line.split()
+            labels.append(split[0].strip())
+            texts.append(' '.join(split[1:]).strip())
+    return texts, labels
+
+    
+# load the data
+X, y = load_data()
+
+# Text tokenization
+# vectorizing text, turning each text into sequence of integers
+tokenizer = Tokenizer()
+tokenizer.fit_on_texts(X)
+# lets dump it to a file, so we can use it in testing
+pickle.dump(tokenizer, open("results/tokenizer.pickle", "wb"))
+
+# convert to sequence of integers
+X = tokenizer.texts_to_sequences(X)
+print(X[0])
+# convert to numpy arrays
+X = np.array(X)
+y = np.array(y)
+# pad sequences at the beginning of each sequence with 0's
+# for example if SEQUENCE_LENGTH=4:
+# [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
+# will be transformed to:
+# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
+X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)
+print(X[0])
+# One Hot encoding labels
+# [spam, ham, spam, ham, ham] will be converted to:
+# [1, 0, 1, 0, 1] and then to:
+# [[0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]
+
+y = [ label2int[label] for label in y ]
+y = to_categorical(y)
+
+print(y[0])
+
+# split and shuffle
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)
+
+# constructs the model with 128 LSTM units
+model = get_model(tokenizer=tokenizer, lstm_units=128)
+
+# initialize our ModelCheckpoint and TensorBoard callbacks
+# model checkpoint for saving best weights
+model_checkpoint = ModelCheckpoint("results/spam_classifier_{val_loss:.2f}", save_best_only=True,
+                                    verbose=1)
+# for better visualization
+tensorboard = TensorBoard(f"logs/spam_classifier_{time.time()}")
+# print our data shapes
+print("X_train.shape:", X_train.shape)
+print("X_test.shape:", X_test.shape)
+print("y_train.shape:", y_train.shape)
+print("y_test.shape:", y_test.shape)
+# train the model
+model.fit(X_train, y_train, validation_data=(X_test, y_test),
+          batch_size=BATCH_SIZE, epochs=EPOCHS,
+          callbacks=[tensorboard, model_checkpoint],
+          verbose=1)
+
+# get the loss and metrics
+result = model.evaluate(X_test, y_test)
+# extract those
+loss = result[0]
+accuracy = result[1]
+precision = result[2]
+recall = result[3]
+
+print(f"[+] Accuracy: {accuracy*100:.2f}%")
+print(f"[+] Precision:   {precision*100:.2f}%")
+print(f"[+] Recall:   {recall*100:.2f}%")
+
+
+
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +sklearn
 +keras
 +tqdm
 +numpy
 +keras_metrics