import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, GRU, Conv1D,
MaxPooling1D, Flatten, Dense, Dropout, SimpleRNN
from tensorflow.keras.optimizers import Adam
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.fasttext import FastText
import transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding,
Dropout
from torchtext.vocab import GloVe
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
import pandas as pd
import json
import re
import numpy as np
import spacy
import tqdm
import xgboost as xgb
import lightgbm as lgb
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,
AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from nltk import pos_tag, word_tokenize
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
# Function to read JSON lines file
def read_json_lines(file_path):
data = []
with open(file_path, 'r') as file:
for line in file:
data.append(json.loads(line))
return data
# Load the datasets
file1_path = '/kaggle/input/sarcasm/Sarcasm_Headlines_Dataset.json'
file2_path = '/kaggle/input/sarcasm/Sarcasm_Headlines_Dataset_v2.json'
df1 = pd.read_json(file1_path, lines=True)
df2 = pd.read_json(file2_path, lines=True)
# Concatenate the datasets
df = pd.concat([df1, df2], ignore_index=True)
# Preprocessing
df.drop(columns=['article_link'], inplace=True) # Drop the 'article_link' column
df.dropna(inplace=True) # Drop any rows with missing values
df['headline'] = df['headline'].str.lower() # Convert text to lowercase
# Basic text preprocessing
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
# Lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
# Remove punctuation
text = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', text)
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
return text
df['headline'] = df['headline'].apply(preprocess_text)
# Check for any missing values
df.isnull().sum()
# Display the first few rows after preprocessing
print("\nAfter Preprocessing:")
print(df.head())
# Example model training (Logistic Regression as a placeholder)
X = df['headline']
y = df['is_sarcastic']
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
# Padding
maxlen = 100 # You can adjust this value
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)
# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(type(X_train))
print(type(X_test))
X_train = X_train.tolist()
X_test = X_test.tolist()
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, GRU, Conv1D,
GlobalMaxPooling1D, Embedding, SimpleRNN
from keras.utils import to_categorical
# Prepare data
X = df['headline'].values
y = df['is_sarcastic'].values
# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y)
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Initialize Sentence Transformer model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
# Generate sentence embeddings
X_train_embeddings = sbert_model.encode(X_train)
X_test_embeddings = sbert_model.encode(X_test)
# Define model architectures using sentence embeddings
def create_lstm_model(input_shape):
model = Sequential()
model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_bilstm_model(input_shape):
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True),
input_shape=input_shape))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_gru_model(input_shape):
model = Sequential()
model.add(GRU(128, return_sequences=True, input_shape=input_shape))
model.add(GRU(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_bigru_model(input_shape):
model = Sequential()
model.add(Bidirectional(GRU(128, return_sequences=True),
input_shape=input_shape))
model.add(Bidirectional(GRU(128)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_cnn_model(input_shape):
model = Sequential()
model.add(Conv1D(128, 5, activation='relu', input_shape=input_shape))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
def create_rnn_model(input_shape):
model = Sequential()
model.add(SimpleRNN(128, return_sequences=True, input_shape=input_shape))
model.add(SimpleRNN(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return model
# Train and evaluate models
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)
print(classification_report(y_test_classes, y_pred_classes))
cm = confusion_matrix(y_test_classes, y_pred_classes)
print("Confusion Matrix:\n", cm)
return model
# Create a dictionary of models
models = {
"LSTM": create_lstm_model((X_train_embeddings.shape[1], 1)),
"Bi-LSTM": create_bilstm_model((X_train_embeddings.shape[1], 1)),
"GRU": create_gru_model((X_train_embeddings.shape[1], 1)),
"Bi-GRU": create_bigru_model((X_train_embeddings.shape[1], 1)),
"CNN": create_cnn_model((X_train_embeddings.shape[1], 1)),
"RNN": create_rnn_model((X_train_embeddings.shape[1], 1))
}
# Train and evaluate each model
results = {}
for name, model in models.items():
print(f"Training {name}...")
trained_model = train_and_evaluate_model(model, X_train_embeddings, y_train,
X_test_embeddings, y_test)
results[name] = trained_model
# Print results
for name, result in results.items():
print(f"{name} model trained and evaluated.")