PROSES PREPROCESSING
import pandas as pd
pd.set_option('display.max_columns', None) # Menampilkan semua kolom
pd.set_option('display.max_colwidth', None) # Menampilkan isi seluruh kolom tanpa
pemotongan
data = pd.read_excel("dana_desa.xlsx")
data.head()
# data cleaning
# menghapus kolom yang tidak akan digunakan
data = data.drop(columns=['Datetime','Tweet_Id','Username','label','kata_kunci'])
data
data['Text'].fillna('test', inplace=True)
data.head()
import re # Mengimpor modul re
# membuat fungsi untuk data cleaning
def datacleaning(text):
text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mentions
text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
text = re.sub(r'RT[\s]', '', text) # menghapus retweet
text = re.sub(r'[?|$|.|@#%^/&*=!_:")(-+,]', '', text) # menghapus simbol
text = re.sub(r"http\S+", '', text) # menghapus link
text = re.sub(r'[0-9]+', '', text) # menghapus angka
text = text.replace('\n', ' ') # mengganti baris baru menjadi spasi
text = text.strip(' ') # hapus spasi dari kiri dan kanan teks
return text
data['Text'] = data['Text'].apply(datacleaning)
data
# case folding
def CaseFolding(text): # mengubah semua kata menjadi bentuk lower case
text = text.lower()
return text
data['Text'] = data['Text'].apply(CaseFolding)
data
# Normalisasi
key_norm = pd.read_csv('key_norm.csv')
def WordNormalization(text):
text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0] if
(key_norm['singkat'] == word).any() else word for word in text.split()])
text = str.lower(text)
return text
data['Text'] = data['Text'].apply(WordNormalization)
data
# Tokenizing
import nltk
nltk.download('punkt')
import nltk
from nltk.tokenize import word_tokenize
def Tokenizing(text):
text = word_tokenize(text)
return text
data['Text'] = data['Text'].apply(Tokenizing)
data
# Stopword
import nltk
nltk.download('stopword')
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('indonesian'))
def Filtering(text):
clean_words = []
for word in text:
if word not in stop_words:
clean_words.append(word)
return " ".join(clean_words)
data['Text'] = data['Text'].apply(Filtering)
Data
# Stemming
pip install Sastrawi
pip install --upgrade pip Setuptools
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()
# Buat fungsi untuk langkah stemming bahasa Indonesia
def Stemming(text):
text = stemmer.stem(text)
return text
data['Text'] = data['Text'].apply(Stemming)
data
data.to_excel('danadesa_dc.xlsx', index=False)
PROSES PELABELAN (LEXICON BASE)
import pandas as pd
data = pd.read_excel("danadesa_dc.xlsx")
data
data['Text'].fillna('test', inplace=True)
data.head()
import csv
lexicon_positive = dict()
with open('lexicon_positive_ver1.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
lexicon_positive[row[0]] = int(row[1])
lexicon_negative = dict()
with open('lexicon_negative_ver1.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
lexicon_negative[row[0]] = int(row[1])
def sentiment_analysis_lexicon_indonesia(text):
score = 0
for word_pos in text:
if (word_pos in lexicon_positive):
score = score + lexicon_positive[word_pos]
for word_neg in text:
if (word_neg in lexicon_negative):
score = score + lexicon_negative[word_neg]
Sentimen=''
if (score > 0):
Sentimen = 'positif'
elif (score < 0):
Sentimen = 'negatif'
else:
Sentimen = 'netral'
return score, Sentimen
data['Text'] = data.Text.str.split()
data
results = data['Text'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
data['score'] = results[0]
data['Sentimen'] = results[1]
print(data['Sentimen'].value_counts())
data
def convert_tokens_to_text(tokens):
text = ' '.join(tokens)
return text
data['Text'] = data['Text'].apply(convert_tokens_to_text)
print(data)
# menghapus kolom yang tidak akan digunakan
data = data.drop(columns=['score'])
data
import matplotlib.pyplot as plt
import seaborn as sns
fig, axes = plt.subplots()
sns.histplot(data=data, x='Sentimen', color='skyblue')
plt.show()
data.to_excel('danadesa_lexicon.xlsx', index=False)
Pemodelan Menggunakan Metode C.45
import pandas as pd
pd.set_option('display.max_columns', None) # Menampilkan semua kolom
pd.set_option('display.max_colwidth', None) # Menampilkan isi seluruh kolom
tanpa pemotongan
data = pd.read_excel("danadesa_lexicon.xlsx")
data.head()
# Pisahkan kolom fitur dan target
X = data['Text']
y = data['Sentimen']
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
tf_idf.fit(X)
X_tf_idf = tf_idf.transform(X).toarray()
data_tf_idf = pd.DataFrame(X_tf_idf, columns=tf_idf.get_feature_names_out())
data_tf_idf
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tf_idf, y, test_size = 0.3,
random_state = 37)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
dc_classifier = DecisionTreeClassifier(max_depth=3, criterion="entropy")
dc_classifier.fit(X_train, y_train)
y_pred = dc_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
label_names = np.unique(y)
plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()
print("Accuracy Score untuk C4.5 Model :: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))
Pemodelan SVM
import pandas as pd
pd.set_option('display.max_columns', None) # Menampilkan semua kolom
pd.set_option('display.max_colwidth', None) # Menampilkan isi seluruh kolom
tanpa pemotongan
data = pd.read_excel("danadesa_lexicon.xlsx")
data.head()
# Pisahkan kolom fitur dan target
X = data['Text']
y = data['Sentimen']
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
tf_idf.fit(X)
X_tf_idf = tf_idf.transform(X).toarray()
data_tf_idf = pd.DataFrame(X_tf_idf, columns=tf_idf.get_feature_names_out())
data_tf_idf
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tf_idf, y, test_size = 0.3,
random_state = 37)
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
svm=svm.SVC(class_weight=None,C=1,gamma=0.1,kernel='linear',random_state=10
0, probability=True)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
cm = confusion_matrix(y_test, svm_pred)
label_names = np.unique(y)
plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()
print("Accuracy Score untuk Support Vector Machine Model :: ",
accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred, zero_division=0))
Pemodelan KNN
import pandas as pd
pd.set_option('display.max_columns', None) # Menampilkan semua kolom
pd.set_option('display.max_colwidth', None) # Menampilkan isi seluruh kolom tanpa
pemotongan
data = pd.read_excel("daring_lexicon.xlsx")
data.head()
# Pisahkan kolom fitur dan target
X = data['Mention']
y = data['Sentimen']
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
tf_idf.fit(X)
X_tf_idf = tf_idf.transform(X).toarray()
data_tf_idf = pd.DataFrame(X_tf_idf, columns=tf_idf.get_feature_names_out())
data_tf_idf
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tf_idf, y, test_size = 0.3,
random_state = 37)
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import seaborn as sns
errors = []
for k in range(1, 20):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
errors.append(1 - knn.score(X_test, y_test))
# Plot elbow method
plt.plot(range(1, 20), errors, marker='o')
plt.xlabel('Jumlah tetangga (k)')
plt.ylabel('Error')
plt.title('Elbow Method')
plt.show()
# Menentukan nilai k terbaik berdasarkan elbow method
best_k = errors.index(min(errors)) + 1
print("Nilai k terbaik: ", best_k)
# Melatih model KNN dengan nilai k terbaik
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
# Prediksi sentimen pada data uji
y_pred = knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
label_names = np.unique(y)
plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()
print(classification_report(y_test, y_pred, zero_division=0))
print("\nKNN :" , accuracy_score(y_test, y_pred))
print('-------------------------------------------')
Pemodelan Random Forest
import pandas as pd
pd.set_option('display.max_columns', None) # Menampilkan semua kolom
pd.set_option('display.max_colwidth', None) # Menampilkan isi seluruh kolom tanpa
pemotongan
data = pd.read_excel("daring_lexicon.xlsx")
data.head()
# Pisahkan kolom fitur dan target
X = data['Mention']
y = data['Sentimen']
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
tf_idf.fit(X)
X_tf_idf = tf_idf.transform(X).toarray()
data_tf_idf = pd.DataFrame(X_tf_idf, columns=tf_idf.get_feature_names_out())
data_tf_idf
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tf_idf, y, test_size = 0.3,
random_state = 37)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
label_names = np.unique(y)
plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()
print("Accuracy Score untuk RandomForestClassifier Model :: ",
accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))
Pemodelan Naive Bayes