1.
pip install google-play-scraper
2.
from google_play_scraper import Sort, reviews
result, continuation_token = reviews(
'com.dts.freefireth',
lang='id', # defaults to 'en'
country='id', # defaults to 'us'
sort=Sort.MOST_RELEVANT, # defaults to Sort.MOST_RELEVANT
count=4000, # defaults to 100
)
result, _ = reviews(
'com.dts.freefireth',
continuation_token=continuation_token # defaults to None(load from the beginning)
)
print(result)
3.
import pandas as pd
df = pd.DataFrame(result)
df.to_csv("D:/TestData11.CSV")
4.
pip install nltk
5.
import nltk
nltk.download()
6.
pip install Sastrawi
7.
pip install numpy
8.
import pandas as pd
import numpy as np
TWEET_DATA = pd.read_csv("D:/data_ff.csv")
TWEET_DATA.head()
9.
TWEET_DATA.to_csv("D:/data_ff.csv")
10.
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas
TWEET_DATA['content'] = TWEET_DATA['content'].str.lower()
print('Case Folding Result : \n')
print(TWEET_DATA['content'].head(5))
print('\n\n\n')
11.
import string
import re #regex library
# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
# ------ Tokenizing ---------
def remove_tweet_special(text):
# remove tab, new line, ans back slice
text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
# remove non ASCII (emoticon, chinese word, .etc)
text = text.encode('ascii', 'replace').decode('ascii')
# remove mention, link, hashtag
text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
# remove incomplete URL
return text.replace("http://", " ").replace("https://", " ")
TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_tweet_special)
#remove punctuation
def remove_punctuation(text):
return text.translate(str.maketrans("","",string.punctuation))
TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_punctuation)
#remove whitespace leading & trailing
def remove_whitespace_LT(text):
return text.strip()
TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_whitespace_LT)
#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
return re.sub('\s+',' ',text)
TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_whitespace_multiple)
# remove single char
def remove_singl_char(text):
return re.sub(r"\b[a-zA-Z]\b", "", text)
TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_singl_char)
# NLTK word rokenize
def word_tokenize_wrapper(text):
return word_tokenize(text)
TWEET_DATA['content_tokens'] = TWEET_DATA['content'].apply(word_tokenize_wrapper)
print('Tokenizing Result : \n')
print(TWEET_DATA['content_tokens'].head())
print('\n\n\n')
11.
# NLTK calc frequency distribution
def freqDist_wrapper(text):
return FreqDist(text)
TWEET_DATA['content_tokens_fdist'] =
TWEET_DATA['content_tokens'].apply(freqDist_wrapper)
print('Frequency Tokens : \n')
print(TWEET_DATA['content_tokens_fdist'].head().apply(lambda x : x.most_common()))
12.
from nltk.corpus import stopwords
# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')
# ---------------------------- manualy add stopword ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
'kalo', 'amp', 'biar', 'bikin', 'bilang',
'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
'jd', 'jgn', 'sdh', 'aja', 'n', 't',
'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
'&', 'yah'])
# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("D:/stopwords.txt", names= ["stopwords"], header = None)
# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
# ---------------------------------------------------------------------------------------
# convert list to dictionary
list_stopwords = set(list_stopwords)
#remove stopword pada list token
def stopwords_removal(words):
return [word for word in words if word not in list_stopwords]
TWEET_DATA['content_tokens_WSW'] =
TWEET_DATA['content_tokens'].apply(stopwords_removal)
print(TWEET_DATA['content_tokens_WSW'].head())
13.
normalizad_word = pd.read_excel("D:/normalisasi.xlsx")
normalizad_word_dict = {}
for index, row in normalizad_word.iterrows():
if row[0] not in normalizad_word_dict:
normalizad_word_dict[row[0]] = row[1]
def normalized_term(document):
return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in
document]
TWEET_DATA['content_normalized'] =
TWEET_DATA['content_tokens_WSW'].apply(normalized_term)
TWEET_DATA['content_normalized'].head(10)
14.
conda install -c conda-forge swifter
15.
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()
# stemmed
def stemmed_wrapper(term):
return stemmer.stem(term)
term_dict = {}
for document in TWEET_DATA['content_normalized']:
for term in document:
if term not in term_dict:
term_dict[term] = ' '
print(len(term_dict))
print("------------------------")
for term in term_dict:
term_dict[term] = stemmed_wrapper(term)
print(term,":" ,term_dict[term])
print(term_dict)
print("------------------------")
# apply stemmed term to dataframe
def get_stemmed_term(document):
return [term_dict[term] for term in document]
TWEET_DATA['content_tokens_stemmed'] =
TWEET_DATA['content_normalized'].swifter.apply(get_stemmed_term)
print(TWEET_DATA['content_tokens_stemmed'])
16.
TWEET_DATA.to_csv("data_ff.csv")
17.
TWEET_DATA.to_excel("data_ff.xlsx")