0% found this document useful (0 votes)

146 views7 pages

Source Code Python Jemmy

1. The document describes steps to preprocess Twitter sentiment data on Free Fire game reviews in Indonesian, including downloading necessary libraries, importing data, cleaning text by removing stopwords and punctuation, stemming words, and saving the final preprocessed data. 2. Key preprocessing steps include tokenizing text, removing stopwords, normalizing words, and stemming words. The document provides code snippets in Python to implement each step using libraries like NLTK, Pandas, Sastrawi, and NumPy. 3. The final preprocessed data is saved in CSV and Excel formats for further analysis.

Uploaded by

Fadilah Riczky

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

146 views7 pages

Source Code Python Jemmy

Uploaded by

Fadilah Riczky

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 7

1.

pip install google-play-scraper

from google_play_scraper import Sort, reviews

result, continuation_token = reviews(
'com.dts.freefireth',
lang='id', # defaults to 'en'
country='id', # defaults to 'us'
sort=Sort.MOST_RELEVANT, # defaults to Sort.MOST_RELEVANT
count=4000, # defaults to 100

)
result, _ = reviews(
'com.dts.freefireth',
continuation_token=continuation_token # defaults to None(load from the beginning)
)
print(result)

import pandas as pd

df = pd.DataFrame(result)
df.to_csv("D:/TestData11.CSV")

4.
pip install nltk

5.
import nltk
nltk.download()

6.
pip install Sastrawi
7.
pip install numpy

8.
import pandas as pd
import numpy as np

TWEET_DATA = pd.read_csv("D:/data_ff.csv")

TWEET_DATA.head()

9.
TWEET_DATA.to_csv("D:/data_ff.csv")

10.

# ------ Case Folding --------

# gunakan fungsi Series.str.lower() pada Pandas
TWEET_DATA['content'] = TWEET_DATA['content'].str.lower()

print('Case Folding Result : \n')

print(TWEET_DATA['content'].head(5))
print('\n\n\n')

11.

import string
import re #regex library

# import word_tokenize & FreqDist from NLTK

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_tweet_special(text):
# remove tab, new line, ans back slice
text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
# remove non ASCII (emoticon, chinese word, .etc)
text = text.encode('ascii', 'replace').decode('ascii')
# remove mention, link, hashtag
text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
# remove incomplete URL
return text.replace("http://", " ").replace("https://", " ")

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_tweet_special)

#remove punctuation
def remove_punctuation(text):
return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_punctuation)

#remove whitespace leading & trailing

def remove_whitespace_LT(text):
return text.strip()

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace

def remove_whitespace_multiple(text):
return re.sub('\s+',' ',text)

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_whitespace_multiple)

# remove single char

def remove_singl_char(text):
return re.sub(r"\b[a-zA-Z]\b", "", text)

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_singl_char)

# NLTK word rokenize

def word_tokenize_wrapper(text):
return word_tokenize(text)

TWEET_DATA['content_tokens'] = TWEET_DATA['content'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n')

print(TWEET_DATA['content_tokens'].head())
print('\n\n\n')
11.

# NLTK calc frequency distribution

def freqDist_wrapper(text):
return FreqDist(text)

TWEET_DATA['content_tokens_fdist'] =
TWEET_DATA['content_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n')

print(TWEET_DATA['content_tokens_fdist'].head().apply(lambda x : x.most_common()))

12.

from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------

# get stopword indonesia
list_stopwords = stopwords.words('indonesian')

# ---------------------------- manualy add stopword ------------------------------------

# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
'kalo', 'amp', 'biar', 'bikin', 'bilang',
'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
'jd', 'jgn', 'sdh', 'aja', 'n', 't',
'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
'&amp', 'yah'])

# ----------------------- add stopword from txt file ------------------------------------

# read txt stopword using pandas
txt_stopword = pd.read_csv("D:/stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword

list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary

list_stopwords = set(list_stopwords)
#remove stopword pada list token
def stopwords_removal(words):
return [word for word in words if word not in list_stopwords]

TWEET_DATA['content_tokens_WSW'] =
TWEET_DATA['content_tokens'].apply(stopwords_removal)

print(TWEET_DATA['content_tokens_WSW'].head())

13.

normalizad_word = pd.read_excel("D:/normalisasi.xlsx")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():

if row[0] not in normalizad_word_dict:
normalizad_word_dict[row[0]] = row[1]

def normalized_term(document):
return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in
document]

TWEET_DATA['content_normalized'] =
TWEET_DATA['content_tokens_WSW'].apply(normalized_term)

TWEET_DATA['content_normalized'].head(10)

14.

conda install -c conda-forge swifter

15.

# import Sastrawi package

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
return stemmer.stem(term)

term_dict = {}

for document in TWEET_DATA['content_normalized']:

for term in document:
if term not in term_dict:
term_dict[term] = ' '

print(len(term_dict))
print("------------------------")

for term in term_dict:

term_dict[term] = stemmed_wrapper(term)
print(term,":" ,term_dict[term])

print(term_dict)
print("------------------------")

# apply stemmed term to dataframe

def get_stemmed_term(document):
return [term_dict[term] for term in document]

TWEET_DATA['content_tokens_stemmed'] =
TWEET_DATA['content_normalized'].swifter.apply(get_stemmed_term)
print(TWEET_DATA['content_tokens_stemmed'])

16.

TWEET_DATA.to_csv("data_ff.csv")
17.
TWEET_DATA.to_excel("data_ff.xlsx")

Berezin. - Практикум по тактике. 2 ступень. Атака и защита. Контрудар - 2013
No ratings yet
Berezin. - Практикум по тактике. 2 ступень. Атака и защита. Контрудар - 2013
78 pages
Orientation & Mobility WK1
0% (1)
Orientation & Mobility WK1
9 pages
Exp1 NLP
No ratings yet
Exp1 NLP
2 pages
AminaRahmanK DL Lab5
No ratings yet
AminaRahmanK DL Lab5
11 pages
7 Idf
No ratings yet
7 Idf
5 pages
NLP Lab File
No ratings yet
NLP Lab File
13 pages
NLP1 Prasen
No ratings yet
NLP1 Prasen
5 pages
Sahil NLP
No ratings yet
Sahil NLP
16 pages
NLP Lab Programms
No ratings yet
NLP Lab Programms
9 pages
7 TextAnalysis
No ratings yet
7 TextAnalysis
3 pages
Record
No ratings yet
Record
6 pages
AP19110010110 Lab Assignment-2 - Jupyter Notebook
No ratings yet
AP19110010110 Lab Assignment-2 - Jupyter Notebook
18 pages
NLP - (1) (1) .Ipynb - Colab
No ratings yet
NLP - (1) (1) .Ipynb - Colab
10 pages
NLP Expts
No ratings yet
NLP Expts
41 pages
NLP
No ratings yet
NLP
12 pages
Text Analytics
No ratings yet
Text Analytics
3 pages
Programs Code
No ratings yet
Programs Code
7 pages
NLP Lab File
No ratings yet
NLP Lab File
15 pages
Se 3 Tal 5 Ees
No ratings yet
Se 3 Tal 5 Ees
1 page
EXP1 (1)
No ratings yet
EXP1 (1)
4 pages
NLP Lab1
No ratings yet
NLP Lab1
6 pages
115 Ir 7
No ratings yet
115 Ir 7
6 pages
20BCP112 - NLP Lab - LAB - Manual
No ratings yet
20BCP112 - NLP Lab - LAB - Manual
65 pages
NLP Pratical
No ratings yet
NLP Pratical
14 pages
NLP Lab Programs
No ratings yet
NLP Lab Programs
3 pages
Koding Text Mining
No ratings yet
Koding Text Mining
10 pages
NLP - Lab - 1.ipynb - Colab
No ratings yet
NLP - Lab - 1.ipynb - Colab
4 pages
Ir Lab 2 Ir Learning Outcomes: Pyterrier
No ratings yet
Ir Lab 2 Ir Learning Outcomes: Pyterrier
7 pages
NLP Practicals All
No ratings yet
NLP Practicals All
57 pages
NLP Lab Work
No ratings yet
NLP Lab Work
34 pages
DSBD 7 Ass
No ratings yet
DSBD 7 Ass
9 pages
Coding
No ratings yet
Coding
35 pages
Tutorial 2
No ratings yet
Tutorial 2
82 pages
TUGAS 1 - 175150201111005 - ANGLING BANTOLO AJI - Text Mining
No ratings yet
TUGAS 1 - 175150201111005 - ANGLING BANTOLO AJI - Text Mining
4 pages
Reg. No.: 39110009 Colab Notebook Link: Name: Abivirshan Suresh
No ratings yet
Reg. No.: 39110009 Colab Notebook Link: Name: Abivirshan Suresh
27 pages
1a NLTK
No ratings yet
1a NLTK
10 pages
Token Ization
No ratings yet
Token Ization
5 pages
20BCP123 - NLP Lab Manual
No ratings yet
20BCP123 - NLP Lab Manual
45 pages
NLP Lab File
No ratings yet
NLP Lab File
13 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
17 pages
NLP Projects
No ratings yet
NLP Projects
4 pages
NLPPractical
No ratings yet
NLPPractical
12 pages
NLP Notebook Explanation
No ratings yet
NLP Notebook Explanation
17 pages
A09Ass07 - Jupyter Notebook
No ratings yet
A09Ass07 - Jupyter Notebook
6 pages
D22CS097 P6
No ratings yet
D22CS097 P6
3 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
19 pages
Basic NLP On Disaster Tweets
No ratings yet
Basic NLP On Disaster Tweets
23 pages
NLP - Practical List
No ratings yet
NLP - Practical List
14 pages
AI Lab Programs
No ratings yet
AI Lab Programs
9 pages
Shubham Jade MSC It 31031420010 NLP Practical Journal
No ratings yet
Shubham Jade MSC It 31031420010 NLP Practical Journal
17 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
15 pages
x0 Process
No ratings yet
x0 Process
4 pages
NLP Lab Programs
No ratings yet
NLP Lab Programs
18 pages
NLP Op
No ratings yet
NLP Op
16 pages
NLP Lab1
No ratings yet
NLP Lab1
2 pages
Final NLP Lab File
No ratings yet
Final NLP Lab File
28 pages
NLP 2
No ratings yet
NLP 2
1 page
Assessment - 2: - K Mary Nikitha
No ratings yet
Assessment - 2: - K Mary Nikitha
27 pages
NLP Lab - Manual
No ratings yet
NLP Lab - Manual
33 pages
Replisome - Wikipedia
No ratings yet
Replisome - Wikipedia
17 pages
Sad I Ams Module Lit Form I
No ratings yet
Sad I Ams Module Lit Form I
38 pages
Bunkering Operations, SOPEP Locker
No ratings yet
Bunkering Operations, SOPEP Locker
3 pages
Veeam One 12 1 Release Notes
No ratings yet
Veeam One 12 1 Release Notes
12 pages
BAVARIA SR36 - SalesManual
No ratings yet
BAVARIA SR36 - SalesManual
55 pages
BÀI TẬP ÔN NGỮ PHÁP
No ratings yet
BÀI TẬP ÔN NGỮ PHÁP
4 pages
Delivery Obligations Under Cif Contracts
No ratings yet
Delivery Obligations Under Cif Contracts
3 pages
65K Brochure
No ratings yet
65K Brochure
4 pages
THE Global Supplier Quality Assurance System: Electrolux Major Appliances
No ratings yet
THE Global Supplier Quality Assurance System: Electrolux Major Appliances
25 pages
7496 Andree Irina Cretu Accounting Principles 62065 1671285388
No ratings yet
7496 Andree Irina Cretu Accounting Principles 62065 1671285388
21 pages
Grade 5 - Science Curriculum - Topics Chart
No ratings yet
Grade 5 - Science Curriculum - Topics Chart
7 pages
Using The User Defined Keys
No ratings yet
Using The User Defined Keys
10 pages
Role of NDT in Mega Projects - Md. Faruque Hossain Chowdhury
No ratings yet
Role of NDT in Mega Projects - Md. Faruque Hossain Chowdhury
28 pages
Eras Intra Operative
No ratings yet
Eras Intra Operative
2 pages
Lecture 6
No ratings yet
Lecture 6
11 pages
Eryhana - com-LittleDolphinCrochetPattern Dyr
100% (2)
Eryhana - com-LittleDolphinCrochetPattern Dyr
5 pages
M - Stainless Steel Take Up Frames
No ratings yet
M - Stainless Steel Take Up Frames
4 pages
SSC CGL & Rbi Grade-B Syllabus
No ratings yet
SSC CGL & Rbi Grade-B Syllabus
8 pages
Physics Mini Project Plant Communicator
No ratings yet
Physics Mini Project Plant Communicator
19 pages
CBSE Class 10 English Dust of Snow HOTS
No ratings yet
CBSE Class 10 English Dust of Snow HOTS
11 pages
Listings
No ratings yet
Listings
67 pages
t6 Pro PDF
No ratings yet
t6 Pro PDF
36 pages
Cherie Gil
No ratings yet
Cherie Gil
3 pages
Care Label: Boutique Management
No ratings yet
Care Label: Boutique Management
10 pages
Inferential Statistics: Statistical Hypothesis
No ratings yet
Inferential Statistics: Statistical Hypothesis
3 pages
2022 Supply Needs
No ratings yet
2022 Supply Needs
1 page
Code of Business Conduct and Ethics
67% (3)
Code of Business Conduct and Ethics
52 pages
Assignment Adnoc
No ratings yet
Assignment Adnoc
21 pages

Source Code Python Jemmy

Uploaded by

Source Code Python Jemmy

Uploaded by

1.

pip install google-play-scraper

from google_play_scraper import Sort, reviews

# ------ Case Folding --------

print('Case Folding Result : \n')

# import word_tokenize & FreqDist from NLTK

# ------ Tokenizing ---------

#remove whitespace leading & trailing

#remove multiple whitespace into single whitespace

# remove single char

# NLTK word rokenize

print('Tokenizing Result : \n')

# NLTK calc frequency distribution

print('Frequency Tokens : \n')

from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------

# ---------------------------- manualy add stopword ------------------------------------

# ----------------------- add stopword from txt file ------------------------------------

# convert stopword string to list & append additional stopword

# convert list to dictionary

for index, row in normalizad_word.iterrows():

conda install -c conda-forge swifter

# import Sastrawi package

for document in TWEET_DATA['content_normalized']:

for term in term_dict:

# apply stemmed term to dataframe

You might also like