Skip to content

Commit 9755c64

Browse files
authored
Merge pull request powerexploit#12 from Sayoni26/scrap_quote
Added new python script
2 parents a93e98e + bdf7b14 commit 9755c64

File tree

1 file changed

+29
-0
lines changed

1 file changed

+29
-0
lines changed

Scripts/scrap_quotes.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
import json
4+
base_url="https://www.goodreads.com/quotes/tag/{0}?page={1}" # the url of the site from where quotes
5+
#will be scrapped emotion and page number will be inserted later
6+
7+
def process(content,emotion): # function to clean the content of the webpage
8+
soup=BeautifulSoup(content,features="html5lib")
9+
quotes_div=soup.find_all("div",attrs={"class","quote"})
10+
quotes=[]
11+
for div in quotes_div:
12+
q_text=div.find("div",attrs={"class","quoteText"})
13+
quote=(q_text.text.strip().split('\n')[0])
14+
author=q_text.find("span",attrs={"class","authorOrTitle"}).text.strip()
15+
q_dict={"quote":quote,"author":author,"emotion":emotion}
16+
quotes.append(q_dict)
17+
return quotes
18+
19+
emotions=['friend','sad'] # you can select any other emotion
20+
quotes=[]
21+
for emotion in emotions:
22+
for index in range(1,5): # here 5 pages have been taken
23+
final_url=base_url.format(emotion,index)
24+
page=requests.get(final_url)
25+
content=page.text
26+
quotes+=process(content,emotion)
27+
28+
with open('quote.json','w') as file: # dump the quotes in json file
29+
json.dump(quotes,file)

0 commit comments

Comments
 (0)