Skip to content

Commit ce8fa75

Browse files
Added script
1 parent 1a47b5c commit ce8fa75

File tree

2 files changed

+64
-0
lines changed

2 files changed

+64
-0
lines changed

reddit-scraper/grabnews.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import json, requests
2+
import sqlite3
3+
4+
5+
def handle(content):
6+
if not content or content is None:
7+
content = None
8+
return content
9+
10+
def dump(endpoint, toget):
11+
headers = {'User-agent' : 'Chrome'}
12+
unique = set()
13+
url = 'http://www.reddit.com/r/python/'+str(endpoint)+'/.json?limit='+str(toget)
14+
response = requests.get(url, headers=headers)
15+
data = json.loads(response.text)
16+
#parsed = json.dumps(data, indent = 4, sort_keys = True)
17+
return data
18+
19+
def _news(data, toget, unique):
20+
for i in range(toget):
21+
parsed_content = json.dumps(data['data']['children'][i]['data'], indent = 4)
22+
content_title = handle(data['data']['children'][i]['data']['title'].strip())
23+
content_text = handle(data['data']['children'][i]['data']['selftext'].strip())
24+
content_author = handle(data['data']['children'][i]['data']['author_fullname'].strip())
25+
content_ups = handle(data['data']['children'][i]['data']['ups'])
26+
content_url = handle(data['data']['children'][i]['data']['url'].strip())
27+
content_id = handle(data['data']['children'][i]['data']['id'])
28+
29+
post = (content_id, content_title, content_text, content_author, content_url, content_ups)
30+
update_post = (content_title, content_text, content_author, content_url, content_ups, content_id)
31+
32+
if content_id in unique:
33+
c.execute("UPDATE top_news SET ptitle = ?, ptext = ?, pauthor = ?, purl = ?, pups = ? where pid = ? ", update_post)
34+
print("Updated")
35+
else:
36+
unique.add(content_id)
37+
c.execute("INSERT INTO top_news VALUES (?, ?, ?, ?, ?, ?)", post)
38+
print("Inserted")
39+
i = i + 1
40+
41+
def get_top_news(endpoint = 'top', toget = 10):
42+
def connect():
43+
c.execute('''CREATE TABLE IF NOT EXISTS top_news
44+
(pid text PRIMARY KEY, ptitle text, ptext text, pauthor text, purl text, pups int)''')
45+
connect()
46+
unique = set()
47+
data = dump(endpoint, toget)
48+
_news(data, toget, unique)
49+
50+
def get_hot_news(endpoint = 'hot', toget = 10):
51+
def connect():
52+
c.execute('''CREATE TABLE IF NOT EXISTS hot_news
53+
(pid text PRIMARY KEY, ptitle text, ptext text, pauthor text, purl text, pups int)''')
54+
connect()
55+
unique = set()
56+
data = dump(endpoint, toget)
57+
_news(data, toget, unique)
58+
59+
def reddit_get():
60+
conn = sqlite3.connect('reddit_news.db')
61+
c = conn.cursor()
62+
get_top_news()
63+
conn.commit()
64+
conn.close()

reddit-scraper/reddit_news.db

12 KB
Binary file not shown.

0 commit comments

Comments
 (0)