Skip to content

Commit da69b37

Browse files
committed
用 Python 来了解一下《安家》
1 parent 13df7b1 commit da69b37

File tree

10 files changed

+1305
-0
lines changed

10 files changed

+1305
-0
lines changed

anjia/__init__.py

Whitespace-only changes.

anjia/actor.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import pandas as pd, jieba, matplotlib.pyplot as plt
2+
3+
csv_data = pd.read_csv('data.csv')
4+
roles = {'姑姑':0, '房似锦':0, '王子':0, '闪闪':0, '老油条':0, '楼山关':0, '鱼化龙':0}
5+
names = list(roles.keys())
6+
for name in names:
7+
jieba.add_word(name)
8+
for row in csv_data['comments']:
9+
row = str(row)
10+
for name in names:
11+
count = row.count(name)
12+
roles[name] += count
13+
plt.figure(figsize=(8, 5))
14+
# 数据
15+
plt.bar(list(roles.keys()), list(roles.values()), width=0.5, label='提及次数', color=['g', 'r', 'dodgerblue', 'c', 'm', 'y', 'aquamarine'])
16+
# 设置数字标签
17+
for a, b in zip(list(roles.keys()), list(roles.values())):
18+
plt.text(a, b, b, ha='center', va='bottom', fontsize=13, color='black')
19+
plt.title('角色被提及次数柱状图')
20+
plt.xticks(rotation=270)
21+
plt.tick_params(labelsize=10)
22+
plt.ylim(0, 30)
23+
plt.legend(loc='upper right')
24+
plt.show()

anjia/anjia.png

195 KB
Loading

anjia/bg.jpg

30.8 KB
Loading

anjia/cloud.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from wordcloud import WordCloud
2+
import numpy as np, jieba
3+
from PIL import Image
4+
5+
def jieba_():
6+
# 打开评论数据文件
7+
content = open('comment.csv', 'rb').read()
8+
# jieba 分词
9+
word_list = jieba.cut(content)
10+
words = []
11+
# 过滤掉的词
12+
remove_words = ['以及', '不会', '一些', '那个', '只有',
13+
'不过', '东西', '这个', '所有', '这么',
14+
'但是', '全片', '一点', '一部', '一个',
15+
'什么', '虽然', '一切', '样子', '一样',
16+
'只能', '不是', '一种', '这个', '为了']
17+
for word in word_list:
18+
if word not in remove_words:
19+
words.append(word)
20+
global word_cloud
21+
# 用逗号隔开词语
22+
word_cloud = ','.join(words)
23+
24+
def cloud():
25+
# 打开词云背景图
26+
cloud_mask = np.array(Image.open('bg.jpg'))
27+
# 定义词云的一些属性
28+
wc = WordCloud(
29+
# 背景图分割颜色为白色
30+
background_color='white',
31+
# 背景图样
32+
mask=cloud_mask,
33+
# 显示最大词数
34+
max_words=100,
35+
# 显示中文
36+
font_path='./fonts/simhei.ttf',
37+
# 最大尺寸
38+
max_font_size=80
39+
)
40+
global word_cloud
41+
# 词云函数
42+
x = wc.generate(word_cloud)
43+
# 生成词云图片
44+
image = x.to_image()
45+
# 展示词云图片
46+
image.show()
47+
# 保存词云图片
48+
wc.to_file('anjia.png')
49+
50+
jieba_()
51+
cloud()

anjia/comment.csv

Lines changed: 554 additions & 0 deletions
Large diffs are not rendered by default.

anjia/comment.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import pandas as pd, matplotlib.pyplot as plt
2+
3+
csv_data = pd.read_csv('data.csv')
4+
df = pd.DataFrame(csv_data)
5+
df_gp = df.groupby(['time']).size()
6+
values = df_gp.values.tolist()
7+
index = df_gp.index.tolist()
8+
# 设置画布大小
9+
plt.figure(figsize=(10, 6))
10+
# 数据
11+
# plt.plot(index, values, label='weight changes', linewidth=3, color='r', marker='o',
12+
# markerfacecolor='blue', markersize=20)
13+
plt.plot(index, values, label='评论数')
14+
# 设置数字标签
15+
for a, b in zip(index, values):
16+
plt.text(a, b, b, ha='center', va='bottom', fontsize=13, color='black')
17+
plt.title('评论数随时间变化折线图')
18+
# plt.xlabel('日期')
19+
# plt.ylabel('评论数')
20+
plt.xticks(rotation=330)
21+
plt.tick_params(labelsize=10)
22+
plt.ylim(0, 200)
23+
plt.legend(loc='upper right')
24+
plt.show()
25+
26+

anjia/data.csv

Lines changed: 555 additions & 0 deletions
Large diffs are not rendered by default.

anjia/spd.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import requests, time, random, pandas as pd
2+
from lxml import etree
3+
4+
def spider():
5+
url = 'https://accounts.douban.com/j/mobile/login/basic'
6+
headers = {"User-Agent": 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}
7+
# 安家评论网址,为了动态翻页,start 后加了格式化数字,短评页面有 20 条数据,每页增加 20 条
8+
url_comment = 'https://movie.douban.com/subject/30482003/comments?start=%d&limit=20&sort=new_score&status=P'
9+
data = {
10+
'ck': '',
11+
'name': '自己的用户',
12+
'password': '自己的密码',
13+
'remember': 'false',
14+
'ticket': ''
15+
}
16+
session = requests.session()
17+
session.post(url=url, headers=headers, data=data)
18+
# 初始化 4 个 list 分别存用户名、评星、时间、评论文字
19+
users = []
20+
stars = []
21+
times = []
22+
content = []
23+
# 抓取 500 条,每页 20 条,这也是豆瓣给的上限
24+
for i in range(0, 500, 20):
25+
# 获取 HTML
26+
data = session.get(url_comment % i, headers=headers)
27+
# 状态码 200 表是成功
28+
print('第', i, '页', '状态码:',data.status_code)
29+
# 暂停 0-1 秒时间,防止IP被封
30+
time.sleep(random.random())
31+
# 解析 HTML
32+
selector = etree.HTML(data.text)
33+
# 用 xpath 获取单页所有评论
34+
comments = selector.xpath('//div[@class="comment"]')
35+
# 遍历所有评论,获取详细信息
36+
for comment in comments:
37+
# 获取用户名
38+
user = comment.xpath('.//h3/span[2]/a/text()')[0]
39+
# 获取评星
40+
star = comment.xpath('.//h3/span[2]/span[2]/@class')[0][7:8]
41+
# 获取时间
42+
date_time = comment.xpath('.//h3/span[2]/span[3]/@title')
43+
# 有的时间为空,需要判断下
44+
if len(date_time) != 0:
45+
date_time = date_time[0]
46+
date_time = date_time[:10]
47+
else:
48+
date_time = None
49+
# 获取评论文字
50+
comment_text = comment.xpath('.//p/span/text()')[0].strip()
51+
# 添加所有信息到列表
52+
users.append(user)
53+
stars.append(star)
54+
times.append(date_time)
55+
content.append(comment_text)
56+
# 用字典包装
57+
comment_dic = {'user': users, 'star': stars, 'time': times, 'comments': content}
58+
# 转换成 DataFrame 格式
59+
comment_df = pd.DataFrame(comment_dic)
60+
# 保存数据
61+
comment_df.to_csv('data.csv')
62+
# 将评论单独再保存下来
63+
comment_df['comments'].to_csv('comment.csv', index=False)
64+
65+
spider()

anjia/star.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import pandas as pd, numpy as np, matplotlib.pyplot as plt
2+
3+
csv_data = pd.read_csv('data.csv')
4+
df_time = csv_data.groupby(['time']).size()
5+
df_star = csv_data.groupby(['star']).size()
6+
index = df_time.index.tolist()
7+
value = [0] * len(index)
8+
# 生成字典
9+
dic = dict(zip(index, value))
10+
# rows = df.loc[df['time'] == '2020-03-05', 'star']
11+
# list = list(map(int, rows.values.tolist()))
12+
# avg = np.mean(list)
13+
# print(list)
14+
# print(avg)
15+
for k, v in dic.items():
16+
stars = csv_data.loc[csv_data['time'] == str(k), 'star']
17+
# 平均值
18+
avg = np.mean(list(map(int, stars.values.tolist())))
19+
dic[k] = round(avg ,2)
20+
# 设置画布大小
21+
plt.figure(figsize=(9, 6))
22+
# 数据
23+
plt.plot(list(dic.keys()), list(dic.values()), label='星级', color='red', marker='o')
24+
plt.title('星级随时间变化折线图')
25+
plt.xticks(rotation=330)
26+
plt.tick_params(labelsize=10)
27+
plt.ylim(0, 5)
28+
plt.legend(loc='upper right')
29+
plt.show()
30+

0 commit comments

Comments
 (0)