Skip to content

Commit

Permalink
functional praise task without page limit
Browse files Browse the repository at this point in the history
  • Loading branch information
thekingofcity committed Dec 31, 2018
1 parent ebd2427 commit 7672667
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 38 deletions.
42 changes: 30 additions & 12 deletions page_parse/praise.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import html as htmllib

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -27,31 +28,48 @@ def get_total_page(html):


@parse_decorator([])
def get_praise_list(html, wb_id):
"""
获取点赞列表
:param html:
:param wb_id:
:return:
def get_praise_list(html:str, wb_id:str):
"""[get praise list]
Arguments:
html {str} -- [web page]
wb_id {str} -- [weibo mid]
Raises:
in -- [can't get wanted dom]
Returns:
WeiboPraise list -- [list contains praises in this html]
ext_param -- [extra parameters to get next page]
"""

cont = get_html_cont(html)
if not cont:
return list()
return list(), ''

soup = BeautifulSoup(cont, 'html.parser')
praise_list = list()
praises = soup.find_all('li')
praises = soup.find_all(attrs={'class': 'list_li S_line1 clearfix'})
# pattern = re.compile(r'<li uid=\\"\d{10}\\">')
# praises = pattern.findall(cont)

for praise in praises:
wb_praise = WeiboPraise()
try:
wb_praise.user_id = praise['uid']
wb_praise.weibo_id = wb_id
user_id = praise.find('img').get('usercard')[3:]
wb_praise = WeiboPraise(user_id, wb_id)
except Exception as e:
parser.error('解析点赞失败,具体信息是{}'.format(e))
else:
praise_list.append(wb_praise)

return praise_list
like_loading = soup.find(attrs={'node-type': 'like_loading'})
feed_like_more = soup.find(attrs={'action-type': 'feed_like_more'})
if like_loading:
action_data = like_loading.get('action-data', '')
elif feed_like_more:
action_data = feed_like_more.get('action-data', '')
else:
action_data = ''
ext_param = htmllib.unescape(action_data)

return praise_list, ext_param
58 changes: 32 additions & 26 deletions tasks/praise.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,49 @@
from config import conf
from page_get import get_page
from db.dao import (WbDataOper, PraiseOper)
from celery.exceptions import SoftTimeLimitExceeded


BASE_URL = 'http://weibo.com/aj/v6/like/big?ajwvr=6&mid={}&page={}&__rnd={}'
# Please note that m.weibo.cn can return more data than PC side
BASE_URL = 'https://weibo.com/aj/v6/like/likelist?ajwvr=6&mid={}&issingle=1&type=0&_t=0&__rnd={}'
PAGE_URL = 'https://weibo.com/aj/v6/like/likelist?ajwvr=6&{}&_t=0&__rnd={}'


@app.task(ignore_result=True)
def crawl_praise_by_page(mid, page_num):
try:
cur_time = int(time.time() * 1000)
cur_url = BASE_URL.format(mid, page_num, cur_time)
html = get_page(cur_url, auth_level=2, is_ajax=True)
praise_datas = praise.get_praise_list(html, mid)
except SoftTimeLimitExceeded:
crawler.error(
"praise SoftTimeLimitExceeded mid={mid} page_num={page_num}".
format(mid=mid, page_num=page_num))
app.send_task(
'tasks.praise.crawl_praise_by_page',
args=(mid, page_num),
queue='praise_page_crawler',
routing_key='praise_page_info')
PraiseOper.add_all(praise_datas)
if page_num == 1:
WbDataOper.set_weibo_praise_crawled(mid)
return html, praise_datas
def crawl_praise_by_page(mid, ext_param):
cur_time = int(time.time() * 1000)
cur_url = PAGE_URL.format(ext_param, cur_time)
html = get_page(cur_url, auth_level=2, is_ajax=True)
praise_data, ext_param = praise.get_praise_list(html, mid)
PraiseOper.add_all(praise_data)
return html, praise_data, ext_param


@app.task(ignore_result=True)
def crawl_praise_page(mid):
# 这里为了马上拿到返回结果,采用本地调用的方式
first_page = crawl_praise_by_page(mid, 1)[0]
total_page = praise.get_total_page(first_page)
cur_time = int(time.time() * 1000)
cur_url = BASE_URL.format(mid, cur_time)
html = get_page(cur_url, auth_level=2, is_ajax=True)
praise_data, ext_param = praise.get_praise_list(html, mid)
PraiseOper.add_all(praise_data)

WbDataOper.set_weibo_praise_crawled(mid)

if not ext_param:
crawler.error('fail to get praise page 2 ext_param, mid is {mid}'.format(mid=mid))
return

# why no app.send_task and fall back to sequential execution
# because weibo praise now require a parameter called max_id
# and request without it will return something different from normal browser

# should work after 5
# TODO: retry or return depending on ext_param
for __ in range(2,5):
# ext_param mainly max_id will be updated each time and be used next time
html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param)
return

for page_num in range(2, total_page + 1):
app.send_task('tasks.praise.crawl_praise_by_page', args=(mid, page_num), queue='praise_page_crawler',
routing_key='praise_page_info')

@app.task(ignore_result=True)
def execute_praise_task():
Expand Down

0 comments on commit 7672667

Please sign in to comment.