Skip to content

Commit 41c61bb

Browse files
committed
提交代码
1 parent 15c97c4 commit 41c61bb

File tree

14 files changed

+455
-0
lines changed

14 files changed

+455
-0
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# 数据审核
2+
## 表说明:
3+
4+
> 表名 含义(更新策略)
5+
6+
## 一、准确性
7+
8+
**字段设计是否满足需求? 表之间的关联字段是否满足要求? (需要人工检查)**
9+
10+
> 注意:是否设计了自增 id,id 的类型是否设置为 bigint?
11+
> 注意:unique index 是否需要设计?
12+
> 注意:各张表之间是否需要设计关联字段;
13+
14+
* [ ]
15+
* [ ]
16+
17+
**各字段采集内容及存储格式是否满足要求?是否与网页一致?是否有信息缺失?**
18+
19+
> 备注:可尝试对每个字段进行升降序排列,然后抽样检查;
20+
21+
**是否考虑了网站同一类数据可能出现的数据格式不一致情况?**
22+
23+
> 建议:代码对各个字段不做兼容性处理、数据不一致则抛出异常并记录
24+
25+
* [ ]
26+
* [ ]
27+
28+
## 二、全量性
29+
30+
**如果是增量采集,是否最早信息和最晚信息都采集了,同时条目总数是否正确;**
31+
**如果是批次采集,是否每个批次都有?**
32+
33+
>备注:需要去网页端评估单个批次的总量;
34+
>参考sql语句:SELECT count(1), batch_date from [table_name] GROUP BY batch_date;
35+
36+
**如果与另外一张表有关联关系,是否信息关联完整?**
37+
38+
## 三、稳定性
39+
40+
* [ ] 是否能够长期稳定采集?
41+
* [ ] 是否加IP代理?
42+
* [ ] 是否支持断点续跑?
43+
* [ ] 是否能确保按时启动,定期采集?
44+
* [ ] 是否已开启报警?
45+
46+
## 四、采集频次、类型、存储方式
47+
48+
* [ ] 采集频次是否满足要求?
49+
* [ ] 采集类型是否满足要求:增量采集 or 批次采集?
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# xxx爬虫文档
2+
## 调研
3+
4+
## 数据库设计
5+
6+
## 爬虫逻辑
7+
8+
## 项目架构
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
__all__ = [
2+
"main",
3+
"setting",
4+
"spider_test"
5+
]
Binary file not shown.
Binary file not shown.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
__all__ = [
2+
"report_item"
3+
]
Binary file not shown.
Binary file not shown.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on 2021-09-19 17:32:59
4+
---------
5+
@summary:
6+
---------
7+
@author: cxhuan
8+
"""
9+
10+
from feapder import Item
11+
12+
13+
class ReportItem(Item):
14+
"""
15+
This class was generated by feapder.
16+
command: feapder create -i report 1.
17+
"""
18+
19+
__table_name__ = "report"
20+
21+
def __init__(self, *args, **kwargs):
22+
self.count = kwargs.get('count')
23+
self.emRatingName = kwargs.get('emRatingName') # 评级名称
24+
self.emRatingValue = kwargs.get('emRatingValue') # 评级代码
25+
self.encodeUrl = kwargs.get('encodeUrl') # 链接
26+
# self.id = kwargs.get('id')
27+
self.indvInduCode = kwargs.get('indvInduCode') # 行业代码
28+
self.indvInduName = kwargs.get('indvInduName') # 行业名称
29+
self.lastEmRatingName = kwargs.get('lastEmRatingName') # 上次评级名称
30+
self.lastEmRatingValue = kwargs.get('lastEmRatingValue') # 上次评级代码
31+
self.orgCode = kwargs.get('orgCode') # 机构代码
32+
self.orgName = kwargs.get('orgName') # 机构名称
33+
self.orgSName = kwargs.get('orgSName') # 机构简称
34+
self.predictNextTwoYearEps = kwargs.get('predictNextTwoYearEps')
35+
self.predictNextTwoYearPe = kwargs.get('predictNextTwoYearPe')
36+
self.predictNextYearEps = kwargs.get('predictNextYearEps')
37+
self.predictNextYearPe = kwargs.get('predictNextYearPe')
38+
self.predictThisYearEps = kwargs.get('predictThisYearEps')
39+
self.predictThisYearPe = kwargs.get('predictThisYearPe')
40+
self.publishDate = kwargs.get('publishDate') # 发表时间
41+
self.ratingChange = kwargs.get('ratingChange') # 评级变动
42+
self.researcher = kwargs.get('researcher') # 研究员
43+
self.stockCode = kwargs.get('stockCode') # 股票代码
44+
self.stockName = kwargs.get('stockName') # 股票简称
45+
self.title = kwargs.get('title') # 报告名称
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on 2021-09-18 18:56:20
4+
---------
5+
@summary: 爬虫入口
6+
---------
7+
@author: cxhuan
8+
"""
9+
10+
from feapder import ArgumentParser
11+
12+
from spiders import *
13+
14+
def crawl_xxx():
15+
"""
16+
AirSpider爬虫
17+
"""
18+
spider = xxx.XXXSpider()
19+
spider.start()
20+
21+
def crawl_xxx():
22+
"""
23+
Spider爬虫
24+
"""
25+
spider = xxx.XXXSpider(redis_key="xxx:xxx")
26+
spider.start()
27+
28+
29+
def crawl_xxx(args):
30+
"""
31+
BatchSpider爬虫
32+
"""
33+
spider = xxx_spider.XXXSpider(
34+
task_table="", # mysql中的任务表
35+
batch_record_table="", # mysql中的批次记录表
36+
batch_name="xxx(周全)", # 批次名字
37+
batch_interval=7, # 批次时间 天为单位 若为小时 可写 1 / 24
38+
task_keys=["id", "xxx"], # 需要获取任务表里的字段名,可添加多个
39+
redis_key="xxx:xxxx", # redis中存放request等信息的根key
40+
task_state="state", # mysql中任务状态字段
41+
)
42+
43+
if args == 1:
44+
spider.start_monitor_task()
45+
elif args == 2:
46+
spider.start()
47+
elif args == 3:
48+
spider.init_task()
49+
50+
51+
if __name__ == "__main__":
52+
parser = ArgumentParser(description="xxx爬虫")
53+
54+
parser.add_argument(
55+
"--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
56+
)
57+
parser.add_argument(
58+
"--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
59+
)
60+
parser.add_argument(
61+
"--crawl_xxx",
62+
type=int,
63+
nargs=1,
64+
help="xxx爬虫",
65+
choices=[1, 2, 3],
66+
function=crawl_xxx,
67+
)
68+
69+
parser.start()
70+
71+
# main.py作为爬虫启动的统一入口,提供命令行的方式启动多个爬虫,若只有一个爬虫,可不编写main.py
72+
# 将上面的xxx修改为自己实际的爬虫名
73+
# 查看运行命令 python main.py --help
74+
# AirSpider与Spider爬虫运行方式 python main.py --crawl_xxx
75+
# BatchSpider运行方式
76+
# 1. 下发任务:python main.py --crawl_xxx 1
77+
# 2. 采集:python main.py --crawl_xxx 2
78+
# 3. 重置任务:python main.py --crawl_xxx 3
79+

0 commit comments

Comments
 (0)