Skip to content

Commit f1d78f8

Browse files
committed
add config docs
1 parent cd8180f commit f1d78f8

File tree

1 file changed

+222
-0
lines changed

1 file changed

+222
-0
lines changed

docs/配置文件参数和意义.md

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
由于[haipproxy](https://github.com/SpiderClub/haipproxy)配置文件参数众多,所以
2+
单独写一篇文档对其做介绍。
3+
4+
配置文件位于[config](https://github.com/SpiderClub/haipproxy/tree/master/config)
5+
目录下,具体有[settings.py](https://github.com/SpiderClub/haipproxy/blob/master/config/settings.py)
6+
[rules.py](https://github.com/SpiderClub/haipproxy/blob/master/config/rules.py)
7+
前者是项目默认的一些配置,包括`scrapy`的配置和`haipproxy`的一些配置;后者的作用是**配置代理IP源抓取规则****代理IP存
8+
储映射相关规则**。具体参数意义请阅读下文。
9+
10+
---
11+
12+
### settings.py
13+
14+
```python3
15+
#####################################################################
16+
# scrapy相关设置
17+
#####################################################################
18+
# scrapy基本信息
19+
BOT_NAME = 'haiproxy'
20+
# 注册的spider路径
21+
SPIDER_MODULES = ['crawler.spiders', 'crawler.validators']
22+
NEWSPIDER_MODULE = 'crawler'
23+
# scrapy downloader 设置
24+
ROBOTSTXT_OBEY = False
25+
COOKIES_ENABLED = False
26+
DOWNLOAD_TIMEOUT = 30
27+
# 最大抓取深度,以防无限递归
28+
DEPTH_LIMIT = 100
29+
CONCURRENT_REQUESTS = 50
30+
# don't filter anything, also can set dont_filter=True in Request objects
31+
DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
32+
HTTPCACHE_ENABLED = False
33+
# 这是翻墙的URL,注意你如果用的是shadowsocks的话,需要将socks5其转为http协议,具体方法
34+
# 请阅读 https://rookiefly.cn/detail/201。请视具体情况修改 127.0.0.1
35+
GFW_PROXY = 'http://127.0.0.1:8123'
36+
37+
# scrapy-splash URL,用于抓取ajax相关任务。请视具体情况修改 127.0.0.1 ,如果是使用的
38+
# docker compose启动,请修改为 http://splash:8050
39+
SPLASH_URL = 'http://127.0.0.1:8050'
40+
41+
# 关闭scrapy 某些扩展,以提高抓取效率
42+
RETRY_ENABLED = False
43+
TELNETCONSOLE_ENABLED = False
44+
# scrapy下载器中间件
45+
DOWNLOADER_MIDDLEWARES = {
46+
'crawler.middlewares.UserAgentMiddleware': 543,
47+
'crawler.middlewares.ProxyMiddleware': 543,
48+
'scrapy_splash.SplashCookiesMiddleware': 723,
49+
# it should be before than HttpProxyMiddleware
50+
'scrapy_splash.SplashMiddleware': 725,
51+
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
52+
}
53+
# spider中间件
54+
SPIDER_MIDDLEWARES = {
55+
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
56+
}
57+
58+
# scrapy日志设置,目前仍然没找到一个比较好的方法
59+
LOG_LEVEL = 'DEBUG'
60+
LOG_FILE = 'logs/haipproxy.log'
61+
62+
63+
#####################################################################
64+
# HAipproxy的默认设置
65+
#####################################################################
66+
# redis设置。如果你使用docker compose,请将 '127.0.0.1' 改成 'redis'
67+
REDIS_HOST = '127.0.0.1'
68+
REDIS_PORT = 6379
69+
REDIS_PASSWORD = '123456'
70+
DEFAULT_REDIS_DB = 0
71+
META_DATA_DB = 0
72+
73+
# 定时任务调度器设置,表示其在Redis中的Key
74+
# 数据结构是一个hash表,存储了各个任务上次执行的时间
75+
TIMER_RECORDER = 'haipproxy:schduler:task'
76+
LOCKER_PREFIX = 'haipproxy:lock:' # Redis分布式锁的前缀
77+
78+
# 代理抓取爬虫设置
79+
# 每次从任务中获取的任务数,由于haipproxy并不保证消息不丢失,所以用户需要根据自己情况做权衡
80+
SPIDER_FEED_SIZE = 10
81+
# 四个代理抓取的任务,分别为 common ajax gfw 和 ajax_gfw,下面是它们在Redis中的队列名,
82+
SPIDER_COMMON_TASK = 'haipproxy:spider:common'
83+
SPIDER_AJAX_TASK = 'haipproxy:spider:ajax'
84+
SPIDER_GFW_TASK = 'haipproxy:spider:gfw'
85+
SPIDER_AJAX_GFW_TASK = 'haipproxy:spider:ajax_gfw'
86+
87+
# 存储所有代理IP,数据结构是一个set,用作过滤器
88+
DATA_ALL = 'haipproxy:all'
89+
# 数据流 init queue->validated_queue->validator_queue(temp)->validated_queue(score queue)->
90+
# ttl_queue, speed_qeuue -> clients
91+
# http_queue是一个列表,用以存放刚抓取到的http/https代理
92+
INIT_HTTP_QUEUE = 'haipproxy:init:http'
93+
# socks4/5代理存放的列表,目前项目并未对其进行校验和使用
94+
INIT_SOCKS4_QUEUE = 'haipproxy:init:socks4'
95+
INIT_SOCKS5_QUEUE = 'haipproxy:init:socks5'
96+
97+
# 校验器批量任务获取数据量
98+
VALIDATOR_FEED_SIZE = 50
99+
# 从init queue获取到的临时集合,目的是对透明代理做一次过滤
100+
TEMP_HTTP_QUEUE = 'haipproxy:http:temp'
101+
TEMP_HTTPS_QUEUE = 'haipproxy:https:temp'
102+
TEMP_WEIBO_QUEUE = 'haipproxy:weibo:temp'
103+
TEMP_ZHIHU_QUEUE = 'haipproxy:zhihu:temp'
104+
105+
# 有序集合,用以存放验证过的IP及它们的分数
106+
VALIDATED_HTTP_QUEUE = 'haipproxy:validated:http'
107+
VALIDATED_HTTPS_QUEUE = 'haipproxy:validated:https'
108+
VALIDATED_WEIBO_QUEUE = 'haipproxy:validated:weibo'
109+
VALIDATED_ZHIHU_QUEUE = 'haipproxy:validated:zhihu'
110+
111+
# 有序集合,用以存放验证过的IP及它们的最近验证时间
112+
TTL_VALIDATED_RESOURCE = 2 # minutes
113+
TTL_HTTP_QUEUE = 'haipproxy:ttl:http'
114+
TTL_HTTPS_QUEUE = 'haipproxy:ttl:https'
115+
TTL_WEIBO_QUEUE = 'haipproxy:ttl:weibo'
116+
TTL_ZHIHU_QUEUE = 'haipproxy:ttl:zhihu'
117+
118+
# 有序集合,用以存放验证过的IP及它们的响应速度,这里速度是最近一次响应速度,不是平均速度
119+
SPEED_HTTP_QUEUE = 'haipproxy:speed:http'
120+
SPEED_HTTPS_QUEUE = 'haipproxy:speed:https'
121+
SPEED_WEIBO_QUEUE = 'haipproxy:speed:weibo'
122+
SPEED_ZHIHU_QUEUE = 'haipproxy:speed:zhihu'
123+
124+
# 如果您需要使用squid作为二级代理,那么需要配置squid相关参数,以ubuntu为例
125+
# 首先执行 sudo chown -R $USER /etc/squid/
126+
# 再执行 sudo chown -R $USER /var/log/squid/cache.log
127+
SQUID_BIN_PATH = '/usr/sbin/squid' # macs上,路径为'/usr/local/sbin/squid'
128+
SQUID_CONF_PATH = '/etc/squid/squid.conf' # mac上,路径为 '/usr/local/etc/squid.conf'
129+
# TEMPLATE file需要用户自己做拷贝
130+
SQUID_TEMPLATE_PATH = '/etc/squid/squid.conf.backup' # mac上,路径为 /usr/local/etc/squid.conf.backup
131+
132+
# 客户端设置
133+
# 客户端只会选择响应时间在10s内的代理IP
134+
LONGEST_RESPONSE_TIME = 10
135+
# 客户端只会选择最低分数为7分的代理IP
136+
LOWEST_SCORE = 7
137+
```
138+
139+
140+
### rules.py
141+
142+
```python3
143+
# 代理抓取爬虫任务规则
144+
CRWALER_TASKS = [
145+
{
146+
# 代理IP源名,不能重复,建议选取域名
147+
'name': 'mogumiao',
148+
# 需要抓取的代理IP链接,您可以根据实际情况进行指定,本项目默认只抓匿名和高匿页面
149+
'resource': ['http://www.mogumiao.com/proxy/free/listFreeIp',
150+
'http://www.mogumiao.com/proxy/api/freeIp?count=15'],
151+
# 爬虫任务类型,一共有四种类型,分别是 common(不需要翻墙,不需要ajax渲染),ajax(需要ajax渲染)
152+
# gfw(需要翻墙)和ajax_gfw(需要翻墙和ajax渲染)
153+
'task_queue': SPIDER_COMMON_TASK,
154+
# 定时抓取间隔,根据网站更新代理IP的时间间隔来定,单位是分钟
155+
'internal': 5,
156+
# 该规则是否生效
157+
'enable': 1,
158+
}
159+
]
160+
161+
# 代理IP抓取爬虫对应映射
162+
CRAWLER_TASK_MAPS = {
163+
'common': SPIDER_COMMON_TASK,
164+
'ajax': SPIDER_AJAX_TASK,
165+
'gfw': SPIDER_GFW_TASK,
166+
'ajax_gfw': SPIDER_AJAX_GFW_TASK
167+
}
168+
169+
170+
# 校验器将从task_queue中获取代理IP,校验后存入resource,具体流程见 架构篇
171+
VALIDATOR_TASKS = [
172+
{
173+
# 任务名,不能和其他任务同名
174+
'name': 'http',
175+
# 代理来源
176+
'task_queue': TEMP_HTTP_QUEUE,
177+
# 代理存入的地方
178+
'resource': VALIDATED_HTTP_QUEUE,
179+
# 定时校验间隔
180+
'internal': 20,
181+
# 是否启用
182+
'enable': 1,
183+
},
184+
{
185+
'name': 'zhihu',
186+
'task_queue': TEMP_ZHIHU_QUEUE,
187+
'resource': VALIDATED_ZHIHU_QUEUE,
188+
'internal': 20,
189+
'enable': 1,
190+
},
191+
]
192+
193+
# 校验器将从下面队列中获取代理IP进行校验
194+
TEMP_TASK_MAPS = {
195+
# init队列必须设置
196+
'init': INIT_HTTP_QUEUE,
197+
'http': TEMP_HTTP_QUEUE,
198+
'zhihu': TEMP_ZHIHU_QUEUE
199+
}
200+
201+
# 以下三个maps的作用是存储和提供可用代理,代表三个维度
202+
SCORE_MAPS = {
203+
'http': VALIDATED_HTTP_QUEUE,
204+
'https': VALIDATED_HTTPS_QUEUE,
205+
'weibo': VALIDATED_WEIBO_QUEUE,
206+
'zhihu': VALIDATED_ZHIHU_QUEUE
207+
}
208+
209+
TTL_MAPS = {
210+
'http': TTL_HTTP_QUEUE,
211+
'https': TTL_HTTPS_QUEUE,
212+
'weibo': TTL_WEIBO_QUEUE,
213+
'zhihu': TTL_ZHIHU_QUEUE
214+
}
215+
216+
SPEED_MAPS = {
217+
'http': SPEED_HTTP_QUEUE,
218+
'https': SPEED_HTTPS_QUEUE,
219+
'weibo': SPEED_WEIBO_QUEUE,
220+
'zhihu': SPEED_ZHIHU_QUEUE
221+
}
222+
```

0 commit comments

Comments
 (0)