|
| 1 | +由于[haipproxy](https://github.com/SpiderClub/haipproxy)配置文件参数众多,所以 |
| 2 | +单独写一篇文档对其做介绍。 |
| 3 | + |
| 4 | +配置文件位于[config](https://github.com/SpiderClub/haipproxy/tree/master/config) |
| 5 | +目录下,具体有[settings.py](https://github.com/SpiderClub/haipproxy/blob/master/config/settings.py) |
| 6 | +和[rules.py](https://github.com/SpiderClub/haipproxy/blob/master/config/rules.py)。 |
| 7 | +前者是项目默认的一些配置,包括`scrapy`的配置和`haipproxy`的一些配置;后者的作用是**配置代理IP源抓取规则**和**代理IP存 |
| 8 | +储映射相关规则**。具体参数意义请阅读下文。 |
| 9 | + |
| 10 | +--- |
| 11 | + |
| 12 | +### settings.py |
| 13 | + |
| 14 | +```python3 |
| 15 | +##################################################################### |
| 16 | +# scrapy相关设置 |
| 17 | +##################################################################### |
| 18 | +# scrapy基本信息 |
| 19 | +BOT_NAME = 'haiproxy' |
| 20 | +# 注册的spider路径 |
| 21 | +SPIDER_MODULES = ['crawler.spiders', 'crawler.validators'] |
| 22 | +NEWSPIDER_MODULE = 'crawler' |
| 23 | +# scrapy downloader 设置 |
| 24 | +ROBOTSTXT_OBEY = False |
| 25 | +COOKIES_ENABLED = False |
| 26 | +DOWNLOAD_TIMEOUT = 30 |
| 27 | +# 最大抓取深度,以防无限递归 |
| 28 | +DEPTH_LIMIT = 100 |
| 29 | +CONCURRENT_REQUESTS = 50 |
| 30 | +# don't filter anything, also can set dont_filter=True in Request objects |
| 31 | +DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter' |
| 32 | +HTTPCACHE_ENABLED = False |
| 33 | +# 这是翻墙的URL,注意你如果用的是shadowsocks的话,需要将socks5其转为http协议,具体方法 |
| 34 | +# 请阅读 https://rookiefly.cn/detail/201。请视具体情况修改 127.0.0.1 |
| 35 | +GFW_PROXY = 'http://127.0.0.1:8123' |
| 36 | + |
| 37 | +# scrapy-splash URL,用于抓取ajax相关任务。请视具体情况修改 127.0.0.1 ,如果是使用的 |
| 38 | +# docker compose启动,请修改为 http://splash:8050 |
| 39 | +SPLASH_URL = 'http://127.0.0.1:8050' |
| 40 | + |
| 41 | +# 关闭scrapy 某些扩展,以提高抓取效率 |
| 42 | +RETRY_ENABLED = False |
| 43 | +TELNETCONSOLE_ENABLED = False |
| 44 | +# scrapy下载器中间件 |
| 45 | +DOWNLOADER_MIDDLEWARES = { |
| 46 | + 'crawler.middlewares.UserAgentMiddleware': 543, |
| 47 | + 'crawler.middlewares.ProxyMiddleware': 543, |
| 48 | + 'scrapy_splash.SplashCookiesMiddleware': 723, |
| 49 | + # it should be before than HttpProxyMiddleware |
| 50 | + 'scrapy_splash.SplashMiddleware': 725, |
| 51 | + 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, |
| 52 | +} |
| 53 | +# spider中间件 |
| 54 | +SPIDER_MIDDLEWARES = { |
| 55 | + 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, |
| 56 | +} |
| 57 | + |
| 58 | +# scrapy日志设置,目前仍然没找到一个比较好的方法 |
| 59 | +LOG_LEVEL = 'DEBUG' |
| 60 | +LOG_FILE = 'logs/haipproxy.log' |
| 61 | + |
| 62 | + |
| 63 | +##################################################################### |
| 64 | +# HAipproxy的默认设置 |
| 65 | +##################################################################### |
| 66 | +# redis设置。如果你使用docker compose,请将 '127.0.0.1' 改成 'redis' |
| 67 | +REDIS_HOST = '127.0.0.1' |
| 68 | +REDIS_PORT = 6379 |
| 69 | +REDIS_PASSWORD = '123456' |
| 70 | +DEFAULT_REDIS_DB = 0 |
| 71 | +META_DATA_DB = 0 |
| 72 | + |
| 73 | +# 定时任务调度器设置,表示其在Redis中的Key |
| 74 | +# 数据结构是一个hash表,存储了各个任务上次执行的时间 |
| 75 | +TIMER_RECORDER = 'haipproxy:schduler:task' |
| 76 | +LOCKER_PREFIX = 'haipproxy:lock:' # Redis分布式锁的前缀 |
| 77 | + |
| 78 | +# 代理抓取爬虫设置 |
| 79 | +# 每次从任务中获取的任务数,由于haipproxy并不保证消息不丢失,所以用户需要根据自己情况做权衡 |
| 80 | +SPIDER_FEED_SIZE = 10 |
| 81 | +# 四个代理抓取的任务,分别为 common ajax gfw 和 ajax_gfw,下面是它们在Redis中的队列名, |
| 82 | +SPIDER_COMMON_TASK = 'haipproxy:spider:common' |
| 83 | +SPIDER_AJAX_TASK = 'haipproxy:spider:ajax' |
| 84 | +SPIDER_GFW_TASK = 'haipproxy:spider:gfw' |
| 85 | +SPIDER_AJAX_GFW_TASK = 'haipproxy:spider:ajax_gfw' |
| 86 | + |
| 87 | +# 存储所有代理IP,数据结构是一个set,用作过滤器 |
| 88 | +DATA_ALL = 'haipproxy:all' |
| 89 | +# 数据流 init queue->validated_queue->validator_queue(temp)->validated_queue(score queue)-> |
| 90 | +# ttl_queue, speed_qeuue -> clients |
| 91 | +# http_queue是一个列表,用以存放刚抓取到的http/https代理 |
| 92 | +INIT_HTTP_QUEUE = 'haipproxy:init:http' |
| 93 | +# socks4/5代理存放的列表,目前项目并未对其进行校验和使用 |
| 94 | +INIT_SOCKS4_QUEUE = 'haipproxy:init:socks4' |
| 95 | +INIT_SOCKS5_QUEUE = 'haipproxy:init:socks5' |
| 96 | + |
| 97 | +# 校验器批量任务获取数据量 |
| 98 | +VALIDATOR_FEED_SIZE = 50 |
| 99 | +# 从init queue获取到的临时集合,目的是对透明代理做一次过滤 |
| 100 | +TEMP_HTTP_QUEUE = 'haipproxy:http:temp' |
| 101 | +TEMP_HTTPS_QUEUE = 'haipproxy:https:temp' |
| 102 | +TEMP_WEIBO_QUEUE = 'haipproxy:weibo:temp' |
| 103 | +TEMP_ZHIHU_QUEUE = 'haipproxy:zhihu:temp' |
| 104 | + |
| 105 | +# 有序集合,用以存放验证过的IP及它们的分数 |
| 106 | +VALIDATED_HTTP_QUEUE = 'haipproxy:validated:http' |
| 107 | +VALIDATED_HTTPS_QUEUE = 'haipproxy:validated:https' |
| 108 | +VALIDATED_WEIBO_QUEUE = 'haipproxy:validated:weibo' |
| 109 | +VALIDATED_ZHIHU_QUEUE = 'haipproxy:validated:zhihu' |
| 110 | + |
| 111 | +# 有序集合,用以存放验证过的IP及它们的最近验证时间 |
| 112 | +TTL_VALIDATED_RESOURCE = 2 # minutes |
| 113 | +TTL_HTTP_QUEUE = 'haipproxy:ttl:http' |
| 114 | +TTL_HTTPS_QUEUE = 'haipproxy:ttl:https' |
| 115 | +TTL_WEIBO_QUEUE = 'haipproxy:ttl:weibo' |
| 116 | +TTL_ZHIHU_QUEUE = 'haipproxy:ttl:zhihu' |
| 117 | + |
| 118 | +# 有序集合,用以存放验证过的IP及它们的响应速度,这里速度是最近一次响应速度,不是平均速度 |
| 119 | +SPEED_HTTP_QUEUE = 'haipproxy:speed:http' |
| 120 | +SPEED_HTTPS_QUEUE = 'haipproxy:speed:https' |
| 121 | +SPEED_WEIBO_QUEUE = 'haipproxy:speed:weibo' |
| 122 | +SPEED_ZHIHU_QUEUE = 'haipproxy:speed:zhihu' |
| 123 | + |
| 124 | +# 如果您需要使用squid作为二级代理,那么需要配置squid相关参数,以ubuntu为例 |
| 125 | +# 首先执行 sudo chown -R $USER /etc/squid/ |
| 126 | +# 再执行 sudo chown -R $USER /var/log/squid/cache.log |
| 127 | +SQUID_BIN_PATH = '/usr/sbin/squid' # macs上,路径为'/usr/local/sbin/squid' |
| 128 | +SQUID_CONF_PATH = '/etc/squid/squid.conf' # mac上,路径为 '/usr/local/etc/squid.conf' |
| 129 | +# TEMPLATE file需要用户自己做拷贝 |
| 130 | +SQUID_TEMPLATE_PATH = '/etc/squid/squid.conf.backup' # mac上,路径为 /usr/local/etc/squid.conf.backup |
| 131 | + |
| 132 | +# 客户端设置 |
| 133 | +# 客户端只会选择响应时间在10s内的代理IP |
| 134 | +LONGEST_RESPONSE_TIME = 10 |
| 135 | +# 客户端只会选择最低分数为7分的代理IP |
| 136 | +LOWEST_SCORE = 7 |
| 137 | +``` |
| 138 | + |
| 139 | + |
| 140 | +### rules.py |
| 141 | + |
| 142 | +```python3 |
| 143 | +# 代理抓取爬虫任务规则 |
| 144 | +CRWALER_TASKS = [ |
| 145 | + { |
| 146 | + # 代理IP源名,不能重复,建议选取域名 |
| 147 | + 'name': 'mogumiao', |
| 148 | + # 需要抓取的代理IP链接,您可以根据实际情况进行指定,本项目默认只抓匿名和高匿页面 |
| 149 | + 'resource': ['http://www.mogumiao.com/proxy/free/listFreeIp', |
| 150 | + 'http://www.mogumiao.com/proxy/api/freeIp?count=15'], |
| 151 | + # 爬虫任务类型,一共有四种类型,分别是 common(不需要翻墙,不需要ajax渲染),ajax(需要ajax渲染) |
| 152 | + # gfw(需要翻墙)和ajax_gfw(需要翻墙和ajax渲染) |
| 153 | + 'task_queue': SPIDER_COMMON_TASK, |
| 154 | + # 定时抓取间隔,根据网站更新代理IP的时间间隔来定,单位是分钟 |
| 155 | + 'internal': 5, |
| 156 | + # 该规则是否生效 |
| 157 | + 'enable': 1, |
| 158 | + } |
| 159 | +] |
| 160 | + |
| 161 | +# 代理IP抓取爬虫对应映射 |
| 162 | +CRAWLER_TASK_MAPS = { |
| 163 | + 'common': SPIDER_COMMON_TASK, |
| 164 | + 'ajax': SPIDER_AJAX_TASK, |
| 165 | + 'gfw': SPIDER_GFW_TASK, |
| 166 | + 'ajax_gfw': SPIDER_AJAX_GFW_TASK |
| 167 | +} |
| 168 | + |
| 169 | + |
| 170 | +# 校验器将从task_queue中获取代理IP,校验后存入resource,具体流程见 架构篇 |
| 171 | +VALIDATOR_TASKS = [ |
| 172 | + { |
| 173 | + # 任务名,不能和其他任务同名 |
| 174 | + 'name': 'http', |
| 175 | + # 代理来源 |
| 176 | + 'task_queue': TEMP_HTTP_QUEUE, |
| 177 | + # 代理存入的地方 |
| 178 | + 'resource': VALIDATED_HTTP_QUEUE, |
| 179 | + # 定时校验间隔 |
| 180 | + 'internal': 20, |
| 181 | + # 是否启用 |
| 182 | + 'enable': 1, |
| 183 | + }, |
| 184 | + { |
| 185 | + 'name': 'zhihu', |
| 186 | + 'task_queue': TEMP_ZHIHU_QUEUE, |
| 187 | + 'resource': VALIDATED_ZHIHU_QUEUE, |
| 188 | + 'internal': 20, |
| 189 | + 'enable': 1, |
| 190 | + }, |
| 191 | +] |
| 192 | + |
| 193 | +# 校验器将从下面队列中获取代理IP进行校验 |
| 194 | +TEMP_TASK_MAPS = { |
| 195 | + # init队列必须设置 |
| 196 | + 'init': INIT_HTTP_QUEUE, |
| 197 | + 'http': TEMP_HTTP_QUEUE, |
| 198 | + 'zhihu': TEMP_ZHIHU_QUEUE |
| 199 | +} |
| 200 | + |
| 201 | +# 以下三个maps的作用是存储和提供可用代理,代表三个维度 |
| 202 | +SCORE_MAPS = { |
| 203 | + 'http': VALIDATED_HTTP_QUEUE, |
| 204 | + 'https': VALIDATED_HTTPS_QUEUE, |
| 205 | + 'weibo': VALIDATED_WEIBO_QUEUE, |
| 206 | + 'zhihu': VALIDATED_ZHIHU_QUEUE |
| 207 | +} |
| 208 | + |
| 209 | +TTL_MAPS = { |
| 210 | + 'http': TTL_HTTP_QUEUE, |
| 211 | + 'https': TTL_HTTPS_QUEUE, |
| 212 | + 'weibo': TTL_WEIBO_QUEUE, |
| 213 | + 'zhihu': TTL_ZHIHU_QUEUE |
| 214 | +} |
| 215 | + |
| 216 | +SPEED_MAPS = { |
| 217 | + 'http': SPEED_HTTP_QUEUE, |
| 218 | + 'https': SPEED_HTTPS_QUEUE, |
| 219 | + 'weibo': SPEED_WEIBO_QUEUE, |
| 220 | + 'zhihu': SPEED_ZHIHU_QUEUE |
| 221 | +} |
| 222 | +``` |
0 commit comments