From ac8842504f488d6139e10370ecbbf3806be399e8 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Mon, 16 Apr 2018 15:56:44 +0800 Subject: [PATCH 001/304] [update]jsonify --- Api/ProxyApi.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 45db4843a..724dc35e6 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -14,16 +14,29 @@ __author__ = 'JHao' import sys +from werkzeug.wrappers import Response +from flask import Flask, jsonify, request sys.path.append('../') -from flask import Flask, jsonify, request from Util.GetConfig import GetConfig - from Manager.ProxyManager import ProxyManager app = Flask(__name__) + +class JsonResponse(Response): + + @classmethod + def force_type(cls, response, environ=None): + if isinstance(response, (dict, list)): + response = jsonify(response) + + return super(JsonResponse, cls).force_type(response, environ) + + +app.response_class = JsonResponse + api_list = { 'get': u'get an usable proxy', # 'refresh': u'refresh proxy pool', @@ -35,7 +48,7 @@ @app.route('/') def index(): - return jsonify(api_list) + return api_list @app.route('/get/') @@ -55,7 +68,7 @@ def refresh(): @app.route('/get_all/') def getAll(): proxies = ProxyManager().getAll() - return jsonify(proxies) + return proxies @app.route('/delete/', methods=['GET']) @@ -68,7 +81,7 @@ def delete(): @app.route('/get_status/') def getStatus(): status = ProxyManager().getNumber() - return jsonify(status) + return status def run(): From 34ef9cedb989feeb3ba1019596706c897c95a965 Mon Sep 17 00:00:00 2001 From: luocaodan Date: Fri, 27 Apr 2018 10:21:51 +0800 Subject: [PATCH 002/304] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=A4=E4=B8=AA?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E7=BD=91=E7=AB=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 5 +- ProxyGetter/getFreeProxy.py | 235 +++++++++++++++++++++++++++--------- Util/WebRequest.py | 20 ++- requirements.txt | 10 +- 4 files changed, 203 insertions(+), 67 deletions(-) diff --git a/Config.ini b/Config.ini index 14e785427..763007e23 100644 --- a/Config.ini +++ b/Config.ini @@ -17,9 +17,10 @@ freeProxyFifth = 1 freeProxySixth = 1 freeProxySeventh = 1 freeProxyEight = 1 +freeProxyNinth = 1 ;foreign website, outside the wall -;freeProxyWallFirst = 1 -;freeProxyWallSecond = 1 +freeProxyWallFirst = 1 +freeProxyWallSecond = 1 [HOST] ; API接口配置 http://127.0.0.1:5010 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index ff9ee8197..4cd08ced5 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -21,7 +21,7 @@ reload(sys) sys.setdefaultencoding('utf-8') -sys.path.append('../') +sys.path.append('..') from Util.utilFunction import robustCrawl, getHtmlTree from Util.WebRequest import WebRequest @@ -32,14 +32,13 @@ """ 66ip.cn data5u.com - ip181.com xicidaili.com goubanjia.com xdaili.cn kuaidaili.com cn-proxy.com proxy-list.org - www.mimiip.com + www.mimiip.com to do """ @@ -55,13 +54,15 @@ def __init__(self): def freeProxyFirst(page=10): """ 抓取无忧代理 http://www.data5u.com/ + 几乎没有能用的 :param page: 页数 :return: """ - url_list = ['http://www.data5u.com/', - 'http://www.data5u.com/free/', - 'http://www.data5u.com/free/gngn/index.shtml', - 'http://www.data5u.com/free/gnpt/index.shtml'] + url_list = [ + 'http://www.data5u.com/', + 'http://www.data5u.com/free/gngn/index.shtml', + 'http://www.data5u.com/free/gnpt/index.shtml' + ] for url in url_list: html_tree = getHtmlTree(url) ul_list = html_tree.xpath('//ul[@class="l2"]') @@ -69,10 +70,10 @@ def freeProxyFirst(page=10): try: yield ':'.join(ul.xpath('.//li/text()')[0:2]) except Exception as e: - pass + print(e) @staticmethod - def freeProxySecond(proxy_number=100): + def deprecatedFreeProxySecond(proxy_number=100): """ 抓取代理66 http://www.66ip.cn/ :param proxy_number: 代理数量 @@ -85,6 +86,29 @@ def freeProxySecond(proxy_number=100): for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): yield proxy + @staticmethod + def freeProxySecond(area=33): + """ + 修改抓取代理66 http://www.66ip.cn/ + :param page:抓取代理页数,page=1北京代理页,page=2上海代理页...... + :return: + """ + if area > 33: + page = 33 + for area_index in range(1, area + 1): + page_count = 5 + for i in range(1, page_count + 1): + url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i) + html_tree = getHtmlTree(url) + tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]") + if len(tr_list) == 0: + continue + for tr in tr_list: + yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0] + break + + ''' + 不能用了 @staticmethod def freeProxyThird(days=1): """ @@ -100,24 +124,28 @@ def freeProxyThird(days=1): yield ':'.join(tr.xpath('./td/text()')[0:2]) except Exception as e: pass + ''' @staticmethod - def freeProxyFourth(): + def freeProxyFourth(page_count=2): """ 抓取西刺代理 http://api.xicidaili.com/free2016.txt :return: """ - url_list = ['http://www.xicidaili.com/nn', # 高匿 - 'http://www.xicidaili.com/nt', # 透明 - ] + url_list = [ + 'http://www.xicidaili.com/nn/', # 高匿 + 'http://www.xicidaili.com/nt/', # 透明 + ] for each_url in url_list: - tree = getHtmlTree(each_url) - proxy_list = tree.xpath('.//table[@id="ip_list"]//tr') - for proxy in proxy_list: - try: - yield ':'.join(proxy.xpath('./td/text()')[0:2]) - except Exception as e: - pass + for i in range(1, page_count + 1): + page_url = each_url + str(i) + tree = getHtmlTree(page_url) + proxy_list = tree.xpath('.//table[@id="ip_list"]//tr[position()>1]') + for proxy in proxy_list: + try: + yield ':'.join(proxy.xpath('./td/text()')[0:2]) + except Exception as e: + pass @staticmethod def freeProxyFifth(): @@ -164,13 +192,17 @@ def freeProxySeventh(): """ 快代理免费https://www.kuaidaili.com/free/inha/1/ """ - url = 'https://www.kuaidaili.com/free/inha/{page}/' - for page in range(1, 10): - page_url = url.format(page=page) - tree = getHtmlTree(page_url) - proxy_list = tree.xpath('.//table//tr') - for tr in proxy_list[1:]: - yield ':'.join(tr.xpath('./td/text()')[0:2]) + url_list = [ + 'https://www.kuaidaili.com/free/inha/{page}/', + 'https://www.kuaidaili.com/free/intr/{page}/' + ] + for url in url_list: + for page in range(1, 5): + page_url = url.format(page=page) + tree = getHtmlTree(page_url) + proxy_list = tree.xpath('.//table//tr') + for tr in proxy_list[1:]: + yield ':'.join(tr.xpath('./td/text()')[0:2]) @staticmethod def freeProxyEight(): @@ -184,15 +216,33 @@ def freeProxyEight(): request = WebRequest() for url in url_list: - r = request.get(url) + r = request.get(url, use_proxy=True) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) + + @staticmethod + def freeProxyNinth(): + """ + coderBusy + https://proxy.coderbusy.com/ + :return: + """ + urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] + request = WebRequest() + for url in urls: + r = request.get(url) + proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)', r.text) + for proxy in proxies: + yield ':'.join(proxy) + + @staticmethod def freeProxyWallFirst(): """ 墙外网站 cn-proxy + 并没有被墙 :return: """ urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] @@ -205,6 +255,10 @@ def freeProxyWallFirst(): @staticmethod def freeProxyWallSecond(): + ''' + 并没有被墙 + :return: + ''' urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 @@ -215,34 +269,101 @@ def freeProxyWallSecond(): yield base64.b64decode(proxy).decode() + @staticmethod + def freeProxyWallThird(): + urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] + request = WebRequest() + for url in urls: + r = request.get(url) + proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) + for proxy in proxies: + yield ':'.join(proxy) + + +import threading + +lock = threading.Lock() +success = 0 +total = 0 + + +def test_once(proxy): + ip_port = proxy.split(":") + ip = ip_port[0] + port = ip_port[1] + import requests + + req_url = "http://www.baidu.com" + proxies = { + "http": "http://%s:%s" % (ip, port), + "https": "https://%s:%s" % (ip, port) + } + + global total + + try: + response = requests.get(req_url, proxies=proxies, timeout=4) + if response.status_code != 200: + print("unknow error, status code:" + str(response.status_code)) + lock.acquire() + total += 1 + lock.release() + return 0 + print("success") + global success + lock.acquire() + success += 1 + total += 1 + lock.release() + return 1 + except requests.exceptions.Timeout: + print("timeout") + except requests.exceptions.ConnectionError: + print("poxy unusable") + except Exception: + print("request error") + + lock.acquire() + total += 1 + lock.release() + return 0 + + +def test_batch(iterator): + global success + global total + + for proxy in iterator: + t = threading.Thread(target=test_once, args=(proxy,)) + t.start() + t.join() + + print("success:" + str(success) + "\ttotal:" + str(total)) + + if __name__ == '__main__': gg = GetFreeProxy() - # for e in gg.freeProxyFirst(): - # print(e) - # - # for e in gg.freeProxySecond(): - # print(e) - # - # for e in gg.freeProxyThird(): - # print(e) - # - # for e in gg.freeProxyFourth(): - # print(e) - # - # for e in gg.freeProxyFifth(): - # print(e) - # - # for e in gg.freeProxySixth(): - # print(e) - # - # for e in gg.freeProxySeventh(): - # print(e) - # - # for e in gg.freeProxyEight(): - # print(e) - # - # for e in gg.freeProxyWallFirst(): - # print(e) - # - # for e in gg.freeProxyWallSecond(): - # print(e) + + # test_batch(gg.freeProxyFirst()) + + # test_batch(gg.freeProxySecond()) + + # test_batch(gg.freeProxyFourth()) + + # test_batch(gg.freeProxyFifth()) + + # test_batch(gg.freeProxySixth()) + + # test_batch(gg.freeProxySeventh()) + + # to do + test_batch(gg.freeProxyEight()) + # gg.freeProxyEight() + + # test_batch(gg.freeProxyNinth()) + + # test_batch(gg.freeProxyWallFirst()) + + # test_batch(gg.freeProxyWallSecond()) + + # test_batch(gg.freeProxyWallThird()) diff --git a/Util/WebRequest.py b/Util/WebRequest.py index abbdb17be..7f5011724 100644 --- a/Util/WebRequest.py +++ b/Util/WebRequest.py @@ -15,6 +15,7 @@ import requests import random import time +from requests.models import Response class WebRequest(object): @@ -51,7 +52,7 @@ def header(self): 'Accept-Language': 'zh-CN,zh;q=0.8'} def get(self, url, header=None, retry_time=5, timeout=30, - retry_flag=list(), retry_interval=5, *args, **kwargs): + retry_flag=list(), retry_interval=5, use_proxy=False, *args, **kwargs): """ get method :param url: target url @@ -60,6 +61,7 @@ def get(self, url, header=None, retry_time=5, timeout=30, :param timeout: network timeout :param retry_flag: if retry_flag in content. do retry :param retry_interval: retry interval(second) + :param use_proxy: 是否使用代理 :param args: :param kwargs: :return: @@ -69,7 +71,16 @@ def get(self, url, header=None, retry_time=5, timeout=30, headers.update(header) while True: try: - html = requests.get(url, headers=headers, timeout=timeout) + if use_proxy: + proxy_url = "http://127.0.0.1:5010/get" + ip_proxy = requests.get(proxy_url).text + proxies = { + "http": "http://" + ip_proxy, + "https": "https://" + ip_proxy + } + html = requests.get(url, headers=headers, timeout=timeout, proxies=proxies) + else: + html = requests.get(url, headers=headers, timeout=timeout) if any(f in html.content for f in retry_flag): raise Exception return html @@ -78,5 +89,8 @@ def get(self, url, header=None, retry_time=5, timeout=30, retry_time -= 1 if retry_time <= 0: # 多次请求失败时,返回百度页面 - return requests.get("https://www.baidu.com/") + resp = Response() + resp.status_code = 200 + return resp time.sleep(retry_interval) + diff --git a/requirements.txt b/requirements.txt index 698cc8197..c0db31b10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ APScheduler==3.2.0 -Flask==0.11.1 -requests==2.11.0 -lxml==3.7.1 +Flask +requests +lxml -pymongo==3.2.2 -redis==2.10.5 +pymongo +redis From de58a7e81fc9f7f33b8437f674c4bbd7245b2391 Mon Sep 17 00:00:00 2001 From: luocaodan Date: Fri, 27 Apr 2018 11:43:07 +0800 Subject: [PATCH 003/304] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=80=E4=B8=AA?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E7=BD=91=E7=AB=99=20http://www.ip3366.net/fr?= =?UTF-8?q?ee/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 1 + ProxyGetter/getFreeProxy.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/Config.ini b/Config.ini index 763007e23..322aaddd8 100644 --- a/Config.ini +++ b/Config.ini @@ -18,6 +18,7 @@ freeProxySixth = 1 freeProxySeventh = 1 freeProxyEight = 1 freeProxyNinth = 1 +freeProxyTen = 1 ;foreign website, outside the wall freeProxyWallFirst = 1 freeProxyWallSecond = 1 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 4cd08ced5..1330d88fc 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -238,6 +238,17 @@ def freeProxyNinth(): yield ':'.join(proxy) + @staticmethod + def freeProxyTen(): + urls = ['http://www.ip3366.net/free/'] + request = WebRequest() + for url in urls: + r = request.get(url) + proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) + for proxy in proxies: + yield ":".join(proxy) + + @staticmethod def freeProxyWallFirst(): """ @@ -356,12 +367,12 @@ def test_batch(iterator): # test_batch(gg.freeProxySeventh()) - # to do - test_batch(gg.freeProxyEight()) - # gg.freeProxyEight() + # test_batch(gg.freeProxyEight()) # test_batch(gg.freeProxyNinth()) + # test_batch(gg.freeProxyTen()) + # test_batch(gg.freeProxyWallFirst()) # test_batch(gg.freeProxyWallSecond()) From e2433d39fc8f5802fedfbc6d6e8e9fc9e1ff7b15 Mon Sep 17 00:00:00 2001 From: luocaodan Date: Fri, 27 Apr 2018 12:41:15 +0800 Subject: [PATCH 004/304] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=80=E4=B8=AA?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E7=BD=91=E7=AB=99=20http://www.iphai.com?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 2 ++ ProxyGetter/getFreeProxy.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/Config.ini b/Config.ini index 322aaddd8..5f417badc 100644 --- a/Config.ini +++ b/Config.ini @@ -19,9 +19,11 @@ freeProxySeventh = 1 freeProxyEight = 1 freeProxyNinth = 1 freeProxyTen = 1 +freeProxyEleven = 1 ;foreign website, outside the wall freeProxyWallFirst = 1 freeProxyWallSecond = 1 +freeProxyWallThird = 1 [HOST] ; API接口配置 http://127.0.0.1:5010 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 1330d88fc..b9f542b01 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -249,6 +249,22 @@ def freeProxyTen(): yield ":".join(proxy) + @staticmethod + def freeProxyEleven(): + urls = [ + 'http://www.iphai.com/free/ng', + 'http://www.iphai.com/free/np', + 'http://www.iphai.com/free/wg', + 'http://www.iphai.com/free/wp' + ] + request = WebRequest() + for url in urls: + r = request.get(url) + proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?[\s\S]*?\s*?(\d+)\s*?', r.text) + for proxy in proxies: + yield ":".join(proxy) + + @staticmethod def freeProxyWallFirst(): """ @@ -373,6 +389,8 @@ def test_batch(iterator): # test_batch(gg.freeProxyTen()) + # test_batch(gg.freeProxyEleven()) + # test_batch(gg.freeProxyWallFirst()) # test_batch(gg.freeProxyWallSecond()) From 73afc225356d0577c35b6f2d127bf53f1d4ec3b1 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Fri, 27 Apr 2018 16:08:59 +0800 Subject: [PATCH 005/304] merge luocandan's code --- DB/SsdbClient.py | 4 + ProxyGetter/getFreeProxy.py | 131 ++++++-------------------- Test/.pytest_cache/v/cache/lastfailed | 3 + Test/.pytest_cache/v/cache/nodeids | 3 + Util/WebRequest.py | 21 +---- requirements.txt | 7 +- 6 files changed, 46 insertions(+), 123 deletions(-) create mode 100644 Test/.pytest_cache/v/cache/lastfailed create mode 100644 Test/.pytest_cache/v/cache/nodeids diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index d9a4030f4..2522e0071 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -110,3 +110,7 @@ def getNumber(self): def changeTable(self, name): self.name = name + +if __name__ == '__main__': + c = SsdbClient('useful_proxy', '118.24.52.95', 8899) + print(c.getAll()) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index b9f542b01..78837d50a 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -53,7 +53,7 @@ def __init__(self): @staticmethod def freeProxyFirst(page=10): """ - 抓取无忧代理 http://www.data5u.com/ + 无忧代理 http://www.data5u.com/ 几乎没有能用的 :param page: 页数 :return: @@ -73,31 +73,16 @@ def freeProxyFirst(page=10): print(e) @staticmethod - def deprecatedFreeProxySecond(proxy_number=100): + def freeProxySecond(area=33, page=1): """ - 抓取代理66 http://www.66ip.cn/ - :param proxy_number: 代理数量 + 代理66 http://www.66ip.cn/ + :param area: 抓取代理页数,page=1北京代理页,page=2上海代理页...... + :param page: 翻页 :return: """ - url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( - proxy_number) - request = WebRequest() - html = request.get(url).text - for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): - yield proxy - - @staticmethod - def freeProxySecond(area=33): - """ - 修改抓取代理66 http://www.66ip.cn/ - :param page:抓取代理页数,page=1北京代理页,page=2上海代理页...... - :return: - """ - if area > 33: - page = 33 + area = 33 if area > 33 else area for area_index in range(1, area + 1): - page_count = 5 - for i in range(1, page_count + 1): + for i in range(1, page + 1): url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i) html_tree = getHtmlTree(url) tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]") @@ -107,12 +92,10 @@ def freeProxySecond(area=33): yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0] break - ''' - 不能用了 @staticmethod def freeProxyThird(days=1): """ - 抓取ip181 http://www.ip181.com/ + ip181 http://www.ip181.com/ 不能用了 :param days: :return: """ @@ -124,12 +107,11 @@ def freeProxyThird(days=1): yield ':'.join(tr.xpath('./td/text()')[0:2]) except Exception as e: pass - ''' @staticmethod def freeProxyFourth(page_count=2): """ - 抓取西刺代理 http://api.xicidaili.com/free2016.txt + 西刺代理 http://www.xicidaili.com :return: """ url_list = [ @@ -150,7 +132,7 @@ def freeProxyFourth(page_count=2): @staticmethod def freeProxyFifth(): """ - 抓取guobanjia http://www.goubanjia.com/ + guobanjia http://www.goubanjia.com/ :return: """ url = "http://www.goubanjia.com/" @@ -175,7 +157,7 @@ def freeProxyFifth(): @staticmethod def freeProxySixth(): """ - 抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10 + 讯代理 http://www.xdaili.cn/ :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' @@ -190,7 +172,7 @@ def freeProxySixth(): @staticmethod def freeProxySeventh(): """ - 快代理免费https://www.kuaidaili.com/free/inha/1/ + 快代理 https://www.kuaidaili.com """ url_list = [ 'https://www.kuaidaili.com/free/inha/{page}/', @@ -207,7 +189,7 @@ def freeProxySeventh(): @staticmethod def freeProxyEight(): """ - 秘密代理IP网站http://www.mimiip.com + 秘密代理 http://www.mimiip.com """ url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿 url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿 @@ -221,12 +203,10 @@ def freeProxyEight(): for proxy in proxies: yield ':'.join(proxy) - @staticmethod def freeProxyNinth(): """ - coderBusy - https://proxy.coderbusy.com/ + 码农代理 https://proxy.coderbusy.com/ :return: """ urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] @@ -237,9 +217,12 @@ def freeProxyNinth(): for proxy in proxies: yield ':'.join(proxy) - @staticmethod def freeProxyTen(): + """ + 云代理 http://www.ip3366.net/free/ + :return: + """ urls = ['http://www.ip3366.net/free/'] request = WebRequest() for url in urls: @@ -248,9 +231,12 @@ def freeProxyTen(): for proxy in proxies: yield ":".join(proxy) - @staticmethod def freeProxyEleven(): + """ + IP海 http://www.iphai.com/free/ng + :return: + """ urls = [ 'http://www.iphai.com/free/ng', 'http://www.iphai.com/free/np', @@ -260,16 +246,15 @@ def freeProxyEleven(): request = WebRequest() for url in urls: r = request.get(url) - proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?[\s\S]*?\s*?(\d+)\s*?', r.text) + proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?[\s\S]*?\s*?(\d+)\s*?', + r.text) for proxy in proxies: yield ":".join(proxy) - @staticmethod def freeProxyWallFirst(): """ 墙外网站 cn-proxy - 并没有被墙 :return: """ urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] @@ -282,10 +267,10 @@ def freeProxyWallFirst(): @staticmethod def freeProxyWallSecond(): - ''' - 并没有被墙 + """ + https://proxy-list.org/english/index.php :return: - ''' + """ urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 @@ -295,7 +280,6 @@ def freeProxyWallSecond(): for proxy in proxies: yield base64.b64decode(proxy).decode() - @staticmethod def freeProxyWallThird(): urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] @@ -307,67 +291,6 @@ def freeProxyWallThird(): yield ':'.join(proxy) -import threading - -lock = threading.Lock() -success = 0 -total = 0 - - -def test_once(proxy): - ip_port = proxy.split(":") - ip = ip_port[0] - port = ip_port[1] - import requests - - req_url = "http://www.baidu.com" - proxies = { - "http": "http://%s:%s" % (ip, port), - "https": "https://%s:%s" % (ip, port) - } - - global total - - try: - response = requests.get(req_url, proxies=proxies, timeout=4) - if response.status_code != 200: - print("unknow error, status code:" + str(response.status_code)) - lock.acquire() - total += 1 - lock.release() - return 0 - print("success") - global success - lock.acquire() - success += 1 - total += 1 - lock.release() - return 1 - except requests.exceptions.Timeout: - print("timeout") - except requests.exceptions.ConnectionError: - print("poxy unusable") - except Exception: - print("request error") - - lock.acquire() - total += 1 - lock.release() - return 0 - - -def test_batch(iterator): - global success - global total - - for proxy in iterator: - t = threading.Thread(target=test_once, args=(proxy,)) - t.start() - t.join() - - print("success:" + str(success) + "\ttotal:" + str(total)) - - if __name__ == '__main__': gg = GetFreeProxy() diff --git a/Test/.pytest_cache/v/cache/lastfailed b/Test/.pytest_cache/v/cache/lastfailed new file mode 100644 index 000000000..65c9a06d6 --- /dev/null +++ b/Test/.pytest_cache/v/cache/lastfailed @@ -0,0 +1,3 @@ +{ + "testGetFreeProxy.py::testGetFreeProxy": true +} \ No newline at end of file diff --git a/Test/.pytest_cache/v/cache/nodeids b/Test/.pytest_cache/v/cache/nodeids new file mode 100644 index 000000000..0ce3684ce --- /dev/null +++ b/Test/.pytest_cache/v/cache/nodeids @@ -0,0 +1,3 @@ +[ + "testGetFreeProxy.py::testGetFreeProxy" +] \ No newline at end of file diff --git a/Util/WebRequest.py b/Util/WebRequest.py index 7f5011724..68db87500 100644 --- a/Util/WebRequest.py +++ b/Util/WebRequest.py @@ -12,10 +12,10 @@ """ __author__ = 'J_hao' +from requests.models import Response import requests import random import time -from requests.models import Response class WebRequest(object): @@ -37,7 +37,7 @@ def user_agent(self): 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', - ] + ] return random.choice(ua_list) @property @@ -52,7 +52,7 @@ def header(self): 'Accept-Language': 'zh-CN,zh;q=0.8'} def get(self, url, header=None, retry_time=5, timeout=30, - retry_flag=list(), retry_interval=5, use_proxy=False, *args, **kwargs): + retry_flag=list(), retry_interval=5, *args, **kwargs): """ get method :param url: target url @@ -61,7 +61,6 @@ def get(self, url, header=None, retry_time=5, timeout=30, :param timeout: network timeout :param retry_flag: if retry_flag in content. do retry :param retry_interval: retry interval(second) - :param use_proxy: 是否使用代理 :param args: :param kwargs: :return: @@ -71,16 +70,7 @@ def get(self, url, header=None, retry_time=5, timeout=30, headers.update(header) while True: try: - if use_proxy: - proxy_url = "http://127.0.0.1:5010/get" - ip_proxy = requests.get(proxy_url).text - proxies = { - "http": "http://" + ip_proxy, - "https": "https://" + ip_proxy - } - html = requests.get(url, headers=headers, timeout=timeout, proxies=proxies) - else: - html = requests.get(url, headers=headers, timeout=timeout) + html = requests.get(url, headers=headers, timeout=timeout) if any(f in html.content for f in retry_flag): raise Exception return html @@ -88,9 +78,8 @@ def get(self, url, header=None, retry_time=5, timeout=30, print(e) retry_time -= 1 if retry_time <= 0: - # 多次请求失败时,返回百度页面 + # 多次请求失败 resp = Response() resp.status_code = 200 return resp time.sleep(retry_interval) - diff --git a/requirements.txt b/requirements.txt index c0db31b10..5d00da69a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ APScheduler==3.2.0 -Flask -requests -lxml +werkzeug==0.11.15 +Flask==0.12 +requests==2.12.4 +lxml==3.7.2 pymongo redis From b9f3e148c64d9a50b7c23768c1007b4d993eead8 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Fri, 27 Apr 2018 16:14:40 +0800 Subject: [PATCH 006/304] [update] add contributor --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 32a0ac968..8c59cd631 100644 --- a/README.md +++ b/README.md @@ -183,5 +183,5 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致 ### Release Notes - [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) + [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) [@luocaodan](https://github.com/luocaodan) From f9a8bf54054f0e7435c7e9718da9033988e0958c Mon Sep 17 00:00:00 2001 From: highroom <827148@163.com> Date: Mon, 14 May 2018 23:39:30 +0800 Subject: [PATCH 007/304] =?UTF-8?q?=E5=A2=9E=E5=8A=A0fq=E4=BB=A3=E7=90=86?= =?UTF-8?q?=E7=9A=84=E9=85=8D=E7=BD=AE=EF=BC=8C=E9=85=8D=E7=BD=AE=E5=90=8E?= =?UTF-8?q?=E8=AF=B7=E8=B0=83=E7=94=A8=E4=BB=A3=E7=90=86=E8=AE=BF=E9=97=AE?= =?UTF-8?q?wallproxy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 4 ++++ ProxyGetter/getFreeProxy.py | 44 ++++++++++++++++++++++++++++++++++--- Util/WebRequest.py | 2 +- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/Config.ini b/Config.ini index 5f417badc..dae17c1a2 100644 --- a/Config.ini +++ b/Config.ini @@ -29,3 +29,7 @@ freeProxyWallThird = 1 ; API接口配置 http://127.0.0.1:5010 ip = 0.0.0.0 port = 5010 + +[WallProxy] +; fq代理配置 +; proxy = 127.0.0.1:1080 \ No newline at end of file diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 78837d50a..edb71fa8b 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -14,6 +14,12 @@ import re import sys import requests +import os + +try: + from configparser import ConfigParser # py3 +except: + from ConfigParser import ConfigParser # py2 try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 @@ -46,6 +52,15 @@ class GetFreeProxy(object): """ proxy getter """ + pwd = os.path.split(os.path.realpath(__file__))[0] + config_path = os.path.join(os.path.split(pwd)[0], 'Config.ini') + config_file = ConfigParser() + config_file.read(config_path) + if config_file.has_option('WallProxy', 'proxy'): + WallProxy = config_file.get('WallProxy', 'proxy') + wall_proxies = {"http": "http://{}".format(WallProxy), "https": "https://{}".format(WallProxy)} + else: + wall_proxies = None def __init__(self): pass @@ -257,10 +272,17 @@ def freeProxyWallFirst(): 墙外网站 cn-proxy :return: """ + kwargs = {} + if GetFreeProxy.wall_proxies: + kwargs['proxies'] = GetFreeProxy.wall_proxies + else: + return + urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: - r = request.get(url) + kwargs['url'] = url + r = request.get(**kwargs) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -271,21 +293,35 @@ def freeProxyWallSecond(): https://proxy-list.org/english/index.php :return: """ + kwargs = {} + if GetFreeProxy.wall_proxies: + kwargs['proxies'] = GetFreeProxy.wall_proxies + else: + return urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: - r = request.get(url) + kwargs['url'] = url + r = request.get(**kwargs) proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode() @staticmethod def freeProxyWallThird(): + + kwargs = {} + if GetFreeProxy.wall_proxies: + kwargs['proxies'] = GetFreeProxy.wall_proxies + else: + return + urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] request = WebRequest() for url in urls: - r = request.get(url) + kwargs['url'] = url + r = request.get(**kwargs) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -319,3 +355,5 @@ def freeProxyWallThird(): # test_batch(gg.freeProxyWallSecond()) # test_batch(gg.freeProxyWallThird()) + for e in gg.freeProxyWallThird(): + print(e) diff --git a/Util/WebRequest.py b/Util/WebRequest.py index 68db87500..47286a225 100644 --- a/Util/WebRequest.py +++ b/Util/WebRequest.py @@ -70,7 +70,7 @@ def get(self, url, header=None, retry_time=5, timeout=30, headers.update(header) while True: try: - html = requests.get(url, headers=headers, timeout=timeout) + html = requests.get(url, headers=headers, timeout=timeout, **kwargs) if any(f in html.content for f in retry_flag): raise Exception return html From 876b1ec4de23924f8e940772a8690d2373c1a402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8D=A3=E9=A3=9E=20=E5=BE=90?= Date: Wed, 23 May 2018 15:23:41 +0800 Subject: [PATCH 008/304] add http://ip.jiangxianli.com/ --- Config.ini | 1 + ProxyGetter/getFreeProxy.py | 37 +++++++++++++++++++++++++++++++++++++ Test/testGetFreeProxy.py | 16 ++++++++++++++-- 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/Config.ini b/Config.ini index 5f417badc..ca011a01f 100644 --- a/Config.ini +++ b/Config.ini @@ -20,6 +20,7 @@ freeProxyEight = 1 freeProxyNinth = 1 freeProxyTen = 1 freeProxyEleven = 1 +freeProxyTwelve = 1 ;foreign website, outside the wall freeProxyWallFirst = 1 freeProxyWallSecond = 1 diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 78837d50a..54d09c1b0 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -15,6 +15,7 @@ import sys import requests + try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 except: @@ -25,6 +26,7 @@ from Util.utilFunction import robustCrawl, getHtmlTree from Util.WebRequest import WebRequest +from Util.utilFunction import verifyProxyFormat # for debug to disable insecureWarning requests.packages.urllib3.disable_warnings() @@ -251,6 +253,24 @@ def freeProxyEleven(): for proxy in proxies: yield ":".join(proxy) + @staticmethod + def freeProxyTwelve(page_count=8): + """ + guobanjia http://ip.jiangxianli.com/?page= + 免费代理库 + 超多量 + :return: + """ + for i in range(1, page_count + 1): + url = 'http://ip.jiangxianli.com/?page={}'.format(i) + # print(url) + html_tree = getHtmlTree(url) + tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr") + if len(tr_list) == 0: + continue + for tr in tr_list: + yield tr.xpath("./td[2]/text()")[0] + ":" + tr.xpath("./td[3]/text()")[0] + @staticmethod def freeProxyWallFirst(): """ @@ -314,6 +334,23 @@ def freeProxyWallThird(): # test_batch(gg.freeProxyEleven()) + proxy_iter = gg.freeProxyTwelve() + proxy_set = set() + for proxy in proxy_iter: + proxy = proxy.strip() + if proxy and verifyProxyFormat(proxy): + #self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) + proxy_set.add(proxy) + #else: + #self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) + + # store + for proxy in proxy_set: + print(proxy) + + + # test_batch(gg.freeProxyTwelve()) + # test_batch(gg.freeProxyWallFirst()) # test_batch(gg.freeProxyWallSecond()) diff --git a/Test/testGetFreeProxy.py b/Test/testGetFreeProxy.py index df99c79f3..33c3f9e46 100644 --- a/Test/testGetFreeProxy.py +++ b/Test/testGetFreeProxy.py @@ -12,6 +12,18 @@ """ __author__ = 'J_hao' +import re +import sys +import requests + + +try: + from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 +except: + reload(sys) + sys.setdefaultencoding('utf-8') + +sys.path.append('..') from ProxyGetter.getFreeProxy import GetFreeProxy from Util.GetConfig import GetConfig @@ -28,9 +40,9 @@ def testGetFreeProxy(): proxy_count = 0 for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: - print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) + print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy,proxy_count=proxy_count)) proxy_count += 1 - assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) + #assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) if __name__ == '__main__': From 413e41b2973e41742e55ab7bb7a1d642fe6ada8d Mon Sep 17 00:00:00 2001 From: jhao104 Date: Tue, 10 Jul 2018 16:50:31 +0800 Subject: [PATCH 009/304] =?UTF-8?q?[update]=20=E4=BF=AE=E6=94=B9ProxyGette?= =?UTF-8?q?r=E6=A3=80=E6=9F=A5=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 4 -- ProxyGetter/CheckProxy.py | 72 ++++++++++++++++++++++++++++ ProxyGetter/getFreeProxy.py | 96 ++++--------------------------------- 3 files changed, 81 insertions(+), 91 deletions(-) create mode 100644 ProxyGetter/CheckProxy.py diff --git a/Config.ini b/Config.ini index 95e33400d..ca011a01f 100644 --- a/Config.ini +++ b/Config.ini @@ -30,7 +30,3 @@ freeProxyWallThird = 1 ; API接口配置 http://127.0.0.1:5010 ip = 0.0.0.0 port = 5010 - -[WallProxy] -; fq代理配置 -; proxy = 127.0.0.1:1080 \ No newline at end of file diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py new file mode 100644 index 000000000..f6ba9b66a --- /dev/null +++ b/ProxyGetter/CheckProxy.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: CheckProxy + Description : used for check getFreeProxy.py + Author : JHao + date: 2018/7/10 +------------------------------------------------- + Change Activity: + 2018/7/10: CheckProxy +------------------------------------------------- +""" +__author__ = 'JHao' + +import sys +from getFreeProxy import GetFreeProxy +from Util.utilFunction import verifyProxyFormat + +sys.path.append('../') + +from Util.LogHandler import LogHandler + +log = LogHandler('check_proxy', file=False) + + +class CheckProxy(object): + + @staticmethod + def checkAllGetProxyFunc(): + """ + 检查getFreeProxy所有代理获取函数运行情况 + Returns: + None + """ + import inspect + member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction) + proxy_count_dict = dict() + for func_name, func in member_list: + log.info(u"开始运行 {}".format(func_name)) + try: + proxy_list = [_ for _ in func() if verifyProxyFormat(_)] + proxy_count_dict[func_name] = len(proxy_list) + except Exception as e: + log.info(u"代理获取函数 {} 运行出错!".format(func_name)) + log.error(str(e)) + log.info(u"所有函数运行完毕 " + "***" * 5) + for func_name, func in member_list: + log.info(u"函数 {n}, 获取到代理数: {c}".format(n=func_name, c=proxy_count_dict.get(func_name, 0))) + + @staticmethod + def checkGetProxyFunc(func): + """ + 检查指定的getFreeProxy某个function运行情况 + Args: + func: getFreeProxy中某个可调用方法 + + Returns: + None + """ + func_name = getattr(func, '__name__', "None") + log.info("start running func: {}".format(func_name)) + count = 0 + for proxy in func(): + if verifyProxyFormat(proxy): + log.info("fetch proxy: {}".format(proxy)) + count += 1 + log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count)) + + +if __name__ == '__main__': + CheckProxy.checkAllGetProxyFunc() + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 23542a5b7..bf2e03f61 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -14,13 +14,6 @@ import re import sys import requests -import os - -try: - from configparser import ConfigParser # py3 -except: - from ConfigParser import ConfigParser # py2 - try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 @@ -30,8 +23,8 @@ sys.path.append('..') -from Util.utilFunction import robustCrawl, getHtmlTree from Util.WebRequest import WebRequest +from Util.utilFunction import getHtmlTree from Util.utilFunction import verifyProxyFormat # for debug to disable insecureWarning @@ -54,15 +47,6 @@ class GetFreeProxy(object): """ proxy getter """ - pwd = os.path.split(os.path.realpath(__file__))[0] - config_path = os.path.join(os.path.split(pwd)[0], 'Config.ini') - config_file = ConfigParser() - config_file.read(config_path) - if config_file.has_option('WallProxy', 'proxy'): - WallProxy = config_file.get('WallProxy', 'proxy') - wall_proxies = {"http": "http://{}".format(WallProxy), "https": "https://{}".format(WallProxy)} - else: - wall_proxies = None def __init__(self): pass @@ -215,7 +199,7 @@ def freeProxyEight(): request = WebRequest() for url in url_list: - r = request.get(url, use_proxy=True) + r = request.get(url) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -278,7 +262,6 @@ def freeProxyTwelve(page_count=8): """ for i in range(1, page_count + 1): url = 'http://ip.jiangxianli.com/?page={}'.format(i) - # print(url) html_tree = getHtmlTree(url) tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr") if len(tr_list) == 0: @@ -292,17 +275,10 @@ def freeProxyWallFirst(): 墙外网站 cn-proxy :return: """ - kwargs = {} - if GetFreeProxy.wall_proxies: - kwargs['proxies'] = GetFreeProxy.wall_proxies - else: - return - urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: - kwargs['url'] = url - r = request.get(**kwargs) + r = request.get(url) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -313,84 +289,30 @@ def freeProxyWallSecond(): https://proxy-list.org/english/index.php :return: """ - kwargs = {} - if GetFreeProxy.wall_proxies: - kwargs['proxies'] = GetFreeProxy.wall_proxies - else: - return urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] request = WebRequest() import base64 for url in urls: - kwargs['url'] = url - r = request.get(**kwargs) + r = request.get(url) proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode() @staticmethod def freeProxyWallThird(): - - kwargs = {} - if GetFreeProxy.wall_proxies: - kwargs['proxies'] = GetFreeProxy.wall_proxies - else: - return - urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] request = WebRequest() for url in urls: - kwargs['url'] = url - r = request.get(**kwargs) + r = request.get(url) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) if __name__ == '__main__': - gg = GetFreeProxy() - - # test_batch(gg.freeProxyFirst()) - - # test_batch(gg.freeProxySecond()) - - # test_batch(gg.freeProxyFourth()) - - # test_batch(gg.freeProxyFifth()) - - # test_batch(gg.freeProxySixth()) - - # test_batch(gg.freeProxySeventh()) - - # test_batch(gg.freeProxyEight()) - - # test_batch(gg.freeProxyNinth()) - - # test_batch(gg.freeProxyTen()) - - # test_batch(gg.freeProxyEleven()) - - proxy_iter = gg.freeProxyTwelve() - proxy_set = set() - for proxy in proxy_iter: - proxy = proxy.strip() - if proxy and verifyProxyFormat(proxy): - #self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) - proxy_set.add(proxy) - #else: - #self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) - - # store - for proxy in proxy_set: - print(proxy) - - - # test_batch(gg.freeProxyTwelve()) - - # test_batch(gg.freeProxyWallFirst()) + from CheckProxy import CheckProxy - # test_batch(gg.freeProxyWallSecond()) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) - # test_batch(gg.freeProxyWallThird()) - for e in gg.freeProxyWallThird(): - print(e) + CheckProxy.checkAllGetProxyFunc() From edac60ce8ea6340834e1e4afa53d37f3e1a783a8 Mon Sep 17 00:00:00 2001 From: jhao104 Date: Tue, 10 Jul 2018 16:54:13 +0800 Subject: [PATCH 010/304] [update] readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8c59cd631..8dc30eee9 100644 --- a/README.md +++ b/README.md @@ -178,10 +178,10 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan) ### Release Notes - [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) [@luocaodan](https://github.com/luocaodan) + [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) From 62b05856fbed3f104842010defe0f46fd5e5c242 Mon Sep 17 00:00:00 2001 From: YeClimEric Date: Wed, 10 Oct 2018 17:50:34 +0800 Subject: [PATCH 011/304] =?UTF-8?q?1.flask=E6=94=AF=E6=8C=81=E5=A4=9A?= =?UTF-8?q?=E8=BF=9B=E7=A8=8B=E5=A4=84=E7=90=86=E4=BB=BB=E5=8A=A1=202.?= =?UTF-8?q?=E4=BC=98=E5=8C=96=20proxy=20=E9=87=87=E9=9B=86=E3=80=81?= =?UTF-8?q?=E6=A0=A1=E9=AA=8C=E6=B5=81=E7=A8=8B=EF=BC=8C=E5=8A=A0=E5=BF=AB?= =?UTF-8?q?=20userfull=20proxy=20=E6=A0=A1=E9=AA=8C=E9=80=9F=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Api/ProxyApi.py | 9 ++++----- Config.ini | 8 +++++--- Manager/ProxyManager.py | 32 ++++++++++++-------------------- Schedule/ProxyRefreshSchedule.py | 23 +++++++++++++---------- Util/GetConfig.py | 11 ++++++++--- 5 files changed, 42 insertions(+), 41 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 724dc35e6..2e3733013 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -2,13 +2,13 @@ # !/usr/bin/env python """ ------------------------------------------------- - File Name: ProxyApi.py - Description : + File Name: ProxyApi.py + Description : Author : JHao date: 2016/12/4 ------------------------------------------------- Change Activity: - 2016/12/4: + 2016/12/4: ------------------------------------------------- """ __author__ = 'JHao' @@ -26,7 +26,6 @@ class JsonResponse(Response): - @classmethod def force_type(cls, response, environ=None): if isinstance(response, (dict, list)): @@ -86,7 +85,7 @@ def getStatus(): def run(): config = GetConfig() - app.run(host=config.host_ip, port=config.host_port) + app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes) if __name__ == '__main__': diff --git a/Config.ini b/Config.ini index ca011a01f..d1ab07bb4 100644 --- a/Config.ini +++ b/Config.ini @@ -9,11 +9,11 @@ name = proxy [ProxyGetter] ;register the proxy getter function -freeProxyFirst = 1 +freeProxyFirst = 1 freeProxySecond = 1 ;freeProxyThird = 1 freeProxyFourth = 1 -freeProxyFifth = 1 +freeProxyFifth = 1 freeProxySixth = 1 freeProxySeventh = 1 freeProxyEight = 1 @@ -26,7 +26,9 @@ freeProxyWallFirst = 1 freeProxyWallSecond = 1 freeProxyWallThird = 1 -[HOST] +[API] ; API接口配置 http://127.0.0.1:5010 ip = 0.0.0.0 port = 5010 +; flask多进程处理请求 +processes = 10 diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index 6131c089a..33aa76b39 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -2,13 +2,13 @@ # !/usr/bin/env python """ ------------------------------------------------- - File Name: ProxyManager.py - Description : + File Name: ProxyManager.py + Description : Author : JHao date: 2016/12/3 ------------------------------------------------- Change Activity: - 2016/12/3: + 2016/12/3: ------------------------------------------------- """ __author__ = 'JHao' @@ -40,30 +40,22 @@ def refresh(self): fetch proxy into Db by ProxyGetter :return: """ + self.db.changeTable(self.raw_proxy_queue) for proxyGetter in self.config.proxy_getter_functions: # fetch - proxy_set = set() try: self.log.info("{func}: fetch proxy start".format(func=proxyGetter)) - proxy_iter = [_ for _ in getattr(GetFreeProxy, proxyGetter.strip())()] + for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): + # 挨个存储 proxy,优化raw 队列的 push 速度,进而加快 check proxy 的速度 + proxy = proxy.strip() + if proxy and verifyProxyFormat(proxy): + self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) + self.db.put(proxy) + else: + self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error("{func}: fetch proxy fail".format(func=proxyGetter)) continue - for proxy in proxy_iter: - proxy = proxy.strip() - if proxy and verifyProxyFormat(proxy): - self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) - proxy_set.add(proxy) - else: - self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) - - # store - for proxy in proxy_set: - self.db.changeTable(self.useful_proxy_queue) - if self.db.exists(proxy): - continue - self.db.changeTable(self.raw_proxy_queue) - self.db.put(proxy) def get(self): """ diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index 7dac2aa34..6088fcb0a 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -18,7 +18,8 @@ import time import logging from threading import Thread -from apscheduler.schedulers.blocking import BlockingScheduler +# 使用后台调度,不使用阻塞式~ +from apscheduler.schedulers.background import BackgroundScheduler as Sch sys.path.append('../') @@ -73,12 +74,7 @@ def refreshPool(): pp.validProxy() -def main(process_num=30): - p = ProxyRefreshSchedule() - - # 获取新代理 - p.refresh() - +def batch_refresh(process_num=30): # 检验新代理 pl = [] for num in range(process_num): @@ -93,11 +89,18 @@ def main(process_num=30): pl[num].join() +def fetch_all(): + p = ProxyRefreshSchedule() + # 获取新代理 + p.refresh() + + def run(): - main() - sch = BlockingScheduler() - sch.add_job(main, 'interval', minutes=10) # 每10分钟抓取一次 + sch = Sch() + sch.add_job(fetch_all, 'interval', minutes=5) # 每5分钟抓取一次 + sch.add_job(batch_refresh, "interval", minutes=1) # 每分钟检查一次 sch.start() + fetch_all() if __name__ == '__main__': diff --git a/Util/GetConfig.py b/Util/GetConfig.py index 24b003f28..8ea57be56 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -2,7 +2,7 @@ # !/usr/bin/env python """ ------------------------------------------------- - File Name: GetConfig.py + File Name: GetConfig.py Description : fetch config from config.ini Author : JHao date: 2016/12/3 @@ -51,11 +51,15 @@ def proxy_getter_functions(self): @LazyProperty def host_ip(self): - return self.config_file.get('HOST','ip') + return self.config_file.get('API','ip') @LazyProperty def host_port(self): - return int(self.config_file.get('HOST', 'port')) + return int(self.config_file.get('API', 'port')) + + @LazyProperty + def processes(self): + return int(self.config_file.get('API', 'processes')) if __name__ == '__main__': gg = GetConfig() @@ -66,3 +70,4 @@ def host_port(self): print(gg.proxy_getter_functions) print(gg.host_ip) print(gg.host_port) + print(gg.processes) From a0b152a968e073c0c35f8dc03d862f783ba4ee86 Mon Sep 17 00:00:00 2001 From: YeClimEric Date: Wed, 10 Oct 2018 18:15:47 +0800 Subject: [PATCH 012/304] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20dockerfile?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7c815a4e7..d97495489 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,28 +3,25 @@ WORKDIR /usr/src/app COPY . . ENV DEBIAN_FRONTEND noninteractive ENV TZ Asia/Shanghai -RUN pip install --no-cache-dir -r requirements.txt && \ - apt-get update && \ - apt-get install -y --force-yes git make gcc g++ autoconf && apt-get clean && \ - git clone --depth 1 https://github.com/ideawu/ssdb.git ssdb && \ - cd ssdb && make && make install && cp ssdb-server /usr/bin && \ - apt-get remove -y --force-yes git make gcc g++ autoconf && \ - apt-get autoremove -y && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ - cp ssdb.conf /etc && cd .. && yes | rm -r ssdb && \ - mkdir -p /var/lib/ssdb && \ - sed \ - -e 's@home.*@home /var/lib@' \ - -e 's/loglevel.*/loglevel info/' \ - -e 's@work_dir = .*@work_dir = /var/lib/ssdb@' \ - -e 's@pidfile = .*@pidfile = /run/ssdb.pid@' \ - -e 's@level:.*@level: info@' \ - -e 's@ip:.*@ip: 0.0.0.0@' \ - -i /etc/ssdb.conf && \ - echo "# ! /bin/sh " > /usr/src/app/run.sh && \ - echo "cd Run" >> /usr/src/app/run.sh && \ - echo "/usr/bin/ssdb-server /etc/ssdb.conf &" >> /usr/src/app/run.sh && \ - echo "python main.py" >> /usr/src/app/run.sh && \ - chmod 777 run.sh + +RUN apt-get update +RUN apt-get install vim -y + +RUN apt-get install -y redis-server +RUN sed -i 's/^\(bind .*\)$/# \1/' /etc/redis/redis.conf \ + && sed -i 's/^\(databases .*\)$/databases 1/' /etc/redis/redis.conf \ + && sed -i 's/^\(daemonize .*\)$/daemonize yes/' /etc/redis/redis.conf +# && sed -i 's/^\(dir .*\)$/# \1\ndir \/data/' /etc/redis/redis.conf \ +# && sed -i 's/^\(logfile .*\)$/# \1/' /etc/redis/redis.conf + +RUN pip install --no-cache-dir -r requirements.txt + + +RUN echo "# ! /bin/sh " > run.sh \ + && echo "redis-server /etc/redis/redis.conf&" >> run.sh \ + && echo "cd Run" >> run.sh \ + && echo "python main.py" >> run.sh \ + && chmod 777 run.sh + EXPOSE 5010 CMD [ "sh", "run.sh" ] From 5de6b7d3793337f7c5aa05dd3539c7db3b31fc9e Mon Sep 17 00:00:00 2001 From: YeClimEric Date: Wed, 10 Oct 2018 19:29:58 +0800 Subject: [PATCH 013/304] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20dockerfile?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyRefreshSchedule.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index 6088fcb0a..38668072d 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -102,6 +102,9 @@ def run(): sch.start() fetch_all() + while True: + time.sleep(1) + if __name__ == '__main__': run() From 2086a52ecc21c3099c328fa0df40281399feebaf Mon Sep 17 00:00:00 2001 From: jhao104 Date: Wed, 17 Oct 2018 14:21:09 +0800 Subject: [PATCH 014/304] [fix] fix198 --- Api/ProxyApi.py | 5 ++++- Config.ini | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 2e3733013..b8977f9ca 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -85,7 +85,10 @@ def getStatus(): def run(): config = GetConfig() - app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes) + if sys.platform.startswith("win"): + app.run(host=config.host_ip, port=config.host_port) + else: + app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes) if __name__ == '__main__': diff --git a/Config.ini b/Config.ini index d1ab07bb4..9394f744e 100644 --- a/Config.ini +++ b/Config.ini @@ -27,8 +27,7 @@ freeProxyWallSecond = 1 freeProxyWallThird = 1 [API] -; API接口配置 http://127.0.0.1:5010 +; API config http://127.0.0.1:5010 ip = 0.0.0.0 port = 5010 -; flask多进程处理请求 processes = 10 From 7449f7dabb9449a6eedf67f2ff4d20df39a9e5ae Mon Sep 17 00:00:00 2001 From: vc5 Date: Thu, 25 Oct 2018 00:01:36 +0800 Subject: [PATCH 015/304] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AF=86=E7=A0=81?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 1 + DB/DbClient.py | 3 ++- DB/SsdbClient.py | 4 ++-- Util/GetConfig.py | 9 +++++++++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Config.ini b/Config.ini index 9394f744e..24f570f01 100644 --- a/Config.ini +++ b/Config.ini @@ -6,6 +6,7 @@ host = 127.0.0.1 port = 6379 ;port = 8888 name = proxy +#password = yourpassword [ProxyGetter] ;register the proxy getter function diff --git a/DB/DbClient.py b/DB/DbClient.py index 68c5db7a7..0036434ae 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -75,7 +75,8 @@ def __initDbClient(self): assert __type, 'type error, Not support DB type: {}'.format(self.config.db_type) self.client = getattr(__import__(__type), __type)(name=self.config.db_name, host=self.config.db_host, - port=self.config.db_port) + port=self.config.db_port, + password=self.config.db_password) def get(self, key, **kwargs): return self.client.get(key, **kwargs) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index 2522e0071..2249fdcc1 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -32,7 +32,7 @@ class SsdbClient(object): """ - def __init__(self, name, host, port): + def __init__(self, name, **kwargs): """ init :param name: hash name @@ -41,7 +41,7 @@ def __init__(self, name, host, port): :return: """ self.name = name - self.__conn = Redis(connection_pool=BlockingConnectionPool(host=host, port=port)) + self.__conn = Redis(connection_pool=BlockingConnectionPool(**kwargs)) def get(self, proxy): """ diff --git a/Util/GetConfig.py b/Util/GetConfig.py index 8ea57be56..c4c31ab0e 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -45,6 +45,15 @@ def db_host(self): def db_port(self): return int(self.config_file.get('DB', 'port')) + @LazyProperty + def db_password(self): + try: + password = self.config_file.get('DB', 'password') + except Exception: + password = None + return password + + @LazyProperty def proxy_getter_functions(self): return self.config_file.options('ProxyGetter') From 0238d9f931425736c9d72e4ea3e429ff4f03ef64 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Mon, 29 Oct 2018 09:44:48 +0800 Subject: [PATCH 016/304] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8dc30eee9..47480fb92 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)| [@vc5](https://github.com/vc5) ### Release Notes From 8ac170e981fb08a892c27552782b4528d67f64eb Mon Sep 17 00:00:00 2001 From: Jacob Date: Thu, 8 Nov 2018 21:35:31 +0800 Subject: [PATCH 017/304] =?UTF-8?q?=E5=AE=8C=E5=96=84Redis=E5=92=8CMongodb?= =?UTF-8?q?=E9=AA=8C=E8=AF=81=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加Config.ini的用户和密码 为username参数做兼容处理 --- Config.ini | 3 ++- DB/DbClient.py | 1 + DB/MongodbClient.py | 4 ++-- DB/RedisClient.py | 8 ++++++-- DB/SsdbClient.py | 7 +++++-- Util/GetConfig.py | 7 +++++++ 6 files changed, 23 insertions(+), 7 deletions(-) diff --git a/Config.ini b/Config.ini index 24f570f01..cf3f8ded2 100644 --- a/Config.ini +++ b/Config.ini @@ -6,7 +6,8 @@ host = 127.0.0.1 port = 6379 ;port = 8888 name = proxy -#password = yourpassword +;username = your_username (Only Mongodb) +;password = your_password [ProxyGetter] ;register the proxy getter function diff --git a/DB/DbClient.py b/DB/DbClient.py index 0036434ae..40127cc11 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -76,6 +76,7 @@ def __initDbClient(self): self.client = getattr(__import__(__type), __type)(name=self.config.db_name, host=self.config.db_host, port=self.config.db_port, + username=self.config.db_username, password=self.config.db_password) def get(self, key, **kwargs): diff --git a/DB/MongodbClient.py b/DB/MongodbClient.py index bd0647f51..a30ef6cf1 100644 --- a/DB/MongodbClient.py +++ b/DB/MongodbClient.py @@ -17,9 +17,9 @@ class MongodbClient(object): - def __init__(self, name, host, port): + def __init__(self, name, host, port, **kwargs): self.name = name - self.client = MongoClient(host, port) + self.client = MongoClient(host, port, **kwargs) self.db = self.client.proxy def changeTable(self, name): diff --git a/DB/RedisClient.py b/DB/RedisClient.py index 7d9af4386..1983d855e 100644 --- a/DB/RedisClient.py +++ b/DB/RedisClient.py @@ -22,7 +22,11 @@ class RedisClient(object): Reids client """ - def __init__(self, name, host, port): + # 为了保持DbClient的标准 + # 在RedisClient里面接受username参数, 但不进行使用. + # 因为不能将username通过kwargs传进redis.Redis里面, 会报错: + # TypeError: __init__() got an unexpected keyword argument 'username' + def __init__(self, name, host, port, username, **kwargs): """ init :param name: @@ -31,7 +35,7 @@ def __init__(self, name, host, port): :return: """ self.name = name - self.__conn = redis.Redis(host=host, port=port, db=0) + self.__conn = redis.Redis(host=host, port=port, db=0, **kwargs) def get(self): """ diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index 2249fdcc1..202ddaa8f 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -31,8 +31,11 @@ class SsdbClient(object): 验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1; """ - - def __init__(self, name, **kwargs): + # 为了保持DbClient的标准 + # 在SsdbClient里面接受username参数, 但不进行使用. + # 因为不能将username通过kwargs传进redis.Redis里面, 会报错: + # TypeError: __init__() got an unexpected keyword argument 'username' + def __init__(self, name, username, **kwargs): """ init :param name: hash name diff --git a/Util/GetConfig.py b/Util/GetConfig.py index c4c31ab0e..c26b00f1e 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -53,6 +53,13 @@ def db_password(self): password = None return password + @LazyProperty + def db_username(self): + try: + username = self.config_file.get('DB', 'username') + except Exception: + username = None + return username @LazyProperty def proxy_getter_functions(self): From 4eaaa7dc12a5e318368f8eb4f1bb08ef8ee7ca48 Mon Sep 17 00:00:00 2001 From: Jacob Date: Thu, 8 Nov 2018 22:22:43 +0800 Subject: [PATCH 018/304] =?UTF-8?q?=E4=BC=98=E5=8C=96Docker=E4=BD=BF?= =?UTF-8?q?=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 标准化Dockerfile 2. 添加Docker-compose的部署方式 3. 整理Docker相关的文件 --- Docker/Dockerfile | 13 +++++++++++++ Dockerfile => Docker/Dockerfile.develop | 0 Docker/docker-compose.yml | 14 ++++++++++++++ README.md | 17 +++++++++++++++++ Run/main.py | 3 ++- 5 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 Docker/Dockerfile rename Dockerfile => Docker/Dockerfile.develop (100%) create mode 100644 Docker/docker-compose.yml diff --git a/Docker/Dockerfile b/Docker/Dockerfile new file mode 100644 index 000000000..6ad6f5f53 --- /dev/null +++ b/Docker/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.6 +WORKDIR /usr/src/app +COPY . . + +ENV DEBIAN_FRONTEND noninteractive +ENV TZ Asia/Shanghai + +RUN pip install --no-cache-dir -r requirements.txt + +EXPOSE 5010 + +WORKDIR /usr/src/app/ +CMD [ "python", "Run/main.py" ] diff --git a/Dockerfile b/Docker/Dockerfile.develop similarity index 100% rename from Dockerfile rename to Docker/Dockerfile.develop diff --git a/Docker/docker-compose.yml b/Docker/docker-compose.yml new file mode 100644 index 000000000..9529745d5 --- /dev/null +++ b/Docker/docker-compose.yml @@ -0,0 +1,14 @@ +version: '2' +services: + proxy_pool: + volumes: + - ..:/usr/src/app + ports: + - "5010:5010" + links: + - proxy_redis + image: "proxy_pool" + proxy_redis: + ports: + - "6379:6379" + image: "redis" \ No newline at end of file diff --git a/README.md b/README.md index 47480fb92..e5cece52a 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,23 @@ port = 5010 # 监听端口 # 依次到Api下启动ProxyApi.py,Schedule下启动ProxyRefreshSchedule.py和ProxyValidSchedule.py即可. ``` +* 生产环境 Docker/docker-compose + +```shell +# Workdir proxy_pool +docker build -t proxy_pool . +pip install docker-compose +docker-compose -f Docker/docker-compose.yml up -d +``` + +* 开发环境 Docker + +```shell +# Workdir proxy_pool +docker build -t proxy_pool . +docker run -it --rm -v $(pwd):/usr/src/app -p 5010:5010 proxy_pool +``` + ### 使用   启动过几分钟后就能看到抓取到的代理IP,你可以直接到数据库中查看,推荐一个[SSDB可视化工具](https://github.com/jhao104/SSDBAdmin)。 diff --git a/Run/main.py b/Run/main.py index 6b07654ee..fcd84f6f4 100644 --- a/Run/main.py +++ b/Run/main.py @@ -15,7 +15,8 @@ import sys from multiprocessing import Process -sys.path.append('../') +sys.path.append('.') +sys.path.append('..') from Api.ProxyApi import run as ProxyApiRun from Schedule.ProxyValidSchedule import run as ValidRun From 935929db18effd7cd319a7de1dc0871419ba3267 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 9 Nov 2018 15:49:08 +0800 Subject: [PATCH 019/304] [fix] The Requests package through 2.19.1 before 2018-09-14 for Python sends an HTTP Authorization header to an http URI upon receiving a same-hostname https-to-http redirect, which makes it easier for remote attackers to discover credentials by sniffing the network. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5d00da69a..bc3581ff5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ APScheduler==3.2.0 werkzeug==0.11.15 Flask==0.12 -requests==2.12.4 +requests==2.20.0 lxml==3.7.2 pymongo From dcfa0e03777ee833ba06967c33b6cd39e0371384 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 9 Nov 2018 16:29:29 +0800 Subject: [PATCH 020/304] =?UTF-8?q?[update]=20=E4=BC=98=E5=8C=96=E6=8A=93?= =?UTF-8?q?=E5=8E=BB=E5=87=BD=E6=95=B0=EF=BC=8C=E6=AF=8F=E6=AC=A1=E5=B0=91?= =?UTF-8?q?=E6=8A=93=E4=B8=80=E4=BA=9B=20=E5=87=8F=E5=B0=91=E8=80=97?= =?UTF-8?q?=E6=97=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 52 ++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index bf2e03f61..a560dc700 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -15,17 +15,10 @@ import sys import requests -try: - from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 -except: - reload(sys) - sys.setdefaultencoding('utf-8') - sys.path.append('..') from Util.WebRequest import WebRequest from Util.utilFunction import getHtmlTree -from Util.utilFunction import verifyProxyFormat # for debug to disable insecureWarning requests.packages.urllib3.disable_warnings() @@ -48,9 +41,6 @@ class GetFreeProxy(object): proxy getter """ - def __init__(self): - pass - @staticmethod def freeProxyFirst(page=10): """ @@ -164,7 +154,7 @@ def freeProxySixth(): url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' request = WebRequest() try: - res = request.get(url).json() + res = request.get(url, timeout=10).json() for row in res['RESULT']['rows']: yield '{}:{}'.format(row['ip'], row['port']) except Exception as e: @@ -180,7 +170,7 @@ def freeProxySeventh(): 'https://www.kuaidaili.com/free/intr/{page}/' ] for url in url_list: - for page in range(1, 5): + for page in range(1, 2): page_url = url.format(page=page) tree = getHtmlTree(page_url) proxy_list = tree.xpath('.//table//tr') @@ -192,14 +182,14 @@ def freeProxyEight(): """ 秘密代理 http://www.mimiip.com """ - url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿 - url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿 - url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明 + url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 2)] # 国内高匿 + url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 2)] # 国内普匿 + url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 2)] # 国内透明 url_list = url_gngao + url_gnpu + url_gntou request = WebRequest() for url in url_list: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -213,7 +203,7 @@ def freeProxyNinth(): urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] request = WebRequest() for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -227,7 +217,7 @@ def freeProxyTen(): urls = ['http://www.ip3366.net/free/'] request = WebRequest() for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ":".join(proxy) @@ -246,14 +236,14 @@ def freeProxyEleven(): ] request = WebRequest() for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?[\s\S]*?\s*?(\d+)\s*?', r.text) for proxy in proxies: yield ":".join(proxy) @staticmethod - def freeProxyTwelve(page_count=8): + def freeProxyTwelve(page_count=2): """ guobanjia http://ip.jiangxianli.com/?page= 免费代理库 @@ -278,7 +268,7 @@ def freeProxyWallFirst(): urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] request = WebRequest() for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -293,7 +283,7 @@ def freeProxyWallSecond(): request = WebRequest() import base64 for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) for proxy in proxies: yield base64.b64decode(proxy).decode() @@ -303,7 +293,7 @@ def freeProxyWallThird(): urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] request = WebRequest() for url in urls: - r = request.get(url) + r = request.get(url, timeout=10) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ':'.join(proxy) @@ -312,7 +302,17 @@ def freeProxyWallThird(): if __name__ == '__main__': from CheckProxy import CheckProxy - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth) - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySixth) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySeventh) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEight) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve) - CheckProxy.checkAllGetProxyFunc() + # CheckProxy.checkAllGetProxyFunc() From f203ae19b6436b88d84d181a8f392c4044e04e09 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 9 Nov 2018 16:30:02 +0800 Subject: [PATCH 021/304] =?UTF-8?q?[update]=20=E6=A3=80=E6=9F=A5=20getter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/CheckProxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py index f6ba9b66a..f29824723 100644 --- a/ProxyGetter/CheckProxy.py +++ b/ProxyGetter/CheckProxy.py @@ -62,7 +62,7 @@ def checkGetProxyFunc(func): count = 0 for proxy in func(): if verifyProxyFormat(proxy): - log.info("fetch proxy: {}".format(proxy)) + log.info("{} fetch proxy: {}".format(func_name, proxy)) count += 1 log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count)) From 69eafeabdd11451adf2b6f42dac1620e729dcba3 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 9 Nov 2018 16:30:37 +0800 Subject: [PATCH 022/304] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0=E5=8F=AF?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=E4=BB=A3=E7=90=86=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Config.ini b/Config.ini index 24f570f01..1d46fc857 100644 --- a/Config.ini +++ b/Config.ini @@ -1,10 +1,9 @@ [DB] ;Configure the database information -;type: SSDB/REDIS/MONGODB if use redis, only modify the host port,the type should be SSDB +;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB type = SSDB host = 127.0.0.1 port = 6379 -;port = 8888 name = proxy #password = yourpassword @@ -15,17 +14,17 @@ freeProxySecond = 1 ;freeProxyThird = 1 freeProxyFourth = 1 freeProxyFifth = 1 -freeProxySixth = 1 +;freeProxySixth = 1 freeProxySeventh = 1 -freeProxyEight = 1 -freeProxyNinth = 1 +;freeProxyEight = 1 +;freeProxyNinth = 1 freeProxyTen = 1 freeProxyEleven = 1 freeProxyTwelve = 1 ;foreign website, outside the wall -freeProxyWallFirst = 1 -freeProxyWallSecond = 1 -freeProxyWallThird = 1 +;freeProxyWallFirst = 1 +;freeProxyWallSecond = 1 +;freeProxyWallThird = 1 [API] ; API config http://127.0.0.1:5010 From d77e1110e99c49bbe0d81a2beb3beb3f0bbe3205 Mon Sep 17 00:00:00 2001 From: 1again Date: Fri, 9 Nov 2018 20:34:35 +0800 Subject: [PATCH 023/304] =?UTF-8?q?[refine]=20Refine=20GetConfig=20?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 基于配置化的管理思想, 假设项目的任何地方都需要使用GetConfig 于是可以在GetConfig模块里生成一个config对象. 任何地方需要只要import即可. --- Api/ProxyApi.py | 3 +-- DB/DbClient.py | 19 +++++++++---------- Manager/ProxyManager.py | 5 ++--- Util/GetConfig.py | 2 ++ 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index b8977f9ca..99a0953a0 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -19,7 +19,7 @@ sys.path.append('../') -from Util.GetConfig import GetConfig +from Util.GetConfig import config from Manager.ProxyManager import ProxyManager app = Flask(__name__) @@ -84,7 +84,6 @@ def getStatus(): def run(): - config = GetConfig() if sys.platform.startswith("win"): app.run(host=config.host_ip, port=config.host_port) else: diff --git a/DB/DbClient.py b/DB/DbClient.py index 0036434ae..869c93af1 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -16,7 +16,7 @@ import os import sys -from Util.GetConfig import GetConfig +from Util.GetConfig import config from Util.utilClass import Singleton sys.path.append(os.path.dirname(os.path.abspath(__file__))) @@ -55,7 +55,6 @@ def __init__(self): init :return: """ - self.config = GetConfig() self.__initDbClient() def __initDbClient(self): @@ -64,19 +63,19 @@ def __initDbClient(self): :return: """ __type = None - if "SSDB" == self.config.db_type: + if "SSDB" == config.db_type: __type = "SsdbClient" - elif "REDIS" == self.config.db_type: + elif "REDIS" == config.db_type: __type = "RedisClient" - elif "MONGODB" == self.config.db_type: + elif "MONGODB" == config.db_type: __type = "MongodbClient" else: pass - assert __type, 'type error, Not support DB type: {}'.format(self.config.db_type) - self.client = getattr(__import__(__type), __type)(name=self.config.db_name, - host=self.config.db_host, - port=self.config.db_port, - password=self.config.db_password) + assert __type, 'type error, Not support DB type: {}'.format(config.db_type) + self.client = getattr(__import__(__type), __type)(name=config.db_name, + host=config.db_host, + port=config.db_port, + password=config.db_password) def get(self, key, **kwargs): return self.client.get(key, **kwargs) diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index 33aa76b39..a2f39b3c5 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -17,7 +17,7 @@ from Util import EnvUtil from DB.DbClient import DbClient -from Util.GetConfig import GetConfig +from Util.GetConfig import config from Util.LogHandler import LogHandler from Util.utilFunction import verifyProxyFormat from ProxyGetter.getFreeProxy import GetFreeProxy @@ -30,7 +30,6 @@ class ProxyManager(object): def __init__(self): self.db = DbClient() - self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' @@ -41,7 +40,7 @@ def refresh(self): :return: """ self.db.changeTable(self.raw_proxy_queue) - for proxyGetter in self.config.proxy_getter_functions: + for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info("{func}: fetch proxy start".format(func=proxyGetter)) diff --git a/Util/GetConfig.py b/Util/GetConfig.py index c4c31ab0e..efbbe5077 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -70,6 +70,8 @@ def host_port(self): def processes(self): return int(self.config_file.get('API', 'processes')) +config = GetConfig() + if __name__ == '__main__': gg = GetConfig() print(gg.db_type) From 40861f429011c53e25693e62daede4b47c253dd2 Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 12 Nov 2018 10:00:38 +0800 Subject: [PATCH 024/304] [update] config annotation --- Config.ini | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Config.ini b/Config.ini index 1d46fc857..54f690397 100644 --- a/Config.ini +++ b/Config.ini @@ -27,7 +27,10 @@ freeProxyTwelve = 1 ;freeProxyWallThird = 1 [API] -; API config http://127.0.0.1:5010 +# API config http://127.0.0.1:5010 +# The ip specified when starting the web API ip = 0.0.0.0 +# he port on which to run the web API port = 5010 +# Flask processes option processes = 10 From 2591918c874a001435b3ff0af8604e5070b8ff58 Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 12 Nov 2018 10:32:01 +0800 Subject: [PATCH 025/304] [update] formatting code --- Util/GetConfig.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Util/GetConfig.py b/Util/GetConfig.py index efbbe5077..5dfae9912 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -53,14 +53,13 @@ def db_password(self): password = None return password - @LazyProperty def proxy_getter_functions(self): return self.config_file.options('ProxyGetter') @LazyProperty def host_ip(self): - return self.config_file.get('API','ip') + return self.config_file.get('API', 'ip') @LazyProperty def host_port(self): @@ -70,6 +69,7 @@ def host_port(self): def processes(self): return int(self.config_file.get('API', 'processes')) + config = GetConfig() if __name__ == '__main__': From 8a0404521ddcf17031a5975f83c7b6b5a8e3b662 Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 12 Nov 2018 11:27:13 +0800 Subject: [PATCH 026/304] [update] set default pwd option --- Util/GetConfig.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Util/GetConfig.py b/Util/GetConfig.py index 5dfae9912..0f60fcd2f 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -47,11 +47,7 @@ def db_port(self): @LazyProperty def db_password(self): - try: - password = self.config_file.get('DB', 'password') - except Exception: - password = None - return password + return self.config_file.get('DB', 'password', fallback="default pwd") @LazyProperty def proxy_getter_functions(self): @@ -82,3 +78,4 @@ def processes(self): print(gg.host_ip) print(gg.host_port) print(gg.processes) + print(gg.db_password) From 6525ea8e09f3a128f0e2652d5d333005b41196c2 Mon Sep 17 00:00:00 2001 From: jhao Date: Tue, 13 Nov 2018 10:28:31 +0800 Subject: [PATCH 027/304] =?UTF-8?q?[update]=20=E8=B0=83=E6=95=B4=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E9=80=9F=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Schedule/ProxyRefreshSchedule.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Schedule/ProxyRefreshSchedule.py b/Schedule/ProxyRefreshSchedule.py index 38668072d..a61cc5a25 100644 --- a/Schedule/ProxyRefreshSchedule.py +++ b/Schedule/ProxyRefreshSchedule.py @@ -18,8 +18,7 @@ import time import logging from threading import Thread -# 使用后台调度,不使用阻塞式~ -from apscheduler.schedulers.background import BackgroundScheduler as Sch +from apscheduler.schedulers.background import BackgroundScheduler sys.path.append('../') @@ -74,7 +73,7 @@ def refreshPool(): pp.validProxy() -def batch_refresh(process_num=30): +def batchRefresh(process_num=30): # 检验新代理 pl = [] for num in range(process_num): @@ -89,21 +88,23 @@ def batch_refresh(process_num=30): pl[num].join() -def fetch_all(): +def fetchAll(): p = ProxyRefreshSchedule() # 获取新代理 p.refresh() def run(): - sch = Sch() - sch.add_job(fetch_all, 'interval', minutes=5) # 每5分钟抓取一次 - sch.add_job(batch_refresh, "interval", minutes=1) # 每分钟检查一次 - sch.start() - fetch_all() + scheduler = BackgroundScheduler() + # 不用太快, 网站更新速度比较慢, 太快会加大验证压力, 导致raw_proxy积压 + scheduler.add_job(fetchAll, 'interval', minutes=10, id="fetch_proxy") + scheduler.add_job(batchRefresh, "interval", minutes=1) # 每分钟检查一次 + scheduler.start() + + fetchAll() while True: - time.sleep(1) + time.sleep(3) if __name__ == '__main__': From c1e74b4237971caf9dfefede1405e0516c27fe7a Mon Sep 17 00:00:00 2001 From: jhao Date: Tue, 13 Nov 2018 10:29:14 +0800 Subject: [PATCH 028/304] =?UTF-8?q?[update]=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Manager/ProxyManager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index a2f39b3c5..c770b6224 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -36,7 +36,7 @@ def __init__(self): def refresh(self): """ - fetch proxy into Db by ProxyGetter + fetch proxy into Db by ProxyGetter/getFreeProxy.py :return: """ self.db.changeTable(self.raw_proxy_queue) @@ -45,7 +45,7 @@ def refresh(self): try: self.log.info("{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): - # 挨个存储 proxy,优化raw 队列的 push 速度,进而加快 check proxy 的速度 + # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) From e41a9cbe796744f91e395e4064ebb5c9ef82e39c Mon Sep 17 00:00:00 2001 From: jhao Date: Tue, 13 Nov 2018 10:30:01 +0800 Subject: [PATCH 029/304] [update] dbclient --- DB/DbClient.py | 10 ++++------ DB/SsdbClient.py | 14 ++++++-------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/DB/DbClient.py b/DB/DbClient.py index 869c93af1..f79fc8511 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -44,7 +44,7 @@ class DbClient(object): 所有方法需要相应类去具体实现: SSDB:SsdbClient.py - REDIS:RedisClient.py + REDIS:RedisClient.py 停用 统一使用SsdbClient.py """ @@ -66,7 +66,7 @@ def __initDbClient(self): if "SSDB" == config.db_type: __type = "SsdbClient" elif "REDIS" == config.db_type: - __type = "RedisClient" + __type = "SsdbClient" elif "MONGODB" == config.db_type: __type = "MongodbClient" else: @@ -107,7 +107,5 @@ def getNumber(self): if __name__ == "__main__": account = DbClient() - print(account.get()) - account.changeTable('use') - account.put('ac') - print(account.get()) + account.changeTable('useful_proxy') + print(account.pop()) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index 202ddaa8f..4ceedd1df 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -31,16 +31,13 @@ class SsdbClient(object): 验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1; """ - # 为了保持DbClient的标准 - # 在SsdbClient里面接受username参数, 但不进行使用. - # 因为不能将username通过kwargs传进redis.Redis里面, 会报错: - # TypeError: __init__() got an unexpected keyword argument 'username' - def __init__(self, name, username, **kwargs): + def __init__(self, name, **kwargs): """ init :param name: hash name - :param host: ssdb host - :param port: ssdb port + :param host: host + :param port: port + :param password: password :return: """ self.name = name @@ -114,6 +111,7 @@ def getNumber(self): def changeTable(self, name): self.name = name + if __name__ == '__main__': - c = SsdbClient('useful_proxy', '118.24.52.95', 8899) + c = SsdbClient(name='useful_proxy', host='127.0.0.1', port=8899, password=None) print(c.getAll()) From 428359c8dada998481f038dbdc8d3923e5850c0e Mon Sep 17 00:00:00 2001 From: jhao Date: Tue, 13 Nov 2018 14:02:03 +0800 Subject: [PATCH 030/304] Merge branch 'jhao104/master' of https://github.com/1again/proxy_pool into 1again-jhao104/master # Conflicts: # DB/DbClient.py # Util/GetConfig.py --- Config.ini | 2 +- README.md | 2 +- Util/GetConfig.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Config.ini b/Config.ini index 44ef085d2..c8a9cc266 100644 --- a/Config.ini +++ b/Config.ini @@ -3,7 +3,7 @@ ;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB type = SSDB host = 127.0.0.1 -port = 6379 +port = 8888 name = proxy ;username = your_username (Only Mongodb) ;password = your_password diff --git a/README.md b/README.md index e5cece52a..8bdca40c5 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ freeProxyCustom = 1 # 确保名字和你添加方法名字一致   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)| [@vc5](https://github.com/vc5) +  [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)| [@vc5](https://github.com/vc5)| [@1again](https://github.com/1again) ### Release Notes diff --git a/Util/GetConfig.py b/Util/GetConfig.py index 0f60fcd2f..cd354e20f 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -47,7 +47,7 @@ def db_port(self): @LazyProperty def db_password(self): - return self.config_file.get('DB', 'password', fallback="default pwd") + return self.config_file.get('DB', 'password', fallback=None) @LazyProperty def proxy_getter_functions(self): From 3c3ddaff09a346680c4bcfceb52fb5db0e690d1b Mon Sep 17 00:00:00 2001 From: incoding Date: Wed, 14 Nov 2018 13:17:00 +0800 Subject: [PATCH 031/304] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=9B=B4=E5=8A=A0?= =?UTF-8?q?=E4=B8=A5=E8=B0=A8=E7=9A=84=E4=BB=A3=E7=90=86=E6=A0=A1=E9=AA=8C?= =?UTF-8?q?=E8=A7=84=E5=88=99=EF=BC=88=E4=B8=80=E4=BA=9B=E9=9D=9E=E6=B3=95?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E4=B9=9F=E4=BC=9A=E8=BF=94=E5=9B=9E200?= =?UTF-8?q?=E7=8A=B6=E6=80=81=E7=A0=81=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Util/utilFunction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Util/utilFunction.py b/Util/utilFunction.py index fc26a59b1..ec86c1fe3 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -100,7 +100,7 @@ def validUsefulProxy(proxy): try: # 超过20秒的代理就不要了 r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=10, verify=False) - if r.status_code == 200: + if r.status_code == 200 and r.headers['content-type'].lower().find('application/json') != -1 and r.json()['origin']: # logger.info('%s is ok' % proxy) return True except Exception as e: From e5c1b89c919bae95fcb14e715d7b2e91115dfbe3 Mon Sep 17 00:00:00 2001 From: incoding Date: Wed, 14 Nov 2018 13:33:47 +0800 Subject: [PATCH 032/304] =?UTF-8?q?=E6=B7=BB=E5=8A=A0my=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E5=A4=B9=EF=BC=8C=E4=BF=9D=E5=AD=98=E5=AE=9A=E5=88=B6=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- my/Config.ini | 37 +++++++++++++++++++++++++++++++++++++ my/Dockerfile | 16 ++++++++++++++++ my/build.sh | 1 + my/run.sh | 14 ++++++++++++++ 4 files changed, 68 insertions(+) create mode 100644 my/Config.ini create mode 100644 my/Dockerfile create mode 100755 my/build.sh create mode 100755 my/run.sh diff --git a/my/Config.ini b/my/Config.ini new file mode 100644 index 000000000..627092c1a --- /dev/null +++ b/my/Config.ini @@ -0,0 +1,37 @@ +[DB] +;Configure the database information +;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB +type = PROXY_POOL_DB_TYPE +host = PROXY_POOL_DB_HOST +port = PROXY_POOL_DB_PORT +name = proxy +;username = your_username (Only Mongodb) +;password = your_password + +[ProxyGetter] +;register the proxy getter function +freeProxyFirst = 1 +freeProxySecond = 1 +;freeProxyThird = 1 +freeProxyFourth = 1 +freeProxyFifth = 1 +;freeProxySixth = 1 +freeProxySeventh = 1 +;freeProxyEight = 1 +;freeProxyNinth = 1 +freeProxyTen = 1 +freeProxyEleven = 1 +freeProxyTwelve = 1 +;foreign website, outside the wall +;freeProxyWallFirst = 1 +;freeProxyWallSecond = 1 +;freeProxyWallThird = 1 + +[API] +# API config http://127.0.0.1:5010 +# The ip specified when starting the web API +ip = 0.0.0.0 +# he port on which to run the web API +port = 8080 +# Flask processes option +processes = 10 diff --git a/my/Dockerfile b/my/Dockerfile new file mode 100644 index 000000000..cb042627c --- /dev/null +++ b/my/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.6 + +WORKDIR /usr/src/app + +ENV TZ=Asia/Shanghai \ + PROXY_POOL_DB_TYPE=SSDB \ + PROXY_POOL_DB_HOST=redis \ + PROXY_POOL_DB_PORT=6379 + +COPY . . + +RUN pip install --no-cache-dir -r requirements.txt && cp my/Config.ini ./ + +CMD [ "my/run.sh" ] + +EXPOSE 8080 diff --git a/my/build.sh b/my/build.sh new file mode 100755 index 000000000..328e9449d --- /dev/null +++ b/my/build.sh @@ -0,0 +1 @@ +docker build -t registry.cn-beijing.aliyuncs.com/ryttech/proxy_pool:1.12.20181114 -f my/Dockerfile . \ No newline at end of file diff --git a/my/run.sh b/my/run.sh new file mode 100755 index 000000000..441ace853 --- /dev/null +++ b/my/run.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +for var in \ + PROXY_POOL_DB_TYPE \ + PROXY_POOL_DB_HOST \ + PROXY_POOL_DB_PORT \ +; do + val="${!var}" + if [ "$val" ]; then + sed -ri "s/$var/$val/" Config.ini + fi +done + +python Run/main.py \ No newline at end of file From 110b0df1e29529346314378155890d740064ca0b Mon Sep 17 00:00:00 2001 From: jhao Date: Wed, 14 Nov 2018 16:51:41 +0800 Subject: [PATCH 033/304] Merge branch 'jhao104/master' of https://github.com/1again/proxy_pool into 1again-jhao104/master # Conflicts: # DB/DbClient.py # Util/GetConfig.py --- Api/ProxyApi.py | 5 +---- Config.ini | 4 +--- Schedule/ProxyValidSchedule.py | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index 99a0953a0..fc759a363 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -84,10 +84,7 @@ def getStatus(): def run(): - if sys.platform.startswith("win"): - app.run(host=config.host_ip, port=config.host_port) - else: - app.run(host=config.host_ip, port=config.host_port, threaded=False, processes=config.processes) + app.run(host=config.host_ip, port=config.host_port) if __name__ == '__main__': diff --git a/Config.ini b/Config.ini index c8a9cc266..5bdf095a1 100644 --- a/Config.ini +++ b/Config.ini @@ -32,6 +32,4 @@ freeProxyTwelve = 1 # The ip specified when starting the web API ip = 0.0.0.0 # he port on which to run the web API -port = 5010 -# Flask processes option -processes = 10 +port = 8080 diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index 9b075cf90..098c8a336 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -32,7 +32,7 @@ def __init__(self): self.queue = Queue() self.proxy_item = dict() - def __validProxy(self, threads=10): + def __validProxy(self, threads=20): """ 验证useful_proxy代理 :param threads: 线程数 From a3ba910f391fd0220f357f926ef2b5ab6e0a973f Mon Sep 17 00:00:00 2001 From: windhw Date: Thu, 6 Dec 2018 12:48:10 +0800 Subject: [PATCH 034/304] Update main.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加对SIGTERM的处理,这样在后台运行的时候,如果kill掉主进程,子进程也能kill --- Run/main.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Run/main.py b/Run/main.py index fcd84f6f4..cce7b6142 100644 --- a/Run/main.py +++ b/Run/main.py @@ -12,7 +12,7 @@ """ __author__ = 'JHao' -import sys +import sys,signal from multiprocessing import Process sys.path.append('.') @@ -31,6 +31,14 @@ def run(): p_list.append(p2) p3 = Process(target=RefreshRun, name='RefreshRun') p_list.append(p3) + + def kill_child_processes(signum,frame): + for p in p_list: + p.terminate() + sys.exit(1) + + signal.signal(signal.SIGTERM, kill_child_processes) + for p in p_list: p.daemon = True From 2260c6d02f2374d7b4952787cac964f648ffd2b2 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 7 Dec 2018 14:21:51 +0800 Subject: [PATCH 035/304] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0httpbin?= =?UTF-8?q?=E6=A3=80=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Util/utilFunction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Util/utilFunction.py b/Util/utilFunction.py index ec86c1fe3..f4e802263 100644 --- a/Util/utilFunction.py +++ b/Util/utilFunction.py @@ -100,7 +100,7 @@ def validUsefulProxy(proxy): try: # 超过20秒的代理就不要了 r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=10, verify=False) - if r.status_code == 200 and r.headers['content-type'].lower().find('application/json') != -1 and r.json()['origin']: + if r.status_code == 200 and r.json().get("origin"): # logger.info('%s is ok' % proxy) return True except Exception as e: From 26aaf1851a5b9bf4bc84ab344835d37d857ab6d7 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 7 Dec 2018 14:23:49 +0800 Subject: [PATCH 036/304] =?UTF-8?q?=E3=80=90del=E3=80=91delete=20un=20use?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- my/Config.ini | 37 ------------------------------------- my/Dockerfile | 16 ---------------- my/build.sh | 1 - my/run.sh | 14 -------------- 4 files changed, 68 deletions(-) delete mode 100644 my/Config.ini delete mode 100644 my/Dockerfile delete mode 100755 my/build.sh delete mode 100755 my/run.sh diff --git a/my/Config.ini b/my/Config.ini deleted file mode 100644 index 627092c1a..000000000 --- a/my/Config.ini +++ /dev/null @@ -1,37 +0,0 @@ -[DB] -;Configure the database information -;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB -type = PROXY_POOL_DB_TYPE -host = PROXY_POOL_DB_HOST -port = PROXY_POOL_DB_PORT -name = proxy -;username = your_username (Only Mongodb) -;password = your_password - -[ProxyGetter] -;register the proxy getter function -freeProxyFirst = 1 -freeProxySecond = 1 -;freeProxyThird = 1 -freeProxyFourth = 1 -freeProxyFifth = 1 -;freeProxySixth = 1 -freeProxySeventh = 1 -;freeProxyEight = 1 -;freeProxyNinth = 1 -freeProxyTen = 1 -freeProxyEleven = 1 -freeProxyTwelve = 1 -;foreign website, outside the wall -;freeProxyWallFirst = 1 -;freeProxyWallSecond = 1 -;freeProxyWallThird = 1 - -[API] -# API config http://127.0.0.1:5010 -# The ip specified when starting the web API -ip = 0.0.0.0 -# he port on which to run the web API -port = 8080 -# Flask processes option -processes = 10 diff --git a/my/Dockerfile b/my/Dockerfile deleted file mode 100644 index cb042627c..000000000 --- a/my/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -FROM python:3.6 - -WORKDIR /usr/src/app - -ENV TZ=Asia/Shanghai \ - PROXY_POOL_DB_TYPE=SSDB \ - PROXY_POOL_DB_HOST=redis \ - PROXY_POOL_DB_PORT=6379 - -COPY . . - -RUN pip install --no-cache-dir -r requirements.txt && cp my/Config.ini ./ - -CMD [ "my/run.sh" ] - -EXPOSE 8080 diff --git a/my/build.sh b/my/build.sh deleted file mode 100755 index 328e9449d..000000000 --- a/my/build.sh +++ /dev/null @@ -1 +0,0 @@ -docker build -t registry.cn-beijing.aliyuncs.com/ryttech/proxy_pool:1.12.20181114 -f my/Dockerfile . \ No newline at end of file diff --git a/my/run.sh b/my/run.sh deleted file mode 100755 index 441ace853..000000000 --- a/my/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -for var in \ - PROXY_POOL_DB_TYPE \ - PROXY_POOL_DB_HOST \ - PROXY_POOL_DB_PORT \ -; do - val="${!var}" - if [ "$val" ]; then - sed -ri "s/$var/$val/" Config.ini - fi -done - -python Run/main.py \ No newline at end of file From 223f57d1eb8d243b1d69e28b90a39f0529ec4407 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 7 Dec 2018 15:29:55 +0800 Subject: [PATCH 037/304] [fix] fix password --- Util/GetConfig.py | 9 ++------- Util/utilClass.py | 4 ++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/Util/GetConfig.py b/Util/GetConfig.py index cd354e20f..c25035504 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -26,7 +26,7 @@ class GetConfig(object): def __init__(self): self.pwd = os.path.split(os.path.realpath(__file__))[0] self.config_path = os.path.join(os.path.split(self.pwd)[0], 'Config.ini') - self.config_file = ConfigParse() + self.config_file = ConfigParse(defaults={"password": None}) self.config_file.read(self.config_path) @LazyProperty @@ -47,7 +47,7 @@ def db_port(self): @LazyProperty def db_password(self): - return self.config_file.get('DB', 'password', fallback=None) + return self.config_file.get('DB', 'password') @LazyProperty def proxy_getter_functions(self): @@ -61,10 +61,6 @@ def host_ip(self): def host_port(self): return int(self.config_file.get('API', 'port')) - @LazyProperty - def processes(self): - return int(self.config_file.get('API', 'processes')) - config = GetConfig() @@ -77,5 +73,4 @@ def processes(self): print(gg.proxy_getter_functions) print(gg.host_ip) print(gg.host_port) - print(gg.processes) print(gg.db_password) diff --git a/Util/utilClass.py b/Util/utilClass.py index 89112ffd8..b3a35f141 100644 --- a/Util/utilClass.py +++ b/Util/utilClass.py @@ -44,8 +44,8 @@ class ConfigParse(ConfigParser): rewrite ConfigParser, for support upper option """ - def __init__(self): - ConfigParser.__init__(self) + def __init__(self, *args, **kwargs): + ConfigParser.__init__(self, *args, **kwargs) def optionxform(self, optionstr): return optionstr From d49a66a6a1051e2eb86231e03a6a0ab3875dee1e Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 15 Feb 2019 16:02:02 +0800 Subject: [PATCH 038/304] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?= =?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Api/ProxyApi.py | 2 +- Config.ini | 2 +- Config/ConfigGetter.py | 71 ++++++++++++++++++++++++ {Test => Config}/__init__.py | 11 ++-- Config/setting.py | 54 ++++++++++++++++++ DB/DbClient.py | 2 +- DB/SsdbClient.py | 4 +- Manager/ProxyManager.py | 2 +- Schedule/ProxyValidSchedule.py | 4 +- Test/.pytest_cache/v/cache/lastfailed | 3 - Test/.pytest_cache/v/cache/nodeids | 3 - Test/{testGetConfig.py => testConfig.py} | 22 ++++---- Util/GetConfig.py | 7 +-- Util/utilClass.py | 19 ------- test.py | 5 +- 15 files changed, 154 insertions(+), 57 deletions(-) create mode 100644 Config/ConfigGetter.py rename {Test => Config}/__init__.py (56%) create mode 100644 Config/setting.py delete mode 100644 Test/.pytest_cache/v/cache/lastfailed delete mode 100644 Test/.pytest_cache/v/cache/nodeids rename Test/{testGetConfig.py => testConfig.py} (60%) diff --git a/Api/ProxyApi.py b/Api/ProxyApi.py index fc759a363..91df76f88 100644 --- a/Api/ProxyApi.py +++ b/Api/ProxyApi.py @@ -19,7 +19,7 @@ sys.path.append('../') -from Util.GetConfig import config +from Config.ConfigGetter import config from Manager.ProxyManager import ProxyManager app = Flask(__name__) diff --git a/Config.ini b/Config.ini index 5bdf095a1..ee13eaf2c 100644 --- a/Config.ini +++ b/Config.ini @@ -1,6 +1,6 @@ [DB] ;Configure the database information -;type: SSDB/MONGODB if use redis, only modify the host port,the type should be SSDB +;type: SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB type = SSDB host = 127.0.0.1 port = 8888 diff --git a/Config/ConfigGetter.py b/Config/ConfigGetter.py new file mode 100644 index 000000000..56c766c0d --- /dev/null +++ b/Config/ConfigGetter.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: ConfigGetter + Description : 读取配置 + Author : JHao + date: 2019/2/15 +------------------------------------------------- + Change Activity: + 2019/2/15: +------------------------------------------------- +""" +__author__ = 'JHao' + + +from Util.utilClass import LazyProperty +from Config.setting import * + + +class ConfigGetter(object): + """ + get config + """ + + def __init__(self): + pass + + @LazyProperty + def db_type(self): + return DATABASES.get("default", {}).get("TYPE", "SSDB") + + @LazyProperty + def db_name(self): + return DATABASES.get("default", {}).get("NAME", "proxy") + + @LazyProperty + def db_host(self): + return DATABASES.get("default", {}).get("HOST", "127.0.0.1") + + @LazyProperty + def db_port(self): + return DATABASES.get("default", {}).get("PORT", 8080) + + @LazyProperty + def db_password(self): + return DATABASES.get("default", {}).get("PASSWORD", "") + + @LazyProperty + def proxy_getter_functions(self): + return PROXY_GETTER + + @LazyProperty + def host_ip(self): + return SERVER_API.get("HOST", "127.0.0.1") + + @LazyProperty + def host_port(self): + return SERVER_API.get("PORT", 5010) + + +config = ConfigGetter() + +if __name__ == '__main__': + print(config.db_type) + print(config.db_name) + print(config.db_host) + print(config.db_port) + print(config.proxy_getter_functions) + print(config.host_ip) + print(config.host_port) + print(config.db_password) diff --git a/Test/__init__.py b/Config/__init__.py similarity index 56% rename from Test/__init__.py rename to Config/__init__.py index 898942953..9a7d547ee 100644 --- a/Test/__init__.py +++ b/Config/__init__.py @@ -1,13 +1,12 @@ # -*- coding: utf-8 -*- """ ------------------------------------------------- - File Name: __init__.py - Description : - Author : J_hao - date: 2017/7/31 + File Name: __init__ + Description : + Author : JHao + date: 2019/2/15 ------------------------------------------------- Change Activity: - 2017/7/31: + 2019/2/15: ------------------------------------------------- """ -__author__ = 'J_hao' diff --git a/Config/setting.py b/Config/setting.py new file mode 100644 index 000000000..39ae36748 --- /dev/null +++ b/Config/setting.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: setting.py + Description : 配置文件 + Author : JHao + date: 2019/2/15 +------------------------------------------------- + Change Activity: + 2019/2/15: +------------------------------------------------- +""" + +# database config + +DATABASES = { + "default": { + "TYPE": "SSDB", # TYPE SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB + "HOST": "127.0.0.1", + "PORT": 8888, + "NAME": "proxy", + "PASSWORD": "" + + } +} + +# register the proxy getter function + +PROXY_GETTER = [ + "freeProxyFirst", + "freeProxySecond", + # "freeProxyThird", + "freeProxyFourth", + "freeProxyFifth", + # "freeProxySixth" + "freeProxySeventh", + # "freeProxyEight", + # "freeProxyNinth", + "freeProxyTen", + "freeProxyEleven", + "freeProxyTwelve", + # foreign website, outside the wall + "freeProxyWallFirst", + "freeProxyWallSecond", + "freeProxyWallThird" +] + + +# # API config http://127.0.0.1:5010 + +SERVER_API = { + "HOST": "0.0.0.0", # The ip specified which starting the web API + "PORT": 5010 # port number to which the server listens to +} \ No newline at end of file diff --git a/DB/DbClient.py b/DB/DbClient.py index f79fc8511..baa1f79fc 100644 --- a/DB/DbClient.py +++ b/DB/DbClient.py @@ -16,7 +16,7 @@ import os import sys -from Util.GetConfig import config +from Config.ConfigGetter import config from Util.utilClass import Singleton sys.path.append(os.path.dirname(os.path.abspath(__file__))) diff --git a/DB/SsdbClient.py b/DB/SsdbClient.py index 4ceedd1df..85545b355 100644 --- a/DB/SsdbClient.py +++ b/DB/SsdbClient.py @@ -3,7 +3,7 @@ """ ------------------------------------------------- File Name: SsdbClient.py - Description : 封装SSDB操作 + Description : 封装SSDB/Redis操作 Author : JHao date: 2016/12/2 ------------------------------------------------- @@ -27,7 +27,7 @@ class SsdbClient(object): SSDB client SSDB中代理存放的容器为hash: - 原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为为None,以后扩展可能会加入代理属性; + 原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为None,以后扩展可能会加入代理属性; 验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1; """ diff --git a/Manager/ProxyManager.py b/Manager/ProxyManager.py index c770b6224..fd007773b 100644 --- a/Manager/ProxyManager.py +++ b/Manager/ProxyManager.py @@ -17,7 +17,7 @@ from Util import EnvUtil from DB.DbClient import DbClient -from Util.GetConfig import config +from Config.ConfigGetter import config from Util.LogHandler import LogHandler from Util.utilFunction import verifyProxyFormat from ProxyGetter.getFreeProxy import GetFreeProxy diff --git a/Schedule/ProxyValidSchedule.py b/Schedule/ProxyValidSchedule.py index 098c8a336..6b1fa6485 100644 --- a/Schedule/ProxyValidSchedule.py +++ b/Schedule/ProxyValidSchedule.py @@ -56,8 +56,8 @@ def main(self): self.log.info("Start valid useful proxy") self.__validProxy() else: - self.log.info('Valid Complete! sleep 5 minutes.') - time.sleep(60 * 5) + self.log.info('Valid Complete! sleep 5 sec.') + time.sleep(5) self.putQueue() def putQueue(self): diff --git a/Test/.pytest_cache/v/cache/lastfailed b/Test/.pytest_cache/v/cache/lastfailed deleted file mode 100644 index 65c9a06d6..000000000 --- a/Test/.pytest_cache/v/cache/lastfailed +++ /dev/null @@ -1,3 +0,0 @@ -{ - "testGetFreeProxy.py::testGetFreeProxy": true -} \ No newline at end of file diff --git a/Test/.pytest_cache/v/cache/nodeids b/Test/.pytest_cache/v/cache/nodeids deleted file mode 100644 index 0ce3684ce..000000000 --- a/Test/.pytest_cache/v/cache/nodeids +++ /dev/null @@ -1,3 +0,0 @@ -[ - "testGetFreeProxy.py::testGetFreeProxy" -] \ No newline at end of file diff --git a/Test/testGetConfig.py b/Test/testConfig.py similarity index 60% rename from Test/testGetConfig.py rename to Test/testConfig.py index 7f44fa6b4..7ed759387 100644 --- a/Test/testGetConfig.py +++ b/Test/testConfig.py @@ -12,22 +12,22 @@ """ __author__ = 'J_hao' -from Util.GetConfig import GetConfig +from Config.ConfigGetter import config # noinspection PyPep8Naming -def testGetConfig(): +def testConfig(): """ - test class GetConfig in Util/GetConfig :return: """ - gg = GetConfig() - print(gg.db_type) - print(gg.db_name) - print(gg.db_host) - print(gg.db_port) - assert isinstance(gg.proxy_getter_functions, list) - print(gg.proxy_getter_functions) + print(config.db_type) + print(config.db_name) + print(config.db_host) + print(config.db_port) + print(config.db_password) + assert isinstance(config.proxy_getter_functions, list) + print(config.proxy_getter_functions) + if __name__ == '__main__': - testGetConfig() + testConfig() diff --git a/Util/GetConfig.py b/Util/GetConfig.py index c25035504..65554b317 100644 --- a/Util/GetConfig.py +++ b/Util/GetConfig.py @@ -13,8 +13,6 @@ """ __author__ = 'JHao' -import os -from Util.utilClass import ConfigParse from Util.utilClass import LazyProperty @@ -24,10 +22,7 @@ class GetConfig(object): """ def __init__(self): - self.pwd = os.path.split(os.path.realpath(__file__))[0] - self.config_path = os.path.join(os.path.split(self.pwd)[0], 'Config.ini') - self.config_file = ConfigParse(defaults={"password": None}) - self.config_file.read(self.config_path) + pass @LazyProperty def db_type(self): diff --git a/Util/utilClass.py b/Util/utilClass.py index b3a35f141..cffe72443 100644 --- a/Util/utilClass.py +++ b/Util/utilClass.py @@ -9,7 +9,6 @@ ------------------------------------------------- Change Activity: 2016/12/3: Class LazyProperty - 2016/12/4: rewrite ConfigParser ------------------------------------------------- """ __author__ = 'JHao' @@ -33,24 +32,6 @@ def __get__(self, instance, owner): return value -try: - from configparser import ConfigParser # py3 -except: - from ConfigParser import ConfigParser # py2 - - -class ConfigParse(ConfigParser): - """ - rewrite ConfigParser, for support upper option - """ - - def __init__(self, *args, **kwargs): - ConfigParser.__init__(self, *args, **kwargs) - - def optionxform(self, optionstr): - return optionstr - - class Singleton(type): """ Singleton Metaclass diff --git a/test.py b/test.py index 518710d3b..d636535a9 100644 --- a/test.py +++ b/test.py @@ -12,4 +12,7 @@ """ __author__ = 'JHao' -from Schedule import ProxyRefreshSchedule \ No newline at end of file +from Test import testConfig + +if __name__ == '__main__': + testConfig.testConfig() From 2b54d4af03c96515198fada0ee630cf98ea52cf9 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 15 Feb 2019 16:06:33 +0800 Subject: [PATCH 039/304] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?= =?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Test/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 Test/__init__.py diff --git a/Test/__init__.py b/Test/__init__.py new file mode 100644 index 000000000..9b16c75ff --- /dev/null +++ b/Test/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: __init__ + Description : + Author : JHao + date: 2019/2/15 +------------------------------------------------- + Change Activity: + 2019/2/15: +------------------------------------------------- +""" +__author__ = 'JHao' \ No newline at end of file From f00a4569d26ef963656cf9b7617cec9f8780e666 Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 15 Feb 2019 16:24:37 +0800 Subject: [PATCH 040/304] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?= =?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 61 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 8bdca40c5..4f253af81 100644 --- a/README.md +++ b/README.md @@ -39,25 +39,41 @@ git clone git@github.com:jhao104/proxy_pool.git pip install -r requirements.txt ``` -* 配置Config.ini: +* 配置Config/setting.py: ```shell -# Config.ini 为项目配置文件 -# 配置DB -type = SSDB # 如果使用SSDB或redis数据库,均配置为SSDB -host = localhost # db host -port = 8888 # db port -name = proxy # 默认配置 +# Config/setting.py 为项目配置文件 + +# 配置DB +DATABASES = { + "default": { + "TYPE": "SSDB", # 如果使用SSDB或redis数据库,均配置为SSDB + "HOST": "127.0.0.1", # db host + "PORT": 8888, # db port + "NAME": "proxy", # 默认配置 + "PASSWORD": "" # db password + + } +} + # 配置 ProxyGetter -freeProxyFirst = 1 # 这里是启动的抓取函数,可在ProxyGetter/getFreeProxy.py 扩展 -freeProxySecond = 1 -.... -# 配置 HOST (api服务) -ip = 127.0.0.1 # 监听ip,0.0.0.0开启外网访问 -port = 5010 # 监听端口 -# 上面配置启动后,代理api地址为 http://127.0.0.1:5010 +PROXY_GETTER = [ + "freeProxyFirst", # 这里是启用的代理抓取函数名,可在ProxyGetter/getFreeProxy.py 扩展 + "freeProxySecond", + .... +] + + +# 配置 API服务 + +SERVER_API = { + "HOST": "0.0.0.0", # 监听ip, 0.0.0.0 监听所有IP + "PORT": 5010 # 监听端口 +} + +# 上面配置启动后,代理池访问地址为 http://127.0.0.1:5010 ``` @@ -164,18 +180,17 @@ class GetFreeProxy(object): # 确保每个proxy都是 host:ip正确的格式就行 ``` -* 2、添加好方法后,修改Config.ini文件中的`[ProxyGetter]`项: +* 2、添加好方法后,修改Config/setting.py文件中的`PROXY_GETTER`项: -  在`Config.ini`的`[ProxyGetter]`下添加自定义的方法的名字: +  在`PROXY_GETTER`下添加自定义的方法的名字: ```shell - -[ProxyGetter] -;register the proxy getter function -freeProxyFirst = 0 # 如果要取消某个方法,将其删除或赋为0即可 -.... -freeProxyCustom = 1 # 确保名字和你添加方法名字一致 - +PROXY_GETTER = [ + "freeProxyFirst", + "freeProxySecond", + .... + "freeProxyCustom" # # 确保名字和你添加方法名字一致 +] ``` From 16c5a04ba43c05608261581a6affeee1a9d1728f Mon Sep 17 00:00:00 2001 From: jhao Date: Fri, 15 Feb 2019 16:29:33 +0800 Subject: [PATCH 041/304] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?= =?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config.ini | 35 -------------------- Config/setting.py | 2 +- Test/testConfig.py | 2 +- Test/testGetFreeProxy.py | 11 +++---- Util/GetConfig.py | 71 ---------------------------------------- 5 files changed, 7 insertions(+), 114 deletions(-) delete mode 100644 Config.ini delete mode 100644 Util/GetConfig.py diff --git a/Config.ini b/Config.ini deleted file mode 100644 index ee13eaf2c..000000000 --- a/Config.ini +++ /dev/null @@ -1,35 +0,0 @@ -[DB] -;Configure the database information -;type: SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB -type = SSDB -host = 127.0.0.1 -port = 8888 -name = proxy -;username = your_username (Only Mongodb) -;password = your_password - -[ProxyGetter] -;register the proxy getter function -freeProxyFirst = 1 -freeProxySecond = 1 -;freeProxyThird = 1 -freeProxyFourth = 1 -freeProxyFifth = 1 -;freeProxySixth = 1 -freeProxySeventh = 1 -;freeProxyEight = 1 -;freeProxyNinth = 1 -freeProxyTen = 1 -freeProxyEleven = 1 -freeProxyTwelve = 1 -;foreign website, outside the wall -;freeProxyWallFirst = 1 -;freeProxyWallSecond = 1 -;freeProxyWallThird = 1 - -[API] -# API config http://127.0.0.1:5010 -# The ip specified when starting the web API -ip = 0.0.0.0 -# he port on which to run the web API -port = 8080 diff --git a/Config/setting.py b/Config/setting.py index 39ae36748..8b87191fa 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -51,4 +51,4 @@ SERVER_API = { "HOST": "0.0.0.0", # The ip specified which starting the web API "PORT": 5010 # port number to which the server listens to -} \ No newline at end of file +} diff --git a/Test/testConfig.py b/Test/testConfig.py index 7ed759387..ebfd1171f 100644 --- a/Test/testConfig.py +++ b/Test/testConfig.py @@ -2,7 +2,7 @@ """ ------------------------------------------------- File Name: testGetConfig - Description : test all function in GetConfig.py + Description : testGetConfig Author : J_hao date: 2017/7/31 ------------------------------------------------- diff --git a/Test/testGetFreeProxy.py b/Test/testGetFreeProxy.py index 33c3f9e46..854172773 100644 --- a/Test/testGetFreeProxy.py +++ b/Test/testGetFreeProxy.py @@ -16,7 +16,6 @@ import sys import requests - try: from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 except: @@ -25,7 +24,7 @@ sys.path.append('..') from ProxyGetter.getFreeProxy import GetFreeProxy -from Util.GetConfig import GetConfig +from Config.ConfigGetter import config # noinspection PyPep8Naming @@ -34,15 +33,15 @@ def testGetFreeProxy(): test class GetFreeProxy in ProxyGetter/GetFreeProxy :return: """ - gc = GetConfig() - proxy_getter_functions = gc.proxy_getter_functions + proxy_getter_functions = config.proxy_getter_functions for proxyGetter in proxy_getter_functions: proxy_count = 0 for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: - print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy,proxy_count=proxy_count)) + print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy, + proxy_count=proxy_count)) proxy_count += 1 - #assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) + # assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) if __name__ == '__main__': diff --git a/Util/GetConfig.py b/Util/GetConfig.py deleted file mode 100644 index 65554b317..000000000 --- a/Util/GetConfig.py +++ /dev/null @@ -1,71 +0,0 @@ -# -*- coding: utf-8 -*- -# !/usr/bin/env python -""" -------------------------------------------------- - File Name: GetConfig.py - Description : fetch config from config.ini - Author : JHao - date: 2016/12/3 -------------------------------------------------- - Change Activity: - 2016/12/3: get db property func -------------------------------------------------- -""" -__author__ = 'JHao' - -from Util.utilClass import LazyProperty - - -class GetConfig(object): - """ - to get config from config.ini - """ - - def __init__(self): - pass - - @LazyProperty - def db_type(self): - return self.config_file.get('DB', 'type') - - @LazyProperty - def db_name(self): - return self.config_file.get('DB', 'name') - - @LazyProperty - def db_host(self): - return self.config_file.get('DB', 'host') - - @LazyProperty - def db_port(self): - return int(self.config_file.get('DB', 'port')) - - @LazyProperty - def db_password(self): - return self.config_file.get('DB', 'password') - - @LazyProperty - def proxy_getter_functions(self): - return self.config_file.options('ProxyGetter') - - @LazyProperty - def host_ip(self): - return self.config_file.get('API', 'ip') - - @LazyProperty - def host_port(self): - return int(self.config_file.get('API', 'port')) - - -config = GetConfig() - -if __name__ == '__main__': - gg = GetConfig() - print(gg.db_type) - print(gg.db_name) - print(gg.db_host) - print(gg.db_port) - print(gg.proxy_getter_functions) - print(gg.host_ip) - print(gg.host_port) - print(gg.db_password) From 55e71981168e57658371e27f7b9517011cca653f Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 18 Feb 2019 10:53:03 +0800 Subject: [PATCH 042/304] =?UTF-8?q?[update]=20=E4=BD=BF=E7=94=A8setting.py?= =?UTF-8?q?=E6=9B=BF=E6=8D=A2Config.ini=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index 4f253af81..b62864f2d 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,27 @@ PROXY_GETTER = [   `ProxyRefreshSchedule`会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 +### 代理采集 + + 目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)): + + | 厂商名称 | 状态 | 更新速度 | 可用率 | 是否被墙 | 地址 | + | ----- | ---- | -------- | ------ | --------- | ----- | + | 无忧代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.data5u.com/free/index.html) | + | 66代理 | 可用 | 更新很慢 | * | 否 | [地址](http://www.66ip.cn/) | + | 西刺代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.xicidaili.com)| + | 全网代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.goubanjia.com/)| + | 训代理 | 已关闭免费代理 | * | * | 否 | [地址](http://www.xdaili.cn/)| + | 快代理 | 可用 |几分钟一次| * | 否 | [地址](https://www.kuaidaili.com/)| + | 云代理 | 可用 |几分钟一次| * | 否 | [地址](http://www.ip3366.net/)| + | IP海 | 可用 |几小时一次| * | 否 | [地址](http://www.iphai.com/)| + | 免费IP代理库 | 可用 |快| * | 否 | [地址](http://ip.jiangxianli.com/)| + | 中国IP地址 | 可用 |几分钟一次| * | 是 | [地址](http://cn-proxy.com/)| + | Proxy List | 可用 |几分钟一次| * | 是 | [地址](https://proxy-list.org/chinese/index.php)| + | ProxyList+ | 可用 |几分钟一次| * | 是 | [地址](https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1)| + + 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 + ### 问题反馈   任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,如果没有账号可以去 我的[博客](http://www.spiderpy.cn/blog/message)中留言。 From 086074c4288167871a3c23b34346ab59db01f29c Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 18 Feb 2019 11:17:38 +0800 Subject: [PATCH 043/304] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B066?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 48 ++++++++++++++----------------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index a560dc700..caa5b6e9c 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -23,18 +23,6 @@ # for debug to disable insecureWarning requests.packages.urllib3.disable_warnings() -""" - 66ip.cn - data5u.com - xicidaili.com - goubanjia.com - xdaili.cn - kuaidaili.com - cn-proxy.com - proxy-list.org - www.mimiip.com to do -""" - class GetFreeProxy(object): """ @@ -64,24 +52,24 @@ def freeProxyFirst(page=10): print(e) @staticmethod - def freeProxySecond(area=33, page=1): + def freeProxySecond(count=20): """ 代理66 http://www.66ip.cn/ - :param area: 抓取代理页数,page=1北京代理页,page=2上海代理页...... - :param page: 翻页 + :param count: 提取数量 :return: """ - area = 33 if area > 33 else area - for area_index in range(1, area + 1): - for i in range(1, page + 1): - url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i) - html_tree = getHtmlTree(url) - tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]") - if len(tr_list) == 0: - continue - for tr in tr_list: - yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0] - break + urls = [ + "http://www.66ip.cn/mo.php?sxb=&tqsl={count}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=", + "http://www.66ip.cn/nmtq.php?getnum={count}" + "&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip", + ] + request = WebRequest() + for _ in urls: + url = _.format(count=count) + html = request.get(url).content + ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", html) + for ip in ips: + yield ip.strip() @staticmethod def freeProxyThird(days=1): @@ -180,7 +168,7 @@ def freeProxySeventh(): @staticmethod def freeProxyEight(): """ - 秘密代理 http://www.mimiip.com + 秘密代理 http://www.mimiip.com 不能用 """ url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 2)] # 国内高匿 url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 2)] # 国内普匿 @@ -197,7 +185,7 @@ def freeProxyEight(): @staticmethod def freeProxyNinth(): """ - 码农代理 https://proxy.coderbusy.com/ + 码农代理 https://proxy.coderbusy.com/ 不能用 :return: """ urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] @@ -303,7 +291,7 @@ def freeProxyWallThird(): from CheckProxy import CheckProxy # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) - # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth) @@ -313,6 +301,6 @@ def freeProxyWallThird(): # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve) # CheckProxy.checkAllGetProxyFunc() From 792fd13e780205823e872d1370daa46a8b088e97 Mon Sep 17 00:00:00 2001 From: jhao Date: Mon, 18 Feb 2019 14:54:44 +0800 Subject: [PATCH 044/304] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0=E4=BB=A3?= =?UTF-8?q?=E7=90=86IP=E6=8A=93=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config/setting.py | 10 +++++----- ProxyGetter/CheckProxy.py | 2 -- ProxyGetter/getFreeProxy.py | 28 +++++++++++++--------------- Test/testGetFreeProxy.py | 11 ----------- 4 files changed, 18 insertions(+), 33 deletions(-) diff --git a/Config/setting.py b/Config/setting.py index 8b87191fa..63b4f6153 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -29,10 +29,10 @@ PROXY_GETTER = [ "freeProxyFirst", "freeProxySecond", - # "freeProxyThird", + # "freeProxyThird", # 网站已不能访问 "freeProxyFourth", "freeProxyFifth", - # "freeProxySixth" + # "freeProxySixth" # 不再提供免费代理 "freeProxySeventh", # "freeProxyEight", # "freeProxyNinth", @@ -40,9 +40,9 @@ "freeProxyEleven", "freeProxyTwelve", # foreign website, outside the wall - "freeProxyWallFirst", - "freeProxyWallSecond", - "freeProxyWallThird" + # "freeProxyWallFirst", + # "freeProxyWallSecond", + # "freeProxyWallThird" ] diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py index f29824723..2b3fc6a29 100644 --- a/ProxyGetter/CheckProxy.py +++ b/ProxyGetter/CheckProxy.py @@ -12,11 +12,9 @@ """ __author__ = 'JHao' -import sys from getFreeProxy import GetFreeProxy from Util.utilFunction import verifyProxyFormat -sys.path.append('../') from Util.LogHandler import LogHandler diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index caa5b6e9c..cdfa843a0 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -88,7 +88,7 @@ def freeProxyThird(days=1): pass @staticmethod - def freeProxyFourth(page_count=2): + def freeProxyFourth(page_count=1): """ 西刺代理 http://www.xicidaili.com :return: @@ -136,7 +136,7 @@ def freeProxyFifth(): @staticmethod def freeProxySixth(): """ - 讯代理 http://www.xdaili.cn/ + 讯代理 http://www.xdaili.cn/ 已停用 :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' @@ -154,21 +154,19 @@ def freeProxySeventh(): 快代理 https://www.kuaidaili.com """ url_list = [ - 'https://www.kuaidaili.com/free/inha/{page}/', - 'https://www.kuaidaili.com/free/intr/{page}/' + 'https://www.kuaidaili.com/free/inha/', + 'https://www.kuaidaili.com/free/intr/' ] for url in url_list: - for page in range(1, 2): - page_url = url.format(page=page) - tree = getHtmlTree(page_url) - proxy_list = tree.xpath('.//table//tr') - for tr in proxy_list[1:]: - yield ':'.join(tr.xpath('./td/text()')[0:2]) + tree = getHtmlTree(url) + proxy_list = tree.xpath('.//table//tr') + for tr in proxy_list[1:]: + yield ':'.join(tr.xpath('./td/text()')[0:2]) @staticmethod def freeProxyEight(): """ - 秘密代理 http://www.mimiip.com 不能用 + 秘密代理 http://www.mimiip.com 已停用 """ url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 2)] # 国内高匿 url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 2)] # 国内普匿 @@ -185,7 +183,7 @@ def freeProxyEight(): @staticmethod def freeProxyNinth(): """ - 码农代理 https://proxy.coderbusy.com/ 不能用 + 码农代理 https://proxy.coderbusy.com/ 已停用 :return: """ urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] @@ -233,7 +231,7 @@ def freeProxyEleven(): @staticmethod def freeProxyTwelve(page_count=2): """ - guobanjia http://ip.jiangxianli.com/?page= + http://ip.jiangxianli.com/?page= 免费代理库 超多量 :return: @@ -291,7 +289,7 @@ def freeProxyWallThird(): from CheckProxy import CheckProxy # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFifth) @@ -300,7 +298,7 @@ def freeProxyWallThird(): # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEight) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen) - # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve) # CheckProxy.checkAllGetProxyFunc() diff --git a/Test/testGetFreeProxy.py b/Test/testGetFreeProxy.py index 854172773..5074945b4 100644 --- a/Test/testGetFreeProxy.py +++ b/Test/testGetFreeProxy.py @@ -12,22 +12,11 @@ """ __author__ = 'J_hao' -import re -import sys -import requests -try: - from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 -except: - reload(sys) - sys.setdefaultencoding('utf-8') - -sys.path.append('..') from ProxyGetter.getFreeProxy import GetFreeProxy from Config.ConfigGetter import config -# noinspection PyPep8Naming def testGetFreeProxy(): """ test class GetFreeProxy in ProxyGetter/GetFreeProxy From 07f9845017836d2776272e87551b55fb4a677f1a Mon Sep 17 00:00:00 2001 From: jhao Date: Tue, 19 Feb 2019 15:24:23 +0800 Subject: [PATCH 045/304] =?UTF-8?q?[update]=20=E6=9B=B4=E6=96=B0=E4=BB=A3?= =?UTF-8?q?=E7=90=86IP=E6=8A=93=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/release_notes.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/release_notes.md b/doc/release_notes.md index 0871a2db5..36e097726 100644 --- a/doc/release_notes.md +++ b/doc/release_notes.md @@ -1,5 +1,11 @@ ## Release Notes +* 1.13 (2019.02) + + 1.使用.py文件替换.ini作为配置文件; + + 2.更新代理采集部分; + * 1.12 (2018.4) 1.优化代理格式检查; From 0c48d9dc1a0e3dcb2f166882ea29ed7ad3213a21 Mon Sep 17 00:00:00 2001 From: J_hao104 Date: Tue, 5 Mar 2019 10:05:06 +0800 Subject: [PATCH 046/304] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b62864f2d..48edb4c98 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ * 支持版本: ![](https://img.shields.io/badge/Python-2.x-green.svg) ![](https://img.shields.io/badge/Python-3.x-blue.svg) -* 测试地址: http://123.207.35.36:5010 (单机勿压。感谢) +* 测试地址: http://118.24.52.95:5010 (单机勿压。感谢) ### 下载安装 From b568bd2092fc4aa405314968ead1102b1216f18d Mon Sep 17 00:00:00 2001 From: weak_ptr Date: Sun, 10 Mar 2019 17:21:54 +0800 Subject: [PATCH 047/304] =?UTF-8?q?[refine]=20=E5=85=81=E8=AE=B8=20docker-?= =?UTF-8?q?compose=20up=20=E7=9B=B4=E6=8E=A5=E8=BF=90=E8=A1=8C=E6=9C=8D?= =?UTF-8?q?=E5=8A=A1=E8=80=8C=E6=97=A0=E9=9C=80=E4=BF=AE=E6=94=B9=E9=85=8D?= =?UTF-8?q?=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 以下为修改内容。 - 移除现在看起来无用的 Dockerfile.develop - 将 Dockerfile 和 docker-compose.yml 移动到项目根目录下,删除 Docker 目录 - 修改 docker-compose.yml 内容,令 docker-compose 自行构建 proxy_pool,通过环境变量传递数据库类型和域名、端口等配置信息,不再暴露 redis 端口到 host 主机 - 修改 Dockerfile 内容,先复制 requirements.txt,完成依赖安装后,再复制代码文件,避免开发迭代时每次都要等 pip install - 修改 Config.setting 模块,先尝试通过环境变量获取配置信息,并提供未配置环境变量时的默认值。 --- Config/setting.py | 24 ++++++++++++++++++++---- Docker/Dockerfile.develop | 27 --------------------------- Docker/docker-compose.yml | 14 -------------- Docker/Dockerfile => Dockerfile | 9 +++------ docker-compose.yml | 14 ++++++++++++++ requirements.txt | 3 --- 6 files changed, 37 insertions(+), 54 deletions(-) delete mode 100644 Docker/Dockerfile.develop delete mode 100644 Docker/docker-compose.yml rename Docker/Dockerfile => Dockerfile (89%) create mode 100644 docker-compose.yml diff --git a/Config/setting.py b/Config/setting.py index 63b4f6153..a74e69a32 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -12,12 +12,29 @@ """ # database config +from os import getenv + + +class ConfigError(BaseException): + pass + + +DB_TYPE = getenv('db_type', 'SSDB') + +if DB_TYPE == 'SSDB': + DB_HOST = getenv('ssdb_host', '127.0.0.1') + DB_PORT = getenv('ssdb_port', '6379') +elif DB_TYPE == 'MONGODB': + DB_HOST = getenv('mongodb_host', '127.0.0.1') + DB_PORT = getenv('mongodb_host', '27017') +else: + raise ConfigError('Unknown database type, your environment variable `db_type` should be one of SSDB/MONGODB.') DATABASES = { "default": { - "TYPE": "SSDB", # TYPE SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB - "HOST": "127.0.0.1", - "PORT": 8888, + "TYPE": DB_TYPE, # TYPE SSDB/MONGODB if use redis, only modify the host port, the type should be SSDB + "HOST": DB_HOST, + "PORT": DB_PORT, "NAME": "proxy", "PASSWORD": "" @@ -45,7 +62,6 @@ # "freeProxyWallThird" ] - # # API config http://127.0.0.1:5010 SERVER_API = { diff --git a/Docker/Dockerfile.develop b/Docker/Dockerfile.develop deleted file mode 100644 index d97495489..000000000 --- a/Docker/Dockerfile.develop +++ /dev/null @@ -1,27 +0,0 @@ -FROM python:3.6 -WORKDIR /usr/src/app -COPY . . -ENV DEBIAN_FRONTEND noninteractive -ENV TZ Asia/Shanghai - -RUN apt-get update -RUN apt-get install vim -y - -RUN apt-get install -y redis-server -RUN sed -i 's/^\(bind .*\)$/# \1/' /etc/redis/redis.conf \ - && sed -i 's/^\(databases .*\)$/databases 1/' /etc/redis/redis.conf \ - && sed -i 's/^\(daemonize .*\)$/daemonize yes/' /etc/redis/redis.conf -# && sed -i 's/^\(dir .*\)$/# \1\ndir \/data/' /etc/redis/redis.conf \ -# && sed -i 's/^\(logfile .*\)$/# \1/' /etc/redis/redis.conf - -RUN pip install --no-cache-dir -r requirements.txt - - -RUN echo "# ! /bin/sh " > run.sh \ - && echo "redis-server /etc/redis/redis.conf&" >> run.sh \ - && echo "cd Run" >> run.sh \ - && echo "python main.py" >> run.sh \ - && chmod 777 run.sh - -EXPOSE 5010 -CMD [ "sh", "run.sh" ] diff --git a/Docker/docker-compose.yml b/Docker/docker-compose.yml deleted file mode 100644 index 9529745d5..000000000 --- a/Docker/docker-compose.yml +++ /dev/null @@ -1,14 +0,0 @@ -version: '2' -services: - proxy_pool: - volumes: - - ..:/usr/src/app - ports: - - "5010:5010" - links: - - proxy_redis - image: "proxy_pool" - proxy_redis: - ports: - - "6379:6379" - image: "redis" \ No newline at end of file diff --git a/Docker/Dockerfile b/Dockerfile similarity index 89% rename from Docker/Dockerfile rename to Dockerfile index 6ad6f5f53..abe8ddb07 100644 --- a/Docker/Dockerfile +++ b/Dockerfile @@ -1,13 +1,10 @@ FROM python:3.6 -WORKDIR /usr/src/app -COPY . . - ENV DEBIAN_FRONTEND noninteractive ENV TZ Asia/Shanghai - +WORKDIR /usr/src/app +COPY ./requirements.txt . RUN pip install --no-cache-dir -r requirements.txt - +COPY . . EXPOSE 5010 - WORKDIR /usr/src/app/ CMD [ "python", "Run/main.py" ] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 000000000..1c7f24659 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +version: '2' +services: + proxy_pool: + build: . + ports: + - "5010:5010" + links: + - proxy_redis + environment: + db_type: SSDB + ssdb_host: proxy_redis + ssdb_port: 6379 + proxy_redis: + image: "redis" diff --git a/requirements.txt b/requirements.txt index bc3581ff5..3da935240 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,8 +3,5 @@ werkzeug==0.11.15 Flask==0.12 requests==2.20.0 lxml==3.7.2 - pymongo redis - - From 595b08861abfa0e3a4e8dfa16132686292a5815c Mon Sep 17 00:00:00 2001 From: baiyan Date: Sun, 24 Mar 2019 00:42:18 +0800 Subject: [PATCH 048/304] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=97=A0=E5=BF=A7?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E8=A7=A3=E6=9E=90=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index cdfa843a0..470cbb3c2 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -33,7 +33,10 @@ class GetFreeProxy(object): def freeProxyFirst(page=10): """ 无忧代理 http://www.data5u.com/ - 几乎没有能用的 + 无忧代理有反爬虫机制。 + 需要获得元素的 classname。 + 匹配classname中每个字符在key中的位置,组合得到一个整数。 + 最后将整数右移3位得到的才是正确的端口号。 :param page: 页数 :return: """ @@ -42,12 +45,21 @@ def freeProxyFirst(page=10): 'http://www.data5u.com/free/gngn/index.shtml', 'http://www.data5u.com/free/gnpt/index.shtml' ] + key = 'ABCDEFGHIZ' for url in url_list: html_tree = getHtmlTree(url) ul_list = html_tree.xpath('//ul[@class="l2"]') for ul in ul_list: try: - yield ':'.join(ul.xpath('.//li/text()')[0:2]) + ip = ul.xpath('./span[1]/li/text()')[0] + classnames = ul.xpath('./span[2]/li/attribute::class')[0] + classname = classnames.split(' ')[1] + port_sum = 0 + for c in classname: + port_sum *= 10 + port_sum += key.index(c) + port = port_sum >> 3 + yield '{}:{}'.format(ip, port) except Exception as e: print(e) From 35467fb3bc8ac5c63b6939df84aa027f820f3421 Mon Sep 17 00:00:00 2001 From: Oddcc Date: Fri, 29 Mar 2019 14:19:19 +0800 Subject: [PATCH 049/304] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 更新文档中生产环境部署命令 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 48edb4c98..2bee689f4 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ SERVER_API = { # Workdir proxy_pool docker build -t proxy_pool . pip install docker-compose -docker-compose -f Docker/docker-compose.yml up -d +docker-compose -f docker-compose.yml up -d ``` * 开发环境 Docker From f8d039e61e0dc88ebfee43f96f9a584f07c9ca90 Mon Sep 17 00:00:00 2001 From: houbaron Date: Wed, 8 May 2019 21:40:11 +0800 Subject: [PATCH 050/304] =?UTF-8?q?[refine]=E5=85=81=E8=AE=B8=20docker-com?= =?UTF-8?q?pose.yml=20=E5=AE=9A=E4=B9=89=E5=AF=86=E7=A0=81=E8=80=8C?= =?UTF-8?q?=E6=97=A0=E9=A1=BB=E4=BF=AE=E6=94=B9=20setting.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config/setting.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Config/setting.py b/Config/setting.py index a74e69a32..66b8f0866 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -24,9 +24,11 @@ class ConfigError(BaseException): if DB_TYPE == 'SSDB': DB_HOST = getenv('ssdb_host', '127.0.0.1') DB_PORT = getenv('ssdb_port', '6379') + DB_PASSWORD = getenv('ssdb_password', '6379') elif DB_TYPE == 'MONGODB': DB_HOST = getenv('mongodb_host', '127.0.0.1') DB_PORT = getenv('mongodb_host', '27017') + DB_PASSWORD = getenv('mongodb_password', '6379') else: raise ConfigError('Unknown database type, your environment variable `db_type` should be one of SSDB/MONGODB.') @@ -36,7 +38,7 @@ class ConfigError(BaseException): "HOST": DB_HOST, "PORT": DB_PORT, "NAME": "proxy", - "PASSWORD": "" + "PASSWORD": DB_PASSWORD } } From bb4a7b9367a74645d1bfecbf92299260ef4bde0f Mon Sep 17 00:00:00 2001 From: houbaron Date: Wed, 8 May 2019 21:44:56 +0800 Subject: [PATCH 051/304] =?UTF-8?q?[refine]=E8=AE=BE=E7=BD=AE=E9=BB=98?= =?UTF-8?q?=E8=AE=A4=E5=AF=86=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config/setting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Config/setting.py b/Config/setting.py index 66b8f0866..358b0bfbc 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -24,11 +24,11 @@ class ConfigError(BaseException): if DB_TYPE == 'SSDB': DB_HOST = getenv('ssdb_host', '127.0.0.1') DB_PORT = getenv('ssdb_port', '6379') - DB_PASSWORD = getenv('ssdb_password', '6379') + DB_PASSWORD = getenv('ssdb_password', '') elif DB_TYPE == 'MONGODB': DB_HOST = getenv('mongodb_host', '127.0.0.1') DB_PORT = getenv('mongodb_host', '27017') - DB_PASSWORD = getenv('mongodb_password', '6379') + DB_PASSWORD = getenv('mongodb_password', '') else: raise ConfigError('Unknown database type, your environment variable `db_type` should be one of SSDB/MONGODB.') From f5a4317bbc96f6396d85337bba735545c437fecd Mon Sep 17 00:00:00 2001 From: hero Date: Sat, 11 May 2019 20:09:56 +0800 Subject: [PATCH 052/304] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=85=A8=E7=BD=91?= =?UTF-8?q?=E4=BB=A3=E7=90=86port=E9=94=99=E8=AF=AF=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index cdfa843a0..330bf090a 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -128,8 +128,20 @@ def freeProxyFifth(): try: # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port ip_addr = ''.join(each_proxy.xpath(xpath_str)) - port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] - yield '{}:{}'.format(ip_addr, port) + + # HTML中的port是随机数,真正的端口编码在class后面的字母中。 + # 比如这个: + # 9054 + # CFACE解码后对应的是3128。 + port = 0 + for _ in each_proxy.xpath(".//span[contains(@class, 'port')]" + "/attribute::class")[0]. \ + replace("port ", ""): + port *= 10 + port += (ord(_) - ord('A')) + port /= 8 + + yield '{}:{}'.format(ip_addr, int(port)) except Exception as e: pass From 35f43ecbe67ba869fcb3b7f044185f79a7452699 Mon Sep 17 00:00:00 2001 From: jhao Date: Wed, 10 Jul 2019 17:17:32 +0800 Subject: [PATCH 053/304] [update] fix 272 --- Schedule/ProxyCheck.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Schedule/ProxyCheck.py b/Schedule/ProxyCheck.py index 4300f7bf7..782d993d1 100644 --- a/Schedule/ProxyCheck.py +++ b/Schedule/ProxyCheck.py @@ -15,6 +15,12 @@ import sys from threading import Thread + +try: + from Queue import Empty # py3 +except: + from queue import Empty # py2 + sys.path.append('../') from Util.utilFunction import validUsefulProxy @@ -35,7 +41,10 @@ def __init__(self, queue, item_dict): def run(self): self.db.changeTable(self.useful_proxy_queue) while self.queue.qsize(): - proxy = self.queue.get() + try: + proxy = self.queue.get() + except Empty: + break count = self.item_dict[proxy] if validUsefulProxy(proxy): # 验证通过计数器减1 @@ -53,8 +62,3 @@ def run(self): self.db.put(proxy, num=int(count) + 1) self.queue.task_done() - -if __name__ == '__main__': - # p = ProxyCheck() - # p.run() - pass From 2f39dedbf36c3838233f452323f18ddad25f9e7b Mon Sep 17 00:00:00 2001 From: jhao Date: Thu, 11 Jul 2019 16:39:23 +0800 Subject: [PATCH 054/304] =?UTF-8?q?[update]=20=E4=BB=A3=E7=90=86=E5=AF=B9?= =?UTF-8?q?=E8=B1=A1=E7=B1=BB=E5=9E=8B=E5=B0=81=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyHelper/Proxy.py | 104 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 ProxyHelper/Proxy.py diff --git a/ProxyHelper/Proxy.py b/ProxyHelper/Proxy.py new file mode 100644 index 000000000..dce009e96 --- /dev/null +++ b/ProxyHelper/Proxy.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: Proxy + Description : 代理对象类型封装 + Author : JHao + date: 2019/7/11 +------------------------------------------------- + Change Activity: + 2019/7/11: 代理对象类型封装 +------------------------------------------------- +""" +__author__ = 'JHao' + + +class Proxy(object): + + def __init__(self, proxy): + if isinstance(proxy, basestring): + self._proxy = proxy + self._fail_count = 0 + self._region = "" + self._type = "" + self._last_status = "" + self._last_time = "" + + elif isinstance(proxy, dict): + self._proxy = proxy.get("proxy") + self._fail_count = proxy.get("fail_count") + self._region = proxy.get("region") + self._type = proxy.get("type") + self._last_status = proxy.get("last_status") + self._last_time = proxy.get("last_time") + + else: + raise TypeError("proxy arg invalid") + + @property + def proxy(self): + """ 代理 ip:port """ + return self._proxy + + @property + def fail_count(self): + """ 检测失败次数 """ + return self._fail_count + + @property + def region(self): + """ 地理位置(国家/城市) """ + return self._region + + @property + def type(self): + """ 透明/匿名/高匿 """ + return self._type + + @property + def last_status(self): + """ 最后一次检测结果 """ + return self._last_status + + @property + def last_time(self): + """ 最后一次检测时间 """ + return self._last_time + + # --- proxy method --- + @fail_count.setter + def fail_count(self, value): + self._fail_count = value + + @region.setter + def region(self, value): + self._region = value + + @type.setter + def type(self, value): + self._type = value + + @last_status.setter + def last_status(self, value): + self._last_status = value + + @last_time.setter + def last_time(self, value): + self._last_time = value + + +def proxy2Json(proxy): + return {"proxy": proxy.proxy, + "fail_count": proxy.fail_count, + "region": proxy.region, + "type": proxy.type, + "last_status": proxy.last_status, + "last_time": proxy.last_time} + + +if __name__ == '__main__': + p = Proxy("127.0.0.1:8080") + + import json + + print json.dumps(p, default=proxy2Json) From 964061e8e80baf2534652e290385f7131f880447 Mon Sep 17 00:00:00 2001 From: jhao Date: Thu, 11 Jul 2019 17:03:31 +0800 Subject: [PATCH 055/304] =?UTF-8?q?[update]=20=E6=97=A0=E5=BF=A7=E4=BB=A3?= =?UTF-8?q?=E7=90=86=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ProxyGetter/getFreeProxy.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 330bf090a..60dd884c1 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -30,17 +30,14 @@ class GetFreeProxy(object): """ @staticmethod - def freeProxyFirst(page=10): + def freeProxy01(): """ 无忧代理 http://www.data5u.com/ 几乎没有能用的 - :param page: 页数 :return: """ url_list = [ 'http://www.data5u.com/', - 'http://www.data5u.com/free/gngn/index.shtml', - 'http://www.data5u.com/free/gnpt/index.shtml' ] for url in url_list: html_tree = getHtmlTree(url) @@ -300,7 +297,7 @@ def freeProxyWallThird(): if __name__ == '__main__': from CheckProxy import CheckProxy - # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy01()) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxySecond) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyThird) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFourth) @@ -310,7 +307,7 @@ def freeProxyWallThird(): # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEight) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyNinth) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTen) - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) + # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyEleven) # CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyTwelve) # CheckProxy.checkAllGetProxyFunc() From f0c7a0f918cad270a508bed8296cd644b5dc1722 Mon Sep 17 00:00:00 2001 From: jhao Date: Thu, 18 Jul 2019 10:00:04 +0800 Subject: [PATCH 056/304] =?UTF-8?q?[update]=20=E7=A0=B4=E8=A7=A3=E4=BB=A3?= =?UTF-8?q?=E7=90=8666=20=E5=8A=A0=E9=80=9F=E4=B9=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Config/setting.py | 4 +-- ProxyGetter/CheckProxy.py | 2 +- ProxyGetter/getFreeProxy.py | 56 +++++++++++++++++++++++++++---------- Util/utilFunction.py | 12 -------- requirements.txt | 1 + 5 files changed, 46 insertions(+), 29 deletions(-) diff --git a/Config/setting.py b/Config/setting.py index 358b0bfbc..4ef1b76eb 100644 --- a/Config/setting.py +++ b/Config/setting.py @@ -46,8 +46,8 @@ class ConfigError(BaseException): # register the proxy getter function PROXY_GETTER = [ - "freeProxyFirst", - "freeProxySecond", + "freeProxy01", + "freeProxy02", # "freeProxyThird", # 网站已不能访问 "freeProxyFourth", "freeProxyFifth", diff --git a/ProxyGetter/CheckProxy.py b/ProxyGetter/CheckProxy.py index 2b3fc6a29..d15be49c9 100644 --- a/ProxyGetter/CheckProxy.py +++ b/ProxyGetter/CheckProxy.py @@ -67,4 +67,4 @@ def checkGetProxyFunc(func): if __name__ == '__main__': CheckProxy.checkAllGetProxyFunc() - CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) + CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy01) \ No newline at end of file diff --git a/ProxyGetter/getFreeProxy.py b/ProxyGetter/getFreeProxy.py index 60dd884c1..1b1277af4 100644 --- a/ProxyGetter/getFreeProxy.py +++ b/ProxyGetter/getFreeProxy.py @@ -49,24 +49,52 @@ def freeProxy01(): print(e) @staticmethod - def freeProxySecond(count=20): + def freeProxy02(count=20): """ 代理66 http://www.66ip.cn/ :param count: 提取数量 :return: """ urls = [ - "http://www.66ip.cn/mo.php?sxb=&tqsl={count}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=", - "http://www.66ip.cn/nmtq.php?getnum={count}" - "&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip", - ] - request = WebRequest() - for _ in urls: - url = _.format(count=count) - html = request.get(url).content - ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", html) - for ip in ips: - yield ip.strip() + "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=", + "http://www.66ip.cn/nmtq.php?getnum={}&isp=0&anonymoustype=0&s" + "tart=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip" + ] + + try: + import execjs + import requests + + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', + 'Accept': '*/*', + 'Connection': 'keep-alive', + 'Accept-Language': 'zh-CN,zh;q=0.8'} + session = requests.session() + src = session.get("http://www.66ip.cn/", headers=headers).text + src = src.split("")[0] + '}' + src = src.replace("")[0] + '}' + src = src.replace("")[0] + '}' + src = src.replace("")[0] + '}' - src = src.replace("")[0] + '}' +# src = src.replace("")[0] + '}' -# src = src.replace("")[0] + '}' - src = src.replace("