flowercoder
diff --git a/‎examples/example.py
Lines changed: 0 additions & 29 deletions b/‎examples/example.py
Lines changed: 0 additions & 29 deletions
diff --git a/‎examples/proxytest.py
Lines changed: 0 additions & 15 deletions b/‎examples/proxytest.py
Lines changed: 0 additions & 15 deletions
diff --git a/‎proxypool/api.py
Lines changed: 8 additions & 9 deletions b/‎proxypool/api.py
Lines changed: 8 additions & 9 deletions
diff --git a/‎proxypool/crawler.py
Lines changed: 0 additions & 147 deletions b/‎proxypool/crawler.py
Lines changed: 0 additions & 147 deletions
diff --git a/‎proxypool/crawlers/__init__.py
Lines changed: 10 additions & 0 deletions b/‎proxypool/crawlers/__init__.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎proxypool/crawlers/base.py
Lines changed: 27 additions & 0 deletions b/‎proxypool/crawlers/base.py
Lines changed: 27 additions & 0 deletions
diff --git a/‎proxypool/crawlers/daili66.py
Lines changed: 31 additions & 0 deletions b/‎proxypool/crawlers/daili66.py
Lines changed: 31 additions & 0 deletions
diff --git a/‎proxypool/crawlers/ip3366.py
Lines changed: 31 additions & 0 deletions b/‎proxypool/crawlers/ip3366.py
Lines changed: 31 additions & 0 deletions
diff --git a/‎proxypool/crawlers/iphai.py
Lines changed: 34 additions & 0 deletions b/‎proxypool/crawlers/iphai.py
Lines changed: 34 additions & 0 deletions
@@ -1,42 +1,41 @@
 from flask import Flask, g
-
 from .db import RedisClient
+from .setting import API_HOST, API_PORT
 
 __all__ = ['app']
 
 app = Flask(__name__)
 
-
 def get_conn():
+    """
+    get redis client object
+    :return:
+    """
     if not hasattr(g, 'redis'):
         g.redis = RedisClient()
     return g.redis
 
-
 @app.route('/')
 def index():
     return '<h2>Welcome to Proxy Pool System</h2>'
 
-
 @app.route('/random')
 def get_proxy():
     """
-    Get a proxy
+    get a random proxy
     :return: 随机代理
     """
     conn = get_conn()
     return conn.random()
 
-
 @app.route('/count')
 def get_counts():
     """
-    Get the count of proxies
+    get the count of proxies
     :return: 代理池总量
     """
     conn = get_conn()
     return str(conn.count())
 
-
 if __name__ == '__main__':
-    app.run()
+    app.run(host=API_HOST, port=API_PORT, threaded=True)
@@ -0,0 +1,10 @@
+from .daili66 import Daili66Crawler
+from .ip3366 import IP3366Crawler
+from .iphai import IPHaiCrawler
+
+
+__all__ = [
+    Daili66Crawler,
+    IP3366Crawler,
+    IP3366Crawler
+]
@@ -0,0 +1,27 @@
+from retrying import retry
+import requests
+from loguru import logger
+
+
+class BaseCrawler(object):
+    urls = []
+    
+    @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None)
+    def fetch(self, url, **kwargs):
+        try:
+            response = requests.get(url, **kwargs)
+            if response.status_code == 200:
+                return response.text
+        except requests.ConnectionError:
+            return
+        
+    @logger.catch
+    def crawl(self):
+        """
+        crawl main method
+        """
+        for url in self.urls:
+            logger.info(f'fetching {url}')
+            html = self.fetch(url)
+            for proxy in self.parse(html):
+                yield proxy
@@ -0,0 +1,31 @@
+from pyquery import PyQuery as pq
+from proxypool.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+
+
+BASE_URL = 'http://www.636ip.cn/{page}.html'
+MAX_PAGE = 5
+
+class Daili66Crawler(BaseCrawler):
+    """
+    daili66 crawler, http://www.66ip.cn/1.html
+    """
+    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
+    
+    
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        doc = pq(html)
+        trs = doc('.containerbox table tr:gt(0)').items()
+        for tr in trs:
+            host = tr.find('td:nth-child(1)').text()
+            port = int(tr.find('td:nth-child(2)').text())
+            yield Proxy(host=host, port=port)
+
+if __name__ == '__main__':
+    crawler = Daili66Crawler()
+    for proxy in crawler.crawl():
+        print(proxy)
@@ -0,0 +1,31 @@
+from proxypool.crawlers.base import BaseCrawler
+from proxypool.proxy import Proxy
+import re
+
+
+MAX_PAGE = 5
+BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}'
+
+class IP3366Crawler(BaseCrawler):
+    """
+    ip3366 crawler, http://www.ip3366.net/
+    """
+    urls = [BASE_URL.format(page=i) for i in range(1, 8)]
+    
+    
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
+        # \s * 匹配空格，起到换行作用
+        re_ip_address = ip_address.findall(html)
+        for address, port in re_ip_address:
+            proxy = Proxy(host=address.strip(), port=int(port.strip()))
+            yield proxy
+
+if __name__ == '__main__':
+    crawler = IP3366Crawler()
+    for proxy in crawler.crawl():
+        print(proxy)
@@ -0,0 +1,34 @@
+from proxypool.crawlers.base import BaseCrawler
+from proxypool.proxy import Proxy
+import re
+
+
+BASE_URL = 'http://www.iphai.com/'
+
+class IPHaiCrawler(BaseCrawler):
+    """
+    iphai crawler, http://www.iphai.com/
+    """
+    urls = [BASE_URL]
+    
+    
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        find_tr = re.compile('<tr>(.*?)</tr>', re.S)
+        trs = find_tr.findall(html)
+        for s in range(1, len(trs)):
+            find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
+            re_ip_address = find_ip.findall(trs[s])
+            find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
+            re_port = find_port.findall(trs[s])
+            for address, port in zip(re_ip_address, re_port):
+                proxy = Proxy(host=address.strip(), port=int(port.strip()))
+                yield proxy
+
+if __name__ == '__main__':
+    crawler = IPHaiCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)