From 2eea69f35ece051a9865515705b5d9add3b797d7 Mon Sep 17 00:00:00 2001 From: wilsonyin123 <39687962+wilsonyin123@users.noreply.github.com> Date: Fri, 27 Dec 2019 16:34:20 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E4=B8=AD=E6=96=87=E7=AB=99=E6=94=B9?= =?UTF-8?q?=E7=89=88=E6=94=B9=E7=94=A8=E8=8B=B1=E6=96=87=E7=AB=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python_demo/69/pachong_infoq.py | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 python_demo/69/pachong_infoq.py diff --git a/python_demo/69/pachong_infoq.py b/python_demo/69/pachong_infoq.py new file mode 100644 index 0000000..42ed765 --- /dev/null +++ b/python_demo/69/pachong_infoq.py @@ -0,0 +1,44 @@ +from bs4 import BeautifulSoup +import requests + +headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8", + "Connection": "close", + "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1", + "Referer": "http://www.infoq.com", + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" +} + +url = 'https://www.infoq.com/news/' + +# 取得网页完整内容 + + +def craw(url): + response = requests.get(url, headers=headers) + print(response.text) + +# craw(url) + +# 取得新闻标题 + + +def craw2(url): + response = requests.get(url, headers=headers) + + soup = BeautifulSoup(response.text, 'lxml') + + for title_href in soup.find_all('div', class_='items__content'): + print([title.get('title') + for title in title_href.find_all('a') if title.get('title')]) + +# craw2(url) + + +# 翻页 +for i in range(15, 46, 15): + url = 'http://www.infoq.com/news/' + str(i) + # print(url) + craw2(url) From 8f36dc621899229e68cbf6d77740f18dc71bdfe3 Mon Sep 17 00:00:00 2001 From: wilsonyin123 <39687962+wilsonyin123@users.noreply.github.com> Date: Fri, 27 Dec 2019 16:34:58 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E4=B8=AD=E6=96=87=E7=AB=99=E6=94=B9?= =?UTF-8?q?=E7=89=88=E6=94=B9=E7=94=A8=E8=8B=B1=E6=96=87=E7=AB=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- timegeekbang.com/pachong_infoq.py | 44 +++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 timegeekbang.com/pachong_infoq.py diff --git a/timegeekbang.com/pachong_infoq.py b/timegeekbang.com/pachong_infoq.py new file mode 100644 index 0000000..42ed765 --- /dev/null +++ b/timegeekbang.com/pachong_infoq.py @@ -0,0 +1,44 @@ +from bs4 import BeautifulSoup +import requests + +headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8", + "Connection": "close", + "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1", + "Referer": "http://www.infoq.com", + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" +} + +url = 'https://www.infoq.com/news/' + +# 取得网页完整内容 + + +def craw(url): + response = requests.get(url, headers=headers) + print(response.text) + +# craw(url) + +# 取得新闻标题 + + +def craw2(url): + response = requests.get(url, headers=headers) + + soup = BeautifulSoup(response.text, 'lxml') + + for title_href in soup.find_all('div', class_='items__content'): + print([title.get('title') + for title in title_href.find_all('a') if title.get('title')]) + +# craw2(url) + + +# 翻页 +for i in range(15, 46, 15): + url = 'http://www.infoq.com/news/' + str(i) + # print(url) + craw2(url) From fad98ae4443d8e4b30da24449417ef837b1abb7f Mon Sep 17 00:00:00 2001 From: wilsonyin123 <39687962+wilsonyin123@users.noreply.github.com> Date: Fri, 27 Dec 2019 17:01:14 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E4=B8=AD=E6=96=87=E7=AB=99=E6=94=B9?= =?UTF-8?q?=E7=89=88=E6=94=B9=E7=94=A8=E8=8B=B1=E6=96=87=E7=AB=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- timegeekbang.com/pachong_infoq2.py | 56 ++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 timegeekbang.com/pachong_infoq2.py diff --git a/timegeekbang.com/pachong_infoq2.py b/timegeekbang.com/pachong_infoq2.py new file mode 100644 index 0000000..9b4de8b --- /dev/null +++ b/timegeekbang.com/pachong_infoq2.py @@ -0,0 +1,56 @@ +from bs4 import BeautifulSoup +import requests +import os +import shutil + +headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8", + "Connection": "close", + "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1", + "Referer": "http://www.infoq.com", + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" +} + +url = 'http://www.infoq.com/presentations' + + +# 下载图片 +# Requests 库封装复杂的接口,提供更人性化的 HTTP 客户端,但不直接提供下载文件的函数。 +# 需要通过为请求设置特殊参数 stream 来实现。当 stream 设为 True 时, +# 上述请求只下载HTTP响应头,并保持连接处于打开状态, +# 直到访问 Response.content 属性时才开始下载响应主体内容 + + +def download_jpg(image_url, image_localpath): + response = requests.get(image_url, stream=True) + if response.status_code == 200: + with open(image_localpath, 'wb') as f: + response.raw.deconde_content = True + shutil.copyfileobj(response.raw, f) + + +# 取得演讲图片 +def craw3(url): + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, 'lxml') + for pic_href in soup.find_all('div', class_='items__content'): + for pic in pic_href.find_all('img'): + imgurl = pic.get('src') + dir = os.path.abspath('.') + filename = os.path.basename(imgurl) + imgpath = os.path.join(dir, filename) + print('开始下载 %s' % imgurl) + download_jpg(imgurl, imgpath) + + +# craw3(url) + +# 翻页 +j = 0 +for i in range(12, 37, 12): + url = 'http://www.infoq.com/presentations' + str(i) + j += 1 + print('第 %d 页' % j) + craw3(url) From 135f78d440de8d34bc81630fd4200f43c8dc003d Mon Sep 17 00:00:00 2001 From: wilsonyin123 <39687962+wilsonyin123@users.noreply.github.com> Date: Fri, 27 Dec 2019 17:01:43 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=E4=B8=AD=E6=96=87=E7=AB=99=E6=94=B9?= =?UTF-8?q?=E7=89=88=E6=94=B9=E7=94=A8=E8=8B=B1=E6=96=87=E7=AB=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python_demo/70/pachong_infoq2.py | 56 ++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 python_demo/70/pachong_infoq2.py diff --git a/python_demo/70/pachong_infoq2.py b/python_demo/70/pachong_infoq2.py new file mode 100644 index 0000000..9b4de8b --- /dev/null +++ b/python_demo/70/pachong_infoq2.py @@ -0,0 +1,56 @@ +from bs4 import BeautifulSoup +import requests +import os +import shutil + +headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8", + "Connection": "close", + "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1", + "Referer": "http://www.infoq.com", + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" +} + +url = 'http://www.infoq.com/presentations' + + +# 下载图片 +# Requests 库封装复杂的接口,提供更人性化的 HTTP 客户端,但不直接提供下载文件的函数。 +# 需要通过为请求设置特殊参数 stream 来实现。当 stream 设为 True 时, +# 上述请求只下载HTTP响应头,并保持连接处于打开状态, +# 直到访问 Response.content 属性时才开始下载响应主体内容 + + +def download_jpg(image_url, image_localpath): + response = requests.get(image_url, stream=True) + if response.status_code == 200: + with open(image_localpath, 'wb') as f: + response.raw.deconde_content = True + shutil.copyfileobj(response.raw, f) + + +# 取得演讲图片 +def craw3(url): + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, 'lxml') + for pic_href in soup.find_all('div', class_='items__content'): + for pic in pic_href.find_all('img'): + imgurl = pic.get('src') + dir = os.path.abspath('.') + filename = os.path.basename(imgurl) + imgpath = os.path.join(dir, filename) + print('开始下载 %s' % imgurl) + download_jpg(imgurl, imgpath) + + +# craw3(url) + +# 翻页 +j = 0 +for i in range(12, 37, 12): + url = 'http://www.infoq.com/presentations' + str(i) + j += 1 + print('第 %d 页' % j) + craw3(url)