From 2eea69f35ece051a9865515705b5d9add3b797d7 Mon Sep 17 00:00:00 2001
From: wilsonyin123 <39687962+wilsonyin123@users.noreply.github.com>
Date: Fri, 27 Dec 2019 16:34:20 +0800
Subject: [PATCH 1/4] =?UTF-8?q?=E4=B8=AD=E6=96=87=E7=AB=99=E6=94=B9?=
 =?UTF-8?q?=E7=89=88=E6=94=B9=E7=94=A8=E8=8B=B1=E6=96=87=E7=AB=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python_demo/69/pachong_infoq.py | 44 +++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 python_demo/69/pachong_infoq.py

diff --git a/python_demo/69/pachong_infoq.py b/python_demo/69/pachong_infoq.py
new file mode 100644
index 0000000..42ed765
--- /dev/null
+++ b/python_demo/69/pachong_infoq.py
@@ -0,0 +1,44 @@
+from bs4 import BeautifulSoup
+import requests
+
+headers = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+    "Accept-Language": "zh-CN,zh;q=0.8",
+    "Connection": "close",
+    "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1",
+    "Referer": "http://www.infoq.com",
+    "Upgrade-Insecure-Requests": "1",
+    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"
+}
+
+url = 'https://www.infoq.com/news/'
+
+# 取得网页完整内容
+
+
+def craw(url):
+    response = requests.get(url, headers=headers)
+    print(response.text)
+
+# craw(url)
+
+# 取得新闻标题
+
+
+def craw2(url):
+    response = requests.get(url, headers=headers)
+
+    soup = BeautifulSoup(response.text, 'lxml')
+
+    for title_href in soup.find_all('div', class_='items__content'):
+        print([title.get('title')
+               for title in title_href.find_all('a') if title.get('title')])
+
+# craw2(url)
+
+
+# 翻页
+for i in range(15, 46, 15):
+    url = 'http://www.infoq.com/news/' + str(i)
+    # print(url)
+    craw2(url)

From 8f36dc621899229e68cbf6d77740f18dc71bdfe3 Mon Sep 17 00:00:00 2001
From: wilsonyin123 <39687962+wilsonyin123@users.noreply.github.com>
Date: Fri, 27 Dec 2019 16:34:58 +0800
Subject: [PATCH 2/4] =?UTF-8?q?=E4=B8=AD=E6=96=87=E7=AB=99=E6=94=B9?=
 =?UTF-8?q?=E7=89=88=E6=94=B9=E7=94=A8=E8=8B=B1=E6=96=87=E7=AB=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 timegeekbang.com/pachong_infoq.py | 44 +++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 timegeekbang.com/pachong_infoq.py

diff --git a/timegeekbang.com/pachong_infoq.py b/timegeekbang.com/pachong_infoq.py
new file mode 100644
index 0000000..42ed765
--- /dev/null
+++ b/timegeekbang.com/pachong_infoq.py
@@ -0,0 +1,44 @@
+from bs4 import BeautifulSoup
+import requests
+
+headers = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+    "Accept-Language": "zh-CN,zh;q=0.8",
+    "Connection": "close",
+    "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1",
+    "Referer": "http://www.infoq.com",
+    "Upgrade-Insecure-Requests": "1",
+    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"
+}
+
+url = 'https://www.infoq.com/news/'
+
+# 取得网页完整内容
+
+
+def craw(url):
+    response = requests.get(url, headers=headers)
+    print(response.text)
+
+# craw(url)
+
+# 取得新闻标题
+
+
+def craw2(url):
+    response = requests.get(url, headers=headers)
+
+    soup = BeautifulSoup(response.text, 'lxml')
+
+    for title_href in soup.find_all('div', class_='items__content'):
+        print([title.get('title')
+               for title in title_href.find_all('a') if title.get('title')])
+
+# craw2(url)
+
+
+# 翻页
+for i in range(15, 46, 15):
+    url = 'http://www.infoq.com/news/' + str(i)
+    # print(url)
+    craw2(url)

From fad98ae4443d8e4b30da24449417ef837b1abb7f Mon Sep 17 00:00:00 2001
From: wilsonyin123 <39687962+wilsonyin123@users.noreply.github.com>
Date: Fri, 27 Dec 2019 17:01:14 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E4=B8=AD=E6=96=87=E7=AB=99=E6=94=B9?=
 =?UTF-8?q?=E7=89=88=E6=94=B9=E7=94=A8=E8=8B=B1=E6=96=87=E7=AB=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 timegeekbang.com/pachong_infoq2.py | 56 ++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 timegeekbang.com/pachong_infoq2.py

diff --git a/timegeekbang.com/pachong_infoq2.py b/timegeekbang.com/pachong_infoq2.py
new file mode 100644
index 0000000..9b4de8b
--- /dev/null
+++ b/timegeekbang.com/pachong_infoq2.py
@@ -0,0 +1,56 @@
+from bs4 import BeautifulSoup
+import requests
+import os
+import shutil
+
+headers = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+    "Accept-Language": "zh-CN,zh;q=0.8",
+    "Connection": "close",
+    "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1",
+    "Referer": "http://www.infoq.com",
+    "Upgrade-Insecure-Requests": "1",
+    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"
+}
+
+url = 'http://www.infoq.com/presentations'
+
+
+# 下载图片
+# Requests 库封装复杂的接口，提供更人性化的 HTTP 客户端，但不直接提供下载文件的函数。
+# 需要通过为请求设置特殊参数 stream 来实现。当 stream 设为 True 时，
+# 上述请求只下载HTTP响应头，并保持连接处于打开状态，
+# 直到访问 Response.content 属性时才开始下载响应主体内容
+
+
+def download_jpg(image_url, image_localpath):
+    response = requests.get(image_url, stream=True)
+    if response.status_code == 200:
+        with open(image_localpath, 'wb') as f:
+            response.raw.deconde_content = True
+            shutil.copyfileobj(response.raw, f)
+
+
+# 取得演讲图片
+def craw3(url):
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.text, 'lxml')
+    for pic_href in soup.find_all('div', class_='items__content'):
+        for pic in pic_href.find_all('img'):
+            imgurl = pic.get('src')
+            dir = os.path.abspath('.')
+            filename = os.path.basename(imgurl)
+            imgpath = os.path.join(dir, filename)
+            print('开始下载 %s' % imgurl)
+            download_jpg(imgurl, imgpath)
+
+
+# craw3(url)
+
+# 翻页
+j = 0
+for i in range(12, 37, 12):
+    url = 'http://www.infoq.com/presentations' + str(i)
+    j += 1
+    print('第 %d 页' % j)
+    craw3(url)

From 135f78d440de8d34bc81630fd4200f43c8dc003d Mon Sep 17 00:00:00 2001
From: wilsonyin123 <39687962+wilsonyin123@users.noreply.github.com>
Date: Fri, 27 Dec 2019 17:01:43 +0800
Subject: [PATCH 4/4] =?UTF-8?q?=E4=B8=AD=E6=96=87=E7=AB=99=E6=94=B9?=
 =?UTF-8?q?=E7=89=88=E6=94=B9=E7=94=A8=E8=8B=B1=E6=96=87=E7=AB=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python_demo/70/pachong_infoq2.py | 56 ++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 python_demo/70/pachong_infoq2.py

diff --git a/python_demo/70/pachong_infoq2.py b/python_demo/70/pachong_infoq2.py
new file mode 100644
index 0000000..9b4de8b
--- /dev/null
+++ b/python_demo/70/pachong_infoq2.py
@@ -0,0 +1,56 @@
+from bs4 import BeautifulSoup
+import requests
+import os
+import shutil
+
+headers = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+    "Accept-Language": "zh-CN,zh;q=0.8",
+    "Connection": "close",
+    "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1",
+    "Referer": "http://www.infoq.com",
+    "Upgrade-Insecure-Requests": "1",
+    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"
+}
+
+url = 'http://www.infoq.com/presentations'
+
+
+# 下载图片
+# Requests 库封装复杂的接口，提供更人性化的 HTTP 客户端，但不直接提供下载文件的函数。
+# 需要通过为请求设置特殊参数 stream 来实现。当 stream 设为 True 时，
+# 上述请求只下载HTTP响应头，并保持连接处于打开状态，
+# 直到访问 Response.content 属性时才开始下载响应主体内容
+
+
+def download_jpg(image_url, image_localpath):
+    response = requests.get(image_url, stream=True)
+    if response.status_code == 200:
+        with open(image_localpath, 'wb') as f:
+            response.raw.deconde_content = True
+            shutil.copyfileobj(response.raw, f)
+
+
+# 取得演讲图片
+def craw3(url):
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.text, 'lxml')
+    for pic_href in soup.find_all('div', class_='items__content'):
+        for pic in pic_href.find_all('img'):
+            imgurl = pic.get('src')
+            dir = os.path.abspath('.')
+            filename = os.path.basename(imgurl)
+            imgpath = os.path.join(dir, filename)
+            print('开始下载 %s' % imgurl)
+            download_jpg(imgurl, imgpath)
+
+
+# craw3(url)
+
+# 翻页
+j = 0
+for i in range(12, 37, 12):
+    url = 'http://www.infoq.com/presentations' + str(i)
+    j += 1
+    print('第 %d 页' % j)
+    craw3(url)