Skip to content

Commit 8f36dc6

Browse files
authored
中文站改版改用英文站
1 parent 2eea69f commit 8f36dc6

File tree

1 file changed

+44
-0
lines changed

1 file changed

+44
-0
lines changed

timegeekbang.com/pachong_infoq.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
4+
headers = {
5+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
6+
"Accept-Language": "zh-CN,zh;q=0.8",
7+
"Connection": "close",
8+
"Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1",
9+
"Referer": "http://www.infoq.com",
10+
"Upgrade-Insecure-Requests": "1",
11+
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"
12+
}
13+
14+
url = 'https://www.infoq.com/news/'
15+
16+
# 取得网页完整内容
17+
18+
19+
def craw(url):
20+
response = requests.get(url, headers=headers)
21+
print(response.text)
22+
23+
# craw(url)
24+
25+
# 取得新闻标题
26+
27+
28+
def craw2(url):
29+
response = requests.get(url, headers=headers)
30+
31+
soup = BeautifulSoup(response.text, 'lxml')
32+
33+
for title_href in soup.find_all('div', class_='items__content'):
34+
print([title.get('title')
35+
for title in title_href.find_all('a') if title.get('title')])
36+
37+
# craw2(url)
38+
39+
40+
# 翻页
41+
for i in range(15, 46, 15):
42+
url = 'http://www.infoq.com/news/' + str(i)
43+
# print(url)
44+
craw2(url)

0 commit comments

Comments
 (0)