Skip to content

Commit 938983c

Browse files
authored
Merge pull request Jack-Cherish#46 from sys0613/master
Merge pull request Jack-Cherish#46 from sys0613/master
2 parents 04f2783 + 5f58699 commit 938983c

File tree

1 file changed

+52
-0
lines changed

1 file changed

+52
-0
lines changed

one_hour_spider/biquge20180731.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# -*- coding:utf-8 -*-
2+
import requests
3+
from bs4 import BeautifulSoup
4+
import os
5+
6+
"""
7+
从www.biqubao.com笔趣阁爬取小说,楼主教程中的网址我当时没打开,
8+
就参照楼主教程,爬取了笔趣阁小说网的内容。
9+
2018-07-31
10+
"""
11+
12+
if __name__=='__main__':
13+
#所要爬取的小说主页,每次使用时,修改该网址即可,同时保证本地保存根路径存在即可
14+
target="https://www.biqubao.com/book/17570/"
15+
# 本地保存爬取的文本根路径
16+
save_path = 'G:/pythonlearn'
17+
#笔趣阁网站根路径
18+
index_path='https://www.biqubao.com'
19+
20+
req=requests.get(url=target)
21+
#查看request默认的编码,发现与网站response不符,改为网站使用的gdk
22+
print(req.encoding)
23+
req.encoding = 'gbk'
24+
#解析html
25+
soup=BeautifulSoup(req.text,"html.parser")
26+
list_tag=soup.div(id="list")
27+
print('list_tag:',list_tag)
28+
#获取小说名称
29+
story_title=list_tag[0].dl.dt.string
30+
# 根据小说名称创建一个文件夹,如果不存在就新建
31+
dir_path=save_path+'/'+story_title
32+
if not os.path.exists(dir_path):
33+
os.path.join(save_path,story_title)
34+
os.mkdir(dir_path)
35+
#开始循环每一个章节,获取章节名称,与章节对应的网址
36+
for dd_tag in list_tag[0].dl.find_all('dd'):
37+
#章节名称
38+
chapter_name=dd_tag.string
39+
#章节网址
40+
chapter_url=index_path+dd_tag.a.get('href')
41+
#访问该章节详情网址,爬取该章节正文
42+
chapter_req = requests.get(url=chapter_url)
43+
chapter_req.encoding = 'gbk'
44+
chapter_soup = BeautifulSoup(chapter_req.text, "html.parser")
45+
#解析出来正文所在的标签
46+
content_tag = chapter_soup.div.find(id="content")
47+
#获取正文文本,并将空格替换为换行符
48+
content_text = str(content_tag.text.replace('\xa0','\n'))
49+
#将当前章节,写入以章节名字命名的txt文件
50+
with open(dir_path+'/'+chapter_name+'.txt', 'w') as f:
51+
f.write('本文网址:'+chapter_url)
52+
f.write(content_text)

0 commit comments

Comments
 (0)