1
+ # -*- coding:utf-8 -*-
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import os
5
+
6
+ """
7
+ 从www.biqubao.com笔趣阁爬取小说,楼主教程中的网址我当时没打开,
8
+ 就参照楼主教程,爬取了笔趣阁小说网的内容。
9
+ 2018-07-31
10
+ """
11
+
12
+ if __name__ == '__main__' :
13
+ #所要爬取的小说主页,每次使用时,修改该网址即可,同时保证本地保存根路径存在即可
14
+ target = "https://www.biqubao.com/book/17570/"
15
+ # 本地保存爬取的文本根路径
16
+ save_path = 'G:/pythonlearn'
17
+ #笔趣阁网站根路径
18
+ index_path = 'https://www.biqubao.com'
19
+
20
+ req = requests .get (url = target )
21
+ #查看request默认的编码,发现与网站response不符,改为网站使用的gdk
22
+ print (req .encoding )
23
+ req .encoding = 'gbk'
24
+ #解析html
25
+ soup = BeautifulSoup (req .text ,"html.parser" )
26
+ list_tag = soup .div (id = "list" )
27
+ print ('list_tag:' ,list_tag )
28
+ #获取小说名称
29
+ story_title = list_tag [0 ].dl .dt .string
30
+ # 根据小说名称创建一个文件夹,如果不存在就新建
31
+ dir_path = save_path + '/' + story_title
32
+ if not os .path .exists (dir_path ):
33
+ os .path .join (save_path ,story_title )
34
+ os .mkdir (dir_path )
35
+ #开始循环每一个章节,获取章节名称,与章节对应的网址
36
+ for dd_tag in list_tag [0 ].dl .find_all ('dd' ):
37
+ #章节名称
38
+ chapter_name = dd_tag .string
39
+ #章节网址
40
+ chapter_url = index_path + dd_tag .a .get ('href' )
41
+ #访问该章节详情网址,爬取该章节正文
42
+ chapter_req = requests .get (url = chapter_url )
43
+ chapter_req .encoding = 'gbk'
44
+ chapter_soup = BeautifulSoup (chapter_req .text , "html.parser" )
45
+ #解析出来正文所在的标签
46
+ content_tag = chapter_soup .div .find (id = "content" )
47
+ #获取正文文本,并将空格替换为换行符
48
+ content_text = str (content_tag .text .replace ('\xa0 ' ,'\n ' ))
49
+ #将当前章节,写入以章节名字命名的txt文件
50
+ with open (dir_path + '/' + chapter_name + '.txt' , 'w' ) as f :
51
+ f .write ('本文网址:' + chapter_url )
52
+ f .write (content_text )
0 commit comments