Skip to content

Commit c4b697b

Browse files
committed
commit
1 parent 84e5031 commit c4b697b

File tree

2 files changed

+160
-17
lines changed

2 files changed

+160
-17
lines changed

.idea/workspace.xml

Lines changed: 44 additions & 17 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import json
2+
import random
3+
import time
4+
import pymongo
5+
import requests
6+
from bs4 import BeautifulSoup
7+
from lxml import etree
8+
9+
clients = pymongo.MongoClient('localhost')
10+
db = clients["XiMaLaYa"]
11+
col1 = db["album2"]
12+
col2 = db["detaile2"]
13+
14+
UA_LIST = [
15+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
16+
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
17+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
18+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
19+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
20+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
21+
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
22+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
23+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
24+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
25+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
26+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
27+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
28+
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
29+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
30+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
31+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
32+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
33+
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
34+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
35+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
36+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
37+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
38+
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
39+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
40+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
41+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
42+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
43+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
44+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
45+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
46+
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
47+
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
48+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
49+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
50+
]
51+
headers1 = {
52+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
53+
'Accept-Encoding': 'gzip, deflate, sdch',
54+
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
55+
'Cache-Control': 'max-age=0',
56+
'Proxy-Connection': 'keep-alive',
57+
'Upgrade-Insecure-Requests': '1',
58+
'User-Agent': random.choice(UA_LIST)
59+
}
60+
headers2 = {
61+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
62+
'Accept-Encoding': 'gzip, deflate, sdch',
63+
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
64+
'Cache-Control': 'max-age=0',
65+
'Proxy-Connection': 'keep-alive',
66+
'Referer': 'http://www.ximalaya.com/dq/all/2',
67+
'Upgrade-Insecure-Requests': '1',
68+
'User-Agent': random.choice(UA_LIST)
69+
}
70+
71+
72+
def get_url():
73+
start_urls = ['http://www.ximalaya.com/dq/all/{}'.format(num) for num in range(1, 85)]
74+
for start_url in start_urls:
75+
html = requests.get(start_url, headers=headers1).text
76+
soup = BeautifulSoup(html, 'lxml')
77+
for item in soup.find_all(class_="albumfaceOutter"):
78+
content = {
79+
'href': item.a['href'],
80+
'title': item.img['alt'],
81+
'img_url': item.img['src']
82+
}
83+
col1.insert(content)
84+
print('写入一个频道' + item.a['href'])
85+
print(content)
86+
another(item.a['href'])
87+
time.sleep(1)
88+
89+
90+
def another(url):
91+
html = requests.get(url, headers=headers2).text
92+
ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')
93+
if len(ifanother):
94+
num = ifanother[0]
95+
print('本频道资源存在' + num + '个页面')
96+
for n in range(1, int(num)):
97+
print('开始解析{}个中的第{}个页面'.format(num, n))
98+
url2 = url + '?page={}'.format(n)
99+
get_m4a(url2)
100+
get_m4a(url)
101+
102+
103+
def get_m4a(url):
104+
time.sleep(1)
105+
html = requests.get(url, headers=headers2).text
106+
numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')
107+
for i in numlist:
108+
murl = 'http://www.ximalaya.com/tracks/{}.json'.format(i)
109+
html = requests.get(murl, headers=headers1).text
110+
dic = json.loads(html)
111+
col2.insert(dic)
112+
print(murl + '中的数据已被成功插入mongodb')
113+
114+
115+
if __name__ == '__main__':
116+
get_url()

0 commit comments

Comments
 (0)