Skip to content

Commit fb6a92e

Browse files
committed
no message
1 parent 0ec080b commit fb6a92e

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed

moumoubaimifan/sjjy/sjjy.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# coding:utf-8
2+
import csv
3+
import json
4+
5+
import requests
6+
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
7+
import re
8+
9+
line_index = 0
10+
11+
def fetchURL(url):
12+
13+
headers = {
14+
'accept': '*/*',
15+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
16+
'Cookie': 'guider_quick_search=on; accessID=20201021004216238222; PHPSESSID=11117cc60f4dcafd131b69d542987a46; is_searchv2=1; SESSION_HASH=8f93eeb87a87af01198f418aa59bccad9dbe5c13; user_access=1; Qs_lvt_336351=1603457224; Qs_pv_336351=4391272815204901400%2C3043552944961503700'
17+
}
18+
19+
r = requests.get(url, headers=headers)
20+
r.raise_for_status()
21+
return r.text.encode("gbk", 'ignore').decode("gbk", "ignore")
22+
23+
24+
def parseHtml(html):
25+
26+
html = html.replace('\\', '')
27+
html = ILLEGAL_CHARACTERS_RE.sub(r'', html)
28+
s = json.loads(html,strict=False)
29+
global line_index
30+
31+
userInfo = []
32+
for key in s['userInfo']:
33+
line_index = line_index + 1
34+
a = (key['uid'],key['nickname'],key['age'],key['work_location'],key['height'],key['education'],key['matchCondition'],key['marriage'],key['shortnote'].replace('\n',' '))
35+
userInfo.append(a)
36+
37+
with open('sjjy.csv', 'a', newline='') as f:
38+
writer = csv.writer(f)
39+
writer.writerows(userInfo)
40+
41+
def filterData():
42+
filter = []
43+
csv_reader = csv.reader(open("sjjy.csv", encoding='gbk'))
44+
i = 0
45+
for row in csv_reader:
46+
i = i + 1
47+
print('正在处理:' + str(i) + '行')
48+
if row[0] not in filter:
49+
filter.append(row[0])
50+
print(len(filter))
51+
52+
if __name__ == '__main__':
53+
54+
# for i in range(1, 10000):
55+
# url = 'http://search.jiayuan.com/v2/search_v2.php?key=&sex=f&stc=23:1,2:20.30&sn=default&sv=1&p=' + str(i) + '&f=select&listStyle=bigPhoto'
56+
# html = fetchURL(url)
57+
# print(str(i) + '页' + str(len(html)) + '*********' * 20)
58+
# parseHtml(html)
59+
filterData()

0 commit comments

Comments
 (0)