Skip to content

Commit 7611cec

Browse files
committed
complete 0009
准备补编码坑和解析坑
1 parent 544e063 commit 7611cec

File tree

5 files changed

+939
-1
lines changed

5 files changed

+939
-1
lines changed

Jimmy66/0006/0006.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def list1(string):
1212
def file_read(filename):
1313
with open(filename,'r') as fp:
1414
article = fp.read()
15-
return article
15+
return article
1616

1717
#计算出出现最多的单词
1818
def most_word_number(word_list):

Jimmy66/0009/0009.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
#也不清楚这里说的链接是什么定义,是指a标签还是所有href的链接,这里取后者
5+
6+
#导入模块
7+
import re
8+
import urllib2
9+
10+
#读取文件
11+
def file_read(filename):
12+
#因为用之前文件方法打开html不行,貌似涉及到编码问题,所以想了个抖机灵的方法,编码坑还是要填啊,在XML里面也要用到
13+
Req = urllib2.Request("file:./Yixiaohan show-me-the-code.html")
14+
r = urllib2.urlopen(Req)
15+
html = r.read()
16+
return html
17+
18+
#查找链接,返回列表
19+
def link_find(html):
20+
match = re.findall(r'href="(http[s]?:[^"]+)"',html) #加括号可以直接截取...偶然一试才知道,findall和python真强大,爱死
21+
return match
22+
23+
#主函数,显示链接列表
24+
def main():
25+
html = file_read('Yixiaohan show-me-the-code.html')
26+
link = link_find(html)
27+
for string in link:
28+
print string
29+
30+
if __name__ == '__main__':
31+
main()

0 commit comments

Comments
 (0)