File tree Expand file tree Collapse file tree 1 file changed +37
-0
lines changed Expand file tree Collapse file tree 1 file changed +37
-0
lines changed Original file line number Diff line number Diff line change
1
+ #!/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import urllib2
5
+ from pyquery import PyQuery as pq
6
+ from lxml import etree
7
+ import re
8
+
9
+
10
+ def get_html (url ):
11
+ html = urllib2 .urlopen (url ).read ()
12
+ return html
13
+
14
+ def get_content (html ):
15
+ match = re .search (r'<div class="content">([^$]*)<div class="article-copyright">' ,html )
16
+ content = match .group (1 )
17
+ return content
18
+
19
+ def get_result (content ):
20
+ content = content .decode ('utf-8' )
21
+ jq = pq (content )
22
+ l = jq ('p' )
23
+ result = []
24
+ for string in l :
25
+ result .append (pq (string ).text ())
26
+ return result
27
+
28
+
29
+ def main ():
30
+ url = 'http://jimmy66.com/164.html'
31
+ html = get_html (url )
32
+ content = get_content (html )
33
+ result = get_result (content )
34
+ for line in result :
35
+ print line
36
+ if __name__ == '__main__' :
37
+ main ()
You can’t perform that action at this time.
0 commit comments