1
+ #!/usr/bin/env python
2
+ """Spider to try and find bugs in the parser. Requires httplib2 and elementtree
3
+
4
+ usage:
5
+ import spider
6
+ s = spider.Spider()
7
+ s.spider("http://www.google.com", maxURLs=100)
8
+ """
9
+
10
+ import urllib2
11
+ import urlparse
12
+ import robotparser
13
+ import md5
14
+
15
+ import httplib2
16
+
17
+ import html5lib
18
+ from html5lib .treebuilders import etree
19
+
20
+ class Spider (object ):
21
+ def __init__ (self ):
22
+ self .unvisitedURLs = set ()
23
+ self .visitedURLs = set ()
24
+ self .buggyURLs = set ()
25
+ self .robotParser = robotparser .RobotFileParser ()
26
+ self .contentDigest = {}
27
+ self .http = httplib2 .Http (".cache" )
28
+
29
+ def run (self , initialURL , maxURLs = 1000 ):
30
+ urlNumber = 0
31
+ self .visitedURLs .add (initialURL )
32
+ content = self .loadURL (initialURL )
33
+ while maxURLs == None or urlNumber < maxURLs :
34
+ if content is not None :
35
+ self .parse (content )
36
+ urlNumber += 1
37
+ if not self .unvisitedURLs :
38
+ break
39
+ content = self .loadURL (self .unvisitedURLs .pop ())
40
+
41
+ def parse (self , content ):
42
+ failed = False
43
+ p = html5lib .HTMLParser (tree = etree .TreeBuilder )
44
+ try :
45
+ tree = p .parse (content )
46
+ except :
47
+ self .buggyURLs .add (self .currentURL )
48
+ failed = True
49
+ print "BUGGY:" , self .currentURL
50
+ self .visitedURLs .add (self .currentURL )
51
+ if not failed :
52
+ self .updateURLs (tree )
53
+
54
+ def loadURL (self , url ):
55
+ resp , content = self .http .request (url , "GET" )
56
+ self .currentURL = url
57
+ digest = md5 .md5 (content ).hexdigest ()
58
+ if digest in self .contentDigest :
59
+ content = None
60
+ self .visitedURLs .add (url )
61
+ else :
62
+ self .contentDigest [digest ] = url
63
+
64
+ if resp ['status' ] != "200" :
65
+ content = None
66
+
67
+ return content
68
+
69
+ def updateURLs (self , tree ):
70
+ """Take all the links in the current document, extract the URLs and
71
+ update the list of visited and unvisited URLs according to whether we
72
+ have seen them before or not"""
73
+ urls = set ()
74
+ #Remove all links we have already visited
75
+ for link in tree .findall (".//a" ):
76
+ try :
77
+ url = urlparse .urldefrag (link .attrib ['href' ])[0 ]
78
+ if (url and url not in self .unvisitedURLs and url
79
+ not in self .visitedURLs ):
80
+ urls .add (url )
81
+ except KeyError :
82
+ pass
83
+
84
+ #Remove all non-http URLs and add a sutiable base URL where that is
85
+ #missing
86
+ newUrls = set ()
87
+ for url in urls :
88
+ splitURL = list (urlparse .urlsplit (url ))
89
+ if splitURL [0 ] != "http" :
90
+ continue
91
+ if splitURL [1 ] == "" :
92
+ splitURL [1 ] = urlparse .urlsplit (self .currentURL )[1 ]
93
+ newUrls .add (urlparse .urlunsplit (splitURL ))
94
+ urls = newUrls
95
+
96
+ responseHeaders = {}
97
+ #Now we want to find the content types of the links we haven't visited
98
+ for url in urls :
99
+ try :
100
+ resp , content = self .http .request (url , "HEAD" )
101
+ responseHeaders [url ] = resp
102
+ except AttributeError :
103
+ #Don't know why this happens
104
+ pass
105
+
106
+
107
+ #Remove links not of content-type html or pages not found
108
+ #XXX - need to deal with other status codes?
109
+ toVisit = set ([url for url in urls if url in responseHeaders and
110
+ "html" in responseHeaders [url ]['content-type' ] and
111
+ responseHeaders [url ]['status' ] == "200" ])
112
+
113
+ #Now check we are allowed to spider the page
114
+ for url in toVisit :
115
+ robotURL = list (urlparse .urlsplit (url )[:2 ])
116
+ robotURL .extend (["robots.txt" , "" , "" ])
117
+ robotURL = urlparse .urlunsplit (robotURL )
118
+ self .robotParser .set_url (robotURL )
119
+ if not self .robotParser .can_fetch ("*" , url ):
120
+ toVisit .remove (url )
121
+
122
+ self .visitedURLs .update (urls )
123
+ self .unvisitedURLs .update (toVisit )
0 commit comments