Skip to content

Commit 1b1df63

Browse files
committed
Add web spider to test for crashes on the real wild web
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40515
1 parent 659ecd5 commit 1b1df63

File tree

1 file changed

+123
-0
lines changed

1 file changed

+123
-0
lines changed

utils/spider.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/usr/bin/env python
2+
"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
3+
4+
usage:
5+
import spider
6+
s = spider.Spider()
7+
s.spider("http://www.google.com", maxURLs=100)
8+
"""
9+
10+
import urllib2
11+
import urlparse
12+
import robotparser
13+
import md5
14+
15+
import httplib2
16+
17+
import html5lib
18+
from html5lib.treebuilders import etree
19+
20+
class Spider(object):
21+
def __init__(self):
22+
self.unvisitedURLs = set()
23+
self.visitedURLs = set()
24+
self.buggyURLs=set()
25+
self.robotParser = robotparser.RobotFileParser()
26+
self.contentDigest = {}
27+
self.http = httplib2.Http(".cache")
28+
29+
def run(self, initialURL, maxURLs=1000):
30+
urlNumber = 0
31+
self.visitedURLs.add(initialURL)
32+
content = self.loadURL(initialURL)
33+
while maxURLs == None or urlNumber < maxURLs:
34+
if content is not None:
35+
self.parse(content)
36+
urlNumber += 1
37+
if not self.unvisitedURLs:
38+
break
39+
content = self.loadURL(self.unvisitedURLs.pop())
40+
41+
def parse(self, content):
42+
failed = False
43+
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
44+
try:
45+
tree = p.parse(content)
46+
except:
47+
self.buggyURLs.add(self.currentURL)
48+
failed = True
49+
print "BUGGY:", self.currentURL
50+
self.visitedURLs.add(self.currentURL)
51+
if not failed:
52+
self.updateURLs(tree)
53+
54+
def loadURL(self, url):
55+
resp, content = self.http.request(url, "GET")
56+
self.currentURL = url
57+
digest = md5.md5(content).hexdigest()
58+
if digest in self.contentDigest:
59+
content = None
60+
self.visitedURLs.add(url)
61+
else:
62+
self.contentDigest[digest] = url
63+
64+
if resp['status'] != "200":
65+
content = None
66+
67+
return content
68+
69+
def updateURLs(self, tree):
70+
"""Take all the links in the current document, extract the URLs and
71+
update the list of visited and unvisited URLs according to whether we
72+
have seen them before or not"""
73+
urls = set()
74+
#Remove all links we have already visited
75+
for link in tree.findall(".//a"):
76+
try:
77+
url = urlparse.urldefrag(link.attrib['href'])[0]
78+
if (url and url not in self.unvisitedURLs and url
79+
not in self.visitedURLs):
80+
urls.add(url)
81+
except KeyError:
82+
pass
83+
84+
#Remove all non-http URLs and add a sutiable base URL where that is
85+
#missing
86+
newUrls = set()
87+
for url in urls:
88+
splitURL = list(urlparse.urlsplit(url))
89+
if splitURL[0] != "http":
90+
continue
91+
if splitURL[1] == "":
92+
splitURL[1] = urlparse.urlsplit(self.currentURL)[1]
93+
newUrls.add(urlparse.urlunsplit(splitURL))
94+
urls = newUrls
95+
96+
responseHeaders = {}
97+
#Now we want to find the content types of the links we haven't visited
98+
for url in urls:
99+
try:
100+
resp, content = self.http.request(url, "HEAD")
101+
responseHeaders[url] = resp
102+
except AttributeError:
103+
#Don't know why this happens
104+
pass
105+
106+
107+
#Remove links not of content-type html or pages not found
108+
#XXX - need to deal with other status codes?
109+
toVisit = set([url for url in urls if url in responseHeaders and
110+
"html" in responseHeaders[url]['content-type'] and
111+
responseHeaders[url]['status'] == "200"])
112+
113+
#Now check we are allowed to spider the page
114+
for url in toVisit:
115+
robotURL = list(urlparse.urlsplit(url)[:2])
116+
robotURL.extend(["robots.txt", "", ""])
117+
robotURL = urlparse.urlunsplit(robotURL)
118+
self.robotParser.set_url(robotURL)
119+
if not self.robotParser.can_fetch("*", url):
120+
toVisit.remove(url)
121+
122+
self.visitedURLs.update(urls)
123+
self.unvisitedURLs.update(toVisit)

0 commit comments

Comments
 (0)