Skip to content

Commit bd5cdad

Browse files
committed
Changed the hardcoded part and added Docstring
1 parent cb448c2 commit bd5cdad

File tree

1 file changed

+17
-6
lines changed

1 file changed

+17
-6
lines changed

scripts/09_basic_link_web_crawler.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,25 +10,36 @@
1010

1111

1212
def crawl(url):
13-
13+
"""
14+
Crawls a page
15+
Arguments:
16+
- URL of the page to crawl
17+
Return:
18+
- List of all unique links found
19+
"""
20+
21+
found_link = []
1422
req = requests.get(url)
1523

1624
# Check if successful
1725
if(req.status_code != 200):
1826
return []
1927

20-
# Find links
21-
links = link_re.findall(req.text)
28+
# Finding unique links
29+
links = set(link_re.findall(req.text))
2230

23-
print("\nFound {} links".format(len(links)))
31+
print("\nFound {} unique links".format(len(links)))
2432

2533
# Search links for emails
2634
for link in links:
2735

2836
# Get an absolute URL for a link
2937
link = urljoin(url, link)
30-
38+
found_link.append(link)
3139
print(link)
40+
41+
return found_link
3242

3343
if __name__ == '__main__':
34-
crawl('http://www.realpython.com')
44+
url = input("Enter a url to crawl: ")
45+
crawl(url)

0 commit comments

Comments
 (0)