From bd5cdadba232fbc5a2910f4bec5f0c9ac109d592 Mon Sep 17 00:00:00 2001 From: Ankit Sangwan Date: Sat, 5 Sep 2020 19:40:50 +0530 Subject: [PATCH] Changed the hardcoded part and added Docstring --- scripts/09_basic_link_web_crawler.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/scripts/09_basic_link_web_crawler.py b/scripts/09_basic_link_web_crawler.py index 87e2fab..9125cc7 100755 --- a/scripts/09_basic_link_web_crawler.py +++ b/scripts/09_basic_link_web_crawler.py @@ -10,25 +10,36 @@ def crawl(url): - + """ + Crawls a page + Arguments: + - URL of the page to crawl + Return: + - List of all unique links found + """ + + found_link = [] req = requests.get(url) # Check if successful if(req.status_code != 200): return [] - # Find links - links = link_re.findall(req.text) + # Finding unique links + links = set(link_re.findall(req.text)) - print("\nFound {} links".format(len(links))) + print("\nFound {} unique links".format(len(links))) # Search links for emails for link in links: # Get an absolute URL for a link link = urljoin(url, link) - + found_link.append(link) print(link) + + return found_link if __name__ == '__main__': - crawl('http://www.realpython.com') + url = input("Enter a url to crawl: ") + crawl(url)