1
+ from requests_html import HTMLSession
2
+ from urllib .parse import urlparse , urljoin
3
+ from bs4 import BeautifulSoup
4
+ import colorama
5
+
6
+ # init the colorama module
7
+ colorama .init ()
8
+
9
+ GREEN = colorama .Fore .GREEN
10
+ GRAY = colorama .Fore .LIGHTBLACK_EX
11
+ RESET = colorama .Fore .RESET
12
+
13
+ # initialize the set of links (unique links)
14
+ internal_urls = set ()
15
+ external_urls = set ()
16
+
17
+ total_urls_visited = 0
18
+
19
+
20
+ def is_valid (url ):
21
+ """
22
+ Checks whether `url` is a valid URL.
23
+ """
24
+ parsed = urlparse (url )
25
+ return bool (parsed .netloc ) and bool (parsed .scheme )
26
+
27
+
28
+ def get_all_website_links (url ):
29
+ """
30
+ Returns all URLs that is found on `url` in which it belongs to the same website
31
+ """
32
+ # all URLs of `url`
33
+ urls = set ()
34
+ # domain name of the URL without the protocol
35
+ domain_name = urlparse (url ).netloc
36
+ # initialize an HTTP session
37
+ session = HTMLSession ()
38
+ # make HTTP request & retrieve response
39
+ response = session .get (url )
40
+ # execute Javascript
41
+ try :
42
+ response .html .render ()
43
+ except :
44
+ pass
45
+ soup = BeautifulSoup (response .html .html , "html.parser" )
46
+ for a_tag in soup .findAll ("a" ):
47
+ href = a_tag .attrs .get ("href" )
48
+ if href == "" or href is None :
49
+ # href empty tag
50
+ continue
51
+ # join the URL if it's relative (not absolute link)
52
+ href = urljoin (url , href )
53
+ parsed_href = urlparse (href )
54
+ # remove URL GET parameters, URL fragments, etc.
55
+ href = parsed_href .scheme + "://" + parsed_href .netloc + parsed_href .path
56
+ if not is_valid (href ):
57
+ # not a valid URL
58
+ continue
59
+ if href in internal_urls :
60
+ # already in the set
61
+ continue
62
+ if domain_name not in href :
63
+ # external link
64
+ if href not in external_urls :
65
+ print (f"{ GRAY } [!] External link: { href } { RESET } " )
66
+ external_urls .add (href )
67
+ continue
68
+ print (f"{ GREEN } [*] Internal link: { href } { RESET } " )
69
+ urls .add (href )
70
+ internal_urls .add (href )
71
+ return urls
72
+
73
+
74
+ def crawl (url , max_urls = 50 ):
75
+ """
76
+ Crawls a web page and extracts all links.
77
+ You'll find all links in `external_urls` and `internal_urls` global set variables.
78
+ params:
79
+ max_urls (int): number of max urls to crawl, default is 30.
80
+ """
81
+ global total_urls_visited
82
+ total_urls_visited += 1
83
+ links = get_all_website_links (url )
84
+ for link in links :
85
+ if total_urls_visited > max_urls :
86
+ break
87
+ crawl (link , max_urls = max_urls )
88
+
89
+
90
+ if __name__ == "__main__" :
91
+ import argparse
92
+ parser = argparse .ArgumentParser (description = "Link Extractor Tool with Python" )
93
+ parser .add_argument ("url" , help = "The URL to extract links from." )
94
+ parser .add_argument ("-m" , "--max-urls" , help = "Number of max URLs to crawl, default is 30." , default = 30 , type = int )
95
+
96
+ args = parser .parse_args ()
97
+ url = args .url
98
+ max_urls = args .max_urls
99
+
100
+ crawl (url , max_urls = max_urls )
101
+
102
+ print ("[+] Total Internal links:" , len (internal_urls ))
103
+ print ("[+] Total External links:" , len (external_urls ))
104
+ print ("[+] Total URLs:" , len (external_urls ) + len (internal_urls ))
105
+
106
+ domain_name = urlparse (url ).netloc
107
+
108
+ # save the internal links to a file
109
+ with open (f"{ domain_name } _internal_links.txt" , "w" ) as f :
110
+ for internal_link in internal_urls :
111
+ print (internal_link .strip (), file = f )
112
+
113
+ # save the external links to a file
114
+ with open (f"{ domain_name } _external_links.txt" , "w" ) as f :
115
+ for external_link in external_urls :
116
+ print (external_link .strip (), file = f )
117
+
0 commit comments