Skip to content

Commit 2760d68

Browse files
committed
added javascript execution to link extractor tool
1 parent ddf7fe5 commit 2760d68

File tree

1 file changed

+117
-0
lines changed

1 file changed

+117
-0
lines changed
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
from requests_html import HTMLSession
2+
from urllib.parse import urlparse, urljoin
3+
from bs4 import BeautifulSoup
4+
import colorama
5+
6+
# init the colorama module
7+
colorama.init()
8+
9+
GREEN = colorama.Fore.GREEN
10+
GRAY = colorama.Fore.LIGHTBLACK_EX
11+
RESET = colorama.Fore.RESET
12+
13+
# initialize the set of links (unique links)
14+
internal_urls = set()
15+
external_urls = set()
16+
17+
total_urls_visited = 0
18+
19+
20+
def is_valid(url):
21+
"""
22+
Checks whether `url` is a valid URL.
23+
"""
24+
parsed = urlparse(url)
25+
return bool(parsed.netloc) and bool(parsed.scheme)
26+
27+
28+
def get_all_website_links(url):
29+
"""
30+
Returns all URLs that is found on `url` in which it belongs to the same website
31+
"""
32+
# all URLs of `url`
33+
urls = set()
34+
# domain name of the URL without the protocol
35+
domain_name = urlparse(url).netloc
36+
# initialize an HTTP session
37+
session = HTMLSession()
38+
# make HTTP request & retrieve response
39+
response = session.get(url)
40+
# execute Javascript
41+
try:
42+
response.html.render()
43+
except:
44+
pass
45+
soup = BeautifulSoup(response.html.html, "html.parser")
46+
for a_tag in soup.findAll("a"):
47+
href = a_tag.attrs.get("href")
48+
if href == "" or href is None:
49+
# href empty tag
50+
continue
51+
# join the URL if it's relative (not absolute link)
52+
href = urljoin(url, href)
53+
parsed_href = urlparse(href)
54+
# remove URL GET parameters, URL fragments, etc.
55+
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
56+
if not is_valid(href):
57+
# not a valid URL
58+
continue
59+
if href in internal_urls:
60+
# already in the set
61+
continue
62+
if domain_name not in href:
63+
# external link
64+
if href not in external_urls:
65+
print(f"{GRAY}[!] External link: {href}{RESET}")
66+
external_urls.add(href)
67+
continue
68+
print(f"{GREEN}[*] Internal link: {href}{RESET}")
69+
urls.add(href)
70+
internal_urls.add(href)
71+
return urls
72+
73+
74+
def crawl(url, max_urls=50):
75+
"""
76+
Crawls a web page and extracts all links.
77+
You'll find all links in `external_urls` and `internal_urls` global set variables.
78+
params:
79+
max_urls (int): number of max urls to crawl, default is 30.
80+
"""
81+
global total_urls_visited
82+
total_urls_visited += 1
83+
links = get_all_website_links(url)
84+
for link in links:
85+
if total_urls_visited > max_urls:
86+
break
87+
crawl(link, max_urls=max_urls)
88+
89+
90+
if __name__ == "__main__":
91+
import argparse
92+
parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
93+
parser.add_argument("url", help="The URL to extract links from.")
94+
parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
95+
96+
args = parser.parse_args()
97+
url = args.url
98+
max_urls = args.max_urls
99+
100+
crawl(url, max_urls=max_urls)
101+
102+
print("[+] Total Internal links:", len(internal_urls))
103+
print("[+] Total External links:", len(external_urls))
104+
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
105+
106+
domain_name = urlparse(url).netloc
107+
108+
# save the internal links to a file
109+
with open(f"{domain_name}_internal_links.txt", "w") as f:
110+
for internal_link in internal_urls:
111+
print(internal_link.strip(), file=f)
112+
113+
# save the external links to a file
114+
with open(f"{domain_name}_external_links.txt", "w") as f:
115+
for external_link in external_urls:
116+
print(external_link.strip(), file=f)
117+

0 commit comments

Comments
 (0)