Skip to content

Commit 91dd115

Browse files
committed
Added Web Crawler Multithreaded.java
1 parent a89344a commit 91dd115

File tree

1 file changed

+108
-0
lines changed

1 file changed

+108
-0
lines changed
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/**
2+
* // This is the HtmlParser's API interface.
3+
* // You should not implement it, or speculate about its implementation
4+
* interface HtmlParser {
5+
* public List<String> getUrls(String url) {}
6+
* }
7+
*/
8+
import java.net.URI;
9+
import java.net.URISyntaxException;
10+
11+
12+
class Solution {
13+
public List<String> crawl(String startUrl, HtmlParser htmlParser) {
14+
ResultRecord resultRecord = new ResultRecord();
15+
CrawlTask task = new CrawlTask(
16+
startUrl, htmlParser, resultRecord, UrlUtil.parseHostname(startUrl));
17+
try {
18+
task.start();
19+
task.join();
20+
} catch (InterruptedException e) {
21+
e.printStackTrace();
22+
}
23+
return resultRecord.getResultList();
24+
}
25+
}
26+
27+
class CrawlTask extends Thread {
28+
29+
private final String url;
30+
private final HtmlParser htmlParser;
31+
private final ResultRecord resultRecord;
32+
private final String parentHost;
33+
34+
public CrawlTask(String url,
35+
HtmlParser htmlParser,
36+
ResultRecord resultRecord,
37+
String parentHost) {
38+
this.url = url;
39+
this.htmlParser = htmlParser;
40+
this.resultRecord = resultRecord;
41+
this.parentHost = parentHost;
42+
}
43+
44+
public void run() {
45+
String hostname = UrlUtil.parseHostname(url);
46+
if (!hostname.equals(parentHost)) {
47+
return;
48+
}
49+
if (resultRecord.addIfNotExists(url)) {
50+
List<String> childUrls = htmlParser.getUrls(url);
51+
List<CrawlTask> tasks = new ArrayList<>();
52+
for (String childUrl : childUrls) {
53+
tasks.add(new CrawlTask(
54+
childUrl, htmlParser, resultRecord, parentHost));
55+
}
56+
try {
57+
for (CrawlTask task : tasks) {
58+
task.start();
59+
}
60+
for (CrawlTask task : tasks) {
61+
task.join();
62+
}
63+
} catch (InterruptedException e) {
64+
e.printStackTrace();
65+
}
66+
}
67+
}
68+
}
69+
70+
class UrlUtil {
71+
72+
public static String parseHostname(String url) {
73+
try {
74+
URI uri = new URI(url);
75+
return uri.getHost();
76+
} catch(URISyntaxException e) {
77+
e.printStackTrace();
78+
}
79+
return null;
80+
}
81+
}
82+
83+
class ResultRecord {
84+
85+
private Set<String> urls;
86+
private Semaphore mutex;
87+
88+
public ResultRecord() {
89+
this.urls = new HashSet<>();
90+
this.mutex = new Semaphore(1);
91+
}
92+
93+
public boolean addIfNotExists(String url) {
94+
try {
95+
this.mutex.acquire();
96+
boolean added = this.urls.add(url);
97+
this.mutex.release();
98+
return added;
99+
} catch (InterruptedException e) {
100+
e.printStackTrace();
101+
}
102+
return false;
103+
}
104+
105+
public List<String> getResultList() {
106+
return new ArrayList<>(urls);
107+
}
108+
}

0 commit comments

Comments
 (0)