Skip to content

Commit 19a5cd2

Browse files
committed
Keeping the CrawledUrlRepo as member of the WebCrawler and adding a validation when scheduling Links to check each link only once.
1 parent 2bcd341 commit 19a5cd2

File tree

2 files changed

+39
-5
lines changed

2 files changed

+39
-5
lines changed

Abot.Tests.Unit/Crawler/WebCrawlerTest.cs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
using Abot.Core;
2-
using Abot.Core;
32
using Abot.Crawler;
43
using Abot.Poco;
54
using Abot.Util;
@@ -1091,6 +1090,35 @@ public void Crawl_CrawlHasExceededMaxMemoryUsageInMb_CrawlIsStoppedBeforeComplet
10911090
Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested);
10921091
}
10931092

1093+
[Test]
1094+
public void Crawl_ExtractedLinksAreNotCheckedTwice()
1095+
{
1096+
Uri fakeLink1 = new Uri("http://a.com/someUri");
1097+
Uri fakeLink2 = new Uri("http://a.com/someOtherUri");
1098+
Uri fakeLink3 = new Uri("http://a.com/anotherOne");
1099+
CrawledPage homePage = new CrawledPage(_rootUri);
1100+
CrawledPage page1 = new CrawledPage(fakeLink1);
1101+
CrawledPage page2 = new CrawledPage(fakeLink2);
1102+
1103+
// All links are found in each pages.
1104+
_fakeHyperLinkParser.Setup(parser => parser.GetLinks(It.IsAny<CrawledPage>())).Returns(new [] { fakeLink1, fakeLink2, fakeLink3 });
1105+
1106+
_fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
1107+
_fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
1108+
_fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
1109+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = true});
1110+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.Is<PageToCrawl>(p => p.Uri == fakeLink3), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = false});
1111+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
1112+
1113+
_unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);
1114+
_unitUnderTest.Crawl(_rootUri);
1115+
1116+
// The links should be checked only one time, so ShouldCrawlPage should be called only 4 times.
1117+
_fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(4));
1118+
_fakeHyperLinkParser.VerifyAll();
1119+
_fakeCrawlDecisionMaker.VerifyAll();
1120+
}
1121+
10941122
[Test]
10951123
public void Crawl_CanExtractRetryAfterTimeFromHeaders()
10961124
{

Abot/Crawler/WebCrawler.cs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ public abstract class WebCrawler : IWebCrawler
118118
protected CrawlContext _crawlContext;
119119
protected IThreadManager _threadManager;
120120
protected IScheduler _scheduler;
121+
protected ICrawledUrlRepository _crawledUrlRepository;
121122
protected IPageRequester _pageRequester;
122123
protected IHyperLinkParser _hyperLinkParser;
123124
protected ICrawlDecisionMaker _crawlDecisionMaker;
@@ -190,7 +191,8 @@ public WebCrawler(
190191
CrawlBag = _crawlContext.CrawlBag;
191192

192193
_threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
193-
_scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
194+
_crawledUrlRepository = new InMemoryCrawledUrlRepository();
195+
_scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, _crawledUrlRepository, null);
194196
_pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
195197
_crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();
196198

@@ -905,7 +907,6 @@ protected virtual void AddPageToContext(PageToCrawl pageToCrawl)
905907
pageToCrawl.RetryCount++;
906908
return;
907909
}
908-
909910

910911
int domainCount = 0;
911912
Interlocked.Increment(ref _crawlContext.CrawledCount);
@@ -927,8 +928,10 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
927928
{
928929
foreach (Uri uri in crawledPage.ParsedLinks)
929930
{
930-
if (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))
931-
{
931+
// First validate that the link was not already visited or added to the list of pages to visit, so we don't
932+
// make the same validation and fire the same events twice.
933+
if (!_crawledUrlRepository.Contains(uri) &&
934+
(_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) {
932935
try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
933936
{
934937
PageToCrawl page = new PageToCrawl(uri);
@@ -944,6 +947,9 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
944947
}
945948
catch { }
946949
}
950+
951+
// Add this link to the list of visited Urls so validations are not duplicated in the future.
952+
_crawledUrlRepository.AddIfNew(uri);
947953
}
948954
}
949955

0 commit comments

Comments
 (0)