Skip to content

Commit e6a70d5

Browse files
committed
Merge pull request sjdirect#97 from coveo/checkLinksOnce
Check extracted links only once
2 parents b086d64 + a540895 commit e6a70d5

File tree

3 files changed

+58
-4
lines changed

3 files changed

+58
-4
lines changed

Abot.Tests.Unit/Crawler/WebCrawlerTest.cs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
using Abot.Core;
2-
using Abot.Core;
32
using Abot.Crawler;
43
using Abot.Poco;
54
using Abot.Util;
@@ -1091,6 +1090,35 @@ public void Crawl_CrawlHasExceededMaxMemoryUsageInMb_CrawlIsStoppedBeforeComplet
10911090
Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested);
10921091
}
10931092

1093+
[Test]
1094+
public void Crawl_ExtractedLinksAreNotCheckedTwice()
1095+
{
1096+
Uri fakeLink1 = new Uri("http://a.com/someUri");
1097+
Uri fakeLink2 = new Uri("http://a.com/someOtherUri");
1098+
Uri fakeLink3 = new Uri("http://a.com/anotherOne");
1099+
CrawledPage homePage = new CrawledPage(_rootUri);
1100+
CrawledPage page1 = new CrawledPage(fakeLink1);
1101+
CrawledPage page2 = new CrawledPage(fakeLink2);
1102+
1103+
// All links are found in each pages.
1104+
_fakeHyperLinkParser.Setup(parser => parser.GetLinks(It.IsAny<CrawledPage>())).Returns(new [] { fakeLink1, fakeLink2, fakeLink3 });
1105+
1106+
_fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
1107+
_fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
1108+
_fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
1109+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = true});
1110+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.Is<PageToCrawl>(p => p.Uri == fakeLink3), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = false});
1111+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
1112+
1113+
_unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);
1114+
_unitUnderTest.Crawl(_rootUri);
1115+
1116+
// The links should be checked only one time, so ShouldCrawlPage should be called only 4 times.
1117+
_fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(4));
1118+
_fakeHyperLinkParser.VerifyAll();
1119+
_fakeCrawlDecisionMaker.VerifyAll();
1120+
}
1121+
10941122
[Test]
10951123
public void Crawl_CanExtractRetryAfterTimeFromHeaders()
10961124
{

Abot/Core/Scheduler.cs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,18 @@ public interface IScheduler : IDisposable
3333
/// Clear all currently scheduled pages
3434
/// </summary>
3535
void Clear();
36+
37+
/// <summary>
38+
/// Add the Url to the list of crawled Url without scheduling it to be crawled.
39+
/// </summary>
40+
/// <param name="uri"></param>
41+
void AddKnownUri(Uri uri);
42+
43+
/// <summary>
44+
/// Returns whether or not the specified Uri was already scheduled to be crawled or simply added to the
45+
/// list of known Uris.
46+
/// </summary>
47+
bool IsUriKnown(Uri uri);
3648
}
3749

3850
[Serializable]
@@ -94,6 +106,16 @@ public void Clear()
94106
_pagesToCrawlRepo.Clear();
95107
}
96108

109+
public void AddKnownUri(Uri uri)
110+
{
111+
_crawledUrlRepo.AddIfNew(uri);
112+
}
113+
114+
public bool IsUriKnown(Uri uri)
115+
{
116+
return _crawledUrlRepo.Contains(uri);
117+
}
118+
97119
public void Dispose()
98120
{
99121
if (_crawledUrlRepo != null)

Abot/Crawler/WebCrawler.cs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -905,7 +905,6 @@ protected virtual void AddPageToContext(PageToCrawl pageToCrawl)
905905
pageToCrawl.RetryCount++;
906906
return;
907907
}
908-
909908

910909
int domainCount = 0;
911910
Interlocked.Increment(ref _crawlContext.CrawledCount);
@@ -927,8 +926,10 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
927926
{
928927
foreach (Uri uri in crawledPage.ParsedLinks)
929928
{
930-
if (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))
931-
{
929+
// First validate that the link was not already visited or added to the list of pages to visit, so we don't
930+
// make the same validation and fire the same events twice.
931+
if (!_scheduler.IsUriKnown(uri) &&
932+
(_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) {
932933
try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
933934
{
934935
PageToCrawl page = new PageToCrawl(uri);
@@ -944,6 +945,9 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
944945
}
945946
catch { }
946947
}
948+
949+
// Add this link to the list of known Urls so validations are not duplicated in the future.
950+
_scheduler.AddKnownUri(uri);
947951
}
948952
}
949953

0 commit comments

Comments
 (0)