Keeping the CrawledUrlRepo as member of the WebCrawler and adding a validation when scheduling Links to check each link only once.

mikegron · mikegron · commit 19a5cd240e0b · 2015-11-02T16:01:48.000-05:00
diff --git a/Abot.Tests.Unit/Crawler/WebCrawlerTest.cs b/Abot.Tests.Unit/Crawler/WebCrawlerTest.cs
@@ -1,5 +1,4 @@
 ﻿using Abot.Core;
-using Abot.Core;
 using Abot.Crawler;
 using Abot.Poco;
 using Abot.Util;
@@ -1091,6 +1090,35 @@ public void Crawl_CrawlHasExceededMaxMemoryUsageInMb_CrawlIsStoppedBeforeComplet
             Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested);
         }
 
+        [Test]
+        public void Crawl_ExtractedLinksAreNotCheckedTwice()
+        {
+            Uri fakeLink1 = new Uri("http://a.com/someUri");
+            Uri fakeLink2 = new Uri("http://a.com/someOtherUri");
+            Uri fakeLink3 = new Uri("http://a.com/anotherOne");
+            CrawledPage homePage = new CrawledPage(_rootUri);
+            CrawledPage page1 = new CrawledPage(fakeLink1);
+            CrawledPage page2 = new CrawledPage(fakeLink2);
+
+            // All links are found in each pages.
+            _fakeHyperLinkParser.Setup(parser => parser.GetLinks(It.IsAny<CrawledPage>())).Returns(new [] { fakeLink1, fakeLink2, fakeLink3 });
+            
+            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
+            _fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
+            _fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
+            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = true});
+            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.Is<PageToCrawl>(p => p.Uri == fakeLink3), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = false});
+            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
+
+            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);
+            _unitUnderTest.Crawl(_rootUri);
+
+            // The links should be checked only one time, so ShouldCrawlPage should be called only 4 times.
+            _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(4));
+            _fakeHyperLinkParser.VerifyAll();
+            _fakeCrawlDecisionMaker.VerifyAll();
+        }
+
         [Test]
         public void Crawl_CanExtractRetryAfterTimeFromHeaders()
         {
diff --git a/Abot/Crawler/WebCrawler.cs b/Abot/Crawler/WebCrawler.cs
@@ -118,6 +118,7 @@ public abstract class WebCrawler : IWebCrawler
         protected CrawlContext _crawlContext;
         protected IThreadManager _threadManager;
         protected IScheduler _scheduler;
+        protected ICrawledUrlRepository _crawledUrlRepository;
         protected IPageRequester _pageRequester;
         protected IHyperLinkParser _hyperLinkParser;
         protected ICrawlDecisionMaker _crawlDecisionMaker;
@@ -190,7 +191,8 @@ public WebCrawler(
             CrawlBag = _crawlContext.CrawlBag;
 
             _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
-            _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
+            _crawledUrlRepository = new InMemoryCrawledUrlRepository();
+            _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, _crawledUrlRepository, null);
             _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
             _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();
 
@@ -905,7 +907,6 @@ protected virtual void AddPageToContext(PageToCrawl pageToCrawl)
                 pageToCrawl.RetryCount++;
                 return;
             }
-                
 
             int domainCount = 0;
             Interlocked.Increment(ref _crawlContext.CrawledCount);
@@ -927,8 +928,10 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
         {
             foreach (Uri uri in crawledPage.ParsedLinks)
             {
-                if (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))
-                {
+                // First validate that the link was not already visited or added to the list of pages to visit, so we don't
+                // make the same validation and fire the same events twice.
+                if (!_crawledUrlRepository.Contains(uri) &&
+                    (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) {
                     try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
                     {
                         PageToCrawl page = new PageToCrawl(uri);
@@ -944,6 +947,9 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
                     }
                     catch { }
                 }
+
+                // Add this link to the list of visited Urls so validations are not duplicated in the future.
+                _crawledUrlRepository.AddIfNew(uri);
             }
         }
 

Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,7 @@ public abstract class WebCrawler : IWebCrawler`
`118`	`118`	`protected CrawlContext _crawlContext;`
`119`	`119`	`protected IThreadManager _threadManager;`
`120`	`120`	`protected IScheduler _scheduler;`
	`121`	`+ protected ICrawledUrlRepository _crawledUrlRepository;`
`121`	`122`	`protected IPageRequester _pageRequester;`
`122`	`123`	`protected IHyperLinkParser _hyperLinkParser;`
`123`	`124`	`protected ICrawlDecisionMaker _crawlDecisionMaker;`
`@@ -190,7 +191,8 @@ public WebCrawler(`
`190`	`191`	`CrawlBag = _crawlContext.CrawlBag;`
`191`	`192`
`192`	`193`	`_threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);`
`193`		`- _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);`
	`194`	`+ _crawledUrlRepository = new InMemoryCrawledUrlRepository();`
	`195`	`+ _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, _crawledUrlRepository, null);`
`194`	`196`	`_pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);`
`195`	`197`	`_crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();`
`196`	`198`
`@@ -905,7 +907,6 @@ protected virtual void AddPageToContext(PageToCrawl pageToCrawl)`
`905`	`907`	`pageToCrawl.RetryCount++;`
`906`	`908`	`return;`
`907`	`909`	`}`
`908`		`-`
`909`	`910`
`910`	`911`	`int domainCount = 0;`
`911`	`912`	`Interlocked.Increment(ref _crawlContext.CrawledCount);`
`@@ -927,8 +928,10 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)`
`927`	`928`	`{`
`928`	`929`	`foreach (Uri uri in crawledPage.ParsedLinks)`
`929`	`930`	`{`
`930`		`- if (_shouldScheduleLinkDecisionMaker == null \|\| _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))`
`931`		`- {`
	`931`	`+ // First validate that the link was not already visited or added to the list of pages to visit, so we don't`
	`932`	`+ // make the same validation and fire the same events twice.`
	`933`	`+ if (!_crawledUrlRepository.Contains(uri) &&`
	`934`	`+ (_shouldScheduleLinkDecisionMaker == null \|\| _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) {`
`932`	`935`	`try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)`
`933`	`936`	`{`
`934`	`937`	`PageToCrawl page = new PageToCrawl(uri);`
`@@ -944,6 +947,9 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)`
`944`	`947`	`}`
`945`	`948`	`catch { }`
`946`	`949`	`}`
	`950`	`+`
	`951`	`+ // Add this link to the list of visited Urls so validations are not duplicated in the future.`
	`952`	`+ _crawledUrlRepository.AddIfNew(uri);`
`947`	`953`	`}`
`948`	`954`	`}`
`949`	`955`