Merge pull request sjdirect#97 from coveo/checkLinksOnce

sjdirect · sjdirect · commit e6a70d56eea3 · 2015-11-08T19:25:53.000-07:00
Check extracted links only once
diff --git a/Abot.Tests.Unit/Crawler/WebCrawlerTest.cs b/Abot.Tests.Unit/Crawler/WebCrawlerTest.cs
@@ -1,5 +1,4 @@
 ﻿using Abot.Core;
-using Abot.Core;
 using Abot.Crawler;
 using Abot.Poco;
 using Abot.Util;
@@ -1091,6 +1090,35 @@ public void Crawl_CrawlHasExceededMaxMemoryUsageInMb_CrawlIsStoppedBeforeComplet
             Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested);
         }
 
+        [Test]
+        public void Crawl_ExtractedLinksAreNotCheckedTwice()
+        {
+            Uri fakeLink1 = new Uri("http://a.com/someUri");
+            Uri fakeLink2 = new Uri("http://a.com/someOtherUri");
+            Uri fakeLink3 = new Uri("http://a.com/anotherOne");
+            CrawledPage homePage = new CrawledPage(_rootUri);
+            CrawledPage page1 = new CrawledPage(fakeLink1);
+            CrawledPage page2 = new CrawledPage(fakeLink2);
+
+            // All links are found in each pages.
+            _fakeHyperLinkParser.Setup(parser => parser.GetLinks(It.IsAny<CrawledPage>())).Returns(new [] { fakeLink1, fakeLink2, fakeLink3 });
+            
+            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
+            _fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
+            _fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
+            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = true});
+            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.Is<PageToCrawl>(p => p.Uri == fakeLink3), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = false});
+            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
+
+            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);
+            _unitUnderTest.Crawl(_rootUri);
+
+            // The links should be checked only one time, so ShouldCrawlPage should be called only 4 times.
+            _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(4));
+            _fakeHyperLinkParser.VerifyAll();
+            _fakeCrawlDecisionMaker.VerifyAll();
+        }
+
         [Test]
         public void Crawl_CanExtractRetryAfterTimeFromHeaders()
         {
diff --git a/Abot/Core/Scheduler.cs b/Abot/Core/Scheduler.cs
@@ -33,6 +33,18 @@ public interface IScheduler : IDisposable
         /// Clear all currently scheduled pages
         /// </summary>
         void Clear();
+
+        /// <summary>
+        /// Add the Url to the list of crawled Url without scheduling it to be crawled.
+        /// </summary>
+        /// <param name="uri"></param>
+        void AddKnownUri(Uri uri);
+
+        /// <summary>
+        /// Returns whether or not the specified Uri was already scheduled to be crawled or simply added to the
+        /// list of known Uris.
+        /// </summary>
+        bool IsUriKnown(Uri uri);
     }
 
     [Serializable]
@@ -94,6 +106,16 @@ public void Clear()
             _pagesToCrawlRepo.Clear();
         }
 
+        public void AddKnownUri(Uri uri)
+        {
+            _crawledUrlRepo.AddIfNew(uri);
+        }
+
+        public bool IsUriKnown(Uri uri)
+        {
+            return _crawledUrlRepo.Contains(uri);
+        }
+
         public void Dispose()
         {
             if (_crawledUrlRepo != null)
diff --git a/Abot/Crawler/WebCrawler.cs b/Abot/Crawler/WebCrawler.cs
@@ -905,7 +905,6 @@ protected virtual void AddPageToContext(PageToCrawl pageToCrawl)
                 pageToCrawl.RetryCount++;
                 return;
             }
-                
 
             int domainCount = 0;
             Interlocked.Increment(ref _crawlContext.CrawledCount);
@@ -927,8 +926,10 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
         {
             foreach (Uri uri in crawledPage.ParsedLinks)
             {
-                if (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))
-                {
+                // First validate that the link was not already visited or added to the list of pages to visit, so we don't
+                // make the same validation and fire the same events twice.
+                if (!_scheduler.IsUriKnown(uri) &&
+                    (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) {
                     try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
                     {
                         PageToCrawl page = new PageToCrawl(uri);
@@ -944,6 +945,9 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
                     }
                     catch { }
                 }
+
+                // Add this link to the list of known Urls so validations are not duplicated in the future.
+                _scheduler.AddKnownUri(uri);
             }
         }
 

Original file line number	Diff line number	Diff line change
`@@ -905,7 +905,6 @@ protected virtual void AddPageToContext(PageToCrawl pageToCrawl)`
`905`	`905`	`pageToCrawl.RetryCount++;`
`906`	`906`	`return;`
`907`	`907`	`}`
`908`		`-`
`909`	`908`
`910`	`909`	`int domainCount = 0;`
`911`	`910`	`Interlocked.Increment(ref _crawlContext.CrawledCount);`
`@@ -927,8 +926,10 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)`
`927`	`926`	`{`
`928`	`927`	`foreach (Uri uri in crawledPage.ParsedLinks)`
`929`	`928`	`{`
`930`		`- if (_shouldScheduleLinkDecisionMaker == null \|\| _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))`
`931`		`- {`
	`929`	`+ // First validate that the link was not already visited or added to the list of pages to visit, so we don't`
	`930`	`+ // make the same validation and fire the same events twice.`
	`931`	`+ if (!_scheduler.IsUriKnown(uri) &&`
	`932`	`+ (_shouldScheduleLinkDecisionMaker == null \|\| _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) {`
`932`	`933`	`try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)`
`933`	`934`	`{`
`934`	`935`	`PageToCrawl page = new PageToCrawl(uri);`
`@@ -944,6 +945,9 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)`
`944`	`945`	`}`
`945`	`946`	`catch { }`
`946`	`947`	`}`
	`948`	`+`
	`949`	`+ // Add this link to the list of known Urls so validations are not duplicated in the future.`
	`950`	`+ _scheduler.AddKnownUri(uri);`
`947`	`951`	`}`
`948`	`952`	`}`
`949`	`953`