Moved back the Url repository in the scheduler and added two new methods

mikegron · mikegron · commit a540895ae737 · 2015-11-02T16:46:13.000-05:00
diff --git a/Abot/Core/Scheduler.cs b/Abot/Core/Scheduler.cs
@@ -33,6 +33,18 @@ public interface IScheduler : IDisposable
         /// Clear all currently scheduled pages
         /// </summary>
         void Clear();
+
+        /// <summary>
+        /// Add the Url to the list of crawled Url without scheduling it to be crawled.
+        /// </summary>
+        /// <param name="uri"></param>
+        void AddKnownUri(Uri uri);
+
+        /// <summary>
+        /// Returns whether or not the specified Uri was already scheduled to be crawled or simply added to the
+        /// list of known Uris.
+        /// </summary>
+        bool IsUriKnown(Uri uri);
     }
 
     [Serializable]
@@ -94,6 +106,16 @@ public void Clear()
             _pagesToCrawlRepo.Clear();
         }
 
+        public void AddKnownUri(Uri uri)
+        {
+            _crawledUrlRepo.AddIfNew(uri);
+        }
+
+        public bool IsUriKnown(Uri uri)
+        {
+            return _crawledUrlRepo.Contains(uri);
+        }
+
         public void Dispose()
         {
             if (_crawledUrlRepo != null)
diff --git a/Abot/Crawler/WebCrawler.cs b/Abot/Crawler/WebCrawler.cs
@@ -118,7 +118,6 @@ public abstract class WebCrawler : IWebCrawler
         protected CrawlContext _crawlContext;
         protected IThreadManager _threadManager;
         protected IScheduler _scheduler;
-        protected ICrawledUrlRepository _crawledUrlRepository;
         protected IPageRequester _pageRequester;
         protected IHyperLinkParser _hyperLinkParser;
         protected ICrawlDecisionMaker _crawlDecisionMaker;
@@ -191,8 +190,7 @@ public WebCrawler(
             CrawlBag = _crawlContext.CrawlBag;
 
             _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
-            _crawledUrlRepository = new InMemoryCrawledUrlRepository();
-            _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, _crawledUrlRepository, null);
+            _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
             _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
             _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();
 
@@ -930,7 +928,7 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
             {
                 // First validate that the link was not already visited or added to the list of pages to visit, so we don't
                 // make the same validation and fire the same events twice.
-                if (!_crawledUrlRepository.Contains(uri) &&
+                if (!_scheduler.IsUriKnown(uri) &&
                     (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) {
                     try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
                     {
@@ -948,8 +946,8 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
                     catch { }
                 }
 
-                // Add this link to the list of visited Urls so validations are not duplicated in the future.
-                _crawledUrlRepository.AddIfNew(uri);
+                // Add this link to the list of known Urls so validations are not duplicated in the future.
+                _scheduler.AddKnownUri(uri);
             }
         }