@@ -118,6 +118,7 @@ public abstract class WebCrawler : IWebCrawler
118
118
protected CrawlContext _crawlContext ;
119
119
protected IThreadManager _threadManager ;
120
120
protected IScheduler _scheduler ;
121
+ protected ICrawledUrlRepository _crawledUrlRepository ;
121
122
protected IPageRequester _pageRequester ;
122
123
protected IHyperLinkParser _hyperLinkParser ;
123
124
protected ICrawlDecisionMaker _crawlDecisionMaker ;
@@ -190,7 +191,8 @@ public WebCrawler(
190
191
CrawlBag = _crawlContext . CrawlBag ;
191
192
192
193
_threadManager = threadManager ?? new TaskThreadManager ( _crawlContext . CrawlConfiguration . MaxConcurrentThreads > 0 ? _crawlContext . CrawlConfiguration . MaxConcurrentThreads : Environment . ProcessorCount ) ;
193
- _scheduler = scheduler ?? new Scheduler ( _crawlContext . CrawlConfiguration . IsUriRecrawlingEnabled , null , null ) ;
194
+ _crawledUrlRepository = new InMemoryCrawledUrlRepository ( ) ;
195
+ _scheduler = scheduler ?? new Scheduler ( _crawlContext . CrawlConfiguration . IsUriRecrawlingEnabled , _crawledUrlRepository , null ) ;
194
196
_pageRequester = pageRequester ?? new PageRequester ( _crawlContext . CrawlConfiguration ) ;
195
197
_crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker ( ) ;
196
198
@@ -905,7 +907,6 @@ protected virtual void AddPageToContext(PageToCrawl pageToCrawl)
905
907
pageToCrawl . RetryCount ++ ;
906
908
return ;
907
909
}
908
-
909
910
910
911
int domainCount = 0 ;
911
912
Interlocked . Increment ( ref _crawlContext . CrawledCount ) ;
@@ -927,8 +928,10 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
927
928
{
928
929
foreach ( Uri uri in crawledPage . ParsedLinks )
929
930
{
930
- if ( _shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker . Invoke ( uri , crawledPage , _crawlContext ) )
931
- {
931
+ // First validate that the link was not already visited or added to the list of pages to visit, so we don't
932
+ // make the same validation and fire the same events twice.
933
+ if ( ! _crawledUrlRepository . Contains ( uri ) &&
934
+ ( _shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker . Invoke ( uri , crawledPage , _crawlContext ) ) ) {
932
935
try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
933
936
{
934
937
PageToCrawl page = new PageToCrawl ( uri ) ;
@@ -944,6 +947,9 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
944
947
}
945
948
catch { }
946
949
}
950
+
951
+ // Add this link to the list of visited Urls so validations are not duplicated in the future.
952
+ _crawledUrlRepository . AddIfNew ( uri ) ;
947
953
}
948
954
}
949
955
0 commit comments