@@ -118,7 +118,6 @@ public abstract class WebCrawler : IWebCrawler
118
118
protected CrawlContext _crawlContext ;
119
119
protected IThreadManager _threadManager ;
120
120
protected IScheduler _scheduler ;
121
- protected ICrawledUrlRepository _crawledUrlRepository ;
122
121
protected IPageRequester _pageRequester ;
123
122
protected IHyperLinkParser _hyperLinkParser ;
124
123
protected ICrawlDecisionMaker _crawlDecisionMaker ;
@@ -191,8 +190,7 @@ public WebCrawler(
191
190
CrawlBag = _crawlContext . CrawlBag ;
192
191
193
192
_threadManager = threadManager ?? new TaskThreadManager ( _crawlContext . CrawlConfiguration . MaxConcurrentThreads > 0 ? _crawlContext . CrawlConfiguration . MaxConcurrentThreads : Environment . ProcessorCount ) ;
194
- _crawledUrlRepository = new InMemoryCrawledUrlRepository ( ) ;
195
- _scheduler = scheduler ?? new Scheduler ( _crawlContext . CrawlConfiguration . IsUriRecrawlingEnabled , _crawledUrlRepository , null ) ;
193
+ _scheduler = scheduler ?? new Scheduler ( _crawlContext . CrawlConfiguration . IsUriRecrawlingEnabled , null , null ) ;
196
194
_pageRequester = pageRequester ?? new PageRequester ( _crawlContext . CrawlConfiguration ) ;
197
195
_crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker ( ) ;
198
196
@@ -930,7 +928,7 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
930
928
{
931
929
// First validate that the link was not already visited or added to the list of pages to visit, so we don't
932
930
// make the same validation and fire the same events twice.
933
- if ( ! _crawledUrlRepository . Contains ( uri ) &&
931
+ if ( ! _scheduler . IsUriKnown ( uri ) &&
934
932
( _shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker . Invoke ( uri , crawledPage , _crawlContext ) ) ) {
935
933
try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
936
934
{
@@ -948,8 +946,8 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
948
946
catch { }
949
947
}
950
948
951
- // Add this link to the list of visited Urls so validations are not duplicated in the future.
952
- _crawledUrlRepository . AddIfNew ( uri ) ;
949
+ // Add this link to the list of known Urls so validations are not duplicated in the future.
950
+ _scheduler . AddKnownUri ( uri ) ;
953
951
}
954
952
}
955
953
0 commit comments