Skip to content

Commit a540895

Browse files
committed
Moved back the Url repository in the scheduler and added two new methods
1 parent 19a5cd2 commit a540895

File tree

2 files changed

+26
-6
lines changed

2 files changed

+26
-6
lines changed

Abot/Core/Scheduler.cs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,18 @@ public interface IScheduler : IDisposable
3333
/// Clear all currently scheduled pages
3434
/// </summary>
3535
void Clear();
36+
37+
/// <summary>
38+
/// Add the Url to the list of crawled Url without scheduling it to be crawled.
39+
/// </summary>
40+
/// <param name="uri"></param>
41+
void AddKnownUri(Uri uri);
42+
43+
/// <summary>
44+
/// Returns whether or not the specified Uri was already scheduled to be crawled or simply added to the
45+
/// list of known Uris.
46+
/// </summary>
47+
bool IsUriKnown(Uri uri);
3648
}
3749

3850
[Serializable]
@@ -94,6 +106,16 @@ public void Clear()
94106
_pagesToCrawlRepo.Clear();
95107
}
96108

109+
public void AddKnownUri(Uri uri)
110+
{
111+
_crawledUrlRepo.AddIfNew(uri);
112+
}
113+
114+
public bool IsUriKnown(Uri uri)
115+
{
116+
return _crawledUrlRepo.Contains(uri);
117+
}
118+
97119
public void Dispose()
98120
{
99121
if (_crawledUrlRepo != null)

Abot/Crawler/WebCrawler.cs

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,6 @@ public abstract class WebCrawler : IWebCrawler
118118
protected CrawlContext _crawlContext;
119119
protected IThreadManager _threadManager;
120120
protected IScheduler _scheduler;
121-
protected ICrawledUrlRepository _crawledUrlRepository;
122121
protected IPageRequester _pageRequester;
123122
protected IHyperLinkParser _hyperLinkParser;
124123
protected ICrawlDecisionMaker _crawlDecisionMaker;
@@ -191,8 +190,7 @@ public WebCrawler(
191190
CrawlBag = _crawlContext.CrawlBag;
192191

193192
_threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
194-
_crawledUrlRepository = new InMemoryCrawledUrlRepository();
195-
_scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, _crawledUrlRepository, null);
193+
_scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
196194
_pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
197195
_crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();
198196

@@ -930,7 +928,7 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
930928
{
931929
// First validate that the link was not already visited or added to the list of pages to visit, so we don't
932930
// make the same validation and fire the same events twice.
933-
if (!_crawledUrlRepository.Contains(uri) &&
931+
if (!_scheduler.IsUriKnown(uri) &&
934932
(_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) {
935933
try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
936934
{
@@ -948,8 +946,8 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
948946
catch { }
949947
}
950948

951-
// Add this link to the list of visited Urls so validations are not duplicated in the future.
952-
_crawledUrlRepository.AddIfNew(uri);
949+
// Add this link to the list of known Urls so validations are not duplicated in the future.
950+
_scheduler.AddKnownUri(uri);
953951
}
954952
}
955953

0 commit comments

Comments
 (0)