Skip to content

Commit 4524db0

Browse files
committed
Added a check to see if everything below the root is "allowed" by using a the root + "/aaaaa". If that link is disallowed and the IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true, it will ignore the disallow directives for that site. fixes issue 96
1 parent 56b75d4 commit 4524db0

File tree

2 files changed

+54
-5
lines changed

2 files changed

+54
-5
lines changed

Abot.Tests.Unit/Crawler/PoliteWebCrawlerTest.cs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,35 @@ public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed
314314
_fakeHttpRequester.VerifyAll();
315315
}
316316

317+
[Test]
318+
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_RootPageIsAllowed_AllPagesBelowDisallowed_IsIgnoreRobotsDotTextIfRootDisallowedEnabledTrue_CallsHttpRequester()
319+
{
320+
CrawledPage homePage = new CrawledPage(_rootUri)
321+
{
322+
Content = new PageContent
323+
{
324+
Text = "content here"
325+
}
326+
};
327+
CrawledPage page1 = new CrawledPage(_rootUri);
328+
329+
_fakeRobotsDotText.Setup(f => f.IsUrlAllowed(_rootUri.AbsoluteUri, It.IsAny<string>())).Returns(true);
330+
_fakeRobotsDotText.Setup(f => f.IsUrlAllowed(_rootUri.AbsoluteUri + "aaaaa", It.IsAny<string>())).Returns(false);
331+
_fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object);
332+
_fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
333+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
334+
_dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
335+
_dummyConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = true;
336+
_unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);
337+
338+
_unitUnderTest.Crawl(_rootUri);
339+
340+
_fakeCrawlDecisionMaker.VerifyAll();
341+
_fakeRobotsDotText.VerifyAll();
342+
_fakeRobotsDotTextFinder.VerifyAll();
343+
_fakeHttpRequester.VerifyAll();
344+
}
345+
317346
[Test]
318347
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_UsesCorrectUserAgentString()
319348
{

Abot/Crawler/PoliteWebCrawler.cs

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,32 @@ protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
8989
if (_robotsDotText != null)
9090
allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
9191

92-
if (!allowedByRobots && pageToCrawl.IsRoot && _crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled)
92+
93+
//https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*"
94+
var allPathsBelowRootAllowedByRobots = false;
95+
if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots)
9396
{
94-
string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
95-
_logger.DebugFormat(message);
96-
allowedByRobots = true;
97-
_robotsDotText = null;
97+
var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa";
98+
allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
99+
}
100+
101+
if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot)
102+
{
103+
if (!allowedByRobots)
104+
{
105+
string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
106+
_logger.DebugFormat(message);
107+
allowedByRobots = true;
108+
_robotsDotText = null;
109+
}
110+
else if (!allPathsBelowRootAllowedByRobots)
111+
{
112+
string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
113+
_logger.DebugFormat(message);
114+
allowedByRobots = true;
115+
_robotsDotText = null;
116+
}
117+
98118
}
99119
else if (!allowedByRobots)
100120
{

0 commit comments

Comments
 (0)