|
1 | 1 | using Abot.Core;
|
2 |
| -using Abot.Core; |
3 | 2 | using Abot.Crawler;
|
4 | 3 | using Abot.Poco;
|
5 | 4 | using Abot.Util;
|
@@ -1091,6 +1090,35 @@ public void Crawl_CrawlHasExceededMaxMemoryUsageInMb_CrawlIsStoppedBeforeComplet
|
1091 | 1090 | Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested);
|
1092 | 1091 | }
|
1093 | 1092 |
|
| 1093 | + [Test] |
| 1094 | + public void Crawl_ExtractedLinksAreNotCheckedTwice() |
| 1095 | + { |
| 1096 | + Uri fakeLink1 = new Uri("http://a.com/someUri"); |
| 1097 | + Uri fakeLink2 = new Uri("http://a.com/someOtherUri"); |
| 1098 | + Uri fakeLink3 = new Uri("http://a.com/anotherOne"); |
| 1099 | + CrawledPage homePage = new CrawledPage(_rootUri); |
| 1100 | + CrawledPage page1 = new CrawledPage(fakeLink1); |
| 1101 | + CrawledPage page2 = new CrawledPage(fakeLink2); |
| 1102 | + |
| 1103 | + // All links are found in each pages. |
| 1104 | + _fakeHyperLinkParser.Setup(parser => parser.GetLinks(It.IsAny<CrawledPage>())).Returns(new [] { fakeLink1, fakeLink2, fakeLink3 }); |
| 1105 | + |
| 1106 | + _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage); |
| 1107 | + _fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1); |
| 1108 | + _fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2); |
| 1109 | + _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = true}); |
| 1110 | + _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.Is<PageToCrawl>(p => p.Uri == fakeLink3), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = false}); |
| 1111 | + _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); |
| 1112 | + |
| 1113 | + _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); |
| 1114 | + _unitUnderTest.Crawl(_rootUri); |
| 1115 | + |
| 1116 | + // The links should be checked only one time, so ShouldCrawlPage should be called only 4 times. |
| 1117 | + _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(4)); |
| 1118 | + _fakeHyperLinkParser.VerifyAll(); |
| 1119 | + _fakeCrawlDecisionMaker.VerifyAll(); |
| 1120 | + } |
| 1121 | + |
1094 | 1122 | [Test]
|
1095 | 1123 | public void Crawl_CanExtractRetryAfterTimeFromHeaders()
|
1096 | 1124 | {
|
|
0 commit comments