Skip to content

Commit c4d0241

Browse files
committed
! Validation for the root uri in case it is redirected to another domain with "www." for instance
1 parent bce5da7 commit c4d0241

File tree

3 files changed

+123
-19
lines changed

3 files changed

+123
-19
lines changed

Abot.Tests.Unit/Crawler/WebCrawlerTest.cs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,6 +1151,50 @@ public void Crawl_CanExtractRetryAfterDateFromHeaders()
11511151
_fakeCrawlDecisionMaker.Verify(f => f.ShouldRecrawlPage(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>()), Times.Exactly(2));
11521152
}
11531153

1154+
[Test]
1155+
public void Crawl_ChangeRootUriIfRedirected()
1156+
{
1157+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
1158+
_dummyConfiguration.IsHttpRequestAutoRedirectsEnabled = false;
1159+
1160+
// Setup a root page that was redirected.
1161+
Uri redirectedUri = new Uri("http://www.domain.com/");
1162+
CrawledPage page = new CrawledPage(_rootUri) {
1163+
WebException = new WebException(),
1164+
HttpWebResponse = new HttpWebResponseWrapper {
1165+
StatusCode = HttpStatusCode.Redirect,
1166+
Headers = new WebHeaderCollection { { "Location", redirectedUri.AbsoluteUri } }
1167+
}
1168+
};
1169+
_fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page);
1170+
1171+
CrawlResult result = _unitUnderTest.Crawl(_rootUri);
1172+
Assert.That(result.CrawlContext.RootUri.AbsoluteUri, Is.EqualTo(redirectedUri));
1173+
Assert.That(result.CrawlContext.OriginalRootUri, Is.EqualTo(_rootUri));
1174+
}
1175+
1176+
[Test]
1177+
public void Crawl_ChangeRootUriIfRedirectedAutomatically()
1178+
{
1179+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
1180+
_dummyConfiguration.IsHttpRequestAutoRedirectsEnabled = true;
1181+
1182+
// Setup a root page that was redirected.
1183+
Uri redirectedUri = new Uri("http://www.domain.com/");
1184+
CrawledPage page = new CrawledPage(_rootUri) {
1185+
WebException = new WebException(),
1186+
HttpWebResponse = new HttpWebResponseWrapper {
1187+
StatusCode = HttpStatusCode.OK,
1188+
ResponseUri = redirectedUri
1189+
}
1190+
};
1191+
_fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page);
1192+
1193+
CrawlResult result = _unitUnderTest.Crawl(_rootUri);
1194+
Assert.That(result.CrawlContext.RootUri.AbsoluteUri, Is.EqualTo(redirectedUri));
1195+
Assert.That(result.CrawlContext.OriginalRootUri, Is.EqualTo(_rootUri));
1196+
}
1197+
11541198
private void ThrowExceptionWhen_PageCrawlStarting(object sender, PageCrawlStartingArgs e)
11551199
{
11561200
throw new Exception("no!!!");

Abot/Crawler/WebCrawler.cs

Lines changed: 73 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTo
221221
if (uri == null)
222222
throw new ArgumentNullException("uri");
223223

224-
_crawlContext.RootUri = uri;
224+
_crawlContext.RootUri = _crawlContext.OriginalRootUri = uri;
225225

226226
if (cancellationTokenSource != null)
227227
_crawlContext.CancellationTokenSource = cancellationTokenSource;
@@ -670,7 +670,11 @@ protected virtual void ProcessPage(PageToCrawl pageToCrawl)
670670
//CrawledPage crawledPage = await CrawlThePage(pageToCrawl);
671671
CrawledPage crawledPage = CrawlThePage(pageToCrawl);
672672

673-
if (IsRedirect(crawledPage))
673+
// Validate the root uri in case of a redirection.
674+
if (crawledPage.IsRoot)
675+
ValidateRootUriForRedirection(crawledPage);
676+
677+
if (IsRedirect(crawledPage) && !_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled)
674678
ProcessRedirect(crawledPage);
675679

676680
if (PageSizeIsAboveMax(crawledPage))
@@ -720,21 +724,12 @@ protected virtual void ProcessRedirect(CrawledPage crawledPage)
720724

721725
try
722726
{
723-
var location = crawledPage.HttpWebResponse.Headers["Location"];
724-
725-
Uri locationUri;
726-
if (!Uri.TryCreate(location, UriKind.Absolute, out locationUri))
727-
{
728-
var site = crawledPage.Uri.Scheme + "://" + crawledPage.Uri.Host;
729-
location = site + location;
730-
}
731-
732-
var uri = new Uri(location);
727+
var uri = GetRedirectUri(crawledPage);
733728

734729
PageToCrawl page = new PageToCrawl(uri);
735730
page.ParentUri = crawledPage.ParentUri;
736731
page.CrawlDepth = crawledPage.CrawlDepth;
737-
page.IsInternal = _isInternalDecisionMaker(uri, _crawlContext.RootUri);
732+
page.IsInternal = IsInternalUri(uri);
738733
page.IsRoot = false;
739734
page.RedirectedFrom = crawledPage;
740735
page.RedirectPosition = crawledPage.RedirectPosition + 1;
@@ -751,14 +746,26 @@ protected virtual void ProcessRedirect(CrawledPage crawledPage)
751746
catch {}
752747
}
753748

749+
protected virtual bool IsInternalUri(Uri uri)
750+
{
751+
return _isInternalDecisionMaker(uri, _crawlContext.RootUri) ||
752+
_isInternalDecisionMaker(uri, _crawlContext.OriginalRootUri);
753+
}
754+
754755
protected virtual bool IsRedirect(CrawledPage crawledPage)
755756
{
756-
return (!_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled &&
757-
crawledPage.HttpWebResponse != null &&
758-
((int) crawledPage.HttpWebResponse.StatusCode >= 300 &&
759-
(int) crawledPage.HttpWebResponse.StatusCode <= 399));
757+
bool isRedirect = false;
758+
if (crawledPage.HttpWebResponse != null) {
759+
isRedirect = (_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled &&
760+
crawledPage.HttpWebResponse.ResponseUri != null &&
761+
crawledPage.HttpWebResponse.ResponseUri.AbsoluteUri != crawledPage.Uri.AbsoluteUri) ||
762+
(!_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled &&
763+
(int) crawledPage.HttpWebResponse.StatusCode >= 300 &&
764+
(int) crawledPage.HttpWebResponse.StatusCode <= 399);
765+
}
766+
return isRedirect;
760767
}
761-
768+
762769
protected virtual void ThrowIfCancellationRequested()
763770
{
764771
if (_crawlContext.CancellationTokenSource != null && _crawlContext.CancellationTokenSource.IsCancellationRequested)
@@ -927,7 +934,7 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
927934
PageToCrawl page = new PageToCrawl(uri);
928935
page.ParentUri = crawledPage.Uri;
929936
page.CrawlDepth = crawledPage.CrawlDepth + 1;
930-
page.IsInternal = _isInternalDecisionMaker(uri, _crawlContext.RootUri);
937+
page.IsInternal = IsInternalUri(uri);
931938
page.IsRoot = false;
932939

933940
if (ShouldSchedulePageLink(page))
@@ -1024,6 +1031,53 @@ protected virtual void WaitMinimumRetryDelay(PageToCrawl pageToCrawl)
10241031
Thread.Sleep(TimeSpan.FromMilliseconds(milliToWait));
10251032
}
10261033

1034+
/// <summary>
1035+
/// Validate that the Root page was not redirected. If the root page is redirected, we assume that the root uri
1036+
/// should be changed to the uri where it was redirected.
1037+
/// </summary>
1038+
protected virtual void ValidateRootUriForRedirection(CrawledPage crawledRootPage)
1039+
{
1040+
if (!crawledRootPage.IsRoot) {
1041+
throw new ArgumentException("The crawled page must be the root page to be validated for redirection.");
1042+
}
1043+
1044+
if (IsRedirect(crawledRootPage)) {
1045+
_crawlContext.RootUri = GetRedirectUri(crawledRootPage);
1046+
_logger.InfoFormat("The root URI [{0}] was redirected to [{1}]. Pages from domains [{2}] and [{3}] will be considered internal.",
1047+
_crawlContext.OriginalRootUri,
1048+
_crawlContext.RootUri,
1049+
_crawlContext.RootUri.Authority,
1050+
_crawlContext.OriginalRootUri.Authority);
1051+
}
1052+
}
1053+
1054+
/// <summary>
1055+
/// Retrieve the URI where the specified crawled page was redirected.
1056+
/// </summary>
1057+
/// <remarks>
1058+
/// If HTTP auto redirections is disabled, this value is stored in the 'Location' header of the response.
1059+
/// If auto redirections is enabled, this value is stored in the response's ResponseUri property.
1060+
/// </remarks>
1061+
protected virtual Uri GetRedirectUri(CrawledPage crawledPage)
1062+
{
1063+
Uri locationUri;
1064+
if (_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled) {
1065+
// For auto redirects, look for the response uri.
1066+
locationUri = crawledPage.HttpWebResponse.ResponseUri;
1067+
} else {
1068+
// For manual redirects, we need to look for the location header.
1069+
var location = crawledPage.HttpWebResponse.Headers["Location"];
1070+
1071+
if (!Uri.TryCreate(location, UriKind.Absolute, out locationUri))
1072+
{
1073+
var site = crawledPage.Uri.Scheme + "://" + crawledPage.Uri.Host;
1074+
location = site + location;
1075+
locationUri = new Uri(location);
1076+
}
1077+
}
1078+
return locationUri;
1079+
}
1080+
10271081
public virtual void Dispose()
10281082
{
10291083
if (_threadManager != null)

Abot/Poco/CrawlContext.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ public CrawlContext()
2121
/// </summary>
2222
public Uri RootUri { get; set; }
2323

24+
/// <summary>
25+
/// The root of the crawl specified in the configuration. If the root URI was redirected to another URI,
26+
/// it will be set in RootUri.
27+
/// </summary>
28+
public Uri OriginalRootUri { get; set; }
29+
2430
/// <summary>
2531
/// total number of pages that have been crawled
2632
/// </summary>

0 commit comments

Comments
 (0)