@@ -221,7 +221,7 @@ public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTo
221
221
if ( uri == null )
222
222
throw new ArgumentNullException ( "uri" ) ;
223
223
224
- _crawlContext . RootUri = uri ;
224
+ _crawlContext . RootUri = _crawlContext . OriginalRootUri = uri ;
225
225
226
226
if ( cancellationTokenSource != null )
227
227
_crawlContext . CancellationTokenSource = cancellationTokenSource ;
@@ -670,7 +670,11 @@ protected virtual void ProcessPage(PageToCrawl pageToCrawl)
670
670
//CrawledPage crawledPage = await CrawlThePage(pageToCrawl);
671
671
CrawledPage crawledPage = CrawlThePage ( pageToCrawl ) ;
672
672
673
- if ( IsRedirect ( crawledPage ) )
673
+ // Validate the root uri in case of a redirection.
674
+ if ( crawledPage . IsRoot )
675
+ ValidateRootUriForRedirection ( crawledPage ) ;
676
+
677
+ if ( IsRedirect ( crawledPage ) && ! _crawlContext . CrawlConfiguration . IsHttpRequestAutoRedirectsEnabled )
674
678
ProcessRedirect ( crawledPage ) ;
675
679
676
680
if ( PageSizeIsAboveMax ( crawledPage ) )
@@ -720,21 +724,12 @@ protected virtual void ProcessRedirect(CrawledPage crawledPage)
720
724
721
725
try
722
726
{
723
- var location = crawledPage . HttpWebResponse . Headers [ "Location" ] ;
724
-
725
- Uri locationUri ;
726
- if ( ! Uri . TryCreate ( location , UriKind . Absolute , out locationUri ) )
727
- {
728
- var site = crawledPage . Uri . Scheme + "://" + crawledPage . Uri . Host ;
729
- location = site + location ;
730
- }
731
-
732
- var uri = new Uri ( location ) ;
727
+ var uri = GetRedirectUri ( crawledPage ) ;
733
728
734
729
PageToCrawl page = new PageToCrawl ( uri ) ;
735
730
page . ParentUri = crawledPage . ParentUri ;
736
731
page . CrawlDepth = crawledPage . CrawlDepth ;
737
- page . IsInternal = _isInternalDecisionMaker ( uri , _crawlContext . RootUri ) ;
732
+ page . IsInternal = IsInternalUri ( uri ) ;
738
733
page . IsRoot = false ;
739
734
page . RedirectedFrom = crawledPage ;
740
735
page . RedirectPosition = crawledPage . RedirectPosition + 1 ;
@@ -751,14 +746,26 @@ protected virtual void ProcessRedirect(CrawledPage crawledPage)
751
746
catch { }
752
747
}
753
748
749
+ protected virtual bool IsInternalUri ( Uri uri )
750
+ {
751
+ return _isInternalDecisionMaker ( uri , _crawlContext . RootUri ) ||
752
+ _isInternalDecisionMaker ( uri , _crawlContext . OriginalRootUri ) ;
753
+ }
754
+
754
755
protected virtual bool IsRedirect ( CrawledPage crawledPage )
755
756
{
756
- return ( ! _crawlContext . CrawlConfiguration . IsHttpRequestAutoRedirectsEnabled &&
757
- crawledPage . HttpWebResponse != null &&
758
- ( ( int ) crawledPage . HttpWebResponse . StatusCode >= 300 &&
759
- ( int ) crawledPage . HttpWebResponse . StatusCode <= 399 ) ) ;
757
+ bool isRedirect = false ;
758
+ if ( crawledPage . HttpWebResponse != null ) {
759
+ isRedirect = ( _crawlContext . CrawlConfiguration . IsHttpRequestAutoRedirectsEnabled &&
760
+ crawledPage . HttpWebResponse . ResponseUri != null &&
761
+ crawledPage . HttpWebResponse . ResponseUri . AbsoluteUri != crawledPage . Uri . AbsoluteUri ) ||
762
+ ( ! _crawlContext . CrawlConfiguration . IsHttpRequestAutoRedirectsEnabled &&
763
+ ( int ) crawledPage . HttpWebResponse . StatusCode >= 300 &&
764
+ ( int ) crawledPage . HttpWebResponse . StatusCode <= 399 ) ;
765
+ }
766
+ return isRedirect ;
760
767
}
761
-
768
+
762
769
protected virtual void ThrowIfCancellationRequested ( )
763
770
{
764
771
if ( _crawlContext . CancellationTokenSource != null && _crawlContext . CancellationTokenSource . IsCancellationRequested )
@@ -927,7 +934,7 @@ protected virtual void SchedulePageLinks(CrawledPage crawledPage)
927
934
PageToCrawl page = new PageToCrawl ( uri ) ;
928
935
page . ParentUri = crawledPage . Uri ;
929
936
page . CrawlDepth = crawledPage . CrawlDepth + 1 ;
930
- page . IsInternal = _isInternalDecisionMaker ( uri , _crawlContext . RootUri ) ;
937
+ page . IsInternal = IsInternalUri ( uri ) ;
931
938
page . IsRoot = false ;
932
939
933
940
if ( ShouldSchedulePageLink ( page ) )
@@ -1024,6 +1031,53 @@ protected virtual void WaitMinimumRetryDelay(PageToCrawl pageToCrawl)
1024
1031
Thread . Sleep ( TimeSpan . FromMilliseconds ( milliToWait ) ) ;
1025
1032
}
1026
1033
1034
+ /// <summary>
1035
+ /// Validate that the Root page was not redirected. If the root page is redirected, we assume that the root uri
1036
+ /// should be changed to the uri where it was redirected.
1037
+ /// </summary>
1038
+ protected virtual void ValidateRootUriForRedirection ( CrawledPage crawledRootPage )
1039
+ {
1040
+ if ( ! crawledRootPage . IsRoot ) {
1041
+ throw new ArgumentException ( "The crawled page must be the root page to be validated for redirection." ) ;
1042
+ }
1043
+
1044
+ if ( IsRedirect ( crawledRootPage ) ) {
1045
+ _crawlContext . RootUri = GetRedirectUri ( crawledRootPage ) ;
1046
+ _logger . InfoFormat ( "The root URI [{0}] was redirected to [{1}]. Pages from domains [{2}] and [{3}] will be considered internal." ,
1047
+ _crawlContext . OriginalRootUri ,
1048
+ _crawlContext . RootUri ,
1049
+ _crawlContext . RootUri . Authority ,
1050
+ _crawlContext . OriginalRootUri . Authority ) ;
1051
+ }
1052
+ }
1053
+
1054
+ /// <summary>
1055
+ /// Retrieve the URI where the specified crawled page was redirected.
1056
+ /// </summary>
1057
+ /// <remarks>
1058
+ /// If HTTP auto redirections is disabled, this value is stored in the 'Location' header of the response.
1059
+ /// If auto redirections is enabled, this value is stored in the response's ResponseUri property.
1060
+ /// </remarks>
1061
+ protected virtual Uri GetRedirectUri ( CrawledPage crawledPage )
1062
+ {
1063
+ Uri locationUri ;
1064
+ if ( _crawlContext . CrawlConfiguration . IsHttpRequestAutoRedirectsEnabled ) {
1065
+ // For auto redirects, look for the response uri.
1066
+ locationUri = crawledPage . HttpWebResponse . ResponseUri ;
1067
+ } else {
1068
+ // For manual redirects, we need to look for the location header.
1069
+ var location = crawledPage . HttpWebResponse . Headers [ "Location" ] ;
1070
+
1071
+ if ( ! Uri . TryCreate ( location , UriKind . Absolute , out locationUri ) )
1072
+ {
1073
+ var site = crawledPage . Uri . Scheme + "://" + crawledPage . Uri . Host ;
1074
+ location = site + location ;
1075
+ locationUri = new Uri ( location ) ;
1076
+ }
1077
+ }
1078
+ return locationUri ;
1079
+ }
1080
+
1027
1081
public virtual void Dispose ( )
1028
1082
{
1029
1083
if ( _threadManager != null )
0 commit comments