Skip to content

Commit 4e8fea0

Browse files
committed
Merge pull request sjdirect#88 from Coveo/disposablesAndTweaks
Disposables, url fragment and tweaks
2 parents 5466620 + 369d6fb commit 4e8fea0

File tree

10 files changed

+211
-59
lines changed

10 files changed

+211
-59
lines changed

Abot.Tests.Unit/Crawler/WebCrawlerTest.cs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,6 +1151,50 @@ public void Crawl_CanExtractRetryAfterDateFromHeaders()
11511151
_fakeCrawlDecisionMaker.Verify(f => f.ShouldRecrawlPage(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>()), Times.Exactly(2));
11521152
}
11531153

1154+
[Test]
1155+
public void Crawl_ChangeRootUriIfRedirected()
1156+
{
1157+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
1158+
_dummyConfiguration.IsHttpRequestAutoRedirectsEnabled = false;
1159+
1160+
// Setup a root page that was redirected.
1161+
Uri redirectedUri = new Uri("http://www.domain.com/");
1162+
CrawledPage page = new CrawledPage(_rootUri) {
1163+
WebException = new WebException(),
1164+
HttpWebResponse = new HttpWebResponseWrapper {
1165+
StatusCode = HttpStatusCode.Redirect,
1166+
Headers = new WebHeaderCollection { { "Location", redirectedUri.AbsoluteUri } }
1167+
}
1168+
};
1169+
_fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page);
1170+
1171+
CrawlResult result = _unitUnderTest.Crawl(_rootUri);
1172+
Assert.That(result.CrawlContext.RootUri.AbsoluteUri, Is.EqualTo(redirectedUri));
1173+
Assert.That(result.CrawlContext.OriginalRootUri, Is.EqualTo(_rootUri));
1174+
}
1175+
1176+
[Test]
1177+
public void Crawl_ChangeRootUriIfRedirectedAutomatically()
1178+
{
1179+
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
1180+
_dummyConfiguration.IsHttpRequestAutoRedirectsEnabled = true;
1181+
1182+
// Setup a root page that was redirected.
1183+
Uri redirectedUri = new Uri("http://www.domain.com/");
1184+
CrawledPage page = new CrawledPage(_rootUri) {
1185+
WebException = new WebException(),
1186+
HttpWebResponse = new HttpWebResponseWrapper {
1187+
StatusCode = HttpStatusCode.OK,
1188+
ResponseUri = redirectedUri
1189+
}
1190+
};
1191+
_fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page);
1192+
1193+
CrawlResult result = _unitUnderTest.Crawl(_rootUri);
1194+
Assert.That(result.CrawlContext.RootUri.AbsoluteUri, Is.EqualTo(redirectedUri));
1195+
Assert.That(result.CrawlContext.OriginalRootUri, Is.EqualTo(_rootUri));
1196+
}
1197+
11541198
private void ThrowExceptionWhen_PageCrawlStarting(object sender, PageCrawlStartingArgs e)
11551199
{
11561200
throw new Exception("no!!!");

Abot/Core/CrawlDecisionMaker.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage,
135135
return new CrawlDecision { Allow = true };
136136
}
137137

138-
public CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext)
138+
public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext)
139139
{
140140
if (crawledPage == null)
141141
return new CrawlDecision { Allow = false, Reason = "Null crawled page" };

Abot/Core/CsQueryHyperLinkParser.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ namespace Abot.Core
1313
public class CSQueryHyperlinkParser : HyperLinkParser
1414
{
1515
Func<string, string> _cleanURLFunc;
16-
bool _isRespectMetaRobotsNoFollowEnabled;
1716
bool _isRespectAnchorRelNoFollowEnabled;
1817

1918
public CSQueryHyperlinkParser()
@@ -23,7 +22,6 @@ public CSQueryHyperlinkParser()
2322
public CSQueryHyperlinkParser(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanURLFunc = null)
2423
: base(isRespectMetaRobotsNoFollowEnabled)
2524
{
26-
_isRespectMetaRobotsNoFollowEnabled = isRespectMetaRobotsNoFollowEnabled;
2725
_isRespectAnchorRelNoFollowEnabled = isRespectAnchorRelNoFollowEnabled;
2826
_cleanURLFunc = cleanURLFunc;
2927
}

Abot/Core/HapHyperLinkParser.cs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ namespace Abot.Core
1313
public class HapHyperLinkParser : HyperLinkParser
1414
{
1515
Func<string, string> _cleanURLFunc;
16-
bool _isRespectMetaRobotsNoFollowEnabled;
1716
bool _isRespectAnchorRelNoFollowEnabled;
1817

1918
protected override string ParserType
@@ -26,10 +25,12 @@ public HapHyperLinkParser()
2625
{
2726
}
2827

29-
public HapHyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanURLFunc = null)
30-
:base(isRespectMetaRobotsNoFollowEnabled)
28+
public HapHyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled,
29+
bool isRespectAnchorRelNoFollowEnabled,
30+
Func<string, string> cleanURLFunc = null,
31+
bool removeUrlFragment = true)
32+
:base(isRespectMetaRobotsNoFollowEnabled, removeUrlFragment)
3133
{
32-
_isRespectMetaRobotsNoFollowEnabled = isRespectMetaRobotsNoFollowEnabled;
3334
_isRespectAnchorRelNoFollowEnabled = isRespectAnchorRelNoFollowEnabled;
3435
_cleanURLFunc = cleanURLFunc;
3536
}

Abot/Core/HyperLinkParser.cs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,19 @@ public interface IHyperLinkParser
2222
public abstract class HyperLinkParser : IHyperLinkParser
2323
{
2424
protected ILog _logger = LogManager.GetLogger("AbotLogger");
25-
protected bool IsRespectMetaRobotsNoFollowEnabled { get; set; }
25+
protected bool _isRespectMetaRobotsNoFollowEnabled;
26+
protected bool _removeUrlFragment;
2627

2728
public HyperLinkParser()
28-
:this(false)
29+
:this(false, true)
2930
{
3031

3132
}
3233

33-
public HyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled)
34+
public HyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled, bool removeUrlFragment = true)
3435
{
35-
IsRespectMetaRobotsNoFollowEnabled = isRespectMetaRobotsNoFollowEnabled;
36+
_isRespectMetaRobotsNoFollowEnabled = isRespectMetaRobotsNoFollowEnabled;
37+
_removeUrlFragment = removeUrlFragment;
3638
}
3739

3840
/// <summary>
@@ -96,10 +98,14 @@ protected virtual List<Uri> GetUris(CrawledPage crawledPage, IEnumerable<string>
9698
{
9799
try
98100
{
99-
href = hrefValue.Split('#')[0];
101+
// Remove the url fragment part of the url if needed.
102+
// This is the part after the # and is often not useful.
103+
href = _removeUrlFragment
104+
? hrefValue.Split('#')[0]
105+
: hrefValue;
100106
Uri newUri = new Uri(uriToUse, href);
101107

102-
if (!uris.Contains(newUri))
108+
if (!uris.Exists(u => u.AbsoluteUri == newUri.AbsoluteUri))
103109
uris.Add(newUri);
104110
}
105111
catch (Exception e)
@@ -114,7 +120,7 @@ protected virtual List<Uri> GetUris(CrawledPage crawledPage, IEnumerable<string>
114120

115121
protected virtual bool HasRobotsNoFollow(CrawledPage crawledPage)
116122
{
117-
if (!IsRespectMetaRobotsNoFollowEnabled)
123+
if (!_isRespectMetaRobotsNoFollowEnabled)
118124
return false;
119125

120126
string robotsMeta = robotsMeta = GetMetaRobotsValue(crawledPage);

Abot/Core/PageRequester.cs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
namespace Abot.Core
1111
{
12-
public interface IPageRequester
12+
public interface IPageRequester : IDisposable
1313
{
1414
/// <summary>
1515
/// Make an http web request to the url and download its content
@@ -34,7 +34,7 @@ public class PageRequester : IPageRequester
3434

3535
protected CrawlConfiguration _config;
3636
protected IWebContentExtractor _extractor;
37-
protected CookieContainer container = new CookieContainer();
37+
protected CookieContainer _cookieContainer = new CookieContainer();
3838

3939
public PageRequester(CrawlConfiguration config)
4040
: this(config, null)
@@ -211,7 +211,7 @@ protected virtual HttpWebRequest BuildRequestObject(Uri uri)
211211
request.Timeout = _config.HttpRequestTimeoutInSeconds * 1000;
212212

213213
if (_config.IsSendingCookiesEnabled)
214-
request.CookieContainer = container;
214+
request.CookieContainer = _cookieContainer;
215215

216216
if (_config.IsAlwaysLogin)
217217
{
@@ -227,8 +227,18 @@ protected virtual void ProcessResponseObject(HttpWebResponse response)
227227
if (response != null && _config.IsSendingCookiesEnabled)
228228
{
229229
CookieCollection cookies = response.Cookies;
230-
container.Add(cookies);
230+
_cookieContainer.Add(cookies);
231231
}
232232
}
233+
234+
public void Dispose()
235+
{
236+
if (_extractor != null)
237+
{
238+
_extractor.Dispose();
239+
}
240+
_cookieContainer = null;
241+
_config = null;
242+
}
233243
}
234244
}

Abot/Core/Scheduler.cs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ namespace Abot.Core
77
/// <summary>
88
/// Handles managing the priority of what pages need to be crawled
99
/// </summary>
10-
public interface IScheduler
10+
public interface IScheduler : IDisposable
1111
{
1212
/// <summary>
1313
/// Count of remaining items that are currently scheduled
@@ -93,5 +93,17 @@ public void Clear()
9393
{
9494
_pagesToCrawlRepo.Clear();
9595
}
96+
97+
public void Dispose()
98+
{
99+
if (_crawledUrlRepo != null)
100+
{
101+
_crawledUrlRepo.Dispose();
102+
}
103+
if (_pagesToCrawlRepo != null)
104+
{
105+
_pagesToCrawlRepo.Dispose();
106+
}
107+
}
96108
}
97109
}

Abot/Core/WebContentExtractor.cs

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
namespace Abot.Core
1111
{
12-
public interface IWebContentExtractor
12+
public interface IWebContentExtractor : IDisposable
1313
{
1414
PageContent GetContent(WebResponse response);
1515
}
@@ -19,15 +19,20 @@ public class WebContentExtractor : IWebContentExtractor
1919
{
2020
static ILog _logger = LogManager.GetLogger("AbotLogger");
2121

22-
public PageContent GetContent(WebResponse response)
22+
public virtual PageContent GetContent(WebResponse response)
2323
{
2424
using (MemoryStream memoryStream = GetRawData(response))
2525
{
2626
String charset = GetCharsetFromHeaders(response);
2727

28-
if (charset == null)
29-
charset = GetCharsetFromBody(memoryStream);
28+
if (charset == null) {
29+
memoryStream.Seek(0, SeekOrigin.Begin);
3030

31+
// Do not wrap in closing statement to prevent closing of this stream.
32+
StreamReader srr = new StreamReader(memoryStream, Encoding.ASCII);
33+
String body = srr.ReadToEnd();
34+
charset = GetCharsetFromBody(body);
35+
}
3136
memoryStream.Seek(0, SeekOrigin.Begin);
3237

3338
Encoding e = GetEncoding(charset);
@@ -47,7 +52,7 @@ public PageContent GetContent(WebResponse response)
4752
}
4853
}
4954

50-
private string GetCharsetFromHeaders(WebResponse webResponse)
55+
protected string GetCharsetFromHeaders(WebResponse webResponse)
5156
{
5257
string charset = null;
5358
String ctype = webResponse.Headers["content-type"];
@@ -60,21 +65,14 @@ private string GetCharsetFromHeaders(WebResponse webResponse)
6065
return charset;
6166
}
6267

63-
private string GetCharsetFromBody(MemoryStream rawdata)
68+
protected string GetCharsetFromBody(string body)
6469
{
6570
String charset = null;
66-
67-
MemoryStream ms = rawdata;
68-
ms.Seek(0, SeekOrigin.Begin);
69-
70-
//Do not wrapp in closing statement to prevent closing of this stream
71-
StreamReader srr = new StreamReader(ms, Encoding.ASCII);
72-
String meta = srr.ReadToEnd();
73-
74-
if (meta != null)
71+
72+
if (body != null)
7573
{
7674
//find expression from : http://stackoverflow.com/questions/3458217/how-to-use-regular-expression-to-match-the-charset-string-in-html
77-
Match match = Regex.Match(meta, @"<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s""']*)?([^>]*?)[\s""';]*charset\s*=[\s""']*([^\s""'/>]*)", RegexOptions.IgnoreCase);
75+
Match match = Regex.Match(body, @"<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s""']*)?([^>]*?)[\s""';]*charset\s*=[\s""']*([^\s""'/>]*)", RegexOptions.IgnoreCase);
7876
if (match.Success)
7977
{
8078
charset = string.IsNullOrWhiteSpace(match.Groups[2].Value) ? null : match.Groups[2].Value;
@@ -83,9 +81,8 @@ private string GetCharsetFromBody(MemoryStream rawdata)
8381

8482
return charset;
8583
}
86-
87-
88-
private Encoding GetEncoding(string charset)
84+
85+
protected Encoding GetEncoding(string charset)
8986
{
9087
Encoding e = Encoding.UTF8;
9188
if (charset != null)
@@ -125,6 +122,10 @@ private MemoryStream GetRawData(WebResponse webResponse)
125122

126123
return rawData;
127124
}
128-
}
129125

126+
public virtual void Dispose()
127+
{
128+
// Nothing to do
129+
}
130+
}
130131
}

0 commit comments

Comments
 (0)