Skip to content

Commit eb719d8

Browse files
committed
A few modifications/additions to the prior push (merged pull request). Added IsRespectUrlNamedAnchorOrHashbangEnabled as a config value and negated what the _removeUrlFragment logic. Added constructor to support the IsRespectUrlNamedAnchorOrHashbangEnabled on CsQueryHyperLinkParser.cs to have parity with HapHyperLinkParser.cs. Added a few unit tests to the HyperLinkParserTests.cs file.
1 parent 4e8fea0 commit eb719d8

14 files changed

+71
-21
lines changed

Abot.Demo/App.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
isHttpRequestAutomaticDecompressionEnabled="false"
5454
isSendingCookiesEnabled="false"
5555
isSslCertificateValidationEnabled="false"
56+
isRespectUrlNamedAnchorOrHashbangEnabled="false"
5657
minAvailableMemoryRequiredInMb="0"
5758
maxMemoryUsageInMb="0"
5859
maxMemoryUsageCacheTimeInSeconds="0"

Abot.Tests.Integration/App.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
isHttpRequestAutomaticDecompressionEnabled="false"
6565
isSendingCookiesEnabled="false"
6666
isSslCertificateValidationEnabled="false"
67+
isRespectUrlNamedAnchorOrHashbangEnabled="false"
6768
minAvailableMemoryRequiredInMb="0"
6869
maxMemoryUsageInMb="0"
6970
maxMemoryUsageCacheTimeInSeconds="0"

Abot.Tests.Unit/App.config

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@
6363
isHttpRequestAutoRedirectsEnabled="true"
6464
isHttpRequestAutomaticDecompressionEnabled="true"
6565
isSendingCookiesEnabled="true"
66-
isSslCertificateValidationEnabled="false"
66+
isSslCertificateValidationEnabled="false"
67+
isRespectUrlNamedAnchorOrHashbangEnabled="true"
6768
minAvailableMemoryRequiredInMb="25"
6869
maxMemoryUsageInMb="26"
6970
maxMemoryUsageCacheTimeInSeconds="27"

Abot.Tests.Unit/Core/AbotConfigurationSectionHandlerTest.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ public void GetSetion_FillsConfigValuesFromAppConfigFile()
3030
Assert.AreEqual(true, _uut.CrawlBehavior.IsHttpRequestAutomaticDecompressionEnabled);
3131
Assert.AreEqual(true, _uut.CrawlBehavior.IsSendingCookiesEnabled);
3232
Assert.AreEqual(false, _uut.CrawlBehavior.IsSslCertificateValidationEnabled);
33+
Assert.AreEqual(true, _uut.CrawlBehavior.IsRespectUrlNamedAnchorOrHashbangEnabled);
3334
Assert.AreEqual(25, _uut.CrawlBehavior.MinAvailableMemoryRequiredInMb);
3435
Assert.AreEqual(26, _uut.CrawlBehavior.MaxMemoryUsageInMb);
3536
Assert.AreEqual(27, _uut.CrawlBehavior.MaxMemoryUsageCacheTimeInSeconds);
@@ -77,6 +78,7 @@ public void Convert_CovertsFromSectionObjectToDtoObject()
7778
Assert.AreEqual(true, _uut.CrawlBehavior.IsHttpRequestAutomaticDecompressionEnabled);
7879
Assert.AreEqual(true, _uut.CrawlBehavior.IsSendingCookiesEnabled);
7980
Assert.AreEqual(false, _uut.CrawlBehavior.IsSslCertificateValidationEnabled);
81+
Assert.AreEqual(true, _uut.CrawlBehavior.IsRespectUrlNamedAnchorOrHashbangEnabled);
8082
Assert.AreEqual(result.MinAvailableMemoryRequiredInMb, _uut.CrawlBehavior.MinAvailableMemoryRequiredInMb);
8183
Assert.AreEqual(result.MaxMemoryUsageInMb, _uut.CrawlBehavior.MaxMemoryUsageInMb);
8284
Assert.AreEqual(result.MaxMemoryUsageCacheTimeInSeconds, _uut.CrawlBehavior.MaxMemoryUsageCacheTimeInSeconds);

Abot.Tests.Unit/Core/CsQueryHyperLinkParserTest.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ namespace Abot.Tests.Unit.Core
77
[TestFixture]
88
public class CsQueryHyperLinkParserTest : HyperLinkParserTest
99
{
10-
protected override HyperLinkParser GetInstance(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanUrlDelegate = null)
10+
protected override HyperLinkParser GetInstance(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanUrlDelegate = null, bool isRespectUrlNamedAnchorOrHashbangEnabled = false)
1111
{
12-
return new CSQueryHyperlinkParser(isRespectMetaRobotsNoFollowEnabled, isRespectAnchorRelNoFollowEnabled, cleanUrlDelegate);
12+
return new CSQueryHyperlinkParser(isRespectMetaRobotsNoFollowEnabled, isRespectAnchorRelNoFollowEnabled, cleanUrlDelegate, isRespectUrlNamedAnchorOrHashbangEnabled);
1313
}
1414

1515
[Test]

Abot.Tests.Unit/Core/HapHyperLinkParserTest.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ namespace Abot.Tests.Unit.Core
77
[TestFixture]
88
public class HapHyperLinkParserTest : HyperLinkParserTest
99
{
10-
protected override HyperLinkParser GetInstance(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanUrlDelegate = null)
10+
protected override HyperLinkParser GetInstance(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanUrlDelegate = null, bool isRespectUrlNamedAnchorOrHashbangEnabled = false)
1111
{
12-
return new HapHyperLinkParser(isRespectMetaRobotsNoFollowEnabled, isRespectAnchorRelNoFollowEnabled, cleanUrlDelegate);
12+
return new HapHyperLinkParser(isRespectMetaRobotsNoFollowEnabled, isRespectAnchorRelNoFollowEnabled, cleanUrlDelegate, isRespectUrlNamedAnchorOrHashbangEnabled);
1313
}
1414

1515
[Test]

Abot.Tests.Unit/Core/HyperlinkParserTest.cs

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@ public abstract class HyperLinkParserTest
1616
Uri _uri = new Uri("http://a.com/");
1717
CrawledPage _crawledPage;
1818

19-
protected abstract HyperLinkParser GetInstance(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanUrlDelegate = null);
19+
protected abstract HyperLinkParser GetInstance(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanUrlDelegate = null, bool isRespectUrlNamedAnchorOrHashbangEnabled = false);
2020

2121
[SetUp]
2222
public void Setup()
2323
{
2424
_crawledPage = new CrawledPage(_uri){ HttpWebRequest = (HttpWebRequest)WebRequest.Create(_uri) };
25-
_unitUnderTest = GetInstance(false, false);
25+
_unitUnderTest = GetInstance(false, false, null, false);
2626
}
2727

2828
[Test]
@@ -189,14 +189,30 @@ public void GetLinks_DuplicateLinks_ReturnsOnlyOne()
189189
}
190190

191191
[Test]
192-
public void GetLinks_NamedAnchors_Ignores()
192+
public void GetLinks_NamedAnchorsOrHashbangs_Ignores()
193193
{
194-
_crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/aaa/a.html#top\" ></a><a href=\"/aaa/a.html#bottom\" /></a>";
194+
_crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/aaa/a.html#top\" ></a><a href=\"/aaa/a.html#bottom\" /></a><a href=\"/aaa/a.html/#someaction/someid\" /></a>";
195195

196196
IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
197197

198-
Assert.AreEqual(1, result.Count());
198+
Assert.AreEqual(2, result.Count());
199+
Assert.AreEqual("http://a.com/aaa/a.html", result.ElementAt(0).AbsoluteUri);
200+
Assert.AreEqual("http://a.com/aaa/a.html/", result.ElementAt(1).AbsoluteUri);
201+
}
202+
203+
[Test]
204+
public void GetLinks_NamedAnchorsOrHashbangs_Enabled_ReturnsLinks()
205+
{
206+
_unitUnderTest = GetInstance(false, false, null, true);
207+
_crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/aaa/a.html#top\" ></a><a href=\"/aaa/a.html#bottom\" /></a><a href=\"/aaa/a.html/#someaction/someid\" /></a>";
208+
209+
IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
210+
211+
Assert.AreEqual(4, result.Count());
199212
Assert.AreEqual("http://a.com/aaa/a.html", result.ElementAt(0).AbsoluteUri);
213+
Assert.AreEqual("http://a.com/aaa/a.html#top", result.ElementAt(1).AbsoluteUri);
214+
Assert.AreEqual("http://a.com/aaa/a.html#bottom", result.ElementAt(2).AbsoluteUri);
215+
Assert.AreEqual("http://a.com/aaa/a.html/#someaction/someid", result.ElementAt(3).AbsoluteUri);
200216
}
201217

202218
[Test]

Abot.Tests.Unit/Poco/CrawlConfigurationTest.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ public void Constructor_ValidUri_CreatesInstance()
3636
Assert.AreEqual(false, unitUnderTest.IsHttpRequestAutomaticDecompressionEnabled);
3737
Assert.AreEqual(false, unitUnderTest.IsSendingCookiesEnabled);
3838
Assert.AreEqual(true, unitUnderTest.IsSslCertificateValidationEnabled);
39+
Assert.AreEqual(false, unitUnderTest.IsRespectUrlNamedAnchorOrHashbangEnabled);
3940
Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageCacheTimeInSeconds);
4041
Assert.AreEqual(0, unitUnderTest.MaxMemoryUsageInMb);
4142
Assert.AreEqual(0, unitUnderTest.MinAvailableMemoryRequiredInMb);

Abot/Core/AbotConfigurationSectionHandler.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,12 @@ public bool IsSendingCookiesEnabled
242242
get { return (bool)this["isSendingCookiesEnabled"]; }
243243
}
244244

245+
[ConfigurationProperty("isRespectUrlNamedAnchorOrHashbangEnabled", IsRequired = false)]
246+
public bool IsRespectUrlNamedAnchorOrHashbangEnabled
247+
{
248+
get { return (bool)this["isRespectUrlNamedAnchorOrHashbangEnabled"]; }
249+
}
250+
245251
[ConfigurationProperty("minAvailableMemoryRequiredInMb", IsRequired = false)]
246252
public int MinAvailableMemoryRequiredInMb
247253
{

Abot/Core/CsQueryHyperLinkParser.cs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,18 @@ public CSQueryHyperlinkParser()
1919
{
2020
}
2121

22-
public CSQueryHyperlinkParser(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanURLFunc = null)
23-
: base(isRespectMetaRobotsNoFollowEnabled)
22+
/// <summary>
23+
/// Constructor
24+
/// </summary>
25+
/// <param name="isRespectMetaRobotsNoFollowEnabled">Whether parser should ignore pages with meta no robots</param>
26+
/// <param name="isRespectAnchorRelNoFollowEnabled">Whether parser should ignore links with rel no follow</param>
27+
/// <param name="cleanURLFunc">Function to clean the url</param>
28+
/// <param name="isRespectUrlNamedAnchorOrHashbangEnabled">Whether parser should consider named anchor and/or hashbang '#' character as part of the url</param>
29+
public CSQueryHyperlinkParser( bool isRespectMetaRobotsNoFollowEnabled,
30+
bool isRespectAnchorRelNoFollowEnabled,
31+
Func<string, string> cleanURLFunc = null,
32+
bool isRespectUrlNamedAnchorOrHashbangEnabled = false)
33+
: base(isRespectMetaRobotsNoFollowEnabled, isRespectUrlNamedAnchorOrHashbangEnabled)
2434
{
2535
_isRespectAnchorRelNoFollowEnabled = isRespectAnchorRelNoFollowEnabled;
2636
_cleanURLFunc = cleanURLFunc;

0 commit comments

Comments
 (0)