Skip to content

Commit 58eb440

Browse files
committed
Moved the cleanURLFun to the base class where it can clean on the full absolute link, fixes sjdirect#101
1 parent b4c8b2f commit 58eb440

File tree

4 files changed

+14
-13
lines changed

4 files changed

+14
-13
lines changed

Abot.Tests.Unit/Core/HyperlinkParserTest.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -427,8 +427,8 @@ public void GetLinks_CleanUrlDelegateSet_ReturnsCleanLinks()
427427

428428
Assert.IsNotNull(result);
429429
Assert.AreEqual(2, result.Count());
430-
Assert.AreEqual("http://a.com/xxx/x.html", result.ElementAt(0).AbsoluteUri);
431-
Assert.AreEqual("http://a.com/yyy/y.html", result.ElementAt(1).AbsoluteUri);
430+
Assert.AreEqual("http://x.com/xxx/x.html", result.ElementAt(0).AbsoluteUri);
431+
Assert.AreEqual("http://x.com/yyy/y.html", result.ElementAt(1).AbsoluteUri);
432432
}
433433

434434
[Test] //https://github.com/sjdirect/abot/issues/15

Abot/Core/CsQueryHyperLinkParser.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ namespace Abot.Core
1212
[Serializable]
1313
public class CSQueryHyperlinkParser : HyperLinkParser
1414
{
15-
Func<string, string> _cleanURLFunc;
1615
bool _isRespectAnchorRelNoFollowEnabled;
1716

1817
public CSQueryHyperlinkParser()
@@ -30,10 +29,9 @@ public CSQueryHyperlinkParser( bool isRespectMetaRobotsNoFollowEnabled,
3029
bool isRespectAnchorRelNoFollowEnabled,
3130
Func<string, string> cleanURLFunc = null,
3231
bool isRespectUrlNamedAnchorOrHashbangEnabled = false)
33-
: base(isRespectMetaRobotsNoFollowEnabled, isRespectUrlNamedAnchorOrHashbangEnabled)
32+
: base(isRespectMetaRobotsNoFollowEnabled, isRespectUrlNamedAnchorOrHashbangEnabled, cleanURLFunc)
3433
{
3534
_isRespectAnchorRelNoFollowEnabled = isRespectAnchorRelNoFollowEnabled;
36-
_cleanURLFunc = cleanURLFunc;
3735
}
3836

3937
protected override string ParserType
@@ -49,7 +47,7 @@ protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
4947
IEnumerable<string> hrefValues = crawledPage.CsQueryDocument.Select("a, area")
5048
.Elements
5149
.Where(e => !HasRelNoFollow(e))
52-
.Select(y => _cleanURLFunc != null ? _cleanURLFunc(y.GetAttribute("href")) : y.GetAttribute("href"))
50+
.Select(y => y.GetAttribute("href"))
5351
.Where(a => !string.IsNullOrWhiteSpace(a));
5452

5553
IEnumerable<string> canonicalHref = crawledPage.CsQueryDocument.

Abot/Core/HapHyperLinkParser.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ namespace Abot.Core
1212
[Serializable]
1313
public class HapHyperLinkParser : HyperLinkParser
1414
{
15-
Func<string, string> _cleanURLFunc;
1615
bool _isRespectAnchorRelNoFollowEnabled;
1716

1817
protected override string ParserType
@@ -36,10 +35,9 @@ public HapHyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled,
3635
bool isRespectAnchorRelNoFollowEnabled,
3736
Func<string, string> cleanURLFunc = null,
3837
bool isRespectUrlNamedAnchorOrHashbangEnabled = false)
39-
:base(isRespectMetaRobotsNoFollowEnabled, isRespectUrlNamedAnchorOrHashbangEnabled)
38+
:base(isRespectMetaRobotsNoFollowEnabled, isRespectUrlNamedAnchorOrHashbangEnabled, cleanURLFunc)
4039
{
4140
_isRespectAnchorRelNoFollowEnabled = isRespectAnchorRelNoFollowEnabled;
42-
_cleanURLFunc = cleanURLFunc;
4341
}
4442

4543
protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
@@ -94,7 +92,7 @@ protected virtual List<string> GetLinks(HtmlNodeCollection nodes)
9492
if (HasRelNoFollow(node))
9593
continue;
9694

97-
hrefValue = _cleanURLFunc != null ? _cleanURLFunc(node.Attributes["href"].Value) : node.Attributes["href"].Value;
95+
hrefValue = node.Attributes["href"].Value;
9896
if (!string.IsNullOrWhiteSpace(hrefValue))
9997
{
10098
hrefValue = DeEntitize(hrefValue);

Abot/Core/HyperLinkParser.cs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,20 @@ public abstract class HyperLinkParser : IHyperLinkParser
2323
{
2424
protected ILog _logger = LogManager.GetLogger("AbotLogger");
2525
protected bool _isRespectMetaRobotsNoFollowEnabled;
26-
protected bool _isRespectUrlNamedAnchorOrHashbangEnabled;
26+
protected bool _isRespectUrlNamedAnchorOrHashbangEnabled;
27+
protected Func<string, string> _cleanURLFunc;
2728

2829
public HyperLinkParser()
29-
:this(false, true)
30+
:this(false, true, null)
3031
{
3132

3233
}
3334

34-
public HyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectUrlNamedAnchorOrHashbangEnabled = false)
35+
public HyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectUrlNamedAnchorOrHashbangEnabled, Func<string, string> cleanURLFunc)
3536
{
3637
_isRespectMetaRobotsNoFollowEnabled = isRespectMetaRobotsNoFollowEnabled;
3738
_isRespectUrlNamedAnchorOrHashbangEnabled = isRespectUrlNamedAnchorOrHashbangEnabled;
39+
_cleanURLFunc = cleanURLFunc;
3840
}
3941

4042
/// <summary>
@@ -105,6 +107,9 @@ protected virtual List<Uri> GetUris(CrawledPage crawledPage, IEnumerable<string>
105107
: hrefValue.Split('#')[0];
106108
Uri newUri = new Uri(uriToUse, href);
107109

110+
if (_cleanURLFunc != null)
111+
newUri = new Uri(_cleanURLFunc(newUri.AbsoluteUri));
112+
108113
if (!uris.Exists(u => u.AbsoluteUri == newUri.AbsoluteUri))
109114
uris.Add(newUri);
110115
}

0 commit comments

Comments
 (0)