Skip to content

Commit bce5da7

Browse files
committed
Modified the signature of the WebContentExtractor methods to protected / virtual so it can easily be inherited and built upon.
Added a new parameter to the HyperlinkParser to ignore or not the fragment par of the url instead of removing it every time.
1 parent d208b97 commit bce5da7

File tree

4 files changed

+34
-33
lines changed

4 files changed

+34
-33
lines changed

Abot/Core/CsQueryHyperLinkParser.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ namespace Abot.Core
1313
public class CSQueryHyperlinkParser : HyperLinkParser
1414
{
1515
Func<string, string> _cleanURLFunc;
16-
bool _isRespectMetaRobotsNoFollowEnabled;
1716
bool _isRespectAnchorRelNoFollowEnabled;
1817

1918
public CSQueryHyperlinkParser()
@@ -23,7 +22,6 @@ public CSQueryHyperlinkParser()
2322
public CSQueryHyperlinkParser(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanURLFunc = null)
2423
: base(isRespectMetaRobotsNoFollowEnabled)
2524
{
26-
_isRespectMetaRobotsNoFollowEnabled = isRespectMetaRobotsNoFollowEnabled;
2725
_isRespectAnchorRelNoFollowEnabled = isRespectAnchorRelNoFollowEnabled;
2826
_cleanURLFunc = cleanURLFunc;
2927
}

Abot/Core/HapHyperLinkParser.cs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ namespace Abot.Core
1313
public class HapHyperLinkParser : HyperLinkParser
1414
{
1515
Func<string, string> _cleanURLFunc;
16-
bool _isRespectMetaRobotsNoFollowEnabled;
1716
bool _isRespectAnchorRelNoFollowEnabled;
1817

1918
protected override string ParserType
@@ -26,10 +25,12 @@ public HapHyperLinkParser()
2625
{
2726
}
2827

29-
public HapHyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanURLFunc = null)
30-
:base(isRespectMetaRobotsNoFollowEnabled)
28+
public HapHyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled,
29+
bool isRespectAnchorRelNoFollowEnabled,
30+
Func<string, string> cleanURLFunc = null,
31+
bool removeUrlFragment = true)
32+
:base(isRespectMetaRobotsNoFollowEnabled, removeUrlFragment)
3133
{
32-
_isRespectMetaRobotsNoFollowEnabled = isRespectMetaRobotsNoFollowEnabled;
3334
_isRespectAnchorRelNoFollowEnabled = isRespectAnchorRelNoFollowEnabled;
3435
_cleanURLFunc = cleanURLFunc;
3536
}

Abot/Core/HyperLinkParser.cs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,19 @@ public interface IHyperLinkParser
2222
public abstract class HyperLinkParser : IHyperLinkParser
2323
{
2424
protected ILog _logger = LogManager.GetLogger("AbotLogger");
25-
protected bool IsRespectMetaRobotsNoFollowEnabled { get; set; }
25+
protected bool _isRespectMetaRobotsNoFollowEnabled;
26+
protected bool _removeUrlFragment;
2627

2728
public HyperLinkParser()
28-
:this(false)
29+
:this(false, true)
2930
{
3031

3132
}
3233

33-
public HyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled)
34+
public HyperLinkParser(bool isRespectMetaRobotsNoFollowEnabled, bool removeUrlFragment = true)
3435
{
35-
IsRespectMetaRobotsNoFollowEnabled = isRespectMetaRobotsNoFollowEnabled;
36+
_isRespectMetaRobotsNoFollowEnabled = isRespectMetaRobotsNoFollowEnabled;
37+
_removeUrlFragment = removeUrlFragment;
3638
}
3739

3840
/// <summary>
@@ -96,10 +98,14 @@ protected virtual List<Uri> GetUris(CrawledPage crawledPage, IEnumerable<string>
9698
{
9799
try
98100
{
99-
href = hrefValue.Split('#')[0];
101+
// Remove the url fragment part of the url if needed.
102+
// This is the part after the # and is often not useful.
103+
href = _removeUrlFragment
104+
? hrefValue.Split('#')[0]
105+
: hrefValue;
100106
Uri newUri = new Uri(uriToUse, href);
101107

102-
if (!uris.Contains(newUri))
108+
if (!uris.Exists(u => u.AbsoluteUri == newUri.AbsoluteUri))
103109
uris.Add(newUri);
104110
}
105111
catch (Exception e)
@@ -114,7 +120,7 @@ protected virtual List<Uri> GetUris(CrawledPage crawledPage, IEnumerable<string>
114120

115121
protected virtual bool HasRobotsNoFollow(CrawledPage crawledPage)
116122
{
117-
if (!IsRespectMetaRobotsNoFollowEnabled)
123+
if (!_isRespectMetaRobotsNoFollowEnabled)
118124
return false;
119125

120126
string robotsMeta = robotsMeta = GetMetaRobotsValue(crawledPage);

Abot/Core/WebContentExtractor.cs

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,20 @@ public class WebContentExtractor : IWebContentExtractor
1919
{
2020
static ILog _logger = LogManager.GetLogger("AbotLogger");
2121

22-
public PageContent GetContent(WebResponse response)
22+
public virtual PageContent GetContent(WebResponse response)
2323
{
2424
using (MemoryStream memoryStream = GetRawData(response))
2525
{
2626
String charset = GetCharsetFromHeaders(response);
2727

28-
if (charset == null)
29-
charset = GetCharsetFromBody(memoryStream);
28+
if (charset == null) {
29+
memoryStream.Seek(0, SeekOrigin.Begin);
3030

31+
// Do not wrap in closing statement to prevent closing of this stream.
32+
StreamReader srr = new StreamReader(memoryStream, Encoding.ASCII);
33+
String body = srr.ReadToEnd();
34+
charset = GetCharsetFromBody(body);
35+
}
3136
memoryStream.Seek(0, SeekOrigin.Begin);
3237

3338
Encoding e = GetEncoding(charset);
@@ -47,7 +52,7 @@ public PageContent GetContent(WebResponse response)
4752
}
4853
}
4954

50-
private string GetCharsetFromHeaders(WebResponse webResponse)
55+
protected string GetCharsetFromHeaders(WebResponse webResponse)
5156
{
5257
string charset = null;
5358
String ctype = webResponse.Headers["content-type"];
@@ -60,21 +65,14 @@ private string GetCharsetFromHeaders(WebResponse webResponse)
6065
return charset;
6166
}
6267

63-
private string GetCharsetFromBody(MemoryStream rawdata)
68+
protected string GetCharsetFromBody(string body)
6469
{
6570
String charset = null;
66-
67-
MemoryStream ms = rawdata;
68-
ms.Seek(0, SeekOrigin.Begin);
69-
70-
//Do not wrapp in closing statement to prevent closing of this stream
71-
StreamReader srr = new StreamReader(ms, Encoding.ASCII);
72-
String meta = srr.ReadToEnd();
73-
74-
if (meta != null)
71+
72+
if (body != null)
7573
{
7674
//find expression from : http://stackoverflow.com/questions/3458217/how-to-use-regular-expression-to-match-the-charset-string-in-html
77-
Match match = Regex.Match(meta, @"<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s""']*)?([^>]*?)[\s""';]*charset\s*=[\s""']*([^\s""'/>]*)", RegexOptions.IgnoreCase);
75+
Match match = Regex.Match(body, @"<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s""']*)?([^>]*?)[\s""';]*charset\s*=[\s""']*([^\s""'/>]*)", RegexOptions.IgnoreCase);
7876
if (match.Success)
7977
{
8078
charset = string.IsNullOrWhiteSpace(match.Groups[2].Value) ? null : match.Groups[2].Value;
@@ -83,9 +81,8 @@ private string GetCharsetFromBody(MemoryStream rawdata)
8381

8482
return charset;
8583
}
86-
87-
88-
private Encoding GetEncoding(string charset)
84+
85+
protected Encoding GetEncoding(string charset)
8986
{
9087
Encoding e = Encoding.UTF8;
9188
if (charset != null)
@@ -126,10 +123,9 @@ private MemoryStream GetRawData(WebResponse webResponse)
126123
return rawData;
127124
}
128125

129-
public void Dispose()
126+
public virtual void Dispose()
130127
{
131128
// Nothing to do
132129
}
133130
}
134-
135131
}

0 commit comments

Comments
 (0)