Skip to content

Commit 56b75d4

Browse files
committed
Merge branch 'master' of github.com:sjdirect/abot
2 parents 3bdff16 + 2bcd341 commit 56b75d4

File tree

3 files changed

+38
-17
lines changed

3 files changed

+38
-17
lines changed

Abot/Core/PageRequester.cs

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using System.Net;
77
using System.Reflection;
88
using System.Threading.Tasks;
9+
using log4net.Core;
910

1011
namespace Abot.Core
1112
{
@@ -103,24 +104,32 @@ public virtual CrawledPage MakeRequest(Uri uri, Func<CrawledPage, CrawlDecision>
103104
}
104105
finally
105106
{
106-
crawledPage.HttpWebRequest = request;
107-
crawledPage.RequestCompleted = DateTime.Now;
108-
if (response != null)
107+
try
109108
{
110-
crawledPage.HttpWebResponse = new HttpWebResponseWrapper(response);
111-
CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
112-
if (shouldDownloadContentDecision.Allow)
109+
crawledPage.HttpWebRequest = request;
110+
crawledPage.RequestCompleted = DateTime.Now;
111+
if (response != null)
113112
{
114-
crawledPage.DownloadContentStarted = DateTime.Now;
115-
crawledPage.Content = _extractor.GetContent(response);
116-
crawledPage.DownloadContentCompleted = DateTime.Now;
113+
crawledPage.HttpWebResponse = new HttpWebResponseWrapper(response);
114+
CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
115+
if (shouldDownloadContentDecision.Allow)
116+
{
117+
crawledPage.DownloadContentStarted = DateTime.Now;
118+
crawledPage.Content = _extractor.GetContent(response);
119+
crawledPage.DownloadContentCompleted = DateTime.Now;
120+
}
121+
else
122+
{
123+
_logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
124+
}
125+
126+
response.Close();//Should already be closed by _extractor but just being safe
117127
}
118-
else
119-
{
120-
_logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
121-
}
122-
123-
response.Close();//Should already be closed by _extractor but just being safe
128+
}
129+
catch (Exception e)
130+
{
131+
_logger.DebugFormat("Error occurred finalizing requesting url [{0}]", uri.AbsoluteUri);
132+
_logger.Debug(e);
124133
}
125134
}
126135

Abot/Poco/HttpWebResponseWrapper.cs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,26 @@ public HttpWebResponseWrapper(HttpWebResponse response)
3939
this.Cookies = response.Cookies;
4040
this.IsFromCache = response.IsFromCache;
4141
this.IsMutuallyAuthenticated = response.IsMutuallyAuthenticated;
42-
this.LastModified = response.LastModified;
42+
this.LastModified = GetLastModified(response);
4343
this.Method = response.Method;
4444
this.ProtocolVersion = response.ProtocolVersion;
4545
this.ResponseUri = response.ResponseUri;
4646
this.Server = response.Server;
4747
this.StatusDescription = response.StatusDescription;
4848
}
4949

50+
private static DateTime GetLastModified(HttpWebResponse response)
51+
{
52+
try
53+
{
54+
return response.LastModified;
55+
}
56+
catch (ProtocolViolationException)
57+
{
58+
return DateTime.MinValue;
59+
}
60+
}
61+
5062
/// <summary>Constructs a response based on custom parameters.</summary>
5163
/// <remarks>Recieves parameters neccesarily set for Abot to work.</remarks>
5264
public HttpWebResponseWrapper(HttpStatusCode statusCode, string contentType, byte[] content, NameValueCollection headers)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
287287
crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
288288
{
289289
CrawlDecision decision = new CrawlDecision{ Allow = true };
290-
if (crawledPage.PageSizeInBytes < 100)
290+
if (crawledPage.Content.Bytes.Length < 100)
291291
return new CrawlDecision { Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes" };
292292

293293
return decision;

0 commit comments

Comments
 (0)