Skip to content

Commit 64f1d66

Browse files
committed
Added optional HTML sanitization of readable contents.
1 parent f077f77 commit 64f1d66

File tree

3 files changed

+60
-2
lines changed

3 files changed

+60
-2
lines changed

README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,33 @@ Note about CORS: by design, the server will allow any origin to access it, so br
2323
Usage
2424
-----
2525

26+
### `GET /get`
27+
28+
#### Required parameters
29+
30+
- `url`: The URL to retrieve retrieve readable contents from, eg. `https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/`.
31+
32+
#### Optional parameters
33+
34+
- `sanitize`: A *boolean string* to enable HTML sanitization (valid truthy boolean strings: "1", "on", "true", "yes", "y"; everything else will be considered falsy):
35+
36+
**Note:** Enabling contents sanitization loses Readability.js specific HTML semantics, though is probably safer for users if you plan to publish retrieved contents on a public website.
37+
38+
#### Example
39+
40+
Content sanitization enabled:
41+
42+
$ curl http://0.0.0.0:3000/get\?sanitize=y&url\=https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/
43+
{
44+
"byline":"Nicolas Perriault —",
45+
"content":"<p><strong>So finally you&#39;re <a href=\"https://nicolas.perriault.net/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/\">testing",
46+
"length":2867,
47+
"title":"Get your Frontend JavaScript Code Covered | Code",
48+
"uri":"https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/"
49+
}
50+
51+
Content sanitization disabled (default):
52+
2653
$ curl http://0.0.0.0:3000/get\?url\=https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/
2754
{
2855
"byline":"Nicolas Perriault —",

index.js

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,38 @@
11
var scrape = require("./scrape");
22
var express = require("express");
33
var pkgInfo = require("./package.json");
4+
var html2md = require("html-md");
5+
var markdown = require("markdown");
46

57
var app = express();
68

9+
/**
10+
* Casts a qs string arg into an actual boolean.
11+
* @param {String} arg The query string arg.
12+
* @return {Boolean}
13+
*/
14+
function boolArg(queryParam) {
15+
if (!queryParam) return false;
16+
return ["1", "on", "true", "yes", "y"].indexOf(queryParam.toLowerCase()) !== -1;
17+
}
18+
19+
/**
20+
* Takes a result object and replace native html contents with a safer sanitized
21+
* version.
22+
* @param {Object} resultObject
23+
* @return {Object}
24+
*/
25+
function sanitizeResult(resultObject) {
26+
try {
27+
var sanitized = markdown.parse(html2md(resultObject.content));
28+
resultObject.content = sanitized;
29+
resultObject.length = sanitized.length;
30+
return resultObject;
31+
} catch (err) {
32+
throw {error: "Failed HTML sanitization:" + (err || "Unknown reason.")};
33+
}
34+
}
35+
736
app.use(function(req, res, next) {
837
res.header("Content-Type", "application/json");
938
res.header("Access-Control-Allow-Origin", "*");
@@ -21,12 +50,12 @@ app.get("/", function(req, res) {
2150
});
2251

2352
app.get("/get", function(req, res) {
24-
var url = req.query.url;
53+
var url = req.query.url, sanitize = boolArg(req.query.sanitize);
2554
if (!url) {
2655
return res.status(400).json({error: "Missing url parameter"});
2756
}
2857
scrape(url).then(function(result) {
29-
res.json(result);
58+
res.json(sanitize ? sanitizeResult(result) : result);
3059
}).catch(function(err) {
3160
res.status(500).json(err);
3261
});

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
"dependencies": {
1818
"bluebird": "^2.9.12",
1919
"express": "^4.11.2",
20+
"html-md": "^3.0.2",
21+
"markdown": "^0.5.0",
2022
"phantomjs": "^1.9.15"
2123
}
2224
}

0 commit comments

Comments
 (0)