Skip to content

Commit

Permalink
filtering: added --max-depth=<int> for maximum crawling depth (for pa…
Browse files Browse the repository at this point in the history
…ges, not assets) and --single-page moved to basic options
  • Loading branch information
janreges committed Dec 10, 2024
1 parent 7325a4b commit 2dbff75
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 3 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,9 @@ For a clearer list, I recommend going to the documentation: https://crawler.site

* `--url=<url>` Required. HTTP or HTTPS URL address of the website to be crawled.Use quotation marks
if the URL contains query parameters
* `--single-page` Load only one page to which the URL is given (and its assets), but do not follow other pages.
* `--max-depth=<int>` Maximum crawling depth (for pages, not assets). Default is `0` (no limit). `1` means
`/about` or `/about/`, `2` means `/about/contacts` etc.
* `--device=<val>` Device type for choosing a predefined User-Agent. Ignored when `--user-agent` is
defined. Supported values: `desktop`, `mobile`, `tablet`. Defaults is `desktop`.
* `--user-agent=<val>` Custom User-Agent header. Use quotation marks. If specified, it takes precedence over
Expand Down Expand Up @@ -295,7 +298,8 @@ It is particularly useful to disable JavaScript in the case of exporting website
(without HTTP server), where it is almost impossible to get the website to work from any location on the disk only
through the file:// protocol.

* `--single-page` Load only one page to which the URL is given (and its assets), but do not follow other pages.
* `--disable-all-assets` Disables crawling of all assets and files and only crawls pages in href attributes.
Shortcut for calling all other `--disable-*` flags.
* `--disable-javascript` Disables JavaScript downloading and removes all JavaScript code from HTML,
including `onclick` and other `on*` handlers.
* `--disable-styles` Disables CSS file downloading and at the same time removes all style definitions
Expand Down
9 changes: 9 additions & 0 deletions src/Crawler/ContentProcessor/HtmlProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class HtmlProcessor extends BaseProcessor implements ContentProcessor
public static array $htmlPagesExtensions = ['htm', 'html', 'shtml', 'php', 'phtml', 'ashx', 'xhtml', 'asp', 'aspx', 'jsp', 'jspx', 'do', 'cfm', 'cgi', 'pl'];

private readonly bool $singlePageOnly;
private readonly int $maxDepth;
private readonly bool $filesEnabled;
private readonly bool $imagesEnabled;
private readonly bool $scriptsEnabled;
Expand All @@ -41,6 +42,7 @@ public function __construct(Crawler $crawler)
parent::__construct($crawler);

$this->singlePageOnly = $this->options->singlePage;
$this->maxDepth = $this->options->maxDepth;
$this->filesEnabled = !$this->options->disableFiles;
$this->imagesEnabled = !$this->options->disableImages;
$this->scriptsEnabled = !$this->options->disableJavascript;
Expand Down Expand Up @@ -213,6 +215,13 @@ private function findHrefUrls(string $html, ParsedUrl $sourceUrl, FoundUrls $fou
preg_match_all('/href\\\\["\'][:=]\\\\["\'](https?:\/\/[^"\'\\\\]+)\\\\["\']/i', $html, $matches);
$foundUrlsTxt = array_merge($foundUrlsTxt, $matches[1] ?? []);

if ($this->maxDepth > 0) {
$foundUrlsTxt = array_filter($foundUrlsTxt, function ($url) use ($sourceUrl) {
$parsedUrl = ParsedUrl::parse($url, $sourceUrl);
return $parsedUrl->getDepth() <= $this->maxDepth;
});
}

if (!$this->filesEnabled) {
$foundUrlsTxt = array_filter($foundUrlsTxt, function ($url) use ($regexForHtmlExtensions) {
return preg_match('/\.[a-z0-9]{1,10}(|\?.*)$/i', $url) === 0 || preg_match($regexForHtmlExtensions, $url) === 1;
Expand Down
6 changes: 4 additions & 2 deletions src/Crawler/CoreOptions.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ class CoreOptions

// basic settings
public string $url;
public bool $singlePage = false;
public int $maxDepth = 0;
public DeviceType $device = DeviceType::DESKTOP;
public ?string $userAgent = null;
public int $timeout = 5;
Expand Down Expand Up @@ -57,7 +59,6 @@ class CoreOptions
public ?int $consoleWidth = null;

// resource filtering
public bool $singlePage = false;
public bool $disableJavascript = false;
public bool $disableStyles = false;
public bool $disableFonts = false;
Expand Down Expand Up @@ -165,6 +166,8 @@ public static function getOptions(): Options
self::GROUP_BASIC_SETTINGS,
'Basic settings', [
new Option('--url', '-u', 'url', Type::URL, false, 'Required URL. Enclose in quotes if URL contains query parameters.', null, false),
new Option('--single-page', '-sp', 'singlePage', Type::BOOL, false, 'Load only one page to which the URL is given (and its assets), but do not follow other pages.', false, false),
new Option('--max-depth', '-md', 'maxDepth', Type::INT, false, 'Maximum crawling depth (for pages, not assets). Default is `0` (no limit). `1` means `/about` or `/about/`, `2` means `/about/contacts` etc.', 0, false),
new Option('--device', '-d', 'device', Type::STRING, false, 'Device type for User-Agent selection. Values `desktop`, `tablet`, `mobile`. Ignored with `--user-agent`.', 'desktop', false),
new Option('--user-agent', '-ua', 'userAgent', Type::STRING, false, 'Override User-Agent selected by --device.', null, true),
new Option('--timeout', '-t', 'timeout', Type::INT, false, 'Request timeout (in sec).', 5, false),
Expand Down Expand Up @@ -193,7 +196,6 @@ public static function getOptions(): Options
$options->addGroup(new Group(
self::GROUP_RESOURCE_FILTERING,
'Resource filtering', [
new Option('--single-page', '-sp', 'singlePage', Type::BOOL, false, 'Load only one page to which the URL is given (and its assets), but do not follow other pages.', false, false),
new Option('--disable-javascript', '-dj', 'disableJavascript', Type::BOOL, false, 'Disables JavaScript downloading and removes all JavaScript code from HTML, including onclick and other on* handlers.', false, false),
new Option('--disable-styles', '-ds', 'disableStyles', Type::BOOL, false, 'Disables CSS file downloading and at the same time removes all style definitions by <style> tag or inline by style attributes.', false, false),
new Option('--disable-fonts', '-dfo', 'disableFonts', Type::BOOL, false, 'Disables font downloading and also removes all font/font-face definitions from CSS.', false, false),
Expand Down
20 changes: 20 additions & 0 deletions src/Crawler/ParsedUrl.php
Original file line number Diff line number Diff line change
Expand Up @@ -345,4 +345,24 @@ public function getBaseName(): ?string
return $result;
}

/**
* Get depth of the URL path. Examples:
* / -> 0
* /about -> 1
* /about/ -> 1
* /about/me -> 2
* /about/me/ -> 2
* /about/me/contact -> 3
* /about/me/contact/ -> 3
* /about/me/contact/.. -> 2
* /about/me/contact/../.. -> 1
* ...
*
* @return int
*/
public function getDepth(): int
{
return max(substr_count(rtrim($this->path, '/'), '/') - substr_count($this->path, '/..'), 0);
}

}

0 comments on commit 2dbff75

Please sign in to comment.