diff --git a/README.md b/README.md index 2473cb4..c20e386 100644 --- a/README.md +++ b/README.md @@ -256,6 +256,9 @@ For a clearer list, I recommend going to the documentation: https://crawler.site * `--url=` Required. HTTP or HTTPS URL address of the website to be crawled.Use quotation marks if the URL contains query parameters +* `--single-page` Load only one page to which the URL is given (and its assets), but do not follow other pages. +* `--max-depth=` Maximum crawling depth (for pages, not assets). Default is `0` (no limit). `1` means + `/about` or `/about/`, `2` means `/about/contacts` etc. * `--device=` Device type for choosing a predefined User-Agent. Ignored when `--user-agent` is defined. Supported values: `desktop`, `mobile`, `tablet`. Defaults is `desktop`. * `--user-agent=` Custom User-Agent header. Use quotation marks. If specified, it takes precedence over @@ -295,7 +298,8 @@ It is particularly useful to disable JavaScript in the case of exporting website (without HTTP server), where it is almost impossible to get the website to work from any location on the disk only through the file:// protocol. -* `--single-page` Load only one page to which the URL is given (and its assets), but do not follow other pages. +* `--disable-all-assets` Disables crawling of all assets and files and only crawls pages in href attributes. + Shortcut for calling all other `--disable-*` flags. * `--disable-javascript` Disables JavaScript downloading and removes all JavaScript code from HTML, including `onclick` and other `on*` handlers. * `--disable-styles` Disables CSS file downloading and at the same time removes all style definitions diff --git a/src/Crawler/ContentProcessor/HtmlProcessor.php b/src/Crawler/ContentProcessor/HtmlProcessor.php index 050a4fa..ce354d2 100644 --- a/src/Crawler/ContentProcessor/HtmlProcessor.php +++ b/src/Crawler/ContentProcessor/HtmlProcessor.php @@ -27,6 +27,7 @@ class HtmlProcessor extends BaseProcessor implements ContentProcessor public static array $htmlPagesExtensions = ['htm', 'html', 'shtml', 'php', 'phtml', 'ashx', 'xhtml', 'asp', 'aspx', 'jsp', 'jspx', 'do', 'cfm', 'cgi', 'pl']; private readonly bool $singlePageOnly; + private readonly int $maxDepth; private readonly bool $filesEnabled; private readonly bool $imagesEnabled; private readonly bool $scriptsEnabled; @@ -41,6 +42,7 @@ public function __construct(Crawler $crawler) parent::__construct($crawler); $this->singlePageOnly = $this->options->singlePage; + $this->maxDepth = $this->options->maxDepth; $this->filesEnabled = !$this->options->disableFiles; $this->imagesEnabled = !$this->options->disableImages; $this->scriptsEnabled = !$this->options->disableJavascript; @@ -213,6 +215,13 @@ private function findHrefUrls(string $html, ParsedUrl $sourceUrl, FoundUrls $fou preg_match_all('/href\\\\["\'][:=]\\\\["\'](https?:\/\/[^"\'\\\\]+)\\\\["\']/i', $html, $matches); $foundUrlsTxt = array_merge($foundUrlsTxt, $matches[1] ?? []); + if ($this->maxDepth > 0) { + $foundUrlsTxt = array_filter($foundUrlsTxt, function ($url) use ($sourceUrl) { + $parsedUrl = ParsedUrl::parse($url, $sourceUrl); + return $parsedUrl->getDepth() <= $this->maxDepth; + }); + } + if (!$this->filesEnabled) { $foundUrlsTxt = array_filter($foundUrlsTxt, function ($url) use ($regexForHtmlExtensions) { return preg_match('/\.[a-z0-9]{1,10}(|\?.*)$/i', $url) === 0 || preg_match($regexForHtmlExtensions, $url) === 1; diff --git a/src/Crawler/CoreOptions.php b/src/Crawler/CoreOptions.php index 602d33b..48dde6e 100644 --- a/src/Crawler/CoreOptions.php +++ b/src/Crawler/CoreOptions.php @@ -29,6 +29,8 @@ class CoreOptions // basic settings public string $url; + public bool $singlePage = false; + public int $maxDepth = 0; public DeviceType $device = DeviceType::DESKTOP; public ?string $userAgent = null; public int $timeout = 5; @@ -57,7 +59,6 @@ class CoreOptions public ?int $consoleWidth = null; // resource filtering - public bool $singlePage = false; public bool $disableJavascript = false; public bool $disableStyles = false; public bool $disableFonts = false; @@ -165,6 +166,8 @@ public static function getOptions(): Options self::GROUP_BASIC_SETTINGS, 'Basic settings', [ new Option('--url', '-u', 'url', Type::URL, false, 'Required URL. Enclose in quotes if URL contains query parameters.', null, false), + new Option('--single-page', '-sp', 'singlePage', Type::BOOL, false, 'Load only one page to which the URL is given (and its assets), but do not follow other pages.', false, false), + new Option('--max-depth', '-md', 'maxDepth', Type::INT, false, 'Maximum crawling depth (for pages, not assets). Default is `0` (no limit). `1` means `/about` or `/about/`, `2` means `/about/contacts` etc.', 0, false), new Option('--device', '-d', 'device', Type::STRING, false, 'Device type for User-Agent selection. Values `desktop`, `tablet`, `mobile`. Ignored with `--user-agent`.', 'desktop', false), new Option('--user-agent', '-ua', 'userAgent', Type::STRING, false, 'Override User-Agent selected by --device.', null, true), new Option('--timeout', '-t', 'timeout', Type::INT, false, 'Request timeout (in sec).', 5, false), @@ -193,7 +196,6 @@ public static function getOptions(): Options $options->addGroup(new Group( self::GROUP_RESOURCE_FILTERING, 'Resource filtering', [ - new Option('--single-page', '-sp', 'singlePage', Type::BOOL, false, 'Load only one page to which the URL is given (and its assets), but do not follow other pages.', false, false), new Option('--disable-javascript', '-dj', 'disableJavascript', Type::BOOL, false, 'Disables JavaScript downloading and removes all JavaScript code from HTML, including onclick and other on* handlers.', false, false), new Option('--disable-styles', '-ds', 'disableStyles', Type::BOOL, false, 'Disables CSS file downloading and at the same time removes all style definitions by