From 3d39a1a1775356af470daf99f1e0c999d279ad4f Mon Sep 17 00:00:00 2001 From: superpenguin612 <74030080+superpenguin612@users.noreply.github.com> Date: Mon, 1 Jul 2024 13:13:27 -0400 Subject: [PATCH 1/6] Check original URL against depth tree when visited link is a redirect --- src/Crawler.php | 6 +++--- src/Handlers/CrawlRequestFulfilled.php | 3 ++- src/UrlParsers/LinkUrlParser.php | 4 ++-- src/UrlParsers/SitemapUrlParser.php | 4 ++-- src/UrlParsers/UrlParser.php | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/Crawler.php b/src/Crawler.php index 547d3b0..c9eba3a 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -447,7 +447,7 @@ public function startCrawling(UriInterface|string $baseUrl) } } - public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null): ?Node + public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?UriInterface $originalUrl = null, ?Node $node = null): ?Node { if (is_null($this->maximumDepth)) { return new Node((string) $url); @@ -457,7 +457,7 @@ public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $returnNode = null; - if ($node->getValue() === (string) $parentUrl) { + if ($node->getValue() === (string) $parentUrl || $node->getValue() === (string) $originalUrl) { $newNode = new Node((string) $url); $node->addChild($newNode); @@ -466,7 +466,7 @@ public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node } foreach ($node->getChildren() as $currentNode) { - $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode); + $returnNode = $this->addToDepthTree($url, $parentUrl, $originalUrl, $currentNode); if (! is_null($returnNode)) { break; diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index 4f45b49..7df172a 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -62,8 +62,9 @@ public function __invoke(ResponseInterface $response, $index) } $baseUrl = $this->getBaseUrl($response, $crawlUrl); + $originalUrl = $crawlUrl->url; - $this->urlParser->addFromHtml($body, $baseUrl); + $this->urlParser->addFromHtml($body, $baseUrl, $originalUrl); usleep($this->crawler->getDelayBetweenRequests()); } diff --git a/src/UrlParsers/LinkUrlParser.php b/src/UrlParsers/LinkUrlParser.php index 69684ca..db409fc 100644 --- a/src/UrlParsers/LinkUrlParser.php +++ b/src/UrlParsers/LinkUrlParser.php @@ -21,7 +21,7 @@ public function __construct(Crawler $crawler) $this->crawler = $crawler; } - public function addFromHtml(string $html, UriInterface $foundOnUrl): void + public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void { $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl); @@ -29,7 +29,7 @@ public function addFromHtml(string $html, UriInterface $foundOnUrl): void ->filter(fn (Url $url) => $this->hasCrawlableScheme($url)) ->map(fn (Url $url) => $this->normalizeUrl($url)) ->filter(function (Url $url) use ($foundOnUrl) { - if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) { + if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, $originalUrl)) { return false; } diff --git a/src/UrlParsers/SitemapUrlParser.php b/src/UrlParsers/SitemapUrlParser.php index 04bf47a..d98e9e9 100644 --- a/src/UrlParsers/SitemapUrlParser.php +++ b/src/UrlParsers/SitemapUrlParser.php @@ -20,7 +20,7 @@ public function __construct(Crawler $crawler) $this->crawler = $crawler; } - public function addFromHtml(string $html, UriInterface $foundOnUrl): void + public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void { $allLinks = $this->extractLinksFromHtml($html, $foundOnUrl); @@ -28,7 +28,7 @@ public function addFromHtml(string $html, UriInterface $foundOnUrl): void ->filter(fn (Url $url) => $this->hasCrawlableScheme($url)) ->map(fn (Url $url) => $this->normalizeUrl($url)) ->filter(function (Url $url) use ($foundOnUrl) { - if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) { + if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, $originalUrl)) { return false; } diff --git a/src/UrlParsers/UrlParser.php b/src/UrlParsers/UrlParser.php index 24ed582..e46a120 100644 --- a/src/UrlParsers/UrlParser.php +++ b/src/UrlParsers/UrlParser.php @@ -9,5 +9,5 @@ interface UrlParser { public function __construct(Crawler $crawler); - public function addFromHtml(string $html, UriInterface $foundOnUrl): void; + public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void; } From b7678ff15ef1eabf2464e260c4c2dfca7ab50fc6 Mon Sep 17 00:00:00 2001 From: superpenguin612 <74030080+superpenguin612@users.noreply.github.com> Date: Mon, 1 Jul 2024 13:31:10 -0400 Subject: [PATCH 2/6] Fix link to $originalUrl --- src/UrlParsers/SitemapUrlParser.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UrlParsers/SitemapUrlParser.php b/src/UrlParsers/SitemapUrlParser.php index d98e9e9..5081040 100644 --- a/src/UrlParsers/SitemapUrlParser.php +++ b/src/UrlParsers/SitemapUrlParser.php @@ -27,7 +27,7 @@ public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterfac collect($allLinks) ->filter(fn (Url $url) => $this->hasCrawlableScheme($url)) ->map(fn (Url $url) => $this->normalizeUrl($url)) - ->filter(function (Url $url) use ($foundOnUrl) { + ->filter(function (Url $url) use ($foundOnUrl, $originalUrl) { if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, $originalUrl)) { return false; } From 90aa275613dba406297401ff439e7748ac6ef078 Mon Sep 17 00:00:00 2001 From: superpenguin612 <74030080+superpenguin612@users.noreply.github.com> Date: Mon, 1 Jul 2024 13:36:42 -0400 Subject: [PATCH 3/6] Fix link to $originalUrl on linkurlparser --- src/UrlParsers/LinkUrlParser.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UrlParsers/LinkUrlParser.php b/src/UrlParsers/LinkUrlParser.php index db409fc..603d172 100644 --- a/src/UrlParsers/LinkUrlParser.php +++ b/src/UrlParsers/LinkUrlParser.php @@ -28,7 +28,7 @@ public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterfac collect($allLinks) ->filter(fn (Url $url) => $this->hasCrawlableScheme($url)) ->map(fn (Url $url) => $this->normalizeUrl($url)) - ->filter(function (Url $url) use ($foundOnUrl) { + ->filter(function (Url $url) use ($foundOnUrl, $originalUrl) { if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, $originalUrl)) { return false; } From 858a9a7952a4a33eb83202c6d18de2e431bada03 Mon Sep 17 00:00:00 2001 From: David Racovan Date: Wed, 10 Jul 2024 13:37:48 -0400 Subject: [PATCH 4/6] Add max depth + redirects test --- tests/CrawlerTest.php | 12 ++++++++++++ tests/server/server.js | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index b762356..ccfd953 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -364,6 +364,18 @@ public function shouldCrawl(UriInterface $url): bool assertCrawledUrlCount(3); }); +it('should handle redirects correctly when max depth is specified', function () { + createCrawler([ + RequestOptions::ALLOW_REDIRECTS => [ + 'track_redirects' => true, + ], + ]) + ->setMaximumDepth(5) + ->startCrawling('http://localhost:8080/redirect-home/'); + + expect(['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'])->toBeCrawledOnce(); +}); + it('respects the requested delay between requests', function () { $baseUrl = 'http://localhost:8080'; diff --git a/tests/server/server.js b/tests/server/server.js index b0dd455..bfc8e3e 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -70,6 +70,10 @@ app.get('/meta-nofollow', function (request, response) { response.end('\n\nno follow it'); }); +app.get('/redirect-home/', function (request, response) { + response.redirect(301, '/'); +}); + app.get('/dir1/internal-redirect-entry/', function (request, response) { response.end('trapped trap-start'); }); From 03ad37176b285a9889c865591b1c1ad72d9216e9 Mon Sep 17 00:00:00 2001 From: superpenguin612 <74030080+superpenguin612@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:37:57 -0400 Subject: [PATCH 5/6] Move new parameter to end of signature --- src/Crawler.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Crawler.php b/src/Crawler.php index c9eba3a..4782725 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -447,7 +447,7 @@ public function startCrawling(UriInterface|string $baseUrl) } } - public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?UriInterface $originalUrl = null, ?Node $node = null): ?Node + public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node { if (is_null($this->maximumDepth)) { return new Node((string) $url); From 1cca0008ddcd97a46fee776f4e948b950c2c32d6 Mon Sep 17 00:00:00 2001 From: superpenguin612 <74030080+superpenguin612@users.noreply.github.com> Date: Tue, 16 Jul 2024 07:21:50 -0400 Subject: [PATCH 6/6] Fix method calls --- src/Crawler.php | 2 +- src/UrlParsers/LinkUrlParser.php | 2 +- src/UrlParsers/SitemapUrlParser.php | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Crawler.php b/src/Crawler.php index 4782725..3d54d46 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -466,7 +466,7 @@ public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node } foreach ($node->getChildren() as $currentNode) { - $returnNode = $this->addToDepthTree($url, $parentUrl, $originalUrl, $currentNode); + $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode, $originalUrl); if (! is_null($returnNode)) { break; diff --git a/src/UrlParsers/LinkUrlParser.php b/src/UrlParsers/LinkUrlParser.php index 603d172..a48dcf4 100644 --- a/src/UrlParsers/LinkUrlParser.php +++ b/src/UrlParsers/LinkUrlParser.php @@ -29,7 +29,7 @@ public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterfac ->filter(fn (Url $url) => $this->hasCrawlableScheme($url)) ->map(fn (Url $url) => $this->normalizeUrl($url)) ->filter(function (Url $url) use ($foundOnUrl, $originalUrl) { - if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, $originalUrl)) { + if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) { return false; } diff --git a/src/UrlParsers/SitemapUrlParser.php b/src/UrlParsers/SitemapUrlParser.php index 5081040..ef614f2 100644 --- a/src/UrlParsers/SitemapUrlParser.php +++ b/src/UrlParsers/SitemapUrlParser.php @@ -28,7 +28,7 @@ public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterfac ->filter(fn (Url $url) => $this->hasCrawlableScheme($url)) ->map(fn (Url $url) => $this->normalizeUrl($url)) ->filter(function (Url $url) use ($foundOnUrl, $originalUrl) { - if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, $originalUrl)) { + if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) { return false; }