Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check original URL against depth tree when visited link is a redirect #467

Merged
merged 6 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ public function startCrawling(UriInterface|string $baseUrl)
}
}

public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null): ?Node
public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node
{
if (is_null($this->maximumDepth)) {
return new Node((string) $url);
Expand All @@ -457,7 +457,7 @@ public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node

$returnNode = null;

if ($node->getValue() === (string) $parentUrl) {
if ($node->getValue() === (string) $parentUrl || $node->getValue() === (string) $originalUrl) {
$newNode = new Node((string) $url);

$node->addChild($newNode);
Expand All @@ -466,7 +466,7 @@ public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node
}

foreach ($node->getChildren() as $currentNode) {
$returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
$returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode, $originalUrl);

if (! is_null($returnNode)) {
break;
Expand Down
3 changes: 2 additions & 1 deletion src/Handlers/CrawlRequestFulfilled.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ public function __invoke(ResponseInterface $response, $index)
}

$baseUrl = $this->getBaseUrl($response, $crawlUrl);
$originalUrl = $crawlUrl->url;

$this->urlParser->addFromHtml($body, $baseUrl);
$this->urlParser->addFromHtml($body, $baseUrl, $originalUrl);

usleep($this->crawler->getDelayBetweenRequests());
}
Expand Down
6 changes: 3 additions & 3 deletions src/UrlParsers/LinkUrlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ public function __construct(Crawler $crawler)
$this->crawler = $crawler;
}

public function addFromHtml(string $html, UriInterface $foundOnUrl): void
public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void
{
$allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);

collect($allLinks)
->filter(fn (Url $url) => $this->hasCrawlableScheme($url))
->map(fn (Url $url) => $this->normalizeUrl($url))
->filter(function (Url $url) use ($foundOnUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) {
->filter(function (Url $url) use ($foundOnUrl, $originalUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) {
return false;
}

Expand Down
6 changes: 3 additions & 3 deletions src/UrlParsers/SitemapUrlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ public function __construct(Crawler $crawler)
$this->crawler = $crawler;
}

public function addFromHtml(string $html, UriInterface $foundOnUrl): void
public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void
{
$allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);

collect($allLinks)
->filter(fn (Url $url) => $this->hasCrawlableScheme($url))
->map(fn (Url $url) => $this->normalizeUrl($url))
->filter(function (Url $url) use ($foundOnUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) {
->filter(function (Url $url) use ($foundOnUrl, $originalUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) {
return false;
}

Expand Down
2 changes: 1 addition & 1 deletion src/UrlParsers/UrlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ interface UrlParser
{
public function __construct(Crawler $crawler);

public function addFromHtml(string $html, UriInterface $foundOnUrl): void;
public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void;
}
12 changes: 12 additions & 0 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,18 @@ public function shouldCrawl(UriInterface $url): bool
assertCrawledUrlCount(3);
});

it('should handle redirects correctly when max depth is specified', function () {
createCrawler([
RequestOptions::ALLOW_REDIRECTS => [
'track_redirects' => true,
],
])
->setMaximumDepth(5)
->startCrawling('http://localhost:8080/redirect-home/');

expect(['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'])->toBeCrawledOnce();
});

it('respects the requested delay between requests', function () {
$baseUrl = 'http://localhost:8080';

Expand Down
4 changes: 4 additions & 0 deletions tests/server/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ app.get('/meta-nofollow', function (request, response) {
response.end('<html><head>\n<meta name="robots" content="index, nofollow">\n</head><body><a href="/meta-nofollow-target">no follow it</a></body></html>');
});

app.get('/redirect-home/', function (request, response) {
response.redirect(301, '/');
});

app.get('/dir1/internal-redirect-entry/', function (request, response) {
response.end('<a href="../loop-generator/internal-redirect/trapped/">trapped</a> <a href="../../dir1/internal-redirect/trap/">trap-start</a>');
});
Expand Down