Skip to content

Commit

Permalink
feat: custom link parser (#458)
Browse files Browse the repository at this point in the history
* feat: allow custom url parsers

Previously named LinkAdder

* fix: removed incorrect type check

* feat: added sitemap url parser

* tests: added sitemap url parsing test

* chore: added readme documentation for custom url parsers

* Update README.md

* tests: added more consistent tests with sitemapindex and sitemap childs

* Fix styling

* chore: added documentation for the built-in SitemapUrlParser

---------

Co-authored-by: Freek Van der Herten <[email protected]>
Co-authored-by: Velka-DEV <[email protected]>
  • Loading branch information
3 people authored Jan 2, 2024
1 parent 25c378a commit f4958b4
Show file tree
Hide file tree
Showing 14 changed files with 777 additions and 272 deletions.
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,26 @@ This package comes with three `CrawlProfiles` out of the box:
- `CrawlInternalUrls`: this profile will only crawl the internal urls on the pages of a host.
- `CrawlSubdomains`: this profile will only crawl the internal urls and its subdomains on the pages of a host.

### Custom link extraction

You can customize how links are extracted from a page by passing a custom `UrlParser` to the crawler.

```php
Crawler::create()
->setUrlParserClass(<class that implements \Spatie\Crawler\UrlParsers\UrlParser>::class)
...
```

By default, the `LinkUrlParser` is used. This parser will extract all links from the `href` attribute of `a` tags.

There is also a built-in `SitemapUrlParser` that will extract & crawl all links from a sitemap. It does support sitemap index files.

```php
Crawler::create()
->setUrlParserClass(SitemapUrlParser::class)
...
```

### Ignoring robots.txt and robots meta

By default, the crawler will respect robots data. It is possible to disable these checks like so:
Expand Down
8 changes: 4 additions & 4 deletions src/CrawlObservers/CrawlObserver.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ public function willCrawl(UriInterface $url, ?string $linkText): void
public function crawled(
UriInterface $url,
ResponseInterface $response,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
}

Expand All @@ -32,8 +32,8 @@ public function crawled(
public function crawlFailed(
UriInterface $url,
RequestException $requestException,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
}

Expand Down
4 changes: 2 additions & 2 deletions src/CrawlUrl.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ class CrawlUrl

public static function create(
UriInterface $url,
UriInterface $foundOnUrl = null,
?UriInterface $foundOnUrl = null,
$id = null,
string $linkText = null,
?string $linkText = null,
): static {
$static = new static($url, $foundOnUrl, linkText: $linkText);

Expand Down
19 changes: 18 additions & 1 deletion src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
use Spatie\Crawler\Exceptions\InvalidCrawlRequestHandler;
use Spatie\Crawler\Handlers\CrawlRequestFailed;
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
use Spatie\Crawler\UrlParsers\LinkUrlParser;
use Spatie\Robots\RobotsTxt;
use Tree\Node\Node;

Expand Down Expand Up @@ -62,6 +63,8 @@ class Crawler

protected string $crawlRequestFailedClass;

protected string $urlParserClass;

protected int $delayBetweenRequests = 0;

protected array $allowedMimeTypes = [];
Expand Down Expand Up @@ -102,6 +105,8 @@ public function __construct(
$this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;

$this->crawlRequestFailedClass = CrawlRequestFailed::class;

$this->urlParserClass = LinkUrlParser::class;
}

public function getDefaultScheme(): string
Expand Down Expand Up @@ -345,6 +350,18 @@ public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): sel
return $this;
}

public function setUrlParserClass(string $urlParserClass): self
{
$this->urlParserClass = $urlParserClass;

return $this;
}

public function getUrlParserClass(): string
{
return $this->urlParserClass;
}

public function setBrowsershot(Browsershot $browsershot)
{
$this->browsershot = $browsershot;
Expand Down Expand Up @@ -430,7 +447,7 @@ public function startCrawling(UriInterface|string $baseUrl)
}
}

public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null): ?Node
{
if (is_null($this->maximumDepth)) {
return new Node((string) $url);
Expand Down
9 changes: 5 additions & 4 deletions src/Handlers/CrawlRequestFulfilled.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,17 @@
use Spatie\Crawler\CrawlerRobots;
use Spatie\Crawler\CrawlProfiles\CrawlSubdomains;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\LinkAdder;
use Spatie\Crawler\ResponseWithCachedBody;
use Spatie\Crawler\UrlParsers\UrlParser;

class CrawlRequestFulfilled
{
protected LinkAdder $linkAdder;
protected UrlParser $urlParser;

public function __construct(protected Crawler $crawler)
{
$this->linkAdder = new LinkAdder($this->crawler);
$urlParserClass = $this->crawler->getUrlParserClass();
$this->urlParser = new $urlParserClass($this->crawler);
}

public function __invoke(ResponseInterface $response, $index)
Expand Down Expand Up @@ -62,7 +63,7 @@ public function __invoke(ResponseInterface $response, $index)

$baseUrl = $this->getBaseUrl($response, $crawlUrl);

$this->linkAdder->addFromHtml($body, $baseUrl);
$this->urlParser->addFromHtml($body, $baseUrl);

usleep($this->crawler->getDelayBetweenRequests());
}
Expand Down
2 changes: 1 addition & 1 deletion src/ResponseWithCachedBody.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public static function fromGuzzlePsr7Response(ResponseInterface $response): stat
);
}

public function setCachedBody(string $body = null): void
public function setCachedBody(?string $body = null): void
{
$this->cachedBody = $body;
}
Expand Down
9 changes: 6 additions & 3 deletions src/LinkAdder.php → src/UrlParsers/LinkUrlParser.php
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
<?php

namespace Spatie\Crawler;
namespace Spatie\Crawler\UrlParsers;

use Illuminate\Support\Collection;
use InvalidArgumentException;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\Url;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
use Symfony\Component\DomCrawler\Link;
use Tree\Node\Node;

class LinkAdder
class LinkUrlParser implements UrlParser
{
protected Crawler $crawler;

Expand Down Expand Up @@ -66,7 +69,7 @@ protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl):

return new Url($link->getUri(), $linkText);
} catch (InvalidArgumentException $exception) {
return;
return null;
}
})
->filter();
Expand Down
89 changes: 89 additions & 0 deletions src/UrlParsers/SitemapUrlParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
<?php

namespace Spatie\Crawler\UrlParsers;

use Illuminate\Support\Collection;
use InvalidArgumentException;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\Url;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
use Tree\Node\Node;

class SitemapUrlParser implements UrlParser
{
protected Crawler $crawler;

public function __construct(Crawler $crawler)
{
$this->crawler = $crawler;
}

public function addFromHtml(string $html, UriInterface $foundOnUrl): void
{
$allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);

collect($allLinks)
->filter(fn (Url $url) => $this->hasCrawlableScheme($url))
->map(fn (Url $url) => $this->normalizeUrl($url))
->filter(function (Url $url) use ($foundOnUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) {
return false;
}

return $this->shouldCrawl($node);
})
->filter(fn (Url $url) => ! str_contains($url->getPath(), '/tel:'))
->each(function (Url $url) use ($foundOnUrl) {
$crawlUrl = CrawlUrl::create($url, $foundOnUrl, linkText: $url->linkText());

$this->crawler->addToCrawlQueue($crawlUrl);
});
}

protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl): ?Collection
{
$domCrawler = new DomCrawler($html, $foundOnUrl);

return collect($domCrawler->filterXPath('//loc')
->each(function (DomCrawler $node) {
try {
$linkText = $node->text();

if ($linkText) {
$linkText = substr($linkText, 0, 4000);
}

return new Url($linkText, $linkText);
} catch (InvalidArgumentException $exception) {
return null;
}
}));
}

protected function hasCrawlableScheme(UriInterface $uri): bool
{
return in_array($uri->getScheme(), ['http', 'https']);
}

protected function normalizeUrl(UriInterface $url): UriInterface
{
return $url->withFragment('');
}

protected function shouldCrawl(Node $node): bool
{
if ($this->crawler->mustRespectRobots() && ! $this->crawler->getRobotsTxt()->allows($node->getValue(), $this->crawler->getUserAgent())) {
return false;
}

$maximumDepth = $this->crawler->getMaximumDepth();

if (is_null($maximumDepth)) {
return true;
}

return $node->getDepth() <= $maximumDepth;
}
}
13 changes: 13 additions & 0 deletions src/UrlParsers/UrlParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?php

namespace Spatie\Crawler\UrlParsers;

use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;

interface UrlParser
{
public function __construct(Crawler $crawler);

public function addFromHtml(string $html, UriInterface $foundOnUrl): void;
}
8 changes: 4 additions & 4 deletions tests/CrawlObserverCollectionTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,17 @@
public function crawled(
UriInterface $url,
ResponseInterface $response,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
$this->crawled = true;
}

public function crawlFailed(
UriInterface $url,
RequestException $requestException,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
$this->failed = true;
}
Expand Down
46 changes: 46 additions & 0 deletions tests/SitemapUrlParserTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?php

use Spatie\Crawler\Test\TestClasses\Log;
use Spatie\Crawler\UrlParsers\SitemapUrlParser;

beforeEach(function () {
skipIfTestServerIsNotRunning();

Log::reset();
});

it('should extract child sitemaps from sitemap index', function () {
createCrawler()
->setUrlParserClass(SitemapUrlParser::class)
->startCrawling('http://localhost:8080/sitemap_index.xml');

expect(['url' => 'http://localhost:8080/sitemap1.xml', 'foundOn' => 'http://localhost:8080/sitemap_index.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/sitemap2.xml', 'foundOn' => 'http://localhost:8080/sitemap_index.xml'])
->toBeCrawledOnce();
});

it('should extract urls from sitemaps trough sitemap index', function () {
createCrawler()
->setUrlParserClass(SitemapUrlParser::class)
->startCrawling('http://localhost:8080/sitemap_index.xml');

expect(['url' => 'http://localhost:8080/', 'foundOn' => 'http://localhost:8080/sitemap1.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/sitemap1.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/sitemap2.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/sitemap2.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/sitemap2.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/sitemap2.xml'])
->toBeCrawledOnce();
});
10 changes: 5 additions & 5 deletions tests/TestClasses/CrawlLogger.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,22 @@ public function willCrawl(UriInterface $url, ?string $linkText): void
public function crawled(
UriInterface $url,
ResponseInterface $response,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
$this->logCrawl($url, $foundOnUrl, $linkText);
}

public function crawlFailed(
UriInterface $url,
RequestException $requestException,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
$this->logCrawl($url, $foundOnUrl, $linkText);
}

protected function logCrawl(UriInterface $url, ?UriInterface $foundOnUrl, string $linkText = null)
protected function logCrawl(UriInterface $url, ?UriInterface $foundOnUrl, ?string $linkText = null)
{
$logText = "{$this->observerId}hasBeenCrawled: {$url}";

Expand Down
Loading

0 comments on commit f4958b4

Please sign in to comment.