-
-
Notifications
You must be signed in to change notification settings - Fork 359
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat: allow custom url parsers Previously named LinkAdder * fix: removed incorrect type check * feat: added sitemap url parser * tests: added sitemap url parsing test * chore: added readme documentation for custom url parsers * Update README.md * tests: added more consistent tests with sitemapindex and sitemap childs * Fix styling * chore: added documentation for the built-in SitemapUrlParser --------- Co-authored-by: Freek Van der Herten <[email protected]> Co-authored-by: Velka-DEV <[email protected]>
- Loading branch information
1 parent
25c378a
commit f4958b4
Showing
14 changed files
with
777 additions
and
272 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
<?php | ||
|
||
namespace Spatie\Crawler\UrlParsers; | ||
|
||
use Illuminate\Support\Collection; | ||
use InvalidArgumentException; | ||
use Psr\Http\Message\UriInterface; | ||
use Spatie\Crawler\Crawler; | ||
use Spatie\Crawler\CrawlUrl; | ||
use Spatie\Crawler\Url; | ||
use Symfony\Component\DomCrawler\Crawler as DomCrawler; | ||
use Tree\Node\Node; | ||
|
||
class SitemapUrlParser implements UrlParser | ||
{ | ||
protected Crawler $crawler; | ||
|
||
public function __construct(Crawler $crawler) | ||
{ | ||
$this->crawler = $crawler; | ||
} | ||
|
||
public function addFromHtml(string $html, UriInterface $foundOnUrl): void | ||
{ | ||
$allLinks = $this->extractLinksFromHtml($html, $foundOnUrl); | ||
|
||
collect($allLinks) | ||
->filter(fn (Url $url) => $this->hasCrawlableScheme($url)) | ||
->map(fn (Url $url) => $this->normalizeUrl($url)) | ||
->filter(function (Url $url) use ($foundOnUrl) { | ||
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) { | ||
return false; | ||
} | ||
|
||
return $this->shouldCrawl($node); | ||
}) | ||
->filter(fn (Url $url) => ! str_contains($url->getPath(), '/tel:')) | ||
->each(function (Url $url) use ($foundOnUrl) { | ||
$crawlUrl = CrawlUrl::create($url, $foundOnUrl, linkText: $url->linkText()); | ||
|
||
$this->crawler->addToCrawlQueue($crawlUrl); | ||
}); | ||
} | ||
|
||
protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl): ?Collection | ||
{ | ||
$domCrawler = new DomCrawler($html, $foundOnUrl); | ||
|
||
return collect($domCrawler->filterXPath('//loc') | ||
->each(function (DomCrawler $node) { | ||
try { | ||
$linkText = $node->text(); | ||
|
||
if ($linkText) { | ||
$linkText = substr($linkText, 0, 4000); | ||
} | ||
|
||
return new Url($linkText, $linkText); | ||
} catch (InvalidArgumentException $exception) { | ||
return null; | ||
} | ||
})); | ||
} | ||
|
||
protected function hasCrawlableScheme(UriInterface $uri): bool | ||
{ | ||
return in_array($uri->getScheme(), ['http', 'https']); | ||
} | ||
|
||
protected function normalizeUrl(UriInterface $url): UriInterface | ||
{ | ||
return $url->withFragment(''); | ||
} | ||
|
||
protected function shouldCrawl(Node $node): bool | ||
{ | ||
if ($this->crawler->mustRespectRobots() && ! $this->crawler->getRobotsTxt()->allows($node->getValue(), $this->crawler->getUserAgent())) { | ||
return false; | ||
} | ||
|
||
$maximumDepth = $this->crawler->getMaximumDepth(); | ||
|
||
if (is_null($maximumDepth)) { | ||
return true; | ||
} | ||
|
||
return $node->getDepth() <= $maximumDepth; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
<?php | ||
|
||
namespace Spatie\Crawler\UrlParsers; | ||
|
||
use Psr\Http\Message\UriInterface; | ||
use Spatie\Crawler\Crawler; | ||
|
||
interface UrlParser | ||
{ | ||
public function __construct(Crawler $crawler); | ||
|
||
public function addFromHtml(string $html, UriInterface $foundOnUrl): void; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
<?php | ||
|
||
use Spatie\Crawler\Test\TestClasses\Log; | ||
use Spatie\Crawler\UrlParsers\SitemapUrlParser; | ||
|
||
beforeEach(function () { | ||
skipIfTestServerIsNotRunning(); | ||
|
||
Log::reset(); | ||
}); | ||
|
||
it('should extract child sitemaps from sitemap index', function () { | ||
createCrawler() | ||
->setUrlParserClass(SitemapUrlParser::class) | ||
->startCrawling('http://localhost:8080/sitemap_index.xml'); | ||
|
||
expect(['url' => 'http://localhost:8080/sitemap1.xml', 'foundOn' => 'http://localhost:8080/sitemap_index.xml']) | ||
->toBeCrawledOnce(); | ||
|
||
expect(['url' => 'http://localhost:8080/sitemap2.xml', 'foundOn' => 'http://localhost:8080/sitemap_index.xml']) | ||
->toBeCrawledOnce(); | ||
}); | ||
|
||
it('should extract urls from sitemaps trough sitemap index', function () { | ||
createCrawler() | ||
->setUrlParserClass(SitemapUrlParser::class) | ||
->startCrawling('http://localhost:8080/sitemap_index.xml'); | ||
|
||
expect(['url' => 'http://localhost:8080/', 'foundOn' => 'http://localhost:8080/sitemap1.xml']) | ||
->toBeCrawledOnce(); | ||
|
||
expect(['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/sitemap1.xml']) | ||
->toBeCrawledOnce(); | ||
|
||
expect(['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/sitemap2.xml']) | ||
->toBeCrawledOnce(); | ||
|
||
expect(['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/sitemap2.xml']) | ||
->toBeCrawledOnce(); | ||
|
||
expect(['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/sitemap2.xml']) | ||
->toBeCrawledOnce(); | ||
|
||
expect(['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/sitemap2.xml']) | ||
->toBeCrawledOnce(); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.