diff --git a/internal/pkg/crawl/config.go b/internal/pkg/crawl/config.go index ee9003bb..20fb21da 100644 --- a/internal/pkg/crawl/config.go +++ b/internal/pkg/crawl/config.go @@ -246,7 +246,10 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { c.DomainsCrawl = config.DomainsCrawl c.DisableAssetsCapture = config.DisableAssetsCapture c.DisabledHTMLTags = config.DisableHTMLTag - c.ExcludedHosts = config.ExcludeHosts + + // We exclude some hosts by default + c.ExcludedHosts = utils.DedupeStrings(append(config.ExcludeHosts, "archive.org", "archive-it.org")) + c.IncludedHosts = config.IncludeHosts c.CaptureAlternatePages = config.CaptureAlternatePages c.ExcludedStrings = config.ExcludeString