From c3339fd353deeb68a6c3a6afc40620241a72f150 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 24 Sep 2024 10:34:46 +0200 Subject: [PATCH] add: --disable-ipv4 & --disable-ipv6 --- cmd/get.go | 6 ++++-- config/config.go | 10 +++++++--- go.mod | 2 +- go.sum | 4 ++-- internal/pkg/crawl/config.go | 15 ++++++++++----- internal/pkg/crawl/crawl.go | 2 ++ 6 files changed, 26 insertions(+), 13 deletions(-) diff --git a/cmd/get.go b/cmd/get.go index 03a92867..68010c9e 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -54,14 +54,16 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().Int("crawl-time-limit", 0, "Number of seconds until the crawl will automatically set itself into the finished state.") getCmd.PersistentFlags().Int("crawl-max-time-limit", 0, "Number of seconds until the crawl will automatically panic itself. Default to crawl-time-limit + (crawl-time-limit / 10)") getCmd.PersistentFlags().StringSlice("exclude-string", []string{}, "Discard any (discovered) URLs containing this string.") - getCmd.PersistentFlags().Bool("random-local-ip", false, "Use random local IP for requests. (will be ignored if a proxy is set)") getCmd.PersistentFlags().Int("min-space-required", 20, "Minimum space required in GB to continue the crawl.") getCmd.PersistentFlags().Bool("handover", false, "Use the handover mechanism that dispatch URLs via a buffer before enqueuing on disk. (UNSTABLE)") getCmd.PersistentFlags().Bool("ultrasafe-queue", false, "Don't use committed batch writes to the WAL and instead fsync() after each write.") - // Proxy flags + // Network flags getCmd.PersistentFlags().String("proxy", "", "Proxy to use when requesting pages.") getCmd.PersistentFlags().StringSlice("bypass-proxy", []string{}, "Domains that should not be proxied.") + getCmd.PersistentFlags().Bool("random-local-ip", false, "Use random local IP for requests. (will be ignored if a proxy is set)") + getCmd.PersistentFlags().Bool("disable-ipv4", false, "Disable IPv4 for requests.") + getCmd.PersistentFlags().Bool("disable-ipv6", false, "Disable IPv6 for requests.") // WARC flags getCmd.PersistentFlags().String("warc-prefix", "ZENO", "Prefix to use when naming the WARC files.") diff --git a/config/config.go b/config/config.go index fc30ffd1..a2261a72 100644 --- a/config/config.go +++ b/config/config.go @@ -20,7 +20,6 @@ type Config struct { Cookies string `mapstructure:"cookies"` APIPort string `mapstructure:"api-port"` PrometheusPrefix string `mapstructure:"prometheus-prefix"` - Proxy string `mapstructure:"proxy"` WARCPrefix string `mapstructure:"warc-prefix"` WARCOperator string `mapstructure:"warc-operator"` CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"` @@ -39,7 +38,6 @@ type Config struct { ExcludeHosts []string `mapstructure:"exclude-host"` IncludeHosts []string `mapstructure:"include-host"` ExcludeString []string `mapstructure:"exclude-string"` - DomainsBypassProxy []string `mapstructure:"bypass-proxy"` ElasticSearchURLs []string `mapstructure:"es-url"` WorkersCount int `mapstructure:"workers"` MaxConcurrentAssets int `mapstructure:"max-concurrent-assets"` @@ -65,7 +63,6 @@ type Config struct { Prometheus bool `mapstructure:"prometheus"` DomainsCrawl bool `mapstructure:"domains-crawl"` CaptureAlternatePages bool `mapstructure:"capture-alternate-pages"` - RandomLocalIP bool `mapstructure:"random-local-ip"` WARCOnDisk bool `mapstructure:"warc-on-disk"` DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"` CertValidation bool `mapstructure:"cert-validation"` @@ -77,6 +74,13 @@ type Config struct { NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"` Handover bool `mapstructure:"handover"` + // Network + Proxy string `mapstructure:"proxy"` + DomainsBypassProxy []string `mapstructure:"bypass-proxy"` + RandomLocalIP bool `mapstructure:"random-local-ip"` + DisableIPv4 bool `mapstructure:"disable-ipv4"` + DisableIPv6 bool `mapstructure:"disable-ipv6"` + // Dependencies NoYTDLP bool `mapstructure:"no-ytdlp"` YTDLPPath string `mapstructure:"ytdlp-path"` diff --git a/go.mod b/go.mod index 38ff2958..03514c6d 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.22.4 require ( git.archive.org/wb/gocrawlhq v1.2.10 - github.com/CorentinB/warc v0.8.48 + github.com/CorentinB/warc v0.8.49 github.com/PuerkitoBio/goquery v1.9.3 github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 github.com/clbanning/mxj/v2 v2.7.0 diff --git a/go.sum b/go.sum index 09622212..3e463fe4 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ git.archive.org/wb/gocrawlhq v1.2.10 h1:E5F57S5tF6gltYe6k7VsKogt7U8/fY37ZVhihJq7wrM= git.archive.org/wb/gocrawlhq v1.2.10/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM= -github.com/CorentinB/warc v0.8.48 h1:znLXHYSfC9aF0dt/CZTXckoamB73tL4FxNa+k0ey/Hk= -github.com/CorentinB/warc v0.8.48/go.mod h1:BikG8yz1B262Bk8JQVaFv6XFGimnd59IfPOuKeev84s= +github.com/CorentinB/warc v0.8.49 h1:wHuHXhyllWaEzgTChvG4MPr/VEDVAQ3u5M/52bZ1Hcg= +github.com/CorentinB/warc v0.8.49/go.mod h1:tq5f9jrDdLZ/jraEk21vb5cWinBETcVF9OEkPrbTaNg= github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0= github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= diff --git a/internal/pkg/crawl/config.go b/internal/pkg/crawl/config.go index 2d383460..c29b1ae9 100644 --- a/internal/pkg/crawl/config.go +++ b/internal/pkg/crawl/config.go @@ -64,7 +64,6 @@ type Crawl struct { CaptureAlternatePages bool DomainsCrawl bool Headless bool - RandomLocalIP bool MinSpaceRequired int // Cookie-related settings @@ -72,9 +71,12 @@ type Crawl struct { KeepCookies bool CookieJar http.CookieJar - // proxy settings - Proxy string - BypassProxy []string + // Network settings + Proxy string + BypassProxy []string + RandomLocalIP bool + DisableIPv4 bool + DisableIPv6 bool // API settings API bool @@ -275,9 +277,12 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { c.CookieFile = config.Cookies c.KeepCookies = config.KeepCookies - // Proxy settings + // Network settings c.Proxy = config.Proxy c.BypassProxy = config.DomainsBypassProxy + c.RandomLocalIP = config.RandomLocalIP + c.DisableIPv4 = config.DisableIPv4 + c.DisableIPv6 = config.DisableIPv6 // Crawl HQ settings c.UseHQ = config.HQ diff --git a/internal/pkg/crawl/crawl.go b/internal/pkg/crawl/crawl.go index a7549136..b5ee8137 100644 --- a/internal/pkg/crawl/crawl.go +++ b/internal/pkg/crawl/crawl.go @@ -108,6 +108,8 @@ func (c *Crawl) Start() (err error) { TempDir: c.WARCTempDir, FullOnDisk: c.WARCFullOnDisk, RandomLocalIP: c.RandomLocalIP, + DisableIPv4: c.DisableIPv4, + DisableIPv6: c.DisableIPv6, } c.Client, err = warc.NewWARCWritingHTTPClient(HTTPClientSettings)