Skip to content

Commit

Permalink
add: --disable-ipv4 & --disable-ipv6
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Sep 24, 2024
1 parent 4b34542 commit c3339fd
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 13 deletions.
6 changes: 4 additions & 2 deletions cmd/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,16 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().Int("crawl-time-limit", 0, "Number of seconds until the crawl will automatically set itself into the finished state.")
getCmd.PersistentFlags().Int("crawl-max-time-limit", 0, "Number of seconds until the crawl will automatically panic itself. Default to crawl-time-limit + (crawl-time-limit / 10)")
getCmd.PersistentFlags().StringSlice("exclude-string", []string{}, "Discard any (discovered) URLs containing this string.")
getCmd.PersistentFlags().Bool("random-local-ip", false, "Use random local IP for requests. (will be ignored if a proxy is set)")
getCmd.PersistentFlags().Int("min-space-required", 20, "Minimum space required in GB to continue the crawl.")
getCmd.PersistentFlags().Bool("handover", false, "Use the handover mechanism that dispatch URLs via a buffer before enqueuing on disk. (UNSTABLE)")
getCmd.PersistentFlags().Bool("ultrasafe-queue", false, "Don't use committed batch writes to the WAL and instead fsync() after each write.")

// Proxy flags
// Network flags
getCmd.PersistentFlags().String("proxy", "", "Proxy to use when requesting pages.")
getCmd.PersistentFlags().StringSlice("bypass-proxy", []string{}, "Domains that should not be proxied.")
getCmd.PersistentFlags().Bool("random-local-ip", false, "Use random local IP for requests. (will be ignored if a proxy is set)")
getCmd.PersistentFlags().Bool("disable-ipv4", false, "Disable IPv4 for requests.")
getCmd.PersistentFlags().Bool("disable-ipv6", false, "Disable IPv6 for requests.")

// WARC flags
getCmd.PersistentFlags().String("warc-prefix", "ZENO", "Prefix to use when naming the WARC files.")
Expand Down
10 changes: 7 additions & 3 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ type Config struct {
Cookies string `mapstructure:"cookies"`
APIPort string `mapstructure:"api-port"`
PrometheusPrefix string `mapstructure:"prometheus-prefix"`
Proxy string `mapstructure:"proxy"`
WARCPrefix string `mapstructure:"warc-prefix"`
WARCOperator string `mapstructure:"warc-operator"`
CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"`
Expand All @@ -39,7 +38,6 @@ type Config struct {
ExcludeHosts []string `mapstructure:"exclude-host"`
IncludeHosts []string `mapstructure:"include-host"`
ExcludeString []string `mapstructure:"exclude-string"`
DomainsBypassProxy []string `mapstructure:"bypass-proxy"`
ElasticSearchURLs []string `mapstructure:"es-url"`
WorkersCount int `mapstructure:"workers"`
MaxConcurrentAssets int `mapstructure:"max-concurrent-assets"`
Expand All @@ -65,7 +63,6 @@ type Config struct {
Prometheus bool `mapstructure:"prometheus"`
DomainsCrawl bool `mapstructure:"domains-crawl"`
CaptureAlternatePages bool `mapstructure:"capture-alternate-pages"`
RandomLocalIP bool `mapstructure:"random-local-ip"`
WARCOnDisk bool `mapstructure:"warc-on-disk"`
DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"`
CertValidation bool `mapstructure:"cert-validation"`
Expand All @@ -77,6 +74,13 @@ type Config struct {
NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"`
Handover bool `mapstructure:"handover"`

// Network
Proxy string `mapstructure:"proxy"`
DomainsBypassProxy []string `mapstructure:"bypass-proxy"`
RandomLocalIP bool `mapstructure:"random-local-ip"`
DisableIPv4 bool `mapstructure:"disable-ipv4"`
DisableIPv6 bool `mapstructure:"disable-ipv6"`

// Dependencies
NoYTDLP bool `mapstructure:"no-ytdlp"`
YTDLPPath string `mapstructure:"ytdlp-path"`
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.22.4

require (
git.archive.org/wb/gocrawlhq v1.2.10
github.com/CorentinB/warc v0.8.48
github.com/CorentinB/warc v0.8.49
github.com/PuerkitoBio/goquery v1.9.3
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
github.com/clbanning/mxj/v2 v2.7.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
git.archive.org/wb/gocrawlhq v1.2.10 h1:E5F57S5tF6gltYe6k7VsKogt7U8/fY37ZVhihJq7wrM=
git.archive.org/wb/gocrawlhq v1.2.10/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM=
github.com/CorentinB/warc v0.8.48 h1:znLXHYSfC9aF0dt/CZTXckoamB73tL4FxNa+k0ey/Hk=
github.com/CorentinB/warc v0.8.48/go.mod h1:BikG8yz1B262Bk8JQVaFv6XFGimnd59IfPOuKeev84s=
github.com/CorentinB/warc v0.8.49 h1:wHuHXhyllWaEzgTChvG4MPr/VEDVAQ3u5M/52bZ1Hcg=
github.com/CorentinB/warc v0.8.49/go.mod h1:tq5f9jrDdLZ/jraEk21vb5cWinBETcVF9OEkPrbTaNg=
github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0=
github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U=
github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
Expand Down
15 changes: 10 additions & 5 deletions internal/pkg/crawl/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,19 @@ type Crawl struct {
CaptureAlternatePages bool
DomainsCrawl bool
Headless bool
RandomLocalIP bool
MinSpaceRequired int

// Cookie-related settings
CookieFile string
KeepCookies bool
CookieJar http.CookieJar

// proxy settings
Proxy string
BypassProxy []string
// Network settings
Proxy string
BypassProxy []string
RandomLocalIP bool
DisableIPv4 bool
DisableIPv6 bool

// API settings
API bool
Expand Down Expand Up @@ -275,9 +277,12 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) {
c.CookieFile = config.Cookies
c.KeepCookies = config.KeepCookies

// Proxy settings
// Network settings
c.Proxy = config.Proxy
c.BypassProxy = config.DomainsBypassProxy
c.RandomLocalIP = config.RandomLocalIP
c.DisableIPv4 = config.DisableIPv4
c.DisableIPv6 = config.DisableIPv6

// Crawl HQ settings
c.UseHQ = config.HQ
Expand Down
2 changes: 2 additions & 0 deletions internal/pkg/crawl/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ func (c *Crawl) Start() (err error) {
TempDir: c.WARCTempDir,
FullOnDisk: c.WARCFullOnDisk,
RandomLocalIP: c.RandomLocalIP,
DisableIPv4: c.DisableIPv4,
DisableIPv6: c.DisableIPv6,
}

c.Client, err = warc.NewWARCWritingHTTPClient(HTTPClientSettings)
Expand Down

0 comments on commit c3339fd

Please sign in to comment.