Skip to content

Commit

Permalink
Add: --random-local-ip & --include-host
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Apr 25, 2024
1 parent d1f81f2 commit 60e711f
Show file tree
Hide file tree
Showing 10 changed files with 38 additions and 14 deletions.
10 changes: 10 additions & 0 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,11 @@ var GlobalFlags = []cli.Flag{
Usage: "Exclude a specific host from the crawl, note that it will not exclude the domain if it is encountered as an asset for another web page.",
Destination: &config.App.Flags.ExcludedHosts,
},
&cli.StringSliceFlag{
Name: "include-host",
Usage: "Only crawl specific hosts, note that it will not include the domain if it is encountered as an asset for another web page.",
Destination: &config.App.Flags.IncludedHosts,
},
&cli.IntFlag{
Name: "max-concurrent-per-domain",
Value: 16,
Expand Down Expand Up @@ -303,6 +308,11 @@ var GlobalFlags = []cli.Flag{
Usage: "Discard any (discovered) URLs containing this string.",
Destination: &config.App.Flags.ExcludedStrings,
},
&cli.BoolFlag{
Name: "random-local-ip",
Usage: "Use random local IP for requests. (will be ignored if a proxy is set)",
Destination: &config.App.Flags.RandomLocalIP,
},
}

var Commands []*cli.Command
Expand Down
1 change: 1 addition & 0 deletions cmd/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl {
c.DisableAssetsCapture = flags.DisableAssetsCapture
c.DisabledHTMLTags = flags.DisabledHTMLTags.Value()
c.ExcludedHosts = flags.ExcludedHosts.Value()
c.IncludedHosts = flags.IncludedHosts.Value()
c.CaptureAlternatePages = flags.CaptureAlternatePages
c.ExcludedStrings = flags.ExcludedStrings.Value()

Expand Down
2 changes: 2 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ type Flags struct {

DisabledHTMLTags cli.StringSlice
ExcludedHosts cli.StringSlice
IncludedHosts cli.StringSlice
DomainsCrawl bool
CaptureAlternatePages bool
HTTPTimeout int
Expand All @@ -25,6 +26,7 @@ type Flags struct {
RateLimitDelay int
CrawlTimeLimit int
MaxCrawlTimeLimit int
RandomLocalIP bool

Proxy string
BypassProxy cli.StringSlice
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
module github.com/internetarchive/Zeno

go 1.22
go 1.22.2

require (
git.archive.org/wb/gocrawlhq v1.2.4
github.com/CorentinB/warc v0.8.36
github.com/CorentinB/warc v0.8.38
github.com/PuerkitoBio/goquery v1.9.1
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
github.com/beeker1121/goque v2.1.0+incompatible
Expand Down Expand Up @@ -77,7 +77,7 @@ require (
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.6.0 // indirect
github.com/prometheus/common v0.50.0 // indirect
github.com/prometheus/common v0.53.0 // indirect
github.com/prometheus/procfs v0.13.0 // indirect
github.com/quic-go/quic-go v0.41.0 // indirect
github.com/refraction-networking/utls v1.6.3 // indirect
Expand Down
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT
git.archive.org/wb/gocrawlhq v1.2.4 h1:Z/w1UwFfvq1m03IT0ZMvV6m18DiOgYEGxR5JVgJkQ/s=
git.archive.org/wb/gocrawlhq v1.2.4/go.mod h1:WiuNIB4Toqe8twVvwRu0fTSNC3KXFqA8/mAeaZ3GICE=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/CorentinB/warc v0.8.36 h1:7u/d81hee73y41UdXPVADsurUMI5ehSi6bYFRVvQCts=
github.com/CorentinB/warc v0.8.36/go.mod h1:/QOyyAq7FYGmaPKoXvrY1GXjzI/Br3w2WSMEeF0uil0=
github.com/CorentinB/warc v0.8.38 h1:83jzwW7erY39lZCVM0Cf0r24M5BePo/WNNJddWUV2B4=
github.com/CorentinB/warc v0.8.38/go.mod h1:Q9SHKf7pwcqzIWcxlzCtAWN8sKH+Q1BZxq1mSHJ9ttY=
github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI=
github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY=
github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo=
Expand Down Expand Up @@ -233,8 +233,8 @@ github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:
github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos=
github.com/prometheus/client_model v0.6.0/go.mod h1:NTQHnmxFpouOD0DpvP4XujX3CdOAGQPoaGhyTchlyt8=
github.com/prometheus/common v0.2.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
github.com/prometheus/common v0.50.0 h1:YSZE6aa9+luNa2da6/Tik0q0A5AbR+U003TItK57CPQ=
github.com/prometheus/common v0.50.0/go.mod h1:wHFBCEVWVmHMUpg7pYcOm2QUR/ocQdYSJVQJKnHc3xQ=
github.com/prometheus/common v0.53.0 h1:U2pL9w9nmJwJDa4qqLQ3ZaePJ6ZTwt7cMD3AG3+aLCE=
github.com/prometheus/common v0.53.0/go.mod h1:BrxBKv3FWBIGXw89Mg1AeBq7FSyRzXWI3l3e7W3RN5U=
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.0-20190117184657-bf6a532e95b1/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.13.0 h1:GqzLlQyfsPbaEHaQkO7tbDlriv/4o5Hudv6OXHGKX7o=
Expand Down
6 changes: 3 additions & 3 deletions internal/pkg/crawl/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type Crawl struct {
Logger logrus.Logger
DisabledHTMLTags []string
ExcludedHosts []string
IncludedHosts []string
ExcludedStrings []string
UserAgent string
Job string
Expand All @@ -68,6 +69,7 @@ type Crawl struct {
Headless bool
Seencheck bool
Workers int
RandomLocalIP bool

// Cookie-related settings
CookieFile string
Expand Down Expand Up @@ -216,9 +218,6 @@ func (c *Crawl) Start() (err error) {
// Init WARC rotator settings
rotatorSettings := c.initWARCRotatorSettings()

// Change WARC pool size
rotatorSettings.WARCWriterPoolSize = c.WARCPoolSize

dedupeOptions := warc.DedupeOptions{LocalDedupe: !c.DisableLocalDedupe, SizeThreshold: c.WARCDedupSize}
if c.CDXDedupeServer != "" {
dedupeOptions = warc.DedupeOptions{LocalDedupe: !c.DisableLocalDedupe, CDXDedupe: true, CDXURL: c.CDXDedupeServer, CDXCookie: c.WARCCustomCookie, SizeThreshold: c.WARCDedupSize}
Expand All @@ -233,6 +232,7 @@ func (c *Crawl) Start() (err error) {
VerifyCerts: c.CertValidation,
TempDir: c.WARCTempDir,
FullOnDisk: c.WARCFullOnDisk,
RandomLocalIP: c.RandomLocalIP,
}

c.Client, err = warc.NewWARCWritingHTTPClient(HTTPClientSettings)
Expand Down
5 changes: 3 additions & 2 deletions internal/pkg/crawl/outlinks.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *frontier.Item, wg *sync
for _, outlink := range outlinks {
outlink := outlink

// If the host of the outlink is in the host exclusion list, we ignore it
if utils.StringInSlice(outlink.Host, c.ExcludedHosts) {
// If the host of the outlink is in the host exclusion list, or the host is not in the host inclusion list
// if one is specified, we ignore the outlink
if utils.StringInSlice(outlink.Host, c.ExcludedHosts) || !c.checkIncludedHosts(outlink.Host) {
continue
}

Expand Down
11 changes: 10 additions & 1 deletion internal/pkg/crawl/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ func (c *Crawl) crawlSpeedLimiter() {
}
}

func (c *Crawl) checkIncludedHosts(host string) bool {
// If no hosts are included, all hosts are included
if len(c.IncludedHosts) == 0 {
return true
}

return utils.StringInSlice(host, c.IncludedHosts)
}

func (c *Crawl) handleCrawlPause() {
for {
if float64(utils.GetFreeDiskSpace(c.JobPath).Avail)/float64(GB) <= 20 {
Expand Down Expand Up @@ -69,7 +78,7 @@ func (c *Crawl) seencheckURL(URL string, URLType string) bool {

func (c *Crawl) excludeHosts(URLs []*url.URL) (output []*url.URL) {
for _, URL := range URLs {
if utils.StringInSlice(URL.Host, c.ExcludedHosts) {
if utils.StringInSlice(URL.Host, c.ExcludedHosts) || !c.checkIncludedHosts(URL.Host) {
continue
} else {
output = append(output, URL)
Expand Down
1 change: 1 addition & 0 deletions internal/pkg/crawl/warc.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ func (c *Crawl) initWARCRotatorSettings() *warc.RotatorSettings {
rotatorSettings.Compression = "GZIP"
rotatorSettings.Prefix = c.WARCPrefix
rotatorSettings.WarcinfoContent.Set("software", fmt.Sprintf("Zeno %s", utils.GetVersion().Version))
rotatorSettings.WARCWriterPoolSize = c.WARCPoolSize

if len(c.WARCOperator) > 0 {
rotatorSettings.WarcinfoContent.Set("operator", c.WARCOperator)
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/crawl/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func (c *Crawl) Worker() {
}

// If the host of the item is in the host exclusion list, we skip it
if utils.StringInSlice(item.Host, c.ExcludedHosts) {
if utils.StringInSlice(item.Host, c.ExcludedHosts) || !c.checkIncludedHosts(item.Host) {
continue
}

Expand Down

0 comments on commit 60e711f

Please sign in to comment.