Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add headless / headfull capabilities to Zeno #55

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
13 changes: 12 additions & 1 deletion cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,20 @@ var GlobalFlags = []cli.Flag{
},
&cli.BoolFlag{
Name: "headless",
Usage: "Use headless browsers instead of standard GET requests.",
Usage: "Use headless browsers instead of standard GET requests. If --headfull is set, this flag will be ignored.",
Destination: &config.App.Flags.Headless,
},
&cli.BoolFlag{
Name: "headfull",
Usage: "Use headfull browsers instead of standard GET requests. If --headless is set, this flag will take precedence.",
Destination: &config.App.Flags.Headfull,
},
&cli.Uint64Flag{
Name: "headless-wait-after-load",
Usage: "Wait the given number of seconds after the page has loaded before closing the page.",
Value: 0,
Destination: &config.App.Flags.HeadlessWaitAfterLoad,
},
&cli.BoolFlag{
Name: "local-seencheck",
Usage: "Simple local seencheck to avoid re-crawling of URIs.",
Expand Down
15 changes: 14 additions & 1 deletion cmd/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl {
version := utils.GetVersion()
c.UserAgent = "Mozilla/5.0 (compatible; archive.org_bot +http://archive.org/details/archive.org_bot) Zeno/" + version.Version[:7] + " warc/" + version.WarcVersion
}
c.Headless = flags.Headless

c.Headless = flags.Headless
c.CookieFile = flags.CookieFile
c.KeepCookies = flags.KeepCookies

Expand All @@ -121,5 +121,18 @@ func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl {
c.HQBatchSize = int(flags.HQBatchSize)
c.HQContinuousPull = flags.HQContinuousPull

// Headless settings
c.Headless = flags.Headless
c.Headfull = flags.Headfull
c.HeadlessWaitAfterLoad = flags.HeadlessWaitAfterLoad

// If Headfull is true, then we make sure Headless is true
// as well. The reason is that "headless" is (in the context of Zeno)
// considered as a method of crawling and headfull is considered
// as a method of rendering for the browser.
if c.Headfull {
c.Headless = true
}

return c
}
7 changes: 5 additions & 2 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ type Flags struct {
Workers int
MaxConcurrentAssets int
MaxHops uint
Headless bool
Seencheck bool
JSON bool
LiveStats bool
Expand Down Expand Up @@ -56,7 +55,11 @@ type Flags struct {
DisableAssetsCapture bool
CertValidation bool

CloudflareStream bool
Headless bool
Headfull bool
HeadlessWaitAfterLoad uint64

Cloudflarestream bool
ElasticSearchURL string
ExcludedStrings cli.StringSlice
}
Expand Down
115 changes: 62 additions & 53 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,100 +4,109 @@ go 1.21

require (
git.archive.org/wb/gocrawlhq v1.2.4
github.com/CorentinB/warc v0.8.33
github.com/CorentinB/warc v0.8.34
github.com/PuerkitoBio/goquery v1.8.1
github.com/asaskevich/govalidator v0.0.0-20200907205600-7a23bdc65eef
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
github.com/beeker1121/goque v2.1.0+incompatible
github.com/dustin/go-humanize v1.0.1
github.com/gin-contrib/pprof v1.4.0
github.com/gin-gonic/gin v1.9.0
github.com/google/uuid v1.3.0
github.com/gin-gonic/gin v1.9.1
github.com/go-rod/rod v0.114.5
github.com/go-rod/stealth v0.4.9
github.com/google/uuid v1.3.1
github.com/gosuri/uilive v0.0.4
github.com/gosuri/uitable v0.0.4
github.com/internetarchive/elogrus v0.0.0-20230725172814-093db31a64fc
github.com/lestrrat-go/file-rotatelogs v2.4.0+incompatible
github.com/olivere/elastic/v7 v7.0.32
github.com/orirawlings/persistent-cookiejar v0.3.2
github.com/paulbellamy/ratecounter v0.2.0
github.com/philippgille/gokv/leveldb v0.6.0
github.com/prometheus/client_golang v1.15.1
github.com/prometheus/client_golang v1.16.0
github.com/remeh/sizedwaitgroup v1.0.0
github.com/sirupsen/logrus v1.6.0
github.com/spf13/afero v1.6.0
github.com/stretchr/testify v1.8.1
github.com/telanflow/cookiejar v0.0.0-20190719062046-114449e86aa5
github.com/tidwall/gjson v1.14.0
github.com/sirupsen/logrus v1.9.3
github.com/spf13/afero v1.9.5
github.com/stretchr/testify v1.8.4
github.com/tidwall/gjson v1.16.0
github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80
github.com/urfave/cli/v2 v2.2.0
github.com/urfave/cli/v2 v2.25.7
github.com/zeebo/xxh3 v1.0.2
go.uber.org/goleak v1.2.1
golang.org/x/net v0.10.0
golang.org/x/net v0.15.0
mvdan.cc/xurls/v2 v2.5.0
)

require (
github.com/andybalholm/brotli v1.0.5 // indirect
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bytedance/sonic v1.8.0 // indirect
github.com/bytedance/sonic v1.10.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.0 // indirect
github.com/chenzhuoyu/base64x v0.0.0-20230717121745-296ad89f973d // indirect
github.com/chenzhuoyu/iasm v0.9.0 // indirect
github.com/cloudflare/circl v1.3.3 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/fastly/go-utils v0.0.0-20180712184237-d95a45783239 // indirect
github.com/fatih/color v1.9.0 // indirect
github.com/gaukas/godicttls v0.0.3 // indirect
github.com/fatih/color v1.15.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.2 // indirect
github.com/gaukas/godicttls v0.0.4 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-playground/validator/v10 v10.11.2 // indirect
github.com/go-playground/validator/v10 v10.15.4 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.2.1 // indirect
github.com/goccy/go-json v0.10.0 // indirect
github.com/gobwas/ws v1.3.0 // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/golang/snappy v0.0.1 // indirect
github.com/gomodule/redigo v1.8.8 // indirect
github.com/jehiah/go-strftime v0.0.0-20171201141054-1d33003b3869 // indirect
github.com/jonboulle/clockwork v0.2.2 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/jonboulle/clockwork v0.4.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.16.5 // indirect
github.com/klauspost/cpuid/v2 v2.0.9 // indirect
github.com/klauspost/compress v1.16.7 // indirect
github.com/klauspost/cpuid/v2 v2.2.5 // indirect
github.com/klauspost/pgzip v1.2.6 // indirect
github.com/konsorten/go-windows-terminal-sequences v1.0.3 // indirect
github.com/leodido/go-urn v1.2.1 // indirect
github.com/lestrrat-go/strftime v1.0.3 // indirect
github.com/leodido/go-urn v1.2.4 // indirect
github.com/lestrrat-go/strftime v1.0.6 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-colorable v0.1.4 // indirect
github.com/mattn/go-isatty v0.0.17 // indirect
github.com/mattn/go-runewidth v0.0.13 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/mengzhuo/cookiestxt v1.0.3 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pelletier/go-toml/v2 v2.0.6 // indirect
github.com/philippgille/gokv/encoding v0.0.0-20191011213304-eb77f15b9c61 // indirect
github.com/philippgille/gokv/util v0.0.0-20191011213304-eb77f15b9c61 // indirect
github.com/pelletier/go-toml/v2 v2.1.0 // indirect
github.com/philippgille/gokv/encoding v0.6.0 // indirect
github.com/philippgille/gokv/util v0.6.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.42.0 // indirect
github.com/prometheus/procfs v0.9.0 // indirect
github.com/refraction-networking/utls v1.3.2 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/russross/blackfriday/v2 v2.0.1 // indirect
github.com/prometheus/client_model v0.4.0 // indirect
github.com/prometheus/common v0.44.0 // indirect
github.com/prometheus/procfs v0.11.1 // indirect
github.com/quic-go/quic-go v0.38.1 // indirect
github.com/refraction-networking/utls v1.5.3 // indirect
github.com/rivo/uniseg v0.4.4 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/satori/go.uuid v1.2.0 // indirect
github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
github.com/syndtr/goleveldb v1.0.0 // indirect
github.com/tebeka/strftime v0.1.5 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.0 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.9 // indirect
golang.org/x/arch v0.0.0-20210923205945-b76863e36670 // indirect
golang.org/x/crypto v0.9.0 // indirect
golang.org/x/sync v0.2.0 // indirect
golang.org/x/sys v0.8.0 // indirect
golang.org/x/text v0.9.0 // indirect
google.golang.org/protobuf v1.30.0 // indirect
github.com/ugorji/go/codec v1.2.11 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
github.com/ysmood/fetchup v0.2.3 // indirect
github.com/ysmood/goob v0.4.0 // indirect
github.com/ysmood/got v0.35.1 // indirect
github.com/ysmood/gson v0.7.3 // indirect
github.com/ysmood/leakless v0.8.0 // indirect
go4.org v0.0.0-20190313082347-94abd6928b1d // indirect
golang.org/x/arch v0.5.0 // indirect
golang.org/x/crypto v0.13.0 // indirect
golang.org/x/sync v0.3.0 // indirect
golang.org/x/sys v0.12.0 // indirect
golang.org/x/text v0.13.0 // indirect
google.golang.org/protobuf v1.31.0 // indirect
gopkg.in/retry.v1 v1.0.3 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
Loading
Loading