Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add state management for workers #65

Merged
merged 3 commits into from
Jun 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
jobs/*
jobs/
Zeno
*.txt
*.sh
5 changes: 3 additions & 2 deletions cmd/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"github.com/internetarchive/Zeno/internal/pkg/frontier"
"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/paulbellamy/ratecounter"
"github.com/remeh/sizedwaitgroup"
"github.com/sirupsen/logrus"
)

Expand Down Expand Up @@ -50,8 +49,10 @@ func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl {
c.JobPath = path.Join("jobs", flags.Job)

c.Workers = flags.Workers
c.WorkerPool = sizedwaitgroup.New(c.Workers)
c.WorkerPool = make([]*crawl.Worker, 0)
c.WorkerStopTimeout = time.Second * 60 // Placeholder for WorkerStopTimeout
c.MaxConcurrentAssets = flags.MaxConcurrentAssets
c.WorkerStopSignal = make(chan bool)

c.Seencheck = flags.Seencheck
c.HTTPTimeout = flags.HTTPTimeout
Expand Down
39 changes: 39 additions & 0 deletions internal/pkg/crawl/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package crawl

import (
"os"
"strconv"
"time"

"github.com/gin-contrib/pprof"
Expand All @@ -11,6 +12,18 @@ import (
"github.com/prometheus/client_golang/prometheus/promhttp"
)

type APIWorkersState struct {
Workers []*APIWorkerState `json:"workers"`
}

type APIWorkerState struct {
WorkerID uint `json:"worker_id"`
Status string `json:"status"`
LastError string `json:"last_error"`
LastSeen string `json:"last_seen"`
Locked bool `json:"locked"`
}

func (crawl *Crawl) startAPI() {
gin.SetMode(gin.ReleaseMode)
gin.DefaultWriter = logInfo.Out
Expand Down Expand Up @@ -56,6 +69,32 @@ func (crawl *Crawl) startAPI() {
r.GET("/metrics", gin.WrapH(promhttp.Handler()))
}

r.GET("/workers", func(c *gin.Context) {
workersState := crawl.GetWorkerState(-1)
c.JSON(200, workersState)
})

r.GET("/worker/:worker_id", func(c *gin.Context) {
workerID := c.Param("worker_id")
workerIDInt, err := strconv.Atoi(workerID)
if err != nil {
c.JSON(400, gin.H{
"error": "Unsupported worker ID",
})
return
}

workersState := crawl.GetWorkerState(workerIDInt)
if workersState == nil {
c.JSON(404, gin.H{
"error": "Worker not found",
})
return
}

c.JSON(200, workersState)
})

err := r.Run(":" + crawl.APIPort)
if err != nil {
logError.Fatalf("unable to start API: %s", err.Error())
Expand Down
41 changes: 21 additions & 20 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ func (c *Crawl) captureAsset(item *frontier.Item, cookies []*http.Cookie) error
}

// Capture capture the URL and return the outlinks
func (c *Crawl) Capture(item *frontier.Item) {
func (c *Crawl) Capture(item *frontier.Item) error {
var (
resp *http.Response
waitGroup sync.WaitGroup
Expand All @@ -237,7 +237,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil)
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while preparing GET request")
return
return err
}

if item.Hop > 0 && item.ParentItem != nil {
Expand Down Expand Up @@ -307,14 +307,14 @@ func (c *Crawl) Capture(item *frontier.Item) {
// Execute request
resp, err = c.executeGET(item, req, false)
if err != nil && err.Error() == "URL from redirection has already been seen" {
return
return err
} else if err != nil && err.Error() == "URL is being rate limited, sending back to HQ" {
c.HQProducerChannel <- frontier.NewItem(item.URL, item.ParentItem, item.Type, item.Hop, "", true)
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("URL is being rate limited, sending back to HQ")
return
return err
} else if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while executing GET request")
return
return err
}
defer resp.Body.Close()

Expand All @@ -335,41 +335,41 @@ func (c *Crawl) Capture(item *frontier.Item) {
base, err := url.Parse(utils.URLToString(resp.Request.URL))
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing base URL")
return
return err
}

// If the response is a JSON document, we want to scrape it for links
if strings.Contains(resp.Header.Get("Content-Type"), "json") {
jsonBody, err := io.ReadAll(resp.Body)
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading JSON body")
return
return err
}

outlinksFromJSON, err := getURLsFromJSON(string(jsonBody))
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting URLs from JSON")
return
return err
}

waitGroup.Add(1)
go c.queueOutlinks(utils.MakeAbsolute(item.URL, utils.StringSliceToURLSlice(outlinksFromJSON)), item, &waitGroup)

return
return err
}

// If the response is an XML document, we want to scrape it for links
if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
xmlBody, err := io.ReadAll(resp.Body)
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading XML body")
return
return err
}

mv, err := mxj.NewMapXml(xmlBody)
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing XML body")
return
return err
}

for _, value := range mv.LeafValues() {
Expand All @@ -390,14 +390,14 @@ func (c *Crawl) Capture(item *frontier.Item) {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading response body")
}

return
return err
}

// Turn the response into a doc that we will scrape for outlinks and assets.
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while creating goquery document")
return
return err
}

// Execute site-specific code on the document
Expand All @@ -406,7 +406,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
cfstreamURLs, err := cloudflarestream.GetJSFiles(doc, base, *c.Client)
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting JS files from cloudflarestream")
return
return err
}

// Seencheck the URLs we captured, we ignore the returned value here
Expand Down Expand Up @@ -464,26 +464,26 @@ func (c *Crawl) Capture(item *frontier.Item) {
outlinks, err := extractOutlinks(base, doc)
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting outlinks")
return
return err
}

waitGroup.Add(1)
go c.queueOutlinks(outlinks, item, &waitGroup)

if c.DisableAssetsCapture {
return
return err
}

// Extract and capture assets
assets, err := c.extractAssets(base, item, doc)
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
return
return err
}

// If we didn't find any assets, let's stop here
if len(assets) == 0 {
return
return err
}

// If --local-seencheck is enabled, then we check if the assets are in the
Expand All @@ -502,7 +502,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
}

if len(seencheckedBatch) == 0 {
return
return err
}

assets = seencheckedBatch
Expand All @@ -522,7 +522,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
}

if len(assets) == 0 {
return
return err
}
}

Expand Down Expand Up @@ -584,6 +584,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
}

swg.Wait()
return err
}

func getURLsFromJSON(jsonString string) ([]string, error) {
Expand Down
Loading
Loading