Skip to content

Commit

Permalink
Merge pull request #478 from pzaino/develop
Browse files Browse the repository at this point in the history
Added "critical" flag to scraping rule's element to allow users to define critical rules, improved general code quality
  • Loading branch information
pzaino authored Oct 25, 2024
2 parents 4b6bdc2 + e88a9ad commit 54eae47
Show file tree
Hide file tree
Showing 54 changed files with 865 additions and 311 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ go.work
app_config.go
config.yaml
config.sh
.env
.DS_Store

# build artefacts
Expand Down
44 changes: 44 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# .golangci.yml
run:
timeout: 5m
tests: false

linters-settings:
gocyclo:
min-complexity: 45

staticcheck: {}

goconst:
min-len: 3
min-occurrences: 2

dupl:
threshold: 50

linters:
enable:
- govet # Use govet instead of maligned (it has 'fieldalignment' check)
- gocyclo
- gosec
- revive
- goconst
# - dupl
- unused # Replaces structcheck, varcheck, and deadcode
- ineffassign
- typecheck
- nakedret
- misspell
- dogsled

disable:
- lll # Line length linter, often too restrictive
- funlen # Function length linter, can be noisy for large projects

issues:
exclude-use-default: false
max-issues-per-linter: 0
max-same-issues: 0

output:
sort-results: true
7 changes: 4 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
Expand All @@ -14,6 +14,7 @@ repos:
# - id: go-imports
- id: no-go-testing
# - id: golangci-lint
# args: [ "--config", ".golangci.yml" ]
- id: go-unit-tests
- repo: local
hooks:
Expand All @@ -32,13 +33,13 @@ repos:
pass_filenames: false
language: golang
- repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
rev: v9.17.0
rev: v9.18.0
hooks:
- id: commitlint
stages: [commit-msg]
additional_dependencies: ['@commitlint/config-conventional']
- repo: https://github.com/gitleaks/gitleaks
rev: v8.18.4
rev: v8.21.1
hooks:
- id: gitleaks
#- repo: https://github.com/pre-commit/mirrors-eslint
Expand Down
8 changes: 8 additions & 0 deletions docker-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ version_to_integer() {
}

# Check for mandatory settings
if [ -f config.sh ]; then
source config.sh
else
if [ -f .env ]; then
source .env
fi
echo "config.sh or .env not found! Proceeding with checking if the user has defined the Environment variables manually."
fi

# shellcheck disable=SC2153
if [ "${DOCKER_DB_HOST}" = "" ]; then
Expand Down
10 changes: 9 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,17 @@ require golang.org/x/sync v0.8.0 // indirect

require (
github.com/antchfx/xpath v1.3.2 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/kr/pretty v0.3.0 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/prometheus/client_golang v1.20.5 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/qri-io/jsonpointer v0.1.1 // indirect
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
gopkg.in/sourcemap.v1 v1.0.5 // indirect
)
Expand Down
20 changes: 20 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/blang/semver v3.5.1+incompatible h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ=
github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down Expand Up @@ -71,6 +75,8 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
Expand All @@ -83,10 +89,21 @@ github.com/likexian/whois v1.15.5 h1:gpPxyCTJtLtJDmakHCo//0ZjK/ocI01GCAd/WBJ2oH8
github.com/likexian/whois v1.15.5/go.mod h1:4b6o1QTCfjwrB5I3KeNQnn79QtuPUTsewsE+ys94I78=
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/oschwald/maxminddb-golang v1.13.1 h1:G3wwjdN9JmIK2o/ermkHM+98oX5fS+k5MbwsmL4MRQE=
github.com/oschwald/maxminddb-golang v1.13.1/go.mod h1:K4pgV9N/GcK694KSTmVSDTODk4IsCNThNdTmnaBZ/F8=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/qri-io/jsonpointer v0.1.1 h1:prVZBZLL6TW5vsSB9fFHFAMBLI4b0ri5vribQlTJiBA=
github.com/qri-io/jsonpointer v0.1.1/go.mod h1:DnJPaYgiKu56EuDp8TU5wFLdZIcAnb/uH9v37ZaMV64=
github.com/qri-io/jsonschema v0.2.1 h1:NNFoKms+kut6ABPf6xiKNM5214jzxAhDBrPHCJ97Wg0=
Expand All @@ -95,6 +112,7 @@ github.com/robertkrimen/otto v0.4.0 h1:/c0GRrK1XDPcgIasAsnlpBT5DelIeB9U/Z/JCQsgr
github.com/robertkrimen/otto v0.4.0/go.mod h1:uW9yN1CYflmUQYvAMS0m+ZiNo3dMzRUDQJX0jWbzgxw=
github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k=
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ=
github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
Expand Down Expand Up @@ -218,6 +236,8 @@ google.golang.org/genproto v0.0.0-20190626174449-989357319d63/go.mod h1:z3L6/3dT
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
Expand Down
84 changes: 80 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ import (
"syscall"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/push"

cmn "github.com/pzaino/thecrowler/pkg/common"
cfg "github.com/pzaino/thecrowler/pkg/config"
crowler "github.com/pzaino/thecrowler/pkg/crawler"
Expand All @@ -56,6 +59,31 @@ var (
config cfg.Config // Configuration "object"
configMutex sync.Mutex // Mutex to protect the configuration
GRulesEngine rules.RuleEngine // Global rules engine

// Prometheus metrics
totalPages = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "crowler_total_pages",
Help: "Total number of pages crawled.",
},
[]string{"pipeline_id", "source"},
)
totalLinks = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "crowler_total_links",
Help: "Total number of links collected.",
},
[]string{"pipeline_id", "source"},
)
totalErrors = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "crowler_total_errors",
Help: "Total number of errors encountered.",
},
[]string{"pipeline_id", "source"},
)
// TODO: Define more prometheus metrics here...

)

// WorkBlock is a struct that holds all the necessary information to instantiate a new
Expand Down Expand Up @@ -378,9 +406,9 @@ func logStatus(PipelineStatus *[]crowler.Status) {
status := (*PipelineStatus)[idx]
if status.PipelineRunning == 0 {
continue
} else {
runningPipelines++
}
runningPipelines++

var totalRunningTime time.Duration
if status.EndTime.IsZero() {
totalRunningTime = time.Since(status.StartTime)
Expand Down Expand Up @@ -410,6 +438,9 @@ func logStatus(PipelineStatus *[]crowler.Status) {
report += fmt.Sprintf(" Last Page Delay: %f\n", status.LastDelay)
report += sepPLine + "\n"

// Update the metrics
updateMetrics(status)

// Reset the status if the pipeline has completed (display only the last report)
if status.PipelineRunning == 2 || status.PipelineRunning == 3 {
status.PipelineRunning = 0
Expand All @@ -421,6 +452,40 @@ func logStatus(PipelineStatus *[]crowler.Status) {
}
}

func updateMetrics(status crowler.Status) {
if !config.Prometheus.Enabled {
return
}

// Update the metrics
labels := prometheus.Labels{
"pipeline_id": fmt.Sprintf("%d", status.PipelineID),
"source": status.Source,
}
totalPages.With(labels).Set(float64(status.TotalPages))
totalLinks.With(labels).Set(float64(status.TotalLinks))
totalErrors.With(labels).Set(float64(status.TotalErrors))

// Push metrics
if err := push.New("http://"+config.Prometheus.Host+":"+strconv.Itoa(config.Prometheus.Port), "crowler_engine").
Collector(totalPages).
// Add other collectors...
Grouping("pipeline_id", fmt.Sprintf("%d", status.PipelineID)).
Push(); err != nil {
cmn.DebugMsg(cmn.DbgLvlError, "Could not push metrics: %v", err)
}

// Delete metrics if pipeline is complete
if status.PipelineRunning == 2 || status.PipelineRunning == 3 {
// Use the configured pushgateway URL
if err := push.New("http://"+config.Prometheus.Host+":"+strconv.Itoa(config.Prometheus.Port), "crowler_engine").
Grouping("pipeline_id", fmt.Sprintf("%d", status.PipelineID)).
Delete(); err != nil {
cmn.DebugMsg(cmn.DbgLvlError, "Could not delete metrics: %v", err)
}
}
}

func StatusStr(condition int) string {
switch condition {
case 0:
Expand Down Expand Up @@ -512,6 +577,13 @@ func initAll(configFile *string, config *cfg.Config,
cmn.DebugMsg(cmn.DbgLvlInfo, "Crawling rules loaded: %d", RulesEngine.CountCrawlingRules())
cmn.DebugMsg(cmn.DbgLvlInfo, "Plugins loaded: %d", RulesEngine.CountPlugins())

// Initialize the prometheus metrics
if config.Prometheus.Enabled {
prometheus.MustRegister(totalPages)
prometheus.MustRegister(totalLinks)
prometheus.MustRegister(totalErrors)
}

// Start the crawler
crowler.StartCrawler(*config)

Expand Down Expand Up @@ -729,8 +801,12 @@ func configCheckHandler(w http.ResponseWriter, r *http.Request) {
func closeResources(db cdb.Handler, sel chan crowler.SeleniumInstance) {
// Close the database connection
if db != nil {
db.Close()
cmn.DebugMsg(cmn.DbgLvlInfo, "Database connection closed.")
err := db.Close()
if err != nil {
cmn.DebugMsg(cmn.DbgLvlError, "closing database connection: %v", err)
} else {
cmn.DebugMsg(cmn.DbgLvlInfo, "Database connection closed.")
}
}
// Stop the Selenium services
close(sel)
Expand Down
15 changes: 8 additions & 7 deletions pkg/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ const (
errIPNotAllowed = "ip address is not allowed"
)

// GetEngineID returns the engine ID
func GetEngineID() string {
// Retrieve process PID
pid := os.Getpid()
Expand Down Expand Up @@ -81,7 +82,7 @@ func UpdateLoggerConfig(logType string) {
logType = strings.ToLower(strings.TrimSpace(logType))
if logType == "file" {
// Set log to log to a file
logFile, err := os.OpenFile("log.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
logFile, err := os.OpenFile("log.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0600)
if err == nil {
log.SetOutput(logFile)
}
Expand Down Expand Up @@ -129,7 +130,7 @@ func GetFileExt(filePath string) string {
return fileType
}

// Create a function that checks if a path is correct and if it exists
// IsPathCorrect checks if the given path exists
func IsPathCorrect(path string) bool {
if _, err := os.Stat(path); os.IsNotExist(err) {
return false
Expand Down Expand Up @@ -224,7 +225,7 @@ func SafeTransport(timeout int, sslmode string) *http.Transport {
}

func dialContextWithIPCheck(timeout time.Duration) func(ctx context.Context, network, addr string) (net.Conn, error) {
return func(ctx context.Context, network, addr string) (net.Conn, error) {
return func(_ context.Context, network, addr string) (net.Conn, error) {
c, err := net.DialTimeout(network, addr, timeout)
if err != nil {
return nil, err
Expand Down Expand Up @@ -259,7 +260,7 @@ func dialTLSWithIPCheck(timeout time.Duration) func(ctx context.Context, network

//// ----- ENV related shared functions ----- ////

// interpolateEnvVars replaces occurrences of `${VAR}` or `$VAR` in the input string
// InterpolateEnvVars replaces occurrences of `${VAR}` or `$VAR` in the input string
// with the value of the VAR environment variable.
func InterpolateEnvVars(input string) string {
envVarPattern := regexp.MustCompile(`\$\{?(\w+)\}?`)
Expand All @@ -274,7 +275,7 @@ func InterpolateEnvVars(input string) string {
})
}

// Convert a string to an integer
// StringToInt converts a string to an integer
func StringToInt(s string) int {
i, err := strconv.Atoi(s)
if err != nil {
Expand All @@ -283,7 +284,7 @@ func StringToInt(s string) int {
return i
}

// Convert a string to a float
// StringToFloat converts a string to a float
func StringToFloat(s string) float64 {
f, err := strconv.ParseFloat(s, 64)
if err != nil {
Expand All @@ -292,7 +293,7 @@ func StringToFloat(s string) float64 {
return f
}

// Convert String to FLoat32
// StringToFloat32 converts String to FLoat32
func StringToFloat32(s string) float32 {
f, err := strconv.ParseFloat(s, 32)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/common/genericAPIclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func FetchRemoteFile(url string, timeout int, sslmode string) (string, error) {
if err != nil {
return "", fmt.Errorf("failed to fetch file from %s: %v", url, err)
}
defer resp.Body.Close()
defer resp.Body.Close() //nolint:errcheck // Don't lint for error not checked, this is a defer statement

if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("received non-200 response from %s: %d", url, resp.StatusCode)
Expand Down
Loading

0 comments on commit 54eae47

Please sign in to comment.