Skip to content

Commit

Permalink
Merge pull request #458 from pzaino/develop
Browse files Browse the repository at this point in the history
Introducing new config-schema and new ruleset-schema, new rules based external detection and new multi-type values for environment settings and plugins parameters
  • Loading branch information
pzaino authored Oct 4, 2024
2 parents 7e8d1db + 7cab208 commit fe1f88f
Show file tree
Hide file tree
Showing 30 changed files with 4,007 additions and 1,471 deletions.
43 changes: 25 additions & 18 deletions config.default
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,51 @@
# that should work for most small to medium deployments. You can use this as a
# starting point for your own configuration file.

---
database:
type: postgres
host: ${POSTGRES_DB_HOST}
host: "${POSTGRES_DB_HOST}"
port: 5432
user: ${CROWLER_DB_USER}
password: ${CROWLER_DB_PASSWORD}
user: "${CROWLER_DB_USER}"
password: "${CROWLER_DB_PASSWORD}"
dbname: SitesIndex
sslmode: disable
max_conns: 1000
max_idle_conns: 100

# The CROWler calls web-site's entry-point URLs "sources" as in "source of information"
crawler:
source_screenshot: true
interval: random(random(5,15), random(45,75))
interval: random(random(2,5), random(5,10))
workers: 5
delay: random(3,75)
delay: random(4,35)
timeout: 10
maintenance: 60
collect_html: false
collect_content: false
collect_images: false

api:
port: 8080
host: 0.0.0.0 # Replace this with the network interface IP you want to use for the API (0.0.0.0 means respond on all available IPs)
host: 0.0.0.0
timeout: 60
write_timeout: 60
read_timeout: 60
rate_limit: "1010,1010" # Rate limit for the API, in requests per second
enable_console: true # Enable the console for the API (this enables extra endpoint to check system status and add/remove/update crowler sources aka websites entry-point URLs)
readheader_timeout: 60
rate_limit: '1010,1010'
enable_console: true
return_404: false

selenium:
- type: chrome # This is the type of browser you want to use for this selenium instance, you can use chrome or firefox
path: "" # If you have deployed CROWLER_VDIs then leave this empty
port: 4444 # The port where the selenium instance will be listening
host: crowler_vdi # Replace this with the network name or the IP of your crowler_vdi container
use_service: false # If you are using CROWLER_VDIs, then set this to false
sslmode: disable # If you are using CROWLER_VDIs locally (intranet/vpn/private lan), then set this to disable
- name: crowler_vdi_01
type: chrome
path: ''
port: 4444
host: crowler_vdi_1
use_service: false
sslmode: disable

image_storage:
type: local
path: /app/data/images
path: "/app/data/images"

network_info:
netlookup:
Expand All @@ -55,8 +61,9 @@ network_info:
service_scout:
enabled: true
timeout: 1200
geolocation:
geo_localization:
enabled: false
path: ''
timeout: 15

debug_level: 0
9 changes: 9 additions & 0 deletions doc/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,15 @@ The **CROWler** is a comprehensive web crawling and scraping tool designed to pe
- *Benefits*: Provides insights into the security measures implemented by a website.

- **SSL/TLS Analysis**: Analyzes SSL/TLS certificates and configurations to identify security risks and compliance issues.
- The CROWler can detect and analyze the following:
- Certificate information
- Certificate chain (and order)
- Expiry date
- Key length
- Signature algorithm
- Cipher suites
- Protocols
- Vulnerabilities (e.g., Heartbleed, POODLE, DROWN)
- *Benefits*: Helps ensure secure communication between clients and servers.

- **3rd party Integration**: Integrates with third-party services like Shodan, VirusTotal, and others to gather additional information about web assets.
Expand Down
8 changes: 1 addition & 7 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ require (
github.com/oschwald/maxminddb-golang v1.13.1
github.com/qri-io/jsonschema v0.2.1
github.com/robertkrimen/otto v0.4.0
github.com/stretchr/testify v1.9.0
)

require (
Expand All @@ -37,12 +36,7 @@ require (
gopkg.in/sourcemap.v1 v1.0.5 // indirect
)

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
golang.org/x/time v0.6.0
gopkg.in/yaml.v3 v3.0.1 // indirect
)
require golang.org/x/time v0.6.0

require (
github.com/google/go-cmp v0.6.0
Expand Down
5 changes: 5 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -438,12 +438,17 @@ func initAll(configFile *string, config *cfg.Config,
db *cdb.Handler, seleniumInstances *chan crowler.SeleniumInstance,
RulesEngine *rules.RuleEngine, lmt **rate.Limiter) error {
var err error

// Reload the configuration file
*config, err = cfg.LoadConfig(*configFile)
if err != nil {
return fmt.Errorf("loading configuration file: %s", err)
}

// Reset Key-Value Store
cmn.KVStore = nil
cmn.KVStore = cmn.NewKeyValueStore()

// Reconnect to the database
*db, err = cdb.NewHandler(*config)
if err != nil {
Expand Down
14 changes: 14 additions & 0 deletions pkg/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,20 @@ func IsPathCorrect(path string) bool {

//// ----- HTTP related shared functions ----- ////

// URLToHost extracts the host from a URL
func URLToHost(url string) string {
host := url
if strings.Contains(host, "://") {
host = host[strings.Index(host, "://")+3:]
}
if strings.Contains(host, "/") {
host = host[:strings.Index(host, "/")]
}
host = strings.TrimSuffix(host, "/")
host = strings.TrimSpace(host)
return host
}

// HostToIP returns the IP address of a given host
func HostToIP(host string) []string {
ips, err := net.LookupIP(host)
Expand Down
58 changes: 58 additions & 0 deletions pkg/common/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -622,3 +622,61 @@ func TestStringToFloat32(t *testing.T) {
})
}
}

func TestURLToHost(t *testing.T) {
tests := []struct {
name string
url string
expected string
}{
{
name: "URLToHost Test case 1",
url: "http://example.com/path",
expected: "example.com",
},
{
name: "URLToHost Test case 2",
url: "https://example.com/path/to/resource",
expected: "example.com",
},
{
name: "URLToHost Test case 3",
url: "ftp://example.com/resource",
expected: "example.com",
},
{
name: "URLToHost Test case 4",
url: "example.com/path",
expected: "example.com",
},
{
name: "URLToHost Test case 5",
url: "example.com",
expected: "example.com",
},
{
name: "URLToHost Test case 6",
url: "http://example.com/",
expected: "example.com",
},
{
name: "URLToHost Test case 7",
url: "http://example.com",
expected: "example.com",
},
{
name: "URLToHost Test case 8",
url: "http://example.com:8080/path",
expected: "example.com:8080",
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
result := URLToHost(test.url)
if result != test.expected {
t.Errorf("Expected host %q, but got %q", test.expected, result)
}
})
}
}
1 change: 1 addition & 0 deletions pkg/common/json_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// Package common package is used to store common functions and variables
package common

import (
Expand Down
Loading

0 comments on commit fe1f88f

Please sign in to comment.