From 6c8070e92d4a992a20054c309fb1a7b2cdd889d0 Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Wed, 2 Oct 2024 14:15:18 +0100
Subject: [PATCH 01/12] Completed external_detection configuration schema
---
schemas/crowler-config-schema.json | 241 +++++++++++++++++++++++++----
1 file changed, 213 insertions(+), 28 deletions(-)
diff --git a/schemas/crowler-config-schema.json b/schemas/crowler-config-schema.json
index 93c8d9f..67bd5dc 100644
--- a/schemas/crowler-config-schema.json
+++ b/schemas/crowler-config-schema.json
@@ -558,6 +558,7 @@
"description": "This is the VDI configuration section, it's used to configure the VDI and tell the CROWler's Engine how to connect to it. It is the configuration for the selenium driver, to scale the CROWler web crawling capabilities, you can add multiple VDIs in an array format.",
"type": "array",
"items": {
+ "title": "CROWler VDI Configuration Items",
"type": "object",
"properties": {
"name": {
@@ -826,6 +827,7 @@
"description": "This is the list of proxies that the CROWler will use to collect HTTP headers.",
"type": "array",
"items": {
+ "title": "CROWler HTTP Headers collection Proxies Items",
"type": "object",
"properties": {
"host": {
@@ -1265,6 +1267,7 @@
"description": "This is the rulesets load configuration section, it is used to tell the CROWler where and how to load all the Rulesets we want to use to crawl, interact, scrape info and detect stuff on the provided Sources to crawl.",
"type": "array",
"items": {
+ "title": "CROWler Rulesets locations Configuration parameters",
"type": "object",
"properties": {
"path": {
@@ -1375,6 +1378,7 @@
"description": "This is the list of locations from where the CROWler will load and register all available plugins.",
"type": "array",
"items": {
+ "title": "CROWler Plugins Location Configuration",
"type": "object",
"properties": {
"path": {
@@ -1464,40 +1468,218 @@
"external_detection": {
"title": "CROWler External Detection Services Configuration",
"description": "This is the External Detection configuration section, it is used to tell the CROWler's Engine which external detection services we want to use and provide credentials to access them. External detection services are provided by VirusTotal, URLHaus, PhishTank, GoogleSafeBrowsing, AbuseIPDB, OpenPhish, Cuckoo, HybridAnalysis, CiscoUmbrella, AlienVault, IPVoid, Shodan, Censys, SSLLabs. They can be accessed using the CROWler's Detection Rules.",
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "name": {
- "type": "string",
- "description": "This is the name of the external detection service. For example, VirusTotal, URLHaus, PhishTank, GoogleSafeBrowsing, AbuseIPDB, OpenPhish, Cuckoo, HybridAnalysis, CiscoUmbrella, AlienVault, IPVoid, Shodan, Censys, SSLLabs."
+ "type": "object",
+ "properties": {
+ "abuse_ipdb": {
+ "title": "AbuseIPDB Configuration",
+ "description": "This is the AbuseIPDB configuration section, it is used to tell the CROWler's Engine how to connect to AbuseIPDB and use its services. AbuseIPDB will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "AbuseIPDB API Key",
+ "description": "This is the API key that the CROWler will use to connect to AbuseIPDB.",
+ "type": "string"
+ }
},
- "enabled": {
- "type": "boolean",
- "description": "This is a flag that tells the CROWler to use the external detection service."
+ "additionalProperties": false,
+ "required": [
+ "api_key"
+ ]
+ },
+ "alien_vault": {
+ "title": "Alien Vault Configuration",
+ "description": "This is the Alien Vault configuration section, it is used to tell the CROWler's Engine how to connect to Alien Vault and use its services. Alien Vault will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "Alien Vault API Key",
+ "description": "This is the API key that the CROWler will use to connect to Alien Vault.",
+ "type": "string"
+ }
},
- "api_key": {
- "type": "string",
- "description": "This is the API key for the external detection service."
+ "additionalProperties": false,
+ "required": [
+ "api_key"
+ ]
+ },
+ "censys": {
+ "title": "Censys Configuration",
+ "description": "This is the Censys configuration section, it is used to tell the CROWler's Engine how to connect to Censys and use its services. Censys will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "api_id": {
+ "title": "Censys API ID",
+ "description": "This is the API ID that the CROWler will use to connect to Censys.",
+ "type": "string"
+ },
+ "api_secret": {
+ "title": "Censys API Secret",
+ "description": "This is the API secret that the CROWler will use to connect to Censys.",
+ "type": "string"
+ }
},
- "timeout": {
- "type": "integer",
- "minimum": 10,
- "description": "This is the timeout in seconds for the external detection service."
+ "additionalProperties": false,
+ "required": [
+ "api_id",
+ "api_secret"
+ ]
+ },
+ "cisco_umbrella": {
+ "title": "Cisco Umbrella Configuration",
+ "description": "This is the Cisco Umbrella configuration section, it is used to tell the CROWler's Engine how to connect to Cisco Umbrella and use its services. Cisco Umbrella will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "Cisco Umbrella API Key",
+ "description": "This is the API key that the CROWler will use to connect to Cisco Umbrella.",
+ "type": "string"
+ }
},
- "delay": {
- "type": "string",
- "description": "This is the delay time (in seconds) between requests for the external detection service."
- }
+ "additionalProperties": false,
+ "required": [
+ "api_key"
+ ]
},
- "additionalProperties": false,
- "required": [
- "name",
- "api_key"
- ]
+ "cuckoo": {
+ "title": "Cuckoo Configuration",
+ "description": "This is the Cuckoo configuration section, it is used to tell the CROWler's Engine how to connect to Cuckoo and use its services. Cuckoo will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ },
+ "additionalProperties": false
+ },
+ "google_safe_browsing": {
+ "title": "Google Safe Browsing Configuration",
+ "description": "This is the Google Safe Browsing configuration section, it is used to tell the CROWler's Engine how to connect to Google Safe Browsing and use its services. Google Safe Browsing will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "company_id": {
+ "title": "Google Safe Browsing Company ID",
+ "description": "This is the company ID that the CROWler will use to connect to Google Safe Browsing.",
+ "type": "string"
+ },
+ "api_key": {
+ "title": "Google Safe Browsing API Key",
+ "description": "This is the API key that the CROWler will use to connect to Google Safe Browsing.",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "api_key"
+ ]
+ },
+ "hybrid_analysis": {
+ "title": "Hybrid Analysis Configuration",
+ "description": "This is the Hybrid Analysis configuration section, it is used to tell the CROWler's Engine how to connect to Hybrid Analysis and use its services. Hybrid Analysis will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "Hybrid Analysis API Key",
+ "description": "This is the API key that the CROWler will use to connect to Hybrid Analysis.",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "api_key"
+ ]
+ },
+ "ipvoid": {
+ "title": "IPVoid Configuration",
+ "description": "This is the IPVoid configuration section, it is used to tell the CROWler's Engine how to connect to IPVoid and use its services. IPVoid will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "IPVoid API Key",
+ "description": "This is the API key that the CROWler will use to connect to IPVoid.",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "api_key"
+ ]
+ },
+ "open_phish": {
+ "title": "OpenPhish Configuration",
+ "description": "This is the OpenPhish configuration section, it is used to tell the CROWler's Engine how to connect to OpenPhish and use its services. OpenPhish will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "OpenPhish API Key",
+ "description": "This is the API key that the CROWler will use to connect to OpenPhish.",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "api_key"
+ ]
+ },
+ "phish_tank": {
+ "title": "PhishTank Configuration",
+ "description": "This is the PhishTank configuration section, it is used to tell the CROWler's Engine how to connect to PhishTank and use its services. PhishTank will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ },
+ "additionalProperties": false
+ },
+ "shodan": {
+ "title": "Shodan Configuration",
+ "description": "This is the Shodan configuration section, it is used to tell the CROWler's Engine how to connect to Shodan and use its services. Shodan will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "Shodan API Key",
+ "description": "This is the API key that the CROWler will use to connect to Shodan.",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "api_key"
+ ]
+ },
+ "ssllabs": {
+ "title": "SSLLabs Configuration",
+ "description": "This is the SSLLabs configuration section, it is used to tell the CROWler's Engine how to connect to SSLLabs and use its services. SSLLabs will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "SSLLabs API Key",
+ "description": "This is the API key that the CROWler will use to connect to SSLLabs.",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false
+ },
+ "url_haus": {
+ "title": "URLHaus Configuration",
+ "description": "This is the URLHaus configuration section, it is used to tell the CROWler's Engine how to connect to URLHaus and use its services. URLHaus will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ },
+ "additionalProperties": false
+ },
+ "virus_total": {
+ "title": "VirusTotal Configuration",
+ "description": "This is the VirusTotal configuration section, it is used to tell the CROWler's Engine how to connect to VirusTotal and use its services. VirusTotal will be accessed through CROWler's Detection Rules.",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "VirusTotal API Key",
+ "description": "This is the API key that the CROWler will use to connect to VirusTotal.",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "api_key"
+ ]
+ }
},
- "minItems": 1,
- "uniqueItems": true
+ "additionalProperties": false
},
"os": {
@@ -1524,9 +1706,11 @@
"allOf": [
{ "$ref": "#/properties/remote" },
{
+ "title": "Remote Configuration Mode requirements",
"type": "object",
"required": ["remote"],
"not": {
+ "title": "Local Configuration exclusions",
"required": ["database", "crawler", "api", "selenium", "network_info"]
}
}
@@ -1545,6 +1729,7 @@
},
"required": ["database", "crawler", "api", "selenium", "network_info"],
"not": {
+ "title": "Local Configuration Mode exclusions",
"required": ["remote"]
}
}
From b16f64fbc59f4cf944851f767a75bce65e4b425a Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Wed, 2 Oct 2024 14:59:45 +0100
Subject: [PATCH 02/12] Fixed a dependecy issue, some missing titles in the
config schema and a database test
---
go.mod | 8 +-------
pkg/crawler/crawler.go | 5 +++++
pkg/database/database_test.go | 29 ++++++++++++++++++++++-------
schemas/ruleset-schema.json | 27 +++++++++++++++++++++------
4 files changed, 49 insertions(+), 20 deletions(-)
diff --git a/go.mod b/go.mod
index 598ff57..3121bb7 100644
--- a/go.mod
+++ b/go.mod
@@ -17,7 +17,6 @@ require (
github.com/oschwald/maxminddb-golang v1.13.1
github.com/qri-io/jsonschema v0.2.1
github.com/robertkrimen/otto v0.4.0
- github.com/stretchr/testify v1.9.0
)
require (
@@ -37,12 +36,7 @@ require (
gopkg.in/sourcemap.v1 v1.0.5 // indirect
)
-require (
- github.com/davecgh/go-spew v1.1.1 // indirect
- github.com/pmezard/go-difflib v1.0.0 // indirect
- golang.org/x/time v0.6.0
- gopkg.in/yaml.v3 v3.0.1 // indirect
-)
+require golang.org/x/time v0.6.0
require (
github.com/google/go-cmp v0.6.0
diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
index 00d9f87..b7f5dd9 100644
--- a/pkg/crawler/crawler.go
+++ b/pkg/crawler/crawler.go
@@ -97,6 +97,11 @@ type ProcessContext struct {
var indexPageMutex sync.Mutex // Mutex to ensure that only one goroutine is indexing a page at a time
+// GetContextID returns a unique context ID for the ProcessContext
+func (ctx *ProcessContext) GetContextID() string {
+ return fmt.Sprintf("%d-%d", ctx.SelID, ctx.source.ID)
+}
+
// CrawlWebsite is responsible for crawling a website, it's the main entry point
// and it's called from the main.go when there is a Source to crawl.
func CrawlWebsite(args Pars, sel SeleniumInstance, releaseSelenium chan<- SeleniumInstance) {
diff --git a/pkg/database/database_test.go b/pkg/database/database_test.go
index 84a6770..250750c 100644
--- a/pkg/database/database_test.go
+++ b/pkg/database/database_test.go
@@ -5,7 +5,6 @@ import (
"testing"
cfg "github.com/pzaino/thecrowler/pkg/config"
- "github.com/stretchr/testify/assert"
)
func TestBuildConnectionString(t *testing.T) {
@@ -42,16 +41,19 @@ func TestBuildConnectionString(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
result := buildConnectionString(test.config)
- assert.Equal(t, test.expected, result)
+ if result != test.expected {
+ t.Errorf("expected '%s', got '%s'", test.expected, result)
+ }
})
}
}
+
func TestNewHandler(t *testing.T) {
// Test cases
tests := []struct {
name string
config cfg.Config
- expectedType Handler
+ expectedType interface{}
expectedErr error
}{
{
@@ -90,11 +92,24 @@ func TestNewHandler(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
handler, err := NewHandler(test.config)
- assert.Equal(t, test.expectedErr, err)
+
+ if (err != nil && test.expectedErr == nil) || (err == nil && test.expectedErr != nil) || (err != nil && err.Error() != test.expectedErr.Error()) {
+ t.Errorf("expected error '%v', got '%v'", test.expectedErr, err)
+ }
+
if test.expectedType != nil {
- assert.IsType(t, test.expectedType, handler)
- } else {
- assert.Nil(t, handler)
+ switch test.expectedType.(type) {
+ case *PostgresHandler:
+ if _, ok := handler.(*PostgresHandler); !ok {
+ t.Errorf("expected type *PostgresHandler, got %T", handler)
+ }
+ case *SQLiteHandler:
+ if _, ok := handler.(*SQLiteHandler); !ok {
+ t.Errorf("expected type *SQLiteHandler, got %T", handler)
+ }
+ }
+ } else if handler != nil {
+ t.Errorf("expected nil handler, got %T", handler)
}
})
}
diff --git a/schemas/ruleset-schema.json b/schemas/ruleset-schema.json
index 189e1c2..dbfab68 100644
--- a/schemas/ruleset-schema.json
+++ b/schemas/ruleset-schema.json
@@ -1,13 +1,20 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
- "type": "object",
+ "$id": "https://github.com/pzaino/thecrowler/main/schemas/ruleset-schema.json",
+ "title": "CROWler Ruleset Schema",
+ "version": "1.0.4",
"description": "The CROWler ruleset schema defines the structure of a ruleset file, which contains rules for scraping, action execution, detection, and crawling.",
+ "type": "object",
"items": {
"type": "object",
"properties": {
"format_version": {
"type": "string",
- "description": "Version of the ruleset format, to ensure compatibility."
+ "description": "Version of the ruleset format, to ensure compatibility.",
+ "pattern": "^\\d+\\.\\d+\\.\\d+$",
+ "examples": [
+ "1.0.4"
+ ]
},
"author": {
"type": "string",
@@ -15,8 +22,8 @@
},
"created_at": {
"type": "string",
- "format": "date-time",
- "description": "Creation date of the ruleset."
+ "description": "Creation date of the ruleset.",
+ "pattern": "(?:(?:(?:(\\d{4})[-\\/\\.](\\d{2})[-\\/\\.](\\d{2}))|(?:(\\d{2})[-\\/\\.](\\d{2})[-\\/\\.](\\d{4})))\\s*(?:T\\s*)?)?(?:(\\d{1,2}):(\\d{2})(?::(\\d{2}))?\\s*([AaPp][Mm])?)?"
},
"description": {
"type": "string",
@@ -24,7 +31,11 @@
},
"ruleset_name": {
"type": "string",
- "description": "A unique name identifying the ruleset."
+ "description": "A unique name identifying the ruleset.",
+ "examples": [
+ "My Ruleset",
+ "https://example.com"
+ ]
},
"rule_groups": {
"type": "array",
@@ -33,7 +44,11 @@
"properties": {
"group_name": {
"type": "string",
- "description": "A unique name identifying the group of rules."
+ "description": "A unique name identifying the group of rules.",
+ "examples": [
+ "My Group",
+ "https://example.com"
+ ]
},
"valid_from": {
"type": "string",
From 8165f890984d50607be766c4d4aa866ba8c3cedb Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Wed, 2 Oct 2024 15:17:37 +0100
Subject: [PATCH 03/12] Reviewed and fixed few issues found in config.default
---
config.default | 43 +++++++++++++++++++++++++------------------
1 file changed, 25 insertions(+), 18 deletions(-)
diff --git a/config.default b/config.default
index b8015d5..2fd017c 100644
--- a/config.default
+++ b/config.default
@@ -2,45 +2,51 @@
# that should work for most small to medium deployments. You can use this as a
# starting point for your own configuration file.
+---
database:
type: postgres
- host: ${POSTGRES_DB_HOST}
+ host: "${POSTGRES_DB_HOST}"
port: 5432
- user: ${CROWLER_DB_USER}
- password: ${CROWLER_DB_PASSWORD}
+ user: "${CROWLER_DB_USER}"
+ password: "${CROWLER_DB_PASSWORD}"
dbname: SitesIndex
sslmode: disable
+ max_conns: 1000
+ max_idle_conns: 100
-# The CROWler calls web-site's entry-point URLs "sources" as in "source of information"
crawler:
source_screenshot: true
- interval: random(random(5,15), random(45,75))
+ interval: random(random(2,5), random(5,10))
workers: 5
- delay: random(3,75)
+ delay: random(4,35)
timeout: 10
maintenance: 60
+ collect_html: false
+ collect_content: false
+ collect_images: false
api:
port: 8080
- host: 0.0.0.0 # Replace this with the network interface IP you want to use for the API (0.0.0.0 means respond on all available IPs)
+ host: 0.0.0.0
timeout: 60
write_timeout: 60
- read_timeout: 60
- rate_limit: "1010,1010" # Rate limit for the API, in requests per second
- enable_console: true # Enable the console for the API (this enables extra endpoint to check system status and add/remove/update crowler sources aka websites entry-point URLs)
+ readheader_timeout: 60
+ rate_limit: '1010,1010'
+ enable_console: true
return_404: false
selenium:
- - type: chrome # This is the type of browser you want to use for this selenium instance, you can use chrome or firefox
- path: "" # If you have deployed CROWLER_VDIs then leave this empty
- port: 4444 # The port where the selenium instance will be listening
- host: crowler_vdi # Replace this with the network name or the IP of your crowler_vdi container
- use_service: false # If you are using CROWLER_VDIs, then set this to false
- sslmode: disable # If you are using CROWLER_VDIs locally (intranet/vpn/private lan), then set this to disable
+ - name: crowler_vdi_01
+ type: chrome
+ path: ''
+ port: 4444
+ host: crowler_vdi_1
+ use_service: false
+ sslmode: disable
image_storage:
type: local
- path: /app/data/images
+ path: "/app/data/images"
network_info:
netlookup:
@@ -55,8 +61,9 @@ network_info:
service_scout:
enabled: true
timeout: 1200
- geolocation:
+ geo_localization:
enabled: false
+ path: ''
timeout: 15
debug_level: 0
From f7be5b2d7128b0dbb4759d965ad97d37064d1d00 Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Wed, 2 Oct 2024 15:23:50 +0100
Subject: [PATCH 04/12] Paired config schema yaml with the json version
---
schemas/crowler-config-schema.yaml | 207 +++++++++++++++++++++++++----
1 file changed, 180 insertions(+), 27 deletions(-)
diff --git a/schemas/crowler-config-schema.yaml b/schemas/crowler-config-schema.yaml
index f2857b9..4067232 100644
--- a/schemas/crowler-config-schema.yaml
+++ b/schemas/crowler-config-schema.yaml
@@ -427,6 +427,18 @@ properties:
title: "CROWler General/Search API Requests Rate Limit"
description: "This is the rate limit for the General/Search API. It is the maximum number of requests that the CROWler General API will accept per second. You can use the ExprTerpreter language to set the rate limit."
type: "string"
+ readheader_timeout:
+ title: "CROWler General/Search API Readheader Timeout"
+ type: "integer"
+ minimum: "10"
+ description: "This is the readheader timeout (in seconds) for the General/Search API. It is the maximum amount of time that the CROWler will wait for the General/Search API to respond."
+ examples:
+ - "30"
+ write_timeout:
+ title: "CROWler Engine General/Search API Write Timeout"
+ type: "integer"
+ minimum: "10"
+ description: "This is the write timeout (in seconds) for the General/Search API. It is the maximum amount of time that the CROWler will wait for the control API to respond."
enable_console:
title: "CROWler General/Search API Enable Admin Console"
description: "This is a flag that tells the CROWler General API to enable the 'admin console' via the API. In other words, you'll get more endpoints to manage the CROWler via the General API instead of having to use local commands to do admin tasks."
@@ -444,6 +456,7 @@ properties:
description: "This is the VDI configuration section, it's used to configure the VDI and tell the CROWler's Engine how to connect to it. It is the configuration for the selenium driver, to scale the CROWler web crawling capabilities, you can add multiple VDIs in an array format."
type: "array"
items:
+ title: "CROWler VDI Configuration Items"
type: "object"
properties:
name:
@@ -657,6 +670,7 @@ properties:
description: "This is the list of proxies that the CROWler will use to collect HTTP headers."
type: "array"
items:
+ title: "CROWler HTTP Headers collection Proxies Items"
type: "object"
properties:
host:
@@ -994,6 +1008,7 @@ properties:
description: "This is the rulesets load configuration section, it is used to tell the CROWler where and how to load all the Rulesets we want to use to crawl, interact, scrape info and detect stuff on the provided Sources to crawl."
type: "array"
items:
+ title: "CROWler Rulesets locations Configuration parameters"
type: "object"
properties:
path:
@@ -1081,6 +1096,7 @@ properties:
description: "This is the list of locations from where the CROWler will load and register all available plugins."
type: "array"
items:
+ title: "CROWler Plugins Location Configuration"
type: "object"
properties:
path:
@@ -1149,32 +1165,166 @@ properties:
external_detection:
title: "CROWler External Detection Services Configuration"
description: "This is the External Detection configuration section, it is used to tell the CROWler's Engine which external detection services we want to use and provide credentials to access them. External detection services are provided by VirusTotal, URLHaus, PhishTank, GoogleSafeBrowsing, AbuseIPDB, OpenPhish, Cuckoo, HybridAnalysis, CiscoUmbrella, AlienVault, IPVoid, Shodan, Censys, SSLLabs. They can be accessed using the CROWler's Detection Rules."
- type: "array"
- items:
- type: "object"
- properties:
- name:
- type: "string"
- description: "This is the name of the external detection service. For example, VirusTotal, URLHaus, PhishTank, GoogleSafeBrowsing, AbuseIPDB, OpenPhish, Cuckoo, HybridAnalysis, CiscoUmbrella, AlienVault, IPVoid, Shodan, Censys, SSLLabs."
- enabled:
- type: "boolean"
- description: "This is a flag that tells the CROWler to use the external detection service."
- api_key:
- type: "string"
- description: "This is the API key for the external detection service."
- timeout:
- type: "integer"
- minimum: "10"
- description: "This is the timeout in seconds for the external detection service."
- delay:
- type: "string"
- description: "This is the delay time (in seconds) between requests for the external detection service."
- additionalProperties: "false"
- required:
- - "name"
- - "api_key"
- minItems: "1"
- uniqueItems: "true"
+ type: "object"
+ properties:
+ abuse_ipdb:
+ title: "AbuseIPDB Configuration"
+ description: "This is the AbuseIPDB configuration section, it is used to tell the CROWler's Engine how to connect to AbuseIPDB and use its services. AbuseIPDB will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ api_key:
+ title: "AbuseIPDB API Key"
+ description: "This is the API key that the CROWler will use to connect to AbuseIPDB."
+ type: "string"
+ additionalProperties: "false"
+ required:
+ - "api_key"
+ alien_vault:
+ title: "Alien Vault Configuration"
+ description: "This is the Alien Vault configuration section, it is used to tell the CROWler's Engine how to connect to Alien Vault and use its services. Alien Vault will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ api_key:
+ title: "Alien Vault API Key"
+ description: "This is the API key that the CROWler will use to connect to Alien Vault."
+ type: "string"
+ additionalProperties: "false"
+ required:
+ - "api_key"
+ censys:
+ title: "Censys Configuration"
+ description: "This is the Censys configuration section, it is used to tell the CROWler's Engine how to connect to Censys and use its services. Censys will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ api_id:
+ title: "Censys API ID"
+ description: "This is the API ID that the CROWler will use to connect to Censys."
+ type: "string"
+ api_secret:
+ title: "Censys API Secret"
+ description: "This is the API secret that the CROWler will use to connect to Censys."
+ type: "string"
+ additionalProperties: "false"
+ required:
+ - "api_id"
+ - "api_secret"
+ cisco_umbrella:
+ title: "Cisco Umbrella Configuration"
+ description: "This is the Cisco Umbrella configuration section, it is used to tell the CROWler's Engine how to connect to Cisco Umbrella and use its services. Cisco Umbrella will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ api_key:
+ title: "Cisco Umbrella API Key"
+ description: "This is the API key that the CROWler will use to connect to Cisco Umbrella."
+ type: "string"
+ additionalProperties: "false"
+ required:
+ - "api_key"
+ cuckoo:
+ title: "Cuckoo Configuration"
+ description: "This is the Cuckoo configuration section, it is used to tell the CROWler's Engine how to connect to Cuckoo and use its services. Cuckoo will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ additionalProperties: "false"
+ google_safe_browsing:
+ title: "Google Safe Browsing Configuration"
+ description: "This is the Google Safe Browsing configuration section, it is used to tell the CROWler's Engine how to connect to Google Safe Browsing and use its services. Google Safe Browsing will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ company_id:
+ title: "Google Safe Browsing Company ID"
+ description: "This is the company ID that the CROWler will use to connect to Google Safe Browsing."
+ type: "string"
+ api_key:
+ title: "Google Safe Browsing API Key"
+ description: "This is the API key that the CROWler will use to connect to Google Safe Browsing."
+ type: "string"
+ additionalProperties: "false"
+ required:
+ - "api_key"
+ hybrid_analysis:
+ title: "Hybrid Analysis Configuration"
+ description: "This is the Hybrid Analysis configuration section, it is used to tell the CROWler's Engine how to connect to Hybrid Analysis and use its services. Hybrid Analysis will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ api_key:
+ title: "Hybrid Analysis API Key"
+ description: "This is the API key that the CROWler will use to connect to Hybrid Analysis."
+ type: "string"
+ additionalProperties: "false"
+ required:
+ - "api_key"
+ ipvoid:
+ title: "IPVoid Configuration"
+ description: "This is the IPVoid configuration section, it is used to tell the CROWler's Engine how to connect to IPVoid and use its services. IPVoid will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ api_key:
+ title: "IPVoid API Key"
+ description: "This is the API key that the CROWler will use to connect to IPVoid."
+ type: "string"
+ additionalProperties: "false"
+ required:
+ - "api_key"
+ open_phish:
+ title: "OpenPhish Configuration"
+ description: "This is the OpenPhish configuration section, it is used to tell the CROWler's Engine how to connect to OpenPhish and use its services. OpenPhish will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ api_key:
+ title: "OpenPhish API Key"
+ description: "This is the API key that the CROWler will use to connect to OpenPhish."
+ type: "string"
+ additionalProperties: "false"
+ required:
+ - "api_key"
+ phish_tank:
+ title: "PhishTank Configuration"
+ description: "This is the PhishTank configuration section, it is used to tell the CROWler's Engine how to connect to PhishTank and use its services. PhishTank will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ additionalProperties: "false"
+ shodan:
+ title: "Shodan Configuration"
+ description: "This is the Shodan configuration section, it is used to tell the CROWler's Engine how to connect to Shodan and use its services. Shodan will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ api_key:
+ title: "Shodan API Key"
+ description: "This is the API key that the CROWler will use to connect to Shodan."
+ type: "string"
+ additionalProperties: "false"
+ required:
+ - "api_key"
+ ssllabs:
+ title: "SSLLabs Configuration"
+ description: "This is the SSLLabs configuration section, it is used to tell the CROWler's Engine how to connect to SSLLabs and use its services. SSLLabs will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ api_key:
+ title: "SSLLabs API Key"
+ description: "This is the API key that the CROWler will use to connect to SSLLabs."
+ type: "string"
+ additionalProperties: "false"
+ url_haus:
+ title: "URLHaus Configuration"
+ description: "This is the URLHaus configuration section, it is used to tell the CROWler's Engine how to connect to URLHaus and use its services. URLHaus will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ additionalProperties: "false"
+ virus_total:
+ title: "VirusTotal Configuration"
+ description: "This is the VirusTotal configuration section, it is used to tell the CROWler's Engine how to connect to VirusTotal and use its services. VirusTotal will be accessed through CROWler's Detection Rules."
+ type: "object"
+ properties:
+ api_key:
+ title: "VirusTotal API Key"
+ description: "This is the API key that the CROWler will use to connect to VirusTotal."
+ type: "string"
+ additionalProperties: "false"
+ required:
+ - "api_key"
+ additionalProperties: "false"
os:
title: "CROWler (internal) Platform OS Configuration"
description: "This is the operating system that the CROWler will use to run. For example, linux, windows or macos. This field is set automatically by the CROWler itself, so no need to set it manually."
@@ -1191,10 +1341,12 @@ oneOf:
description: "Configuration where the 'remote' field must be provided. In this case, the local configuration sections like 'database', 'crawler', and others should not be populated."
allOf:
- $ref: "#/properties/remote"
- - type: "object"
+ - title: "Remote Configuration Mode requirements"
+ type: "object"
required:
- "remote"
not:
+ title: "Local Configuration exclusions"
required:
- "database"
- "crawler"
@@ -1222,6 +1374,7 @@ oneOf:
- "selenium"
- "network_info"
not:
+ title: "Local Configuration Mode exclusions"
required:
- "remote"
dependencies:
From 3976a47a5cce6aae48e267cc165ff0ec6e91a4fb Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Thu, 3 Oct 2024 00:46:27 +0100
Subject: [PATCH 05/12] Improvements on handling external detection via
rulesets
---
pkg/common/common.go | 14 +++
pkg/config/config.go | 29 ++++---
pkg/config/types.go | 37 ++++++--
pkg/crawler/crawler.go | 75 +---------------
pkg/detection/3rd_party.go | 2 +-
pkg/detection/detection.go | 135 ++++++++++++++++++++++++++++-
pkg/detection/types.go | 2 +
pkg/httpinfo/httpinfo.go | 7 +-
pkg/ruleset/detectionrule.go | 23 +++++
pkg/ruleset/types.go | 6 ++
schemas/crowler-config-schema.json | 15 ++++
schemas/ruleset-schema.json | 58 +++++++++++--
12 files changed, 299 insertions(+), 104 deletions(-)
diff --git a/pkg/common/common.go b/pkg/common/common.go
index ce14583..4aac1c4 100644
--- a/pkg/common/common.go
+++ b/pkg/common/common.go
@@ -139,6 +139,20 @@ func IsPathCorrect(path string) bool {
//// ----- HTTP related shared functions ----- ////
+// URLToHost extracts the host from a URL
+func URLToHost(url string) string {
+ host := url
+ if strings.Contains(host, "://") {
+ host = host[strings.Index(host, "://")+3:]
+ }
+ if strings.Contains(host, "/") {
+ host = host[:strings.Index(host, "/")]
+ }
+ host = strings.TrimSuffix(host, "/")
+ host = strings.TrimSpace(host)
+ return host
+}
+
// HostToIP returns the IP address of a given host
func HostToIP(host string) []string {
ips, err := net.LookupIP(host)
diff --git a/pkg/config/config.go b/pkg/config/config.go
index 950cb67..a153f29 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -328,7 +328,7 @@ func NewConfig() *Config {
}},
},
},
- ExternalDetection: []ExternalDetectionConfig{},
+ ExternalDetection: ExternalDetectionConfig{},
OS: runtime.GOOS,
DebugLevel: 0,
}
@@ -860,16 +860,23 @@ func (c *Config) validatePlugins() {
}
func (c *Config) validateExternalDetection() {
- // Check ExternalDetection
- for i := range c.ExternalDetection {
- if c.ExternalDetection[i].Timeout < 1 {
- c.ExternalDetection[i].Timeout = 60
- }
- if strings.TrimSpace(c.ExternalDetection[i].Delay) == "" {
- c.ExternalDetection[i].Delay = "1"
- } else {
- c.ExternalDetection[i].Delay = strings.TrimSpace(c.ExternalDetection[i].Delay)
- }
+ // Check ExternalDetection Timeout
+ if c.ExternalDetection.Timeout < 1 {
+ c.ExternalDetection.Timeout = 60
+ }
+ // Check ExternalDetection Delay
+ if strings.TrimSpace(c.ExternalDetection.Delay) == "" {
+ c.ExternalDetection.Delay = "1"
+ } else {
+ c.ExternalDetection.Delay = strings.TrimSpace(c.ExternalDetection.Delay)
+ }
+ // Check ExternalDetection MaxRetries
+ if c.ExternalDetection.MaxRetries < 0 {
+ c.ExternalDetection.MaxRetries = 0
+ }
+ // Check ExternalDetection MaxRequests
+ if c.ExternalDetection.MaxRequests < 1 {
+ c.ExternalDetection.MaxRequests = 1
}
}
diff --git a/pkg/config/types.go b/pkg/config/types.go
index 4bf9ac9..cde1e7e 100644
--- a/pkg/config/types.go
+++ b/pkg/config/types.go
@@ -350,7 +350,7 @@ type Config struct {
Plugins PluginsConfig `yaml:"plugins"` // Plugins configuration
- ExternalDetection []ExternalDetectionConfig `yaml:"external_detection"`
+ ExternalDetection ExternalDetectionConfig `yaml:"external_detection"`
OS string `yaml:"os"` // Operating system name
DebugLevel int `yaml:"debug_level"` // Debug level for logging
@@ -362,12 +362,35 @@ type PluginsConfig struct {
}
type ExternalDetectionConfig struct {
- Name string `yaml:"name"` // Name of the external detection service
- Enabled bool `yaml:"enabled"` // Whether to enable the external detection service or not
- APIKey string `yaml:"api_key"` // API key for the external detection service
- Timeout int `yaml:"timeout"` // Timeout for the external detection service (in seconds)
- Delay string `yaml:"delay"` // Delay between requests (in seconds)
- FullSite bool `yaml:"full_site"` // Send each collected URL to the external detection service
+ Timeout int `yaml:"timeout"` // Timeout for external detection (in seconds)
+ MaxRequests int `yaml:"max_requests"` // Maximum number of requests
+ MaxRetries int `yaml:"max_retries"` // Maximum number of retries
+ Delay string `yaml:"delay"` // Delay between requests (in seconds)
+ AbuseIPDB ExtDetectProviderConfig `yaml:"abuse_ipdb"`
+ AlienVault ExtDetectProviderConfig `yaml:"alien_vault"`
+ Censys ExtDetectProviderConfig `yaml:"censys"`
+ CiscoUmbrella ExtDetectProviderConfig `yaml:"cisco_umbrella"`
+ Cuckoo ExtDetectProviderConfig `yaml:"cuckoo"`
+ GreyNoise ExtDetectProviderConfig `yaml:"grey_noise"`
+ GoogleSafeBrowsing ExtDetectProviderConfig `yaml:"google_safe_browsing"`
+ HybridAnalysis ExtDetectProviderConfig `yaml:"hybrid_analysis"`
+ IPQualityScore ExtDetectProviderConfig `yaml:"ip_quality_score"`
+ IPVoid ExtDetectProviderConfig `yaml:"ipvoid"`
+ OpenPhish ExtDetectProviderConfig `yaml:"open_phish"`
+ PhishTank ExtDetectProviderConfig `yaml:"phish_tank"`
+ Shodan ExtDetectProviderConfig `yaml:"shodan"`
+ VirusTotal ExtDetectProviderConfig `yaml:"virus_total"`
+ URLHaus ExtDetectProviderConfig `yaml:"url_haus"`
+}
+
+type ExtDetectProviderConfig struct {
+ Provider string `yaml:"provider"`
+ Host string `yaml:"host"`
+ APIKeyLabel string `yaml:"api_key_label"`
+ APIKey string `yaml:"api_key"`
+ APIID string `yaml:"api_id"`
+ APISecret string `yaml:"api_secret"`
+ APIToken string `yaml:"api_token"`
}
/////////////////////////////////////////////////
diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
index b7f5dd9..351c476 100644
--- a/pkg/crawler/crawler.go
+++ b/pkg/crawler/crawler.go
@@ -365,17 +365,13 @@ func (ctx *ProcessContext) CrawlInitialURL(sel SeleniumInstance) (selenium.WebDr
HSSLInfo: nil,
WD: &(ctx.wd),
RE: ctx.re,
+ Config: &ctx.config,
}
detectedTech := detect.DetectTechnologies(&detectCtx)
if detectedTech != nil {
pageInfo.DetectedTech = (*detectedTech)
}
- // Use external Detection if enabled
- if len(ctx.config.ExternalDetection) > 0 {
- pageInfo.ExtDetectionResults = UseExternalDetection(ctx, ctx.source.URL)
- }
-
if !ctx.config.Crawler.CollectHTML {
// If we don't need to collect HTML content, clear it
pageInfo.HTML = ""
@@ -406,74 +402,6 @@ func (ctx *ProcessContext) CrawlInitialURL(sel SeleniumInstance) (selenium.WebDr
return pageSource, nil
}
-// UseExternalDetection is responsible for using external detection services
-func UseExternalDetection(ctx *ProcessContext, url string) []map[string]interface{} {
- var results []map[string]interface{}
- for _, extDet := range ctx.config.ExternalDetection {
- if extDet.Enabled {
- // Call the external detection service
- switch strings.ToLower(strings.TrimSpace(extDet.Name)) {
- case "virustotal": // VirusTotal
- vtResults := detect.ScanWithVirusTotal(extDet.APIKey, url)
- if vtResults != nil {
- results = append(results, vtResults)
- }
- case "shodan": // Shodan
- shodanResults := detect.ScanWithShodan(extDet.APIKey, url)
- if shodanResults != nil {
- results = append(results, shodanResults)
- }
- case "urlhaus": // URLScan
- uhResults := detect.ScanWithURLHaus(url)
- if uhResults != nil {
- results = append(results, uhResults)
- }
- case "hybridanalysis": // HybridAnalysis
- haResults := detect.ScanWithHybridAnalysis(extDet.APIKey, url)
- if haResults != nil {
- results = append(results, haResults)
- }
- case "alienvalut": // AlienVault
- avResults := detect.ScanWithAlienVault(extDet.APIKey, url)
- if avResults != nil {
- results = append(results, avResults)
- }
- case "phishtank": // PhishTank
- ptResults := detect.ScanWithPhishTank(extDet.APIKey, url)
- if ptResults != nil {
- results = append(results, ptResults)
- }
- case "googlesafebrowsing": // Google Safe Browsing
- gsbResults := detect.ScanWithGoogleSafeBrowsing(extDet.APIKey, url)
- if gsbResults != nil {
- results = append(results, gsbResults)
- }
- case "openphish": // OpenPhish
- ofResults := detect.ScanWithOpenPhish(extDet.APIKey, url)
- if ofResults != nil {
- results = append(results, ofResults)
- }
- case "cuckoo": // Cuckoo
- ckResults := detect.ScanWithCuckoo(extDet.APIKey, url)
- if ckResults != nil {
- results = append(results, ckResults)
- }
- case "ciscoumbrella": // Cisco Umbrella
- cuResults := detect.ScanWithCiscoUmbrella(extDet.APIKey, url)
- if cuResults != nil {
- results = append(results, cuResults)
- }
- case "threatcrowd": // ThreatCrowd
- tcResults := detect.ScanWidthThreatCrowd(url)
- if tcResults != nil {
- results = append(results, tcResults)
- }
- }
- }
- }
- return results
-}
-
// Collects the performance metrics logs from the browser
func collectNavigationMetrics(wd *selenium.WebDriver, pageInfo *PageInfo) {
// Retrieve Navigation Timing metrics
@@ -1949,6 +1877,7 @@ func processJob(processCtx *ProcessContext, id int, url string, skippedURLs []Li
HSSLInfo: nil,
WD: &processCtx.wd,
RE: processCtx.re,
+ Config: &processCtx.config,
}
detectedTech := detect.DetectTechnologies(&detectCtx)
if detectedTech != nil {
diff --git a/pkg/detection/3rd_party.go b/pkg/detection/3rd_party.go
index 1a55a90..bb6f36d 100644
--- a/pkg/detection/3rd_party.go
+++ b/pkg/detection/3rd_party.go
@@ -162,7 +162,7 @@ func ScanWithURLHaus(url string) map[string]interface{} {
}
// ScanWidthThreatCrowd scans a URL with ThreatCrowd.
-func ScanWidthThreatCrowd(url string) map[string]interface{} {
+func ScanWithThreatCrowd(url string) map[string]interface{} {
reqInfo := &trdPRequest{
Provider: "ThreatCrowd",
Method: "GET",
diff --git a/pkg/detection/detection.go b/pkg/detection/detection.go
index 45a8e44..0a52a92 100644
--- a/pkg/detection/detection.go
+++ b/pkg/detection/detection.go
@@ -11,6 +11,7 @@ import (
"github.com/PuerkitoBio/goquery"
cmn "github.com/pzaino/thecrowler/pkg/common"
+ cfg "github.com/pzaino/thecrowler/pkg/config"
ruleset "github.com/pzaino/thecrowler/pkg/ruleset"
"github.com/tebeka/selenium"
@@ -22,10 +23,11 @@ const (
// detectionEntityDetails is used internally to represent the details of an entity detection
type detectionEntityDetails struct {
- entityType string
- matchedPatterns []string
- confidence float32
- pluginResult map[string]interface{}
+ entityType string
+ matchedPatterns []string
+ confidence float32
+ pluginResult map[string]interface{}
+ externalDetection map[string]interface{}
}
// IsEmpty checks if the detectionEntityDetails is empty
@@ -145,6 +147,14 @@ func DetectTechnologies(dtCtx *DetectionContext) *map[string]DetectedEntity {
processImpliedTechnologies(&detectedTech, &Patterns)
}
+ // Process external detection
+ if len(detectedTech) > 0 {
+ ExternalDetection := ruleset.GetAllExternalDetectionsMap(&Patterns)
+ if len(ExternalDetection) > 0 {
+ detectTechnologiesByExternalDetection(dtCtx.TargetURL, dtCtx.Config, &ExternalDetection, &detectedTech)
+ }
+ }
+
// Transform the detectedTech map into a map of strings
detectedTechStr := make(map[string]DetectedEntity)
if len(detectedTech) == 0 {
@@ -546,6 +556,123 @@ func detectTechnologiesWithPlugins(wd *selenium.WebDriver, re *ruleset.RuleEngin
}
}
+func detectTechnologiesByExternalDetection(url string, conf *cfg.Config, ExternalDetection *map[string][]ruleset.ExternalDetection, detectedTech *map[string]detectionEntityDetails) {
+ // Iterate through all the external detection services and check for possible technologies
+ for ObjName := range *ExternalDetection {
+ for _, externalDetection := range (*ExternalDetection)[ObjName] {
+ // Send Current URL to the configured external detection services
+ var result map[string]interface{}
+ switch externalDetection.Provider {
+ case "abuse_ipdb":
+ // Resolve IP of current URL
+ host := cmn.URLToHost(url)
+ ips := cmn.HostToIP(host)
+ result = make(map[string]interface{})
+ // AbuseIPDB
+ for _, ip := range ips {
+ rval := ScanWithAbuseIPDB(conf.ExternalDetection.AbuseIPDB.APIKey, ip)
+ if rval != nil {
+ // add rval rows to result
+ for k, v := range rval {
+ result[k] = v
+ }
+ }
+ }
+ case "ipvoid":
+ // Resolve IP of current URL
+ host := cmn.URLToHost(url)
+ ips := cmn.HostToIP(host)
+ result = make(map[string]interface{})
+ // IPVoid
+ for _, ip := range ips {
+ rval := ScanWithIPVoid(conf.ExternalDetection.IPVoid.APIKey, ip)
+ if rval != nil {
+ // add rval rows to result
+ for k, v := range rval {
+ result[k] = v
+ }
+ }
+ }
+ case "censys":
+ // Resolve IP of current URL
+ host := cmn.URLToHost(url)
+ ips := cmn.HostToIP(host)
+ result = make(map[string]interface{})
+ // Censys
+ for _, ip := range ips {
+ rval := ScanWithCensys(conf.ExternalDetection.Censys.APIID, conf.ExternalDetection.Censys.APISecret, ip)
+ if rval != nil {
+ // add rval rows to result
+ for k, v := range rval {
+ result[k] = v
+ }
+ }
+ }
+ case "ssllabs":
+ // SSL Labs
+ result = ScanWithSSLLabs(url)
+ case "url_haus":
+ // URL Haus
+ result = ScanWithURLHaus(url)
+ case "threat_crowd":
+ // Threat Crowd
+ result = ScanWithThreatCrowd(url)
+ case "cuckoo_url":
+ // Cuckoo URL
+ result = ScanWithCuckoo(conf.ExternalDetection.Cuckoo.Host, url)
+ case "virus_total":
+ // Virus Total
+ result = ScanWithVirusTotal(conf.ExternalDetection.VirusTotal.APIKey, url)
+ case "phish_tank":
+ // Phish Tank
+ result = ScanWithPhishTank(conf.ExternalDetection.PhishTank.APIKey, url)
+ case "google_safe_browsing":
+ // Google Safe Browsing
+ result = ScanWithGoogleSafeBrowsing(conf.ExternalDetection.GoogleSafeBrowsing.APIKey, url)
+ case "open_phish":
+ // Open Phish
+ result = ScanWithOpenPhish(conf.ExternalDetection.OpenPhish.APIKey, url)
+ case "hybrid_analysis":
+ // Hybrid Analysis
+ result = ScanWithHybridAnalysis(conf.ExternalDetection.HybridAnalysis.APIKey, url)
+ case "cisco_umbrella":
+ // Cisco Umbrella
+ result = ScanWithCiscoUmbrella(conf.ExternalDetection.CiscoUmbrella.APIKey, url)
+ case "alien_vault":
+ // Alien Vault
+ result = ScanWithAlienVault(conf.ExternalDetection.AlienVault.APIKey, url)
+ case "shodan":
+ // Shodan
+ result = ScanWithShodan(conf.ExternalDetection.Shodan.APIKey, url)
+ case "virus_total_file":
+ // Virus Total File
+ result = ScanWithVirusTotalFile(conf.ExternalDetection.VirusTotal.APIKey, url)
+ case "hybrid_analysis_file":
+ // Hybrid Analysis File
+ result = ScanWithHybridAnalysisFile(conf.ExternalDetection.HybridAnalysis.APIKey, url)
+ case "cuckoo_file":
+ // Cuckoo File
+ result = ScanWithCuckooFile(conf.ExternalDetection.Cuckoo.Host, url)
+ default:
+ cmn.DebugMsg(cmn.DbgLvlError, "unknown external detection service: %s", externalDetection.Provider)
+ continue
+ }
+ cmn.DebugMsg(cmn.DbgLvlDebug3, "External Detection: %s", externalDetection.Provider)
+ // Add the external detection result ato detectedTech
+ if result != nil {
+ // transform the result to a JSON object
+ jsonResult, err := json.Marshal(result)
+ if err != nil {
+ cmn.DebugMsg(cmn.DbgLvlError, "marshalling external detection result: %s", err)
+ continue
+ }
+ resultStr := string(jsonResult)
+ updateDetectedTechCustom(detectedTech, ObjName, 10, externalDetection.Provider, resultStr)
+ }
+ }
+ }
+}
+
func detectTechByURL(url string, URLSignatures *map[string][]ruleset.URLMicroSignature, detectedTech *map[string]detectionEntityDetails) {
for ObjName := range *URLSignatures {
for _, signature := range (*URLSignatures)[ObjName] {
diff --git a/pkg/detection/types.go b/pkg/detection/types.go
index ac4b7d7..800b5e6 100644
--- a/pkg/detection/types.go
+++ b/pkg/detection/types.go
@@ -5,6 +5,7 @@ import (
"net/http"
cmn "github.com/pzaino/thecrowler/pkg/common"
+ cfg "github.com/pzaino/thecrowler/pkg/config"
ruleset "github.com/pzaino/thecrowler/pkg/ruleset"
"github.com/tebeka/selenium"
)
@@ -18,6 +19,7 @@ type DetectionContext struct {
HSSLInfo *SSLInfo `json:"ssl_info"` // (optional) the SSL information of the target website
ResponseBody *string `json:"response_body"` // (optional) the body of the HTTP response
RE *ruleset.RuleEngine // (required) the RuleEngine to use for the detection process
+ Config *cfg.Config // (required) the configuration to use for the detection process
}
// DetectedEntity is a struct to store the detected entity (technology, asset, etc.)
diff --git a/pkg/httpinfo/httpinfo.go b/pkg/httpinfo/httpinfo.go
index 21b5303..68399a1 100644
--- a/pkg/httpinfo/httpinfo.go
+++ b/pkg/httpinfo/httpinfo.go
@@ -112,7 +112,8 @@ func ExtractHTTPInfo(config Config, re *ruleset.RuleEngine, htmlContent string)
}
// Analyze response body for additional information
- detectedItems, err := analyzeResponse(resp, info, sslInfo, re, &htmlContent)
+ // TODO: We need to receive cfg.Config as a parameter to pass it to the detection engine
+ detectedItems, err := analyzeResponse(resp, info, sslInfo, re, &htmlContent, nil)
if err != nil {
return nil, err
}
@@ -274,7 +275,7 @@ func handleRedirect(req *http.Request, _ []*http.Request, config Config, transpo
// Note: In the future this needs to be moved in http_rules logic
func analyzeResponse(resp *http.Response, info *HTTPDetails,
sslInfo *SSLInfo, re *ruleset.RuleEngine,
- htmlContent *string) (map[string]detect.DetectedEntity, error) {
+ htmlContent *string, conf *cfg.Config) (map[string]detect.DetectedEntity, error) {
// Get the response headers
header := &(*info).ResponseHeaders
@@ -304,6 +305,8 @@ func analyzeResponse(resp *http.Response, info *HTTPDetails,
Header: header,
HSSLInfo: &sslInfoDetect,
ResponseBody: &responseBody,
+ RE: re,
+ Config: conf,
}
x := detect.DetectTechnologies(&detectCtx)
if x != nil {
diff --git a/pkg/ruleset/detectionrule.go b/pkg/ruleset/detectionrule.go
index 9d10f15..3faf907 100644
--- a/pkg/ruleset/detectionrule.go
+++ b/pkg/ruleset/detectionrule.go
@@ -44,6 +44,11 @@ func (d *DetectionRule) GetPluginCalls() []PluginCall {
return d.PluginCalls
}
+// GetExternalDetections returns the external detections for the specified detection rule.
+func (d *DetectionRule) GetExternalDetections() []ExternalDetection {
+ return d.ExternalDetections
+}
+
// GetAllHTTPHeaderFields returns the HTTP header fields for the specified detection rule.
func (d *DetectionRule) GetAllHTTPHeaderFields() []HTTPHeaderField {
return d.HTTPHeaderFields
@@ -241,6 +246,24 @@ func GetAllPluginCallsMap(d *[]DetectionRule) map[string][]PluginCall {
return pluginCalls
}
+// GetAllExternalDetectionsMap returns a map of all external detections for the specified detection rules.
+func GetAllExternalDetectionsMap(d *[]DetectionRule) map[string][]ExternalDetection {
+ externalDetections := make(map[string][]ExternalDetection)
+ for _, rule := range *d {
+ if rule.ExternalDetections == nil {
+ continue
+ }
+ // Check if the key already exists
+ if _, ok := externalDetections[strings.ToLower(rule.ObjectName)]; ok {
+ // Append the new external detections to the existing ones
+ externalDetections[strings.ToLower(rule.ObjectName)] = append(externalDetections[strings.ToLower(rule.ObjectName)], rule.ExternalDetections...)
+ continue
+ }
+ externalDetections[strings.ToLower(rule.ObjectName)] = rule.ExternalDetections
+ }
+ return externalDetections
+}
+
///// --------------------- HTTPHeaderField ------------------------------- /////
// GetKey returns the key of the HTTP header field.
diff --git a/pkg/ruleset/types.go b/pkg/ruleset/types.go
index c291eae..b9c3e8c 100644
--- a/pkg/ruleset/types.go
+++ b/pkg/ruleset/types.go
@@ -154,6 +154,7 @@ type DetectionRule struct {
MetaTags []MetaTag `yaml:"meta_tags,omitempty"`
Implies []string `yaml:"implies,omitempty"`
PluginCalls []PluginCall `yaml:"plugin_calls,omitempty"`
+ ExternalDetections []ExternalDetection `yaml:"external_detection,omitempty"`
}
// PluginCall represents a call to a plugin
@@ -168,6 +169,11 @@ type PluginParams struct {
ArgValue string `yaml:"parameter_value"`
}
+// ExternalDetection represents a call to an external detection service
+type ExternalDetection struct {
+ Provider string `yaml:"provider"`
+}
+
// HTTPHeaderField represents a pattern for matching HTTP header fields
type HTTPHeaderField struct {
Key string `yaml:"key"`
diff --git a/schemas/crowler-config-schema.json b/schemas/crowler-config-schema.json
index 67bd5dc..99e7df4 100644
--- a/schemas/crowler-config-schema.json
+++ b/schemas/crowler-config-schema.json
@@ -535,6 +535,21 @@
"description": "This is the rate limit for the General/Search API. It is the maximum number of requests that the CROWler General API will accept per second. You can use the ExprTerpreter language to set the rate limit.",
"type": "string"
},
+ "readheader_timeout": {
+ "title": "CROWler General/Search API Readheader Timeout",
+ "type": "integer",
+ "minimum": 10,
+ "description": "This is the readheader timeout (in seconds) for the General/Search API. It is the maximum amount of time that the CROWler will wait for the General/Search API to respond.",
+ "examples": [
+ 30
+ ]
+ },
+ "write_timeout": {
+ "title": "CROWler Engine General/Search API Write Timeout",
+ "type": "integer",
+ "minimum": 10,
+ "description": "This is the write timeout (in seconds) for the General/Search API. It is the maximum amount of time that the CROWler will wait for the control API to respond."
+ },
"enable_console": {
"title": "CROWler General/Search API Enable Admin Console",
"description": "This is a flag that tells the CROWler General API to enable the 'admin console' via the API. In other words, you'll get more endpoints to manage the CROWler via the General API instead of having to use local commands to do admin tasks.",
diff --git a/schemas/ruleset-schema.json b/schemas/ruleset-schema.json
index dbfab68..c59217d 100644
--- a/schemas/ruleset-schema.json
+++ b/schemas/ruleset-schema.json
@@ -257,8 +257,9 @@
"type": "object",
"properties": {
"rule_name": {
- "type": "string",
- "description": "A unique name identifying the action rule."
+ "title": "Rule Name",
+ "description": "A unique name identifying the action rule.",
+ "type": "string"
},
"url": {
"type": "string",
@@ -443,12 +444,14 @@
"type": "object",
"properties": {
"rule_name": {
- "type": "string",
- "description": "A unique name identifying the detection rule."
+ "title": "Rule Name",
+ "description": "A unique name identifying the detection rule.",
+ "type": "string"
},
"object_name": {
- "type": "string",
- "description": "The name of the object or technology to identify. This will also be the JSON key in the output. This is also the field to use for the 'implies' field if you want to imply other objects."
+ "title": "Object Name",
+ "description": "The name of the object or technology to identify. This will also be the JSON key in the output. This is also the field to use for the 'implies' field if you want to imply other objects.",
+ "type": "string"
},
"http_header_fields": {
"type": "array",
@@ -602,6 +605,49 @@
}
},
"description": "Optional. Call a plugin to detect the technology."
+ },
+ "external_detection": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "provider": {
+ "title": "Provider",
+ "description": "The name of the supported external detection provider.",
+ "type": "string",
+ "enum": [
+ "abuse_ipdb",
+ "alien_vault",
+ "censys",
+ "cisco_umbrella",
+ "grey_noise",
+ "google_safe_browsing",
+ "hybrid_analysis",
+ "ip_quality_score",
+ "ipvoid",
+ "malware_domain_list",
+ "shodan",
+ "virus_total",
+ "url_haus"
+ ],
+ "examples": [
+ "abuse_ipdb",
+ "alien_vault",
+ "censys",
+ "cisco_umbrella",
+ "grey_noise",
+ "google_safe_browsing",
+ "hybrid_analysis",
+ "ip_quality_score",
+ "ipvoid",
+ "malware_domain_list",
+ "shodan",
+ "virus_total",
+ "url_haus"
+ ]
+ }
+ }
+ }
}
},
"required": [
From acc4912e55421a88920c4dafa8f1bcc58d35186b Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Thu, 3 Oct 2024 13:01:40 +0100
Subject: [PATCH 06/12] Improved documnetation
---
doc/features.md | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/doc/features.md b/doc/features.md
index e0c9728..0c4e303 100644
--- a/doc/features.md
+++ b/doc/features.md
@@ -118,6 +118,15 @@ The **CROWler** is a comprehensive web crawling and scraping tool designed to pe
- *Benefits*: Provides insights into the security measures implemented by a website.
- **SSL/TLS Analysis**: Analyzes SSL/TLS certificates and configurations to identify security risks and compliance issues.
+ - The CROWler can detect and analyze the following:
+ - Certificate information
+ - Certificate chain (and order)
+ - Expiry date
+ - Key length
+ - Signature algorithm
+ - Cipher suites
+ - Protocols
+ - Vulnerabilities (e.g., Heartbleed, POODLE, DROWN)
- *Benefits*: Helps ensure secure communication between clients and servers.
- **3rd party Integration**: Integrates with third-party services like Shodan, VirusTotal, and others to gather additional information about web assets.
From c9afeae7b5e898c13464add8dd88fbce78939d26 Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Thu, 3 Oct 2024 15:09:25 +0100
Subject: [PATCH 07/12] Added new in-memory Key-Value DD to handle Rulesets
generated ENV variables and more
---
main.go | 5 +
pkg/common/common_test.go | 58 ++++++
pkg/common/json_test.go | 1 +
pkg/common/kl.go | 199 +++++++++++++++++++
pkg/common/kl_test.go | 402 ++++++++++++++++++++++++++++++++++++++
5 files changed, 665 insertions(+)
create mode 100644 pkg/common/kl.go
create mode 100644 pkg/common/kl_test.go
diff --git a/main.go b/main.go
index 100700f..b87eb8f 100644
--- a/main.go
+++ b/main.go
@@ -438,12 +438,17 @@ func initAll(configFile *string, config *cfg.Config,
db *cdb.Handler, seleniumInstances *chan crowler.SeleniumInstance,
RulesEngine *rules.RuleEngine, lmt **rate.Limiter) error {
var err error
+
// Reload the configuration file
*config, err = cfg.LoadConfig(*configFile)
if err != nil {
return fmt.Errorf("loading configuration file: %s", err)
}
+ // Reset Key-Value Store
+ cmn.KVStore = nil
+ cmn.KVStore = cmn.NewKeyValueStore()
+
// Reconnect to the database
*db, err = cdb.NewHandler(*config)
if err != nil {
diff --git a/pkg/common/common_test.go b/pkg/common/common_test.go
index 07d4c23..e801fbd 100644
--- a/pkg/common/common_test.go
+++ b/pkg/common/common_test.go
@@ -622,3 +622,61 @@ func TestStringToFloat32(t *testing.T) {
})
}
}
+
+func TestURLToHost(t *testing.T) {
+ tests := []struct {
+ name string
+ url string
+ expected string
+ }{
+ {
+ name: "URLToHost Test case 1",
+ url: "http://example.com/path",
+ expected: "example.com",
+ },
+ {
+ name: "URLToHost Test case 2",
+ url: "https://example.com/path/to/resource",
+ expected: "example.com",
+ },
+ {
+ name: "URLToHost Test case 3",
+ url: "ftp://example.com/resource",
+ expected: "example.com",
+ },
+ {
+ name: "URLToHost Test case 4",
+ url: "example.com/path",
+ expected: "example.com",
+ },
+ {
+ name: "URLToHost Test case 5",
+ url: "example.com",
+ expected: "example.com",
+ },
+ {
+ name: "URLToHost Test case 6",
+ url: "http://example.com/",
+ expected: "example.com",
+ },
+ {
+ name: "URLToHost Test case 7",
+ url: "http://example.com",
+ expected: "example.com",
+ },
+ {
+ name: "URLToHost Test case 8",
+ url: "http://example.com:8080/path",
+ expected: "example.com:8080",
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ result := URLToHost(test.url)
+ if result != test.expected {
+ t.Errorf("Expected host %q, but got %q", test.expected, result)
+ }
+ })
+ }
+}
diff --git a/pkg/common/json_test.go b/pkg/common/json_test.go
index 64004f4..dbb822e 100644
--- a/pkg/common/json_test.go
+++ b/pkg/common/json_test.go
@@ -1,3 +1,4 @@
+// Package common package is used to store common functions and variables
package common
import (
diff --git a/pkg/common/kl.go b/pkg/common/kl.go
new file mode 100644
index 0000000..254247f
--- /dev/null
+++ b/pkg/common/kl.go
@@ -0,0 +1,199 @@
+// Copyright 2023 Paolo Fabio Zaino
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package common package is used to store common functions and variables
+package common
+
+import (
+ "errors"
+ "fmt"
+ "sync"
+)
+
+var (
+ // KVStore is the global key-value store
+ KVStore *KeyValueStore
+)
+
+// Properties defines the additional attributes for each key-value entry.
+type Properties struct {
+ Persistent bool // Whether the entry should be persistent
+ Source string // The source of the key-value entry
+ CtxID string // Context ID for more specific identification
+}
+
+// Entry represents a key-value pair along with its properties.
+type Entry struct {
+ Value string
+ Properties Properties
+}
+
+// KeyValueStore stores key-value pairs with properties and ensures thread safety.
+type KeyValueStore struct {
+ store map[string]Entry
+ mutex sync.RWMutex
+}
+
+// NewKeyValueStore initializes the key-value store.
+func NewKeyValueStore() *KeyValueStore {
+ return &KeyValueStore{
+ store: make(map[string]Entry),
+ }
+}
+
+// createKeyWithCtx combines the key and CtxID to create a unique key.
+func createKeyWithCtx(key string, ctxID string) string {
+ return fmt.Sprintf("%s:%s", key, ctxID)
+}
+
+// Set sets a value along with its properties for a given key and context.
+func (kv *KeyValueStore) Set(key string, value string, properties Properties) {
+ fullKey := createKeyWithCtx(key, properties.CtxID)
+ kv.mutex.Lock()
+ defer kv.mutex.Unlock()
+ kv.store[fullKey] = Entry{
+ Value: value,
+ Properties: properties,
+ }
+}
+
+// Size returns the number of key-value pairs in the store.
+func (kv *KeyValueStore) Size() int {
+ kv.mutex.RLock()
+ defer kv.mutex.RUnlock()
+ return len(kv.store)
+}
+
+// Get retrieves the value and properties for a given key and context.
+func (kv *KeyValueStore) Get(key string, ctxID string) (string, Properties, error) {
+ fullKey := createKeyWithCtx(key, ctxID)
+ kv.mutex.RLock()
+ defer kv.mutex.RUnlock()
+
+ entry, exists := kv.store[fullKey]
+ if !exists {
+ return "", Properties{}, errors.New("key not found for context")
+ }
+ return entry.Value, entry.Properties, nil
+}
+
+// GetBySource retrieves the value and properties for a given key and source.
+func (kv *KeyValueStore) GetBySource(key string, source string) (string, Properties, error) {
+ kv.mutex.RLock()
+ defer kv.mutex.RUnlock()
+
+ for fullKey, entry := range kv.store {
+ if entry.Properties.Source == source && fullKey[:len(key)] == key {
+ return entry.Value, entry.Properties, nil
+ }
+ }
+ return "", Properties{}, errors.New("key not found for the specified source")
+}
+
+// GetWithCtx retrieves the value for a given key, considering both Source and CtxID if provided.
+func (kv *KeyValueStore) GetWithCtx(key string, source string, ctxID string) (string, Properties, error) {
+ fullKey := createKeyWithCtx(key, ctxID)
+ kv.mutex.RLock()
+ defer kv.mutex.RUnlock()
+
+ entry, exists := kv.store[fullKey]
+ if !exists {
+ return "", Properties{}, errors.New("key not found")
+ }
+
+ if source != "" && entry.Properties.Source != source {
+ return "", Properties{}, errors.New("source mismatch")
+ }
+
+ return entry.Value, entry.Properties, nil
+}
+
+// Delete removes a key-value pair by key and context.
+func (kv *KeyValueStore) Delete(key string, ctxID string) error {
+ fullKey := createKeyWithCtx(key, ctxID)
+ kv.mutex.Lock()
+ defer kv.mutex.Unlock()
+
+ if _, exists := kv.store[fullKey]; !exists {
+ return errors.New("key not found for context")
+ }
+
+ delete(kv.store, fullKey)
+ return nil
+}
+
+// DeleteNonPersistent removes all key-value pairs that are not persistent.
+func (kv *KeyValueStore) DeleteNonPersistent() {
+ kv.mutex.Lock()
+ defer kv.mutex.Unlock()
+
+ for key, entry := range kv.store {
+ if !entry.Properties.Persistent {
+ delete(kv.store, key)
+ }
+ }
+}
+
+// DeleteAll clears all key-value pairs from the store.
+func (kv *KeyValueStore) DeleteAll() {
+ kv.mutex.Lock()
+ defer kv.mutex.Unlock()
+
+ kv.store = make(map[string]Entry)
+}
+
+// Keys returns a slice of all keys in the store (ignoring context).
+func (kv *KeyValueStore) Keys() []string {
+ kv.mutex.RLock()
+ defer kv.mutex.RUnlock()
+
+ keys := make([]string, 0, len(kv.store))
+ for key := range kv.store {
+ keys = append(keys, key)
+ }
+ return keys
+}
+
+/*
+func main() {
+ kvStore := NewKeyValueStore()
+
+ // Simulate concurrent writing with different contexts
+ var wg sync.WaitGroup
+ wg.Add(2)
+
+ // Thread 1 with CtxID "123"
+ go func() {
+ defer wg.Done()
+ kvStore.Set("username", "admin", "123")
+ fmt.Println("Thread 1 finished writing")
+ }()
+
+ // Thread 2 with CtxID "456"
+ go func() {
+ defer wg.Done()
+ kvStore.Set("username", "user2", "456")
+ fmt.Println("Thread 2 finished writing")
+ }()
+
+ wg.Wait()
+
+ // Retrieve values from different contexts
+ user1, _ := kvStore.Get("username", "123")
+ user2, _ := kvStore.Get("username", "456")
+
+ fmt.Println("User 1:", user1)
+ fmt.Println("User 2:", user2)
+}
+*/
diff --git a/pkg/common/kl_test.go b/pkg/common/kl_test.go
new file mode 100644
index 0000000..95affcc
--- /dev/null
+++ b/pkg/common/kl_test.go
@@ -0,0 +1,402 @@
+// Package common package is used to store common functions and variables
+package common
+
+import (
+ "errors"
+ "fmt"
+ "testing"
+)
+
+func TestNewKeyValueStore(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ if kvStore == nil {
+ t.Fatalf("Expected non-nil KeyValueStore, got nil")
+ }
+
+ if kvStore.store == nil {
+ t.Fatalf("Expected non-nil store map, got nil")
+ }
+
+ if len(kvStore.store) != 0 {
+ t.Fatalf("Expected empty store map, got %d elements", len(kvStore.store))
+ }
+}
+
+func TestCreateKeyWithCtx(t *testing.T) {
+ tests := []struct {
+ key string
+ ctxID string
+ expected string
+ }{
+ {"username", "123", "username:123"},
+ {"email", "456", "email:456"},
+ {"", "789", ":789"},
+ {"password", "", "password:"},
+ {"", "", ":"},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
+ result := createKeyWithCtx(tt.key, tt.ctxID)
+ if result != tt.expected {
+ t.Errorf("expected %s, got %s", tt.expected, result)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_Set(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ tests := []struct {
+ key string
+ value string
+ properties Properties
+ }{
+ {"username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"}},
+ {"email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"}},
+ {"password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""}},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.properties.CtxID), func(t *testing.T) {
+ kvStore.Set(tt.key, tt.value, tt.properties)
+
+ fullKey := createKeyWithCtx(tt.key, tt.properties.CtxID)
+ kvStore.mutex.RLock()
+ entry, exists := kvStore.store[fullKey]
+ kvStore.mutex.RUnlock()
+
+ if !exists {
+ t.Fatalf("Expected key %s to exist", fullKey)
+ }
+
+ if entry.Value != tt.value {
+ t.Errorf("Expected value %s, got %s", tt.value, entry.Value)
+ }
+
+ if entry.Properties != tt.properties {
+ t.Errorf("Expected properties %+v, got %+v", tt.properties, entry.Properties)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_Get(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+
+ tests := []struct {
+ key string
+ ctxID string
+ expected string
+ err error
+ }{
+ {"username", "123", "admin", nil},
+ {"email", "456", "admin@example.com", nil},
+ {"password", "", "secret", nil},
+ {"username", "999", "", errors.New("key not found for context")},
+ {"nonexistent", "123", "", errors.New("key not found for context")},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
+ value, _, err := kvStore.Get(tt.key, tt.ctxID)
+
+ if err != nil && err.Error() != tt.err.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.err, err)
+ }
+
+ if value != tt.expected {
+ t.Errorf("Expected value %s, got %s", tt.expected, value)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_GetBySource(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ kvStore.Set("username", "admin", Properties{Persistent: true, Source: "testSource", CtxID: "123"})
+ kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "testSource", CtxID: "456"})
+ kvStore.Set("password", "secret", Properties{Persistent: true, Source: "anotherSource", CtxID: ""})
+
+ tests := []struct {
+ key string
+ source string
+ expected string
+ err error
+ }{
+ {"username", "testSource", "admin", nil},
+ {"email", "testSource", "admin@example.com", nil},
+ {"password", "anotherSource", "secret", nil},
+ {"username", "nonexistentSource", "", errors.New("key not found for the specified source")},
+ {"nonexistent", "testSource", "", errors.New("key not found for the specified source")},
+ }
+
+ for i, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.source), func(t *testing.T) {
+ if i < kvStore.Size() {
+ value, _, err := kvStore.GetBySource(tt.key, tt.source)
+
+ if err != nil && err.Error() != tt.err.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.err, err)
+ }
+
+ if value != tt.expected {
+ t.Errorf("Expected value %s, got %s", tt.expected, value)
+ }
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_Size(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Test empty store
+ if size := kvStore.Size(); size != 0 {
+ t.Fatalf("Expected size 0, got %d", size)
+ }
+
+ // Add some entries
+ kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+
+ // Test size after adding entries
+ if size := kvStore.Size(); size != 3 {
+ t.Fatalf("Expected size 3, got %d", size)
+ }
+
+ // Delete an entry and test size
+ err := kvStore.Delete("username", "123")
+ if err != nil {
+ t.Fatalf("Error deleting key: %v", err)
+ }
+ if size := kvStore.Size(); size != 2 {
+ t.Fatalf("Expected size 2, got %d", size)
+ }
+
+ // Clear all entries and test size
+ kvStore.DeleteAll()
+ if size := kvStore.Size(); size != 0 {
+ t.Fatalf("Expected size 0, got %d", size)
+ }
+}
+
+func TestKeyValueStore_GetWithCtx(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ kvStore.Set("username", "admin", Properties{Persistent: true, Source: "testSource", CtxID: "123"})
+ kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "testSource", CtxID: "456"})
+ kvStore.Set("password", "secret", Properties{Persistent: true, Source: "anotherSource", CtxID: ""})
+
+ tests := []struct {
+ key string
+ source string
+ ctxID string
+ expected string
+ err error
+ }{
+ {"username", "testSource", "123", "admin", nil},
+ {"email", "testSource", "456", "admin@example.com", nil},
+ {"password", "anotherSource", "", "secret", nil},
+ {"username", "wrongSource", "123", "", errors.New("source mismatch")},
+ {"username", "testSource", "999", "", errors.New("key not found")},
+ {"nonexistent", "testSource", "123", "", errors.New("key not found")},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s_%s", tt.key, tt.source, tt.ctxID), func(t *testing.T) {
+ value, _, err := kvStore.GetWithCtx(tt.key, tt.source, tt.ctxID)
+
+ if err != nil && err.Error() != tt.err.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.err, err)
+ }
+
+ if value != tt.expected {
+ t.Errorf("Expected value %s, got %s", tt.expected, value)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_Delete(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+
+ tests := []struct {
+ key string
+ ctxID string
+ expected error
+ }{
+ {"username", "123", nil},
+ {"email", "456", nil},
+ {"password", "", nil},
+ {"nonexistent", "123", errors.New("key not found for context")},
+ {"username", "999", errors.New("key not found for context")},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
+ err := kvStore.Delete(tt.key, tt.ctxID)
+
+ if err != nil && err.Error() != tt.expected.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.expected, err)
+ }
+
+ if err == nil {
+ // Verify the key has been deleted
+ _, _, getErr := kvStore.Get(tt.key, tt.ctxID)
+ if getErr == nil {
+ t.Errorf("Expected key %s to be deleted, but it still exists", tt.key)
+ }
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_DeleteNonPersistent(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+ kvStore.Set("session", "xyz", Properties{Persistent: false, Source: "test", CtxID: "789"})
+
+ // Ensure the store has the expected number of entries before deletion
+ if size := kvStore.Size(); size != 4 {
+ t.Fatalf("Expected size 4, got %d", size)
+ }
+
+ // Perform the deletion of non-persistent entries
+ kvStore.DeleteNonPersistent()
+
+ // Ensure the store has the expected number of entries after deletion
+ if size := kvStore.Size(); size != 2 {
+ t.Fatalf("Expected size 2, got %d", size)
+ }
+
+ // Verify that only persistent entries remain
+ tests := []struct {
+ key string
+ ctxID string
+ expected string
+ err error
+ }{
+ {"username", "123", "admin", nil},
+ {"password", "", "secret", nil},
+ {"email", "456", "", errors.New("key not found for context")},
+ {"session", "789", "", errors.New("key not found for context")},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
+ value, _, err := kvStore.Get(tt.key, tt.ctxID)
+
+ if err != nil && err.Error() != tt.err.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.err, err)
+ }
+
+ if value != tt.expected {
+ t.Errorf("Expected value %s, got %s", tt.expected, value)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_DeleteAll(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+
+ // Ensure the store has the expected number of entries before deletion
+ if size := kvStore.Size(); size != 3 {
+ t.Fatalf("Expected size 3, got %d", size)
+ }
+
+ // Perform the deletion of all entries
+ kvStore.DeleteAll()
+
+ // Ensure the store is empty after deletion
+ if size := kvStore.Size(); size != 0 {
+ t.Fatalf("Expected size 0, got %d", size)
+ }
+
+ // Verify that all entries have been deleted
+ tests := []struct {
+ key string
+ ctxID string
+ err error
+ }{
+ {"username", "123", errors.New("key not found for context")},
+ {"email", "456", errors.New("key not found for context")},
+ {"password", "", errors.New("key not found for context")},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
+ _, _, err := kvStore.Get(tt.key, tt.ctxID)
+
+ if err == nil || err.Error() != tt.err.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.err, err)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_Keys(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Test with an empty store
+ keys := kvStore.Keys()
+ if len(keys) != 0 {
+ t.Fatalf("Expected 0 keys, got %d", len(keys))
+ }
+
+ // Add some entries
+ kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+
+ // Test with a populated store
+ keys = kvStore.Keys()
+ expectedKeys := []string{
+ createKeyWithCtx("username", "123"),
+ createKeyWithCtx("email", "456"),
+ createKeyWithCtx("password", ""),
+ }
+
+ if len(keys) != len(expectedKeys) {
+ t.Fatalf("Expected %d keys, got %d", len(expectedKeys), len(keys))
+ }
+
+ for _, expectedKey := range expectedKeys {
+ found := false
+ for _, key := range keys {
+ if key == expectedKey {
+ found = true
+ break
+ }
+ }
+ if !found {
+ t.Errorf("Expected key %s not found in keys", expectedKey)
+ }
+ }
+}
From 24e522c99bc906bd43ce0e85f326b8961edb913c Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Thu, 3 Oct 2024 18:25:44 +0100
Subject: [PATCH 08/12] Completed 50% of the work required to handle of ENV
settings in a ruleset
---
pkg/common/kl_test.go | 402 ---------------------
pkg/common/{kl.go => kvstore.go} | 133 ++++---
pkg/common/kvstore_test.go | 598 +++++++++++++++++++++++++++++++
pkg/ruleset/rulesengine_test.go | 7 +-
pkg/ruleset/ruleset.go | 19 +
pkg/ruleset/types.go | 47 ++-
schemas/ruleset-schema.json | 17 +
7 files changed, 745 insertions(+), 478 deletions(-)
delete mode 100644 pkg/common/kl_test.go
rename pkg/common/{kl.go => kvstore.go} (59%)
create mode 100644 pkg/common/kvstore_test.go
diff --git a/pkg/common/kl_test.go b/pkg/common/kl_test.go
deleted file mode 100644
index 95affcc..0000000
--- a/pkg/common/kl_test.go
+++ /dev/null
@@ -1,402 +0,0 @@
-// Package common package is used to store common functions and variables
-package common
-
-import (
- "errors"
- "fmt"
- "testing"
-)
-
-func TestNewKeyValueStore(t *testing.T) {
- kvStore := NewKeyValueStore()
-
- if kvStore == nil {
- t.Fatalf("Expected non-nil KeyValueStore, got nil")
- }
-
- if kvStore.store == nil {
- t.Fatalf("Expected non-nil store map, got nil")
- }
-
- if len(kvStore.store) != 0 {
- t.Fatalf("Expected empty store map, got %d elements", len(kvStore.store))
- }
-}
-
-func TestCreateKeyWithCtx(t *testing.T) {
- tests := []struct {
- key string
- ctxID string
- expected string
- }{
- {"username", "123", "username:123"},
- {"email", "456", "email:456"},
- {"", "789", ":789"},
- {"password", "", "password:"},
- {"", "", ":"},
- }
-
- for _, tt := range tests {
- t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
- result := createKeyWithCtx(tt.key, tt.ctxID)
- if result != tt.expected {
- t.Errorf("expected %s, got %s", tt.expected, result)
- }
- })
- }
-}
-
-func TestKeyValueStore_Set(t *testing.T) {
- kvStore := NewKeyValueStore()
-
- tests := []struct {
- key string
- value string
- properties Properties
- }{
- {"username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"}},
- {"email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"}},
- {"password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""}},
- }
-
- for _, tt := range tests {
- t.Run(fmt.Sprintf("%s_%s", tt.key, tt.properties.CtxID), func(t *testing.T) {
- kvStore.Set(tt.key, tt.value, tt.properties)
-
- fullKey := createKeyWithCtx(tt.key, tt.properties.CtxID)
- kvStore.mutex.RLock()
- entry, exists := kvStore.store[fullKey]
- kvStore.mutex.RUnlock()
-
- if !exists {
- t.Fatalf("Expected key %s to exist", fullKey)
- }
-
- if entry.Value != tt.value {
- t.Errorf("Expected value %s, got %s", tt.value, entry.Value)
- }
-
- if entry.Properties != tt.properties {
- t.Errorf("Expected properties %+v, got %+v", tt.properties, entry.Properties)
- }
- })
- }
-}
-
-func TestKeyValueStore_Get(t *testing.T) {
- kvStore := NewKeyValueStore()
-
- // Prepopulate the store with some entries
- kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
- kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
- kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
-
- tests := []struct {
- key string
- ctxID string
- expected string
- err error
- }{
- {"username", "123", "admin", nil},
- {"email", "456", "admin@example.com", nil},
- {"password", "", "secret", nil},
- {"username", "999", "", errors.New("key not found for context")},
- {"nonexistent", "123", "", errors.New("key not found for context")},
- }
-
- for _, tt := range tests {
- t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
- value, _, err := kvStore.Get(tt.key, tt.ctxID)
-
- if err != nil && err.Error() != tt.err.Error() {
- t.Fatalf("Expected error %v, got %v", tt.err, err)
- }
-
- if value != tt.expected {
- t.Errorf("Expected value %s, got %s", tt.expected, value)
- }
- })
- }
-}
-
-func TestKeyValueStore_GetBySource(t *testing.T) {
- kvStore := NewKeyValueStore()
-
- // Prepopulate the store with some entries
- kvStore.Set("username", "admin", Properties{Persistent: true, Source: "testSource", CtxID: "123"})
- kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "testSource", CtxID: "456"})
- kvStore.Set("password", "secret", Properties{Persistent: true, Source: "anotherSource", CtxID: ""})
-
- tests := []struct {
- key string
- source string
- expected string
- err error
- }{
- {"username", "testSource", "admin", nil},
- {"email", "testSource", "admin@example.com", nil},
- {"password", "anotherSource", "secret", nil},
- {"username", "nonexistentSource", "", errors.New("key not found for the specified source")},
- {"nonexistent", "testSource", "", errors.New("key not found for the specified source")},
- }
-
- for i, tt := range tests {
- t.Run(fmt.Sprintf("%s_%s", tt.key, tt.source), func(t *testing.T) {
- if i < kvStore.Size() {
- value, _, err := kvStore.GetBySource(tt.key, tt.source)
-
- if err != nil && err.Error() != tt.err.Error() {
- t.Fatalf("Expected error %v, got %v", tt.err, err)
- }
-
- if value != tt.expected {
- t.Errorf("Expected value %s, got %s", tt.expected, value)
- }
- }
- })
- }
-}
-
-func TestKeyValueStore_Size(t *testing.T) {
- kvStore := NewKeyValueStore()
-
- // Test empty store
- if size := kvStore.Size(); size != 0 {
- t.Fatalf("Expected size 0, got %d", size)
- }
-
- // Add some entries
- kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
- kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
- kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
-
- // Test size after adding entries
- if size := kvStore.Size(); size != 3 {
- t.Fatalf("Expected size 3, got %d", size)
- }
-
- // Delete an entry and test size
- err := kvStore.Delete("username", "123")
- if err != nil {
- t.Fatalf("Error deleting key: %v", err)
- }
- if size := kvStore.Size(); size != 2 {
- t.Fatalf("Expected size 2, got %d", size)
- }
-
- // Clear all entries and test size
- kvStore.DeleteAll()
- if size := kvStore.Size(); size != 0 {
- t.Fatalf("Expected size 0, got %d", size)
- }
-}
-
-func TestKeyValueStore_GetWithCtx(t *testing.T) {
- kvStore := NewKeyValueStore()
-
- // Prepopulate the store with some entries
- kvStore.Set("username", "admin", Properties{Persistent: true, Source: "testSource", CtxID: "123"})
- kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "testSource", CtxID: "456"})
- kvStore.Set("password", "secret", Properties{Persistent: true, Source: "anotherSource", CtxID: ""})
-
- tests := []struct {
- key string
- source string
- ctxID string
- expected string
- err error
- }{
- {"username", "testSource", "123", "admin", nil},
- {"email", "testSource", "456", "admin@example.com", nil},
- {"password", "anotherSource", "", "secret", nil},
- {"username", "wrongSource", "123", "", errors.New("source mismatch")},
- {"username", "testSource", "999", "", errors.New("key not found")},
- {"nonexistent", "testSource", "123", "", errors.New("key not found")},
- }
-
- for _, tt := range tests {
- t.Run(fmt.Sprintf("%s_%s_%s", tt.key, tt.source, tt.ctxID), func(t *testing.T) {
- value, _, err := kvStore.GetWithCtx(tt.key, tt.source, tt.ctxID)
-
- if err != nil && err.Error() != tt.err.Error() {
- t.Fatalf("Expected error %v, got %v", tt.err, err)
- }
-
- if value != tt.expected {
- t.Errorf("Expected value %s, got %s", tt.expected, value)
- }
- })
- }
-}
-
-func TestKeyValueStore_Delete(t *testing.T) {
- kvStore := NewKeyValueStore()
-
- // Prepopulate the store with some entries
- kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
- kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
- kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
-
- tests := []struct {
- key string
- ctxID string
- expected error
- }{
- {"username", "123", nil},
- {"email", "456", nil},
- {"password", "", nil},
- {"nonexistent", "123", errors.New("key not found for context")},
- {"username", "999", errors.New("key not found for context")},
- }
-
- for _, tt := range tests {
- t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
- err := kvStore.Delete(tt.key, tt.ctxID)
-
- if err != nil && err.Error() != tt.expected.Error() {
- t.Fatalf("Expected error %v, got %v", tt.expected, err)
- }
-
- if err == nil {
- // Verify the key has been deleted
- _, _, getErr := kvStore.Get(tt.key, tt.ctxID)
- if getErr == nil {
- t.Errorf("Expected key %s to be deleted, but it still exists", tt.key)
- }
- }
- })
- }
-}
-
-func TestKeyValueStore_DeleteNonPersistent(t *testing.T) {
- kvStore := NewKeyValueStore()
-
- // Prepopulate the store with some entries
- kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
- kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
- kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
- kvStore.Set("session", "xyz", Properties{Persistent: false, Source: "test", CtxID: "789"})
-
- // Ensure the store has the expected number of entries before deletion
- if size := kvStore.Size(); size != 4 {
- t.Fatalf("Expected size 4, got %d", size)
- }
-
- // Perform the deletion of non-persistent entries
- kvStore.DeleteNonPersistent()
-
- // Ensure the store has the expected number of entries after deletion
- if size := kvStore.Size(); size != 2 {
- t.Fatalf("Expected size 2, got %d", size)
- }
-
- // Verify that only persistent entries remain
- tests := []struct {
- key string
- ctxID string
- expected string
- err error
- }{
- {"username", "123", "admin", nil},
- {"password", "", "secret", nil},
- {"email", "456", "", errors.New("key not found for context")},
- {"session", "789", "", errors.New("key not found for context")},
- }
-
- for _, tt := range tests {
- t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
- value, _, err := kvStore.Get(tt.key, tt.ctxID)
-
- if err != nil && err.Error() != tt.err.Error() {
- t.Fatalf("Expected error %v, got %v", tt.err, err)
- }
-
- if value != tt.expected {
- t.Errorf("Expected value %s, got %s", tt.expected, value)
- }
- })
- }
-}
-
-func TestKeyValueStore_DeleteAll(t *testing.T) {
- kvStore := NewKeyValueStore()
-
- // Prepopulate the store with some entries
- kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
- kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
- kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
-
- // Ensure the store has the expected number of entries before deletion
- if size := kvStore.Size(); size != 3 {
- t.Fatalf("Expected size 3, got %d", size)
- }
-
- // Perform the deletion of all entries
- kvStore.DeleteAll()
-
- // Ensure the store is empty after deletion
- if size := kvStore.Size(); size != 0 {
- t.Fatalf("Expected size 0, got %d", size)
- }
-
- // Verify that all entries have been deleted
- tests := []struct {
- key string
- ctxID string
- err error
- }{
- {"username", "123", errors.New("key not found for context")},
- {"email", "456", errors.New("key not found for context")},
- {"password", "", errors.New("key not found for context")},
- }
-
- for _, tt := range tests {
- t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
- _, _, err := kvStore.Get(tt.key, tt.ctxID)
-
- if err == nil || err.Error() != tt.err.Error() {
- t.Fatalf("Expected error %v, got %v", tt.err, err)
- }
- })
- }
-}
-
-func TestKeyValueStore_Keys(t *testing.T) {
- kvStore := NewKeyValueStore()
-
- // Test with an empty store
- keys := kvStore.Keys()
- if len(keys) != 0 {
- t.Fatalf("Expected 0 keys, got %d", len(keys))
- }
-
- // Add some entries
- kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
- kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
- kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
-
- // Test with a populated store
- keys = kvStore.Keys()
- expectedKeys := []string{
- createKeyWithCtx("username", "123"),
- createKeyWithCtx("email", "456"),
- createKeyWithCtx("password", ""),
- }
-
- if len(keys) != len(expectedKeys) {
- t.Fatalf("Expected %d keys, got %d", len(expectedKeys), len(keys))
- }
-
- for _, expectedKey := range expectedKeys {
- found := false
- for _, key := range keys {
- if key == expectedKey {
- found = true
- break
- }
- }
- if !found {
- t.Errorf("Expected key %s not found in keys", expectedKey)
- }
- }
-}
diff --git a/pkg/common/kl.go b/pkg/common/kvstore.go
similarity index 59%
rename from pkg/common/kl.go
rename to pkg/common/kvstore.go
index 254247f..6cfe77d 100644
--- a/pkg/common/kl.go
+++ b/pkg/common/kvstore.go
@@ -18,24 +18,25 @@ package common
import (
"errors"
"fmt"
+ "reflect"
"sync"
)
-var (
- // KVStore is the global key-value store
- KVStore *KeyValueStore
-)
+// KVStore is the global key-value store
+var KVStore *KeyValueStore
// Properties defines the additional attributes for each key-value entry.
type Properties struct {
- Persistent bool // Whether the entry should be persistent
- Source string // The source of the key-value entry
+ Persistent bool `yaml:"persistent"` // Whether the entry should be persistent
+ Static bool `yaml:"static"` // Whether the entry should be static
+ Source string `yaml:"source"` // The source of the key-value entry
CtxID string // Context ID for more specific identification
+ Type string // The type of the stored value (e.g., "string", "[]string")
}
// Entry represents a key-value pair along with its properties.
type Entry struct {
- Value string
+ Value interface{}
Properties Properties
}
@@ -52,44 +53,69 @@ func NewKeyValueStore() *KeyValueStore {
}
}
+// NewKVStoreProperty initializes a new Properties object.
+func NewKVStoreProperty(persistent bool, static bool, source string, ctxID string, valueType string) Properties {
+ return Properties{
+ Persistent: persistent,
+ Static: static,
+ Source: source,
+ CtxID: ctxID,
+ Type: valueType, // Store the original type
+ }
+}
+
// createKeyWithCtx combines the key and CtxID to create a unique key.
func createKeyWithCtx(key string, ctxID string) string {
return fmt.Sprintf("%s:%s", key, ctxID)
}
-// Set sets a value along with its properties for a given key and context.
-func (kv *KeyValueStore) Set(key string, value string, properties Properties) {
+// Set sets a value (either string or []string) along with its properties for a given key and context.
+func (kv *KeyValueStore) Set(key string, value interface{}, properties Properties) error {
+ // Store the type of the value in the properties
+ properties.Type = reflect.TypeOf(value).String()
+
fullKey := createKeyWithCtx(key, properties.CtxID)
kv.mutex.Lock()
defer kv.mutex.Unlock()
- kv.store[fullKey] = Entry{
- Value: value,
- Properties: properties,
+ setEntry := true
+ if properties.Static {
+ properties.Persistent = true
+ // Search for existing key with the same context
+ if entry, exists := kv.store[fullKey]; exists {
+ setEntry = false
+ // If the existing entry is not static, update it
+ if !entry.Properties.Static {
+ kv.store[fullKey] = Entry{
+ Value: value,
+ Properties: properties,
+ }
+ }
+ }
}
+ if setEntry {
+ kv.store[fullKey] = Entry{
+ Value: value,
+ Properties: properties,
+ }
+ }
+ return nil
}
-// Size returns the number of key-value pairs in the store.
-func (kv *KeyValueStore) Size() int {
- kv.mutex.RLock()
- defer kv.mutex.RUnlock()
- return len(kv.store)
-}
-
-// Get retrieves the value and properties for a given key and context.
-func (kv *KeyValueStore) Get(key string, ctxID string) (string, Properties, error) {
+// Get retrieves the value (which could be string or []string) and properties for a given key and context.
+func (kv *KeyValueStore) Get(key string, ctxID string) (interface{}, Properties, error) {
fullKey := createKeyWithCtx(key, ctxID)
kv.mutex.RLock()
defer kv.mutex.RUnlock()
entry, exists := kv.store[fullKey]
if !exists {
- return "", Properties{}, errors.New("key not found for context")
+ return nil, Properties{}, errors.New("key not found for context")
}
return entry.Value, entry.Properties, nil
}
// GetBySource retrieves the value and properties for a given key and source.
-func (kv *KeyValueStore) GetBySource(key string, source string) (string, Properties, error) {
+func (kv *KeyValueStore) GetBySource(key string, source string) (interface{}, Properties, error) {
kv.mutex.RLock()
defer kv.mutex.RUnlock()
@@ -98,22 +124,22 @@ func (kv *KeyValueStore) GetBySource(key string, source string) (string, Propert
return entry.Value, entry.Properties, nil
}
}
- return "", Properties{}, errors.New("key not found for the specified source")
+ return nil, Properties{}, errors.New("key not found for the specified source")
}
// GetWithCtx retrieves the value for a given key, considering both Source and CtxID if provided.
-func (kv *KeyValueStore) GetWithCtx(key string, source string, ctxID string) (string, Properties, error) {
+func (kv *KeyValueStore) GetWithCtx(key string, source string, ctxID string) (interface{}, Properties, error) {
fullKey := createKeyWithCtx(key, ctxID)
kv.mutex.RLock()
defer kv.mutex.RUnlock()
entry, exists := kv.store[fullKey]
if !exists {
- return "", Properties{}, errors.New("key not found")
+ return nil, Properties{}, errors.New("key not found")
}
if source != "" && entry.Properties.Source != source {
- return "", Properties{}, errors.New("source mismatch")
+ return nil, Properties{}, errors.New("source mismatch")
}
return entry.Value, entry.Properties, nil
@@ -133,6 +159,14 @@ func (kv *KeyValueStore) Delete(key string, ctxID string) error {
return nil
}
+// Size returns the number of key-value pairs in the store.
+func (kv *KeyValueStore) Size() int {
+ kv.mutex.RLock()
+ defer kv.mutex.RUnlock()
+
+ return len(kv.store)
+}
+
// DeleteNonPersistent removes all key-value pairs that are not persistent.
func (kv *KeyValueStore) DeleteNonPersistent() {
kv.mutex.Lock()
@@ -153,8 +187,8 @@ func (kv *KeyValueStore) DeleteAll() {
kv.store = make(map[string]Entry)
}
-// Keys returns a slice of all keys in the store (ignoring context).
-func (kv *KeyValueStore) Keys() []string {
+// AllKeys returns a slice of all keys in the store (ignoring context).
+func (kv *KeyValueStore) AllKeys() []string {
kv.mutex.RLock()
defer kv.mutex.RUnlock()
@@ -165,35 +199,16 @@ func (kv *KeyValueStore) Keys() []string {
return keys
}
-/*
-func main() {
- kvStore := NewKeyValueStore()
-
- // Simulate concurrent writing with different contexts
- var wg sync.WaitGroup
- wg.Add(2)
-
- // Thread 1 with CtxID "123"
- go func() {
- defer wg.Done()
- kvStore.Set("username", "admin", "123")
- fmt.Println("Thread 1 finished writing")
- }()
-
- // Thread 2 with CtxID "456"
- go func() {
- defer wg.Done()
- kvStore.Set("username", "user2", "456")
- fmt.Println("Thread 2 finished writing")
- }()
-
- wg.Wait()
-
- // Retrieve values from different contexts
- user1, _ := kvStore.Get("username", "123")
- user2, _ := kvStore.Get("username", "456")
+// Keys returns a slice of all keys in the store for a given context.
+func (kv *KeyValueStore) Keys(ctxID string) []string {
+ kv.mutex.RLock()
+ defer kv.mutex.RUnlock()
- fmt.Println("User 1:", user1)
- fmt.Println("User 2:", user2)
+ keys := make([]string, 0, len(kv.store))
+ for key := range kv.store {
+ if key[len(key)-len(ctxID):] == ctxID {
+ keys = append(keys, key)
+ }
+ }
+ return keys
}
-*/
diff --git a/pkg/common/kvstore_test.go b/pkg/common/kvstore_test.go
new file mode 100644
index 0000000..3b495cd
--- /dev/null
+++ b/pkg/common/kvstore_test.go
@@ -0,0 +1,598 @@
+// Package common package is used to store common functions and variables
+package common
+
+import (
+ "errors"
+ "fmt"
+ "reflect"
+ "testing"
+)
+
+func TestNewKeyValueStore(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ if kvStore == nil {
+ t.Fatalf("Expected non-nil KeyValueStore, got nil")
+ }
+
+ if kvStore.store == nil {
+ t.Fatalf("Expected non-nil store map, got nil")
+ }
+
+ if len(kvStore.store) != 0 {
+ t.Fatalf("Expected empty store map, got %d elements", len(kvStore.store))
+ }
+}
+
+func TestCreateKeyWithCtx(t *testing.T) {
+ tests := []struct {
+ key string
+ ctxID string
+ expected string
+ }{
+ {"username", "123", "username:123"},
+ {"email", "456", "email:456"},
+ {"", "789", ":789"},
+ {"password", "", "password:"},
+ {"", "", ":"},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
+ result := createKeyWithCtx(tt.key, tt.ctxID)
+ if result != tt.expected {
+ t.Errorf("expected %s, got %s", tt.expected, result)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_Set(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ tests := []struct {
+ key string
+ value interface{} // Can be either string or []string
+ properties Properties
+ expectedType string // Expected type for the stored value
+ }{
+ {"username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"}, "string"},
+ {"email", []string{"admin@example.com", "user@example.com"}, Properties{Persistent: false, Source: "test", CtxID: "456"}, "[]string"},
+ {"password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""}, "string"},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.properties.CtxID), func(t *testing.T) {
+ err := kvStore.Set(tt.key, tt.value, tt.properties)
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ fullKey := createKeyWithCtx(tt.key, tt.properties.CtxID)
+ kvStore.mutex.RLock()
+ entry, exists := kvStore.store[fullKey]
+ kvStore.mutex.RUnlock()
+
+ if !exists {
+ t.Fatalf("Expected key %s to exist", fullKey)
+ }
+
+ if !reflect.DeepEqual(entry.Value, tt.value) {
+ t.Errorf("Expected value %v, got %v", tt.value, entry.Value)
+ }
+
+ // Check if the stored type matches the expected type
+ if entry.Properties.Type != tt.expectedType {
+ t.Errorf("Expected type %v, got %v", tt.expectedType, entry.Properties.Type)
+ }
+
+ if entry.Properties.Source != tt.properties.Source || entry.Properties.Persistent != tt.properties.Persistent {
+ t.Errorf("Expected properties %+v, got %+v", tt.properties, entry.Properties)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_Get(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123", Type: "string"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("email", []string{"admin@example.com", "user@example.com"}, Properties{Persistent: false, Source: "test", CtxID: "456", Type: "[]string"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: "", Type: "string"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ tests := []struct {
+ key string
+ ctxID string
+ expected interface{} // Can be string or []string
+ expectedType string // Expected type for the stored value
+ err error
+ }{
+ {"username", "123", "admin", "string", nil},
+ {"email", "456", []string{"admin@example.com", "user@example.com"}, "[]string", nil},
+ {"password", "", "secret", "string", nil},
+ {"username", "999", nil, "", errors.New("key not found for context")},
+ {"nonexistent", "123", nil, "", errors.New("key not found for context")},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
+ value, props, err := kvStore.Get(tt.key, tt.ctxID)
+
+ if err != nil && err.Error() != tt.err.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.err, err)
+ }
+
+ if !reflect.DeepEqual(value, tt.expected) {
+ t.Errorf("Expected value %v, got %v", tt.expected, value)
+ }
+
+ // Check if the stored type matches the expected type
+ if props.Type != tt.expectedType {
+ t.Errorf("Expected type %v, got %v", tt.expectedType, props.Type)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_GetWithCtx(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "testSource", CtxID: "123", Type: "string"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "testSource", CtxID: "456", Type: "string"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "anotherSource", CtxID: "", Type: "string"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ tests := []struct {
+ key string
+ source string
+ ctxID string
+ expected interface{}
+ expectedType string // Expected type for the stored value
+ err error
+ }{
+ {"username", "testSource", "123", "admin", "string", nil},
+ {"email", "testSource", "456", "admin@example.com", "string", nil},
+ {"password", "anotherSource", "", "secret", "string", nil},
+ {"username", "wrongSource", "123", nil, "", errors.New("source mismatch")},
+ {"username", "testSource", "999", nil, "", errors.New("key not found")},
+ {"nonexistent", "testSource", "123", nil, "", errors.New("key not found")},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s_%s", tt.key, tt.source, tt.ctxID), func(t *testing.T) {
+ value, props, err := kvStore.GetWithCtx(tt.key, tt.source, tt.ctxID)
+
+ // Check error
+ if err != nil && err.Error() != tt.err.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.err, err)
+ }
+
+ // Use reflect.DeepEqual to compare the values
+ if !reflect.DeepEqual(value, tt.expected) {
+ t.Errorf("Expected value %v, got %v", tt.expected, value)
+ }
+
+ // Check if the stored type matches the expected type
+ if props.Type != tt.expectedType {
+ t.Errorf("Expected type %v, got %v", tt.expectedType, props.Type)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_GetBySource(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "testSource", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("email", []string{"admin@example.com", "user@example.com"}, Properties{Persistent: false, Source: "testSource", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "anotherSource", CtxID: ""})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ tests := []struct {
+ key string
+ source string
+ expected interface{} // Can be string or []string
+ err error
+ }{
+ {"username", "testSource", "admin", nil},
+ {"email", "testSource", []string{"admin@example.com", "user@example.com"}, nil},
+ {"password", "anotherSource", "secret", nil},
+ {"username", "nonexistentSource", nil, errors.New("key not found for the specified source")},
+ {"nonexistent", "testSource", nil, errors.New("key not found for the specified source")},
+ }
+
+ for i, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.source), func(t *testing.T) {
+ if i < kvStore.Size() {
+ value, _, err := kvStore.GetBySource(tt.key, tt.source)
+
+ if err != nil && err.Error() != tt.err.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.err, err)
+ }
+
+ if !reflect.DeepEqual(value, tt.expected) {
+ t.Errorf("Expected value %v, got %v", tt.expected, value)
+ }
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_Size(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Test empty store
+ if size := kvStore.Size(); size != 0 {
+ t.Fatalf("Expected size 0, got %d", size)
+ }
+
+ // Add some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ // Test size after adding entries
+ if size := kvStore.Size(); size != 3 {
+ t.Fatalf("Expected size 3, got %d", size)
+ }
+
+ // Delete an entry and test size
+ err = kvStore.Delete("username", "123")
+ if err != nil {
+ t.Fatalf("Error deleting key: %v", err)
+ }
+ if size := kvStore.Size(); size != 2 {
+ t.Fatalf("Expected size 2, got %d", size)
+ }
+
+ // Clear all entries and test size
+ kvStore.DeleteAll()
+ if size := kvStore.Size(); size != 0 {
+ t.Fatalf("Expected size 0, got %d", size)
+ }
+}
+
+func TestKeyValueStore_Delete(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ tests := []struct {
+ key string
+ ctxID string
+ expected error
+ }{
+ {"username", "123", nil},
+ {"email", "456", nil},
+ {"password", "", nil},
+ {"nonexistent", "123", errors.New("key not found for context")},
+ {"username", "999", errors.New("key not found for context")},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
+ err := kvStore.Delete(tt.key, tt.ctxID)
+
+ if err != nil && err.Error() != tt.expected.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.expected, err)
+ }
+
+ if err == nil {
+ // Verify the key has been deleted
+ _, _, getErr := kvStore.Get(tt.key, tt.ctxID)
+ if getErr == nil {
+ t.Errorf("Expected key %s to be deleted, but it still exists", tt.key)
+ }
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_DeleteNonPersistent(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("session", "xyz", Properties{Persistent: false, Source: "test", CtxID: "789"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ // Ensure the store has the expected number of entries before deletion
+ if size := kvStore.Size(); size != 4 {
+ t.Fatalf("Expected size 4, got %d", size)
+ }
+
+ // Perform the deletion of non-persistent entries
+ kvStore.DeleteNonPersistent()
+
+ // Ensure the store has the expected number of entries after deletion
+ if size := kvStore.Size(); size != 2 {
+ t.Fatalf("Expected size 2, got %d", size)
+ }
+
+ // Verify that only persistent entries remain
+ tests := []struct {
+ key string
+ ctxID string
+ expected interface{}
+ err error
+ }{
+ {"username", "123", "admin", nil},
+ {"password", "", "secret", nil},
+ {"email", "456", nil, errors.New("key not found for context")}, // Changed to nil for deleted keys
+ {"session", "789", nil, errors.New("key not found for context")}, // Changed to nil for deleted keys
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
+ value, _, err := kvStore.Get(tt.key, tt.ctxID)
+
+ // Check if the error matches
+ if err != nil && err.Error() != tt.err.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.err, err)
+ }
+
+ // Only check the value if there's no error (i.e., the key exists)
+ if err == nil && !reflect.DeepEqual(value, tt.expected) {
+ t.Errorf("Expected value %v, got %v", tt.expected, value)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_DeleteAll(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ // Ensure the store has the expected number of entries before deletion
+ if size := kvStore.Size(); size != 3 {
+ t.Fatalf("Expected size 3, got %d", size)
+ }
+
+ // Perform the deletion of all entries
+ kvStore.DeleteAll()
+
+ // Ensure the store is empty after deletion
+ if size := kvStore.Size(); size != 0 {
+ t.Fatalf("Expected size 0, got %d", size)
+ }
+
+ // Verify that all entries have been deleted
+ tests := []struct {
+ key string
+ ctxID string
+ err error
+ }{
+ {"username", "123", errors.New("key not found for context")},
+ {"email", "456", errors.New("key not found for context")},
+ {"password", "", errors.New("key not found for context")},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
+ _, _, err := kvStore.Get(tt.key, tt.ctxID)
+
+ if err == nil || err.Error() != tt.err.Error() {
+ t.Fatalf("Expected error %v, got %v", tt.err, err)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_AllKeys(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Test with an empty store
+ keys := kvStore.AllKeys()
+ if len(keys) != 0 {
+ t.Fatalf("Expected 0 keys, got %d", len(keys))
+ }
+
+ // Add some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ // Test with a populated store
+ keys = kvStore.AllKeys()
+ expectedKeys := []string{
+ createKeyWithCtx("username", "123"),
+ createKeyWithCtx("email", "456"),
+ createKeyWithCtx("password", ""),
+ }
+
+ if len(keys) != len(expectedKeys) {
+ t.Fatalf("Expected %d keys, got %d", len(expectedKeys), len(keys))
+ }
+
+ for _, expectedKey := range expectedKeys {
+ found := false
+ for _, key := range keys {
+ if key == expectedKey {
+ found = true
+ break
+ }
+ }
+ if !found {
+ t.Errorf("Expected key %s not found in keys", expectedKey)
+ }
+ }
+}
+
+func TestNewKVStoreProperty(t *testing.T) {
+ tests := []struct {
+ persistent bool
+ static bool
+ source string
+ ctxID string
+ expected Properties
+ }{
+ {true, false, "source1", "ctx1", Properties{Persistent: true, Static: false, Source: "source1", CtxID: "ctx1", Type: "string"}},
+ {false, true, "source2", "ctx2", Properties{Persistent: false, Static: true, Source: "source2", CtxID: "ctx2", Type: "string"}},
+ {true, true, "source3", "ctx3", Properties{Persistent: true, Static: true, Source: "source3", CtxID: "ctx3", Type: "string"}},
+ {false, false, "source4", "ctx4", Properties{Persistent: false, Static: false, Source: "source4", CtxID: "ctx4", Type: "string"}},
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%v_%v_%s_%s_%s", tt.persistent, tt.static, tt.source, tt.ctxID, "string"), func(t *testing.T) {
+ result := NewKVStoreProperty(tt.persistent, tt.static, tt.source, tt.ctxID, "string")
+ if result != tt.expected {
+ t.Errorf("expected %+v, got %+v", tt.expected, result)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_Keys(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Test with an empty store
+ keys := kvStore.Keys("123")
+ if len(keys) != 0 {
+ t.Fatalf("Expected 0 keys, got %d", len(keys))
+ }
+
+ // Add some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ // Test with a populated store for a specific context
+ keys = kvStore.Keys("123")
+ expectedKeys := []string{
+ createKeyWithCtx("username", "123"),
+ createKeyWithCtx("password", "123"),
+ }
+
+ if len(keys) != len(expectedKeys) {
+ t.Fatalf("Expected %d keys, got %d", len(expectedKeys), len(keys))
+ }
+
+ for _, expectedKey := range expectedKeys {
+ found := false
+ for _, key := range keys {
+ if key == expectedKey {
+ found = true
+ break
+ }
+ }
+ if !found {
+ t.Errorf("Expected key %s not found in keys", expectedKey)
+ }
+ }
+
+ // Test with a different context
+ keys = kvStore.Keys("456")
+ expectedKeys = []string{
+ createKeyWithCtx("email", "456"),
+ }
+
+ if len(keys) != len(expectedKeys) {
+ t.Fatalf("Expected %d keys, got %d", len(expectedKeys), len(keys))
+ }
+
+ for _, expectedKey := range expectedKeys {
+ found := false
+ for _, key := range keys {
+ if key == expectedKey {
+ found = true
+ break
+ }
+ }
+ if !found {
+ t.Errorf("Expected key %s not found in keys", expectedKey)
+ }
+ }
+}
diff --git a/pkg/ruleset/rulesengine_test.go b/pkg/ruleset/rulesengine_test.go
index d4225d4..eeb8202 100644
--- a/pkg/ruleset/rulesengine_test.go
+++ b/pkg/ruleset/rulesengine_test.go
@@ -116,7 +116,12 @@ func TestRuleEngineMarshalJSON(t *testing.T) {
"CreatedAt": "0001-01-01T00:00:00Z",
"Description": "",
"Name": "",
- "RuleGroups": null
+ "RuleGroups": null,
+ "Env": null,
+ "LoggingConf": {
+ "LogLevel": "",
+ "LogFile": ""
+ }
}
],
"schema": null
diff --git a/pkg/ruleset/ruleset.go b/pkg/ruleset/ruleset.go
index 92898d0..f07ca1b 100644
--- a/pkg/ruleset/ruleset.go
+++ b/pkg/ruleset/ruleset.go
@@ -19,6 +19,8 @@ package ruleset
import (
"fmt"
"strings"
+
+ cmn "github.com/pzaino/thecrowler/pkg/common"
)
///// ------------------------ RULESET ---------------------------------- /////
@@ -33,6 +35,23 @@ func NewRuleset(name string) Ruleset {
}
}
+func (rs *Ruleset) SetEnv(CtxID string) {
+ if rs.Env != nil {
+ for i := 0; i < len(rs.Env); i++ {
+ // Retrieve the environment variable key, value and properties
+ key := rs.Env[i].Key
+ value := rs.Env[i].Value
+ properties := rs.Env[i].Properties
+ // Set the environment variable
+ envProperties := cmn.NewKVStoreProperty(properties.Persistent, properties.Static, properties.Source, CtxID, properties.Type)
+ err := cmn.KVStore.Set(key, value, envProperties)
+ if err != nil {
+ cmn.DebugMsg(cmn.DbgLvlError, fmt.Sprintf("setting environment variable %s: %s", key, err.Error()))
+ }
+ }
+ }
+}
+
/// --- Checks --- ///
// IsValid checks if the Ruleset is valid.
diff --git a/pkg/ruleset/types.go b/pkg/ruleset/types.go
index b9c3e8c..2016a08 100644
--- a/pkg/ruleset/types.go
+++ b/pkg/ruleset/types.go
@@ -58,26 +58,41 @@ type CustomTime struct {
// Ruleset represents the top-level structure of the rules YAML file
type Ruleset struct {
- FormatVersion string `yaml:"format_version"`
- Author string `yaml:"author"`
- CreatedAt CustomTime `yaml:"created_at"`
- Description string `yaml:"description"`
- Name string `yaml:"ruleset_name"`
- RuleGroups []RuleGroup `yaml:"rule_groups"`
+ FormatVersion string `yaml:"format_version"`
+ Author string `yaml:"author"`
+ CreatedAt CustomTime `yaml:"created_at"`
+ Description string `yaml:"description"`
+ Name string `yaml:"ruleset_name"`
+ RuleGroups []RuleGroup `yaml:"rule_groups"`
+ Env []EnvSetting `yaml:"environment_settings,omitempty"`
+ LoggingConf LoggingConfiguration `yaml:"logging_configuration,omitempty"`
}
// RuleGroup represents a group of rules
type RuleGroup struct {
- GroupName string `yaml:"group_name"`
- ValidFrom CustomTime `yaml:"valid_from,omitempty"`
- ValidTo CustomTime `yaml:"valid_to,omitempty"`
- IsEnabled bool `yaml:"is_enabled"`
- ScrapingRules []ScrapingRule `yaml:"scraping_rules,omitempty"`
- ActionRules []ActionRule `yaml:"action_rules,omitempty"`
- DetectionRules []DetectionRule `yaml:"detection_rules,omitempty"`
- CrawlingRules []CrawlingRule `yaml:"crawling_rules,omitempty"`
- EnvironmentSettings EnvironmentSettings `yaml:"environment_settings,omitempty"`
- LoggingConfiguration LoggingConfiguration `yaml:"logging_configuration,omitempty"`
+ GroupName string `yaml:"group_name"`
+ ValidFrom CustomTime `yaml:"valid_from,omitempty"`
+ ValidTo CustomTime `yaml:"valid_to,omitempty"`
+ IsEnabled bool `yaml:"is_enabled"`
+ ScrapingRules []ScrapingRule `yaml:"scraping_rules,omitempty"`
+ ActionRules []ActionRule `yaml:"action_rules,omitempty"`
+ DetectionRules []DetectionRule `yaml:"detection_rules,omitempty"`
+ CrawlingRules []CrawlingRule `yaml:"crawling_rules,omitempty"`
+}
+
+// EnvSetting represents the environment settings for the ruleset
+type EnvSetting struct {
+ Key string `yaml:"key"`
+ Value string `yaml:"value"`
+ Properties EnvProperties `yaml:"properties"`
+}
+
+// EnvProperties represents the properties for the environment settings
+type EnvProperties struct {
+ Persistent bool `yaml:"persistent"`
+ Static bool `yaml:"static"`
+ Type string `yaml:"type"`
+ Source string `yaml:"source"`
}
// PreCondition represents a pre-condition for a scraping rule
diff --git a/schemas/ruleset-schema.json b/schemas/ruleset-schema.json
index c59217d..989a74e 100644
--- a/schemas/ruleset-schema.json
+++ b/schemas/ruleset-schema.json
@@ -795,6 +795,23 @@
"value": {
"type": "string",
"description": "The value of the environment setting."
+ },
+ "properties": {
+ "type": "object",
+ "properties": {
+ "persistent": {
+ "type": "boolean",
+ "description": "Optional. Flag to indicate if the environment setting should be persistent after ruleset completes execution."
+ },
+ "static": {
+ "type": "boolean",
+ "description": "Optional. Flag to indicate if the environment setting should be static and not changeable. That means that the value will be set once and never changed."
+ },
+ "source": {
+ "type": "string",
+ "description": "Optional. The source of the environment setting. If not set manually then the current URL will be used as the source."
+ }
+ }
}
}
},
From 90259701dde29da37d89a03f7d7bfa6e941c18fb Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Thu, 3 Oct 2024 22:09:23 +0100
Subject: [PATCH 09/12] Added another 15% of the work required to support ENV
variables in the rulesets
---
pkg/common/kvstore.go | 63 ++++++++++++++++++---
pkg/common/kvstore_test.go | 60 +++++++++++++++++++-
pkg/crawler/action_rules.go | 12 +++-
pkg/crawler/crawler.go | 2 +
pkg/crawler/scraper.go | 6 ++
pkg/detection/detection.go | 5 +-
pkg/detection/types.go | 1 +
pkg/ruleset/rulesengine.go | 34 +++++++++---
pkg/ruleset/rulesengine_test.go | 19 +++----
pkg/ruleset/ruleset.go | 26 ++-------
pkg/ruleset/ruleset_test.go | 2 +-
pkg/ruleset/rulesgroup.go | 23 ++++++++
pkg/ruleset/types.go | 32 +++++------
schemas/ruleset-schema.json | 98 ++++++++++++++++-----------------
14 files changed, 266 insertions(+), 117 deletions(-)
diff --git a/pkg/common/kvstore.go b/pkg/common/kvstore.go
index 6cfe77d..797c5b5 100644
--- a/pkg/common/kvstore.go
+++ b/pkg/common/kvstore.go
@@ -19,6 +19,7 @@ import (
"errors"
"fmt"
"reflect"
+ "strings"
"sync"
)
@@ -145,8 +146,19 @@ func (kv *KeyValueStore) GetWithCtx(key string, source string, ctxID string) (in
return entry.Value, entry.Properties, nil
}
+// Size returns the number of key-value pairs in the store.
+func (kv *KeyValueStore) Size() int {
+ kv.mutex.RLock()
+ defer kv.mutex.RUnlock()
+
+ return len(kv.store)
+}
+
// Delete removes a key-value pair by key and context.
-func (kv *KeyValueStore) Delete(key string, ctxID string) error {
+// Flags can be used to specify whether to delete persistent.
+// If no flags are provided, only non-persistent entries are deleted.
+// Flag[0] set is to delete persistent entries
+func (kv *KeyValueStore) Delete(key string, ctxID string, flags ...bool) error {
fullKey := createKeyWithCtx(key, ctxID)
kv.mutex.Lock()
defer kv.mutex.Unlock()
@@ -155,16 +167,53 @@ func (kv *KeyValueStore) Delete(key string, ctxID string) error {
return errors.New("key not found for context")
}
- delete(kv.store, fullKey)
+ // Check if the entry should be removed
+ removeEntry := true
+ if len(flags) == 0 {
+ if kv.store[fullKey].Properties.Persistent {
+ removeEntry = false
+ }
+ } else {
+ if !flags[0] && kv.store[fullKey].Properties.Persistent {
+ removeEntry = false
+ }
+ }
+ if removeEntry {
+ delete(kv.store, fullKey)
+ }
return nil
}
-// Size returns the number of key-value pairs in the store.
-func (kv *KeyValueStore) Size() int {
- kv.mutex.RLock()
- defer kv.mutex.RUnlock()
+// DeleteByCID removes all key-value pairs for a given context.
+// Flags can be used to specify whether to delete persistent.
+// If no flags are provided, only non-persistent entries are deleted.
+// Flag[0] set is to delete persistent entries
+func (kv *KeyValueStore) DeleteByCID(ctxID string, flags ...bool) {
+ kv.mutex.Lock()
+ defer kv.mutex.Unlock()
- return len(kv.store)
+ for key := range kv.store {
+ if strings.HasSuffix(key, ctxID) {
+ removeEntry := true
+
+ // If no flags are provided, only delete non-persistent entries
+ if len(flags) == 0 {
+ if kv.store[key].Properties.Persistent {
+ removeEntry = false
+ }
+ } else {
+ // Handle persistent flag logic
+ if !flags[0] && kv.store[key].Properties.Persistent {
+ removeEntry = false
+ }
+ }
+
+ // Perform the deletion
+ if removeEntry {
+ delete(kv.store, key)
+ }
+ }
+ }
}
// DeleteNonPersistent removes all key-value pairs that are not persistent.
diff --git a/pkg/common/kvstore_test.go b/pkg/common/kvstore_test.go
index 3b495cd..26d6ffe 100644
--- a/pkg/common/kvstore_test.go
+++ b/pkg/common/kvstore_test.go
@@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"reflect"
+ "strings"
"testing"
)
@@ -274,7 +275,7 @@ func TestKeyValueStore_Size(t *testing.T) {
}
// Delete an entry and test size
- err = kvStore.Delete("username", "123")
+ err = kvStore.Delete("username", "123", true)
if err != nil {
t.Fatalf("Error deleting key: %v", err)
}
@@ -320,7 +321,7 @@ func TestKeyValueStore_Delete(t *testing.T) {
for _, tt := range tests {
t.Run(fmt.Sprintf("%s_%s", tt.key, tt.ctxID), func(t *testing.T) {
- err := kvStore.Delete(tt.key, tt.ctxID)
+ err := kvStore.Delete(tt.key, tt.ctxID, true)
if err != nil && err.Error() != tt.expected.Error() {
t.Fatalf("Expected error %v, got %v", tt.expected, err)
@@ -596,3 +597,58 @@ func TestKeyValueStore_Keys(t *testing.T) {
}
}
}
+
+func TestKeyValueStore_DeleteByCID(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Static: false, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ err = kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Static: false, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Static: true, Source: "test", CtxID: "456"}) // Static (implicitly persistent)
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ err = kvStore.Set("session", "xyz", Properties{Persistent: false, Static: false, Source: "test", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ tests := []struct {
+ ctxID string
+ flags []bool
+ expectedRem int // Expected number of remaining entries for the given CtxID
+ }{
+ {"123", nil, 1}, // Only non-persistent entries should be deleted, 1 persistent remains
+ {"456", nil, 1}, // Only non-persistent entries should be deleted, 1 persistent remains
+ {"456", []bool{true}, 0}, // All entries should be deleted
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("DeleteByCID_%s", tt.ctxID), func(t *testing.T) {
+ kvStore.DeleteByCID(tt.ctxID, tt.flags...)
+
+ // Count the number of remaining entries for the given CtxID
+ remaining := 0
+ for key := range kvStore.store {
+ parts := strings.Split(key, ":")
+ if len(parts) == 2 && parts[1] == tt.ctxID {
+ remaining++
+ }
+ }
+
+ // Verify the number of remaining entries for the specific CtxID
+ if remaining != tt.expectedRem {
+ t.Errorf("Expected %d remaining entries for CtxID %s, got %d", tt.expectedRem, tt.ctxID, remaining)
+ }
+ })
+ }
+}
diff --git a/pkg/crawler/action_rules.go b/pkg/crawler/action_rules.go
index c54b9cf..3300ad3 100644
--- a/pkg/crawler/action_rules.go
+++ b/pkg/crawler/action_rules.go
@@ -54,15 +54,21 @@ func processURLRules(wd *selenium.WebDriver, ctx *ProcessContext, url string) {
if rs != nil {
cmn.DebugMsg(cmn.DbgLvlDebug, "Executing ruleset: %s", rs.Name)
// Execute all the rules in the ruleset
- executeActionRules(ctx, rs.GetAllEnabledActionRules(), wd)
+ executeActionRules(ctx, rs.GetAllEnabledActionRules(ctx.GetContextID(), true), wd)
+ // Clean up non-persistent rules
+ cmn.KVStore.DeleteByCID(ctx.GetContextID())
}
} else {
rg, err := ctx.re.GetRuleGroupByURL(url)
if err == nil {
if rg != nil {
cmn.DebugMsg(cmn.DbgLvlDebug, "Executing rule group: %s", rg.GroupName)
+ // Set the environment variables for the rule group
+ rg.SetEnv(ctx.GetContextID())
// Execute all the rules in the rule group
executeActionRules(ctx, rg.GetActionRules(), wd)
+ // Clean up non-persistent rules
+ cmn.KVStore.DeleteByCID(ctx.GetContextID())
}
}
}
@@ -1121,7 +1127,9 @@ func executePlannedRulesets(wd *selenium.WebDriver, ctx *ProcessContext, planned
cmn.DebugMsg(cmn.DbgLvlError, "getting ruleset: %v", err)
} else {
// Execute the ruleset
- executeActionRules(ctx, rs.GetAllEnabledActionRules(), wd)
+ executeActionRules(ctx, rs.GetAllEnabledActionRules(ctx.GetContextID(), true), wd)
+ // Clean up non-persistent rules
+ cmn.KVStore.DeleteByCID(ctx.GetContextID())
}
}
}
diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
index 351c476..c1635d3 100644
--- a/pkg/crawler/crawler.go
+++ b/pkg/crawler/crawler.go
@@ -359,6 +359,7 @@ func (ctx *ProcessContext) CrawlInitialURL(sel SeleniumInstance) (selenium.WebDr
// Detect technologies used on the page
detectCtx := detect.DetectionContext{
+ CtxID: ctx.GetContextID(),
TargetURL: ctx.source.URL,
ResponseBody: nil,
Header: nil,
@@ -1871,6 +1872,7 @@ func processJob(processCtx *ProcessContext, id int, url string, skippedURLs []Li
// Collect Detected Technologies
detectCtx := detect.DetectionContext{
+ CtxID: processCtx.GetContextID(),
TargetURL: url,
ResponseBody: nil,
Header: nil,
diff --git a/pkg/crawler/scraper.go b/pkg/crawler/scraper.go
index 9988a64..e10cec4 100644
--- a/pkg/crawler/scraper.go
+++ b/pkg/crawler/scraper.go
@@ -340,6 +340,9 @@ func ApplyRulesGroup(ctx *ProcessContext, ruleGroup *rs.RuleGroup, url string, w
// Initialize a map to hold the extracted data
extractedData := make(map[string]interface{})
+ // Set the environment variables
+ ruleGroup.SetEnv(ctx.GetContextID())
+
// Iterate over the rules in the rule group
for _, rule := range ruleGroup.ScrapingRules {
// Apply the rule to the web page
@@ -350,6 +353,9 @@ func ApplyRulesGroup(ctx *ProcessContext, ruleGroup *rs.RuleGroup, url string, w
}
}
+ // Remove non-persistent environment variables
+ cmn.KVStore.DeleteByCID(ctx.GetContextID())
+
return extractedData, nil
}
diff --git a/pkg/detection/detection.go b/pkg/detection/detection.go
index 0a52a92..1cc97e5 100644
--- a/pkg/detection/detection.go
+++ b/pkg/detection/detection.go
@@ -40,7 +40,7 @@ func DetectTechnologies(dtCtx *DetectionContext) *map[string]DetectedEntity {
cmn.DebugMsg(cmn.DbgLvlDebug, "Starting technologies detection...")
// micro-signatures
- Patterns := dtCtx.RE.GetAllEnabledDetectionRules()
+ Patterns := dtCtx.RE.GetAllEnabledDetectionRules(dtCtx.CtxID)
if len(Patterns) == 0 {
cmn.DebugMsg(cmn.DbgLvlDebug, "No detection rules enabled")
return nil
@@ -155,6 +155,9 @@ func DetectTechnologies(dtCtx *DetectionContext) *map[string]DetectedEntity {
}
}
+ // Delete non-persistent ENV entries
+ cmn.KVStore.DeleteByCID(dtCtx.CtxID)
+
// Transform the detectedTech map into a map of strings
detectedTechStr := make(map[string]DetectedEntity)
if len(detectedTech) == 0 {
diff --git a/pkg/detection/types.go b/pkg/detection/types.go
index 800b5e6..225c17b 100644
--- a/pkg/detection/types.go
+++ b/pkg/detection/types.go
@@ -12,6 +12,7 @@ import (
// DetectionContext is a struct to store the context of the detection process
type DetectionContext struct {
+ CtxID string `json:"ctx_id"` // (required) the ID of the detection context
TargetURL string `json:"target_url"` // (optional) the URL of the target website
TargetIP string `json:"target_ip"` // (optional) the IP address of the target website
WD *selenium.WebDriver // (optional) the Selenium WebDriver (required to run detection plugins)
diff --git a/pkg/ruleset/rulesengine.go b/pkg/ruleset/rulesengine.go
index 8bb494a..051201c 100644
--- a/pkg/ruleset/rulesengine.go
+++ b/pkg/ruleset/rulesengine.go
@@ -392,54 +392,74 @@ func (re *RuleEngine) GetAllCrawlingRules() []CrawlingRule {
}
// GetAllDetectionRules returns all the detection rules in the RuleEngine.
-func (re *RuleEngine) GetAllDetectionRules() []DetectionRule {
+func (re *RuleEngine) GetAllDetectionRules(CtxID string) []DetectionRule {
var detectionRules []DetectionRule
+ CID := strings.ToLower(strings.TrimSpace(CtxID))
for _, rg := range re.GetAllRuleGroups() {
+ if CID != "" {
+ rg.SetEnv(CID)
+ }
detectionRules = append(detectionRules, rg.DetectionRules...)
}
return detectionRules
}
// GetAllEnabledScrapingRules returns all the enabled scraping rules in the RuleEngine.
-func (re *RuleEngine) GetAllEnabledScrapingRules() []ScrapingRule {
+func (re *RuleEngine) GetAllEnabledScrapingRules(CtxID string) []ScrapingRule {
var scrapingRules []ScrapingRule
+ CID := strings.ToLower(strings.TrimSpace(CtxID))
for _, rg := range re.GetAllEnabledRuleGroups() {
+ if CID != "" {
+ rg.SetEnv(CID)
+ }
scrapingRules = append(scrapingRules, rg.ScrapingRules...)
}
return scrapingRules
}
// GetAllEnabledActionRules returns all the enabled action rules in the RuleEngine.
-func (re *RuleEngine) GetAllEnabledActionRules() []ActionRule {
+func (re *RuleEngine) GetAllEnabledActionRules(CtxID string) []ActionRule {
var actionRules []ActionRule
+ CID := strings.ToLower(strings.TrimSpace(CtxID))
for _, rg := range re.GetAllEnabledRuleGroups() {
+ if CID != "" {
+ rg.SetEnv(CID)
+ }
actionRules = append(actionRules, rg.ActionRules...)
}
return actionRules
}
// GetAllEnabledCrawlingRules returns all the enabled crawling rules in the RuleEngine.
-func (re *RuleEngine) GetAllEnabledCrawlingRules() []CrawlingRule {
+func (re *RuleEngine) GetAllEnabledCrawlingRules(CtxID string) []CrawlingRule {
var crawlingRules []CrawlingRule
+ CID := strings.ToLower(strings.TrimSpace(CtxID))
for _, rg := range re.GetAllEnabledRuleGroups() {
+ if CID != "" {
+ rg.SetEnv(CID)
+ }
crawlingRules = append(crawlingRules, rg.CrawlingRules...)
}
return crawlingRules
}
// GetAllEnabledDetectionRules returns all the enabled detection rules in the RuleEngine.
-func (re *RuleEngine) GetAllEnabledDetectionRules() []DetectionRule {
+func (re *RuleEngine) GetAllEnabledDetectionRules(CtxID string) []DetectionRule {
var detectionRules []DetectionRule
+ CID := strings.ToLower(strings.TrimSpace(CtxID))
for _, rg := range re.GetAllEnabledRuleGroups() {
+ if CID != "" {
+ rg.SetEnv(CID)
+ }
detectionRules = append(detectionRules, rg.DetectionRules...)
}
return detectionRules
}
// GetAllScrapingRulesByURL returns all the scraping rules for the specified URL.
-func (re *RuleEngine) GetAllScrapingRulesByURL(url string) []ScrapingRule {
+func (re *RuleEngine) GetAllScrapingRulesByURL(url, CtxID string) []ScrapingRule {
// find all scraping rules with the specified URL
- rules := re.GetAllEnabledScrapingRules()
+ rules := re.GetAllEnabledScrapingRules(CtxID)
var scrapingRules []ScrapingRule
// Prepare the URL for search
diff --git a/pkg/ruleset/rulesengine_test.go b/pkg/ruleset/rulesengine_test.go
index eeb8202..398a2a5 100644
--- a/pkg/ruleset/rulesengine_test.go
+++ b/pkg/ruleset/rulesengine_test.go
@@ -116,12 +116,7 @@ func TestRuleEngineMarshalJSON(t *testing.T) {
"CreatedAt": "0001-01-01T00:00:00Z",
"Description": "",
"Name": "",
- "RuleGroups": null,
- "Env": null,
- "LoggingConf": {
- "LogLevel": "",
- "LogFile": ""
- }
+ "RuleGroups": null
}
],
"schema": null
@@ -352,7 +347,7 @@ func TestGetAllDetectionRules(t *testing.T) {
},
}
- detectionRules := re.GetAllDetectionRules()
+ detectionRules := re.GetAllDetectionRules("")
if len(detectionRules) != 4 {
t.Errorf("Expected 4 detection rules, got %d", len(detectionRules))
@@ -403,7 +398,7 @@ func TestGetAllEnabledScrapingRules(t *testing.T) {
},
}
- scrapingRules := re.GetAllEnabledScrapingRules()
+ scrapingRules := re.GetAllEnabledScrapingRules("")
if len(scrapingRules) != 3 {
t.Errorf("Expected 2 enabled scraping rules, got %d", len(scrapingRules))
@@ -454,7 +449,7 @@ func TestGetAllEnabledActionRules(t *testing.T) {
},
}
- actionRules := re.GetAllEnabledActionRules()
+ actionRules := re.GetAllEnabledActionRules("")
if len(actionRules) != 3 {
t.Errorf("Expected 2 enabled action rules, got %d", len(actionRules))
@@ -505,7 +500,7 @@ func TestGetAllEnabledCrawlingRules(t *testing.T) {
},
}
- crawlingRules := re.GetAllEnabledCrawlingRules()
+ crawlingRules := re.GetAllEnabledCrawlingRules("")
if len(crawlingRules) != 3 {
for _, cr := range crawlingRules {
@@ -559,7 +554,7 @@ func TestGetAllEnabledDetectionRules(t *testing.T) {
},
}
- detectionRules := re.GetAllEnabledDetectionRules()
+ detectionRules := re.GetAllEnabledDetectionRules("")
if len(detectionRules) != 3 {
t.Errorf("Expected 3 detection rules, got %d", len(detectionRules))
@@ -630,7 +625,7 @@ func TestGetAllScrapingRulesByURL(t *testing.T) {
},
}
- scrapingRules := re.GetAllScrapingRulesByURL("https://example.com")
+ scrapingRules := re.GetAllScrapingRulesByURL("https://example.com", "")
if len(scrapingRules) != 2 {
t.Errorf("Expected 2 scraping rules, but got %d", len(scrapingRules))
diff --git a/pkg/ruleset/ruleset.go b/pkg/ruleset/ruleset.go
index f07ca1b..469f87c 100644
--- a/pkg/ruleset/ruleset.go
+++ b/pkg/ruleset/ruleset.go
@@ -19,8 +19,6 @@ package ruleset
import (
"fmt"
"strings"
-
- cmn "github.com/pzaino/thecrowler/pkg/common"
)
///// ------------------------ RULESET ---------------------------------- /////
@@ -35,23 +33,6 @@ func NewRuleset(name string) Ruleset {
}
}
-func (rs *Ruleset) SetEnv(CtxID string) {
- if rs.Env != nil {
- for i := 0; i < len(rs.Env); i++ {
- // Retrieve the environment variable key, value and properties
- key := rs.Env[i].Key
- value := rs.Env[i].Value
- properties := rs.Env[i].Properties
- // Set the environment variable
- envProperties := cmn.NewKVStoreProperty(properties.Persistent, properties.Static, properties.Source, CtxID, properties.Type)
- err := cmn.KVStore.Set(key, value, envProperties)
- if err != nil {
- cmn.DebugMsg(cmn.DbgLvlError, fmt.Sprintf("setting environment variable %s: %s", key, err.Error()))
- }
- }
- }
-}
-
/// --- Checks --- ///
// IsValid checks if the Ruleset is valid.
@@ -134,10 +115,15 @@ func (rs *Ruleset) GetAllEnabledScrapingRules() []ScrapingRule {
// GetAllEnabledActionRules returns a slice of Rule containing only the enabled action rules.
// It iterates over the RuleGroups in the SiteRules and appends the enabled action rules
// to the result slice.
-func (rs *Ruleset) GetAllEnabledActionRules() []ActionRule {
+func (rs *Ruleset) GetAllEnabledActionRules(CtxID string, flags ...bool) []ActionRule {
var enabledRules []ActionRule
for _, rg := range rs.GetAllEnabledRuleGroups() {
+ if len(flags) > 0 {
+ if flags[0] {
+ rg.SetEnv(CtxID)
+ }
+ }
enabledRules = append(enabledRules, rg.ActionRules...)
}
diff --git a/pkg/ruleset/ruleset_test.go b/pkg/ruleset/ruleset_test.go
index 4d59c21..6e04927 100644
--- a/pkg/ruleset/ruleset_test.go
+++ b/pkg/ruleset/ruleset_test.go
@@ -724,7 +724,7 @@ func TestRulesetGetAllEnabledActionRules(t *testing.T) {
},
}
- actualActionRules := ruleset.GetAllEnabledActionRules()
+ actualActionRules := ruleset.GetAllEnabledActionRules("")
if len(actualActionRules) != len(expectedActionRules) {
t.Errorf("Expected %d action rules, but got %d", len(expectedActionRules), len(actualActionRules))
diff --git a/pkg/ruleset/rulesgroup.go b/pkg/ruleset/rulesgroup.go
index ff640a0..fc991d0 100644
--- a/pkg/ruleset/rulesgroup.go
+++ b/pkg/ruleset/rulesgroup.go
@@ -27,6 +27,29 @@ import (
///// ---------------------- RuleGroup -------------------------------- /////
+/// --- Actions --- ///
+
+func (rg *RuleGroup) SetEnv(CtxID string) {
+ if rg.Env != nil {
+ for i := 0; i < len(rg.Env); i++ {
+ // Retrieve the environment variable key, value and properties
+ key := rg.Env[i].Key
+ value := rg.Env[i].Value
+ properties := rg.Env[i].Properties
+ // Set the environment variable
+ envProperties := cmn.NewKVStoreProperty(properties.Persistent, properties.Static, properties.Source, CtxID, properties.Type)
+ err := cmn.KVStore.Set(key, value, envProperties)
+ if err != nil {
+ cmn.DebugMsg(cmn.DbgLvlError, fmt.Sprintf("setting environment variable %s: %s", key, err.Error()))
+ }
+ // test
+ for _, k := range cmn.KVStore.AllKeys() {
+ cmn.DebugMsg(cmn.DbgLvlInfo, k)
+ }
+ }
+ }
+}
+
/// --- Checks --- ///
// IsValid checks if the provided RuleGroup is valid.
diff --git a/pkg/ruleset/types.go b/pkg/ruleset/types.go
index 2016a08..224a3c8 100644
--- a/pkg/ruleset/types.go
+++ b/pkg/ruleset/types.go
@@ -58,26 +58,26 @@ type CustomTime struct {
// Ruleset represents the top-level structure of the rules YAML file
type Ruleset struct {
- FormatVersion string `yaml:"format_version"`
- Author string `yaml:"author"`
- CreatedAt CustomTime `yaml:"created_at"`
- Description string `yaml:"description"`
- Name string `yaml:"ruleset_name"`
- RuleGroups []RuleGroup `yaml:"rule_groups"`
- Env []EnvSetting `yaml:"environment_settings,omitempty"`
- LoggingConf LoggingConfiguration `yaml:"logging_configuration,omitempty"`
+ FormatVersion string `yaml:"format_version"`
+ Author string `yaml:"author"`
+ CreatedAt CustomTime `yaml:"created_at"`
+ Description string `yaml:"description"`
+ Name string `yaml:"ruleset_name"`
+ RuleGroups []RuleGroup `yaml:"rule_groups"`
}
// RuleGroup represents a group of rules
type RuleGroup struct {
- GroupName string `yaml:"group_name"`
- ValidFrom CustomTime `yaml:"valid_from,omitempty"`
- ValidTo CustomTime `yaml:"valid_to,omitempty"`
- IsEnabled bool `yaml:"is_enabled"`
- ScrapingRules []ScrapingRule `yaml:"scraping_rules,omitempty"`
- ActionRules []ActionRule `yaml:"action_rules,omitempty"`
- DetectionRules []DetectionRule `yaml:"detection_rules,omitempty"`
- CrawlingRules []CrawlingRule `yaml:"crawling_rules,omitempty"`
+ GroupName string `yaml:"group_name"`
+ ValidFrom CustomTime `yaml:"valid_from,omitempty"`
+ ValidTo CustomTime `yaml:"valid_to,omitempty"`
+ IsEnabled bool `yaml:"is_enabled"`
+ ScrapingRules []ScrapingRule `yaml:"scraping_rules,omitempty"`
+ ActionRules []ActionRule `yaml:"action_rules,omitempty"`
+ DetectionRules []DetectionRule `yaml:"detection_rules,omitempty"`
+ CrawlingRules []CrawlingRule `yaml:"crawling_rules,omitempty"`
+ Env []EnvSetting `yaml:"environment_settings,omitempty"`
+ LoggingConf LoggingConfiguration `yaml:"logging_configuration,omitempty"`
}
// EnvSetting represents the environment settings for the ruleset
diff --git a/schemas/ruleset-schema.json b/schemas/ruleset-schema.json
index 989a74e..802798b 100644
--- a/schemas/ruleset-schema.json
+++ b/schemas/ruleset-schema.json
@@ -779,64 +779,64 @@
"fuzzing_parameters"
]
}
- }
- }
- }
- },
- "environment_settings": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "key": {
- "type": "string",
- "description": "The name of the environment setting."
},
- "value": {
- "type": "string",
- "description": "The value of the environment setting."
+ "environment_settings": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "key": {
+ "type": "string",
+ "description": "The name of the environment setting."
+ },
+ "value": {
+ "type": "string",
+ "description": "The value of the environment setting."
+ },
+ "properties": {
+ "type": "object",
+ "properties": {
+ "persistent": {
+ "type": "boolean",
+ "description": "Optional. Flag to indicate if the environment setting should be persistent after ruleset completes execution."
+ },
+ "static": {
+ "type": "boolean",
+ "description": "Optional. Flag to indicate if the environment setting should be static and not changeable. That means that the value will be set once and never changed."
+ },
+ "source": {
+ "type": "string",
+ "description": "Optional. The source of the environment setting. If not set manually then the current URL will be used as the source."
+ }
+ }
+ }
+ }
+ },
+ "description": "Optional. Custom key value settings to use in the rules. Normally used to set environment variables for the rules."
},
- "properties": {
+ "logging_configuration": {
"type": "object",
"properties": {
- "persistent": {
- "type": "boolean",
- "description": "Optional. Flag to indicate if the environment setting should be persistent after ruleset completes execution."
- },
- "static": {
- "type": "boolean",
- "description": "Optional. Flag to indicate if the environment setting should be static and not changeable. That means that the value will be set once and never changed."
+ "log_level": {
+ "type": "string",
+ "enum": [
+ "DEBUG",
+ "INFO",
+ "WARNING",
+ "ERROR",
+ "CRITICAL"
+ ],
+ "description": "Optional. Specifies the logging level for actions and scraping activities."
},
- "source": {
+ "log_message": {
"type": "string",
- "description": "Optional. The source of the environment setting. If not set manually then the current URL will be used as the source."
+ "description": "Optional. The message you want to log if the rule matches something."
}
- }
+ },
+ "description": "rule log configuration (aka what you want to be logged when the rule execute)."
}
}
- },
- "description": "Optional. Custom key value settings to use in the rules. Normally used to set environment variables for the rules."
- },
- "logging_configuration": {
- "type": "object",
- "properties": {
- "log_level": {
- "type": "string",
- "enum": [
- "DEBUG",
- "INFO",
- "WARNING",
- "ERROR",
- "CRITICAL"
- ],
- "description": "Optional. Specifies the logging level for actions and scraping activities."
- },
- "log_message": {
- "type": "string",
- "description": "Optional. The message you want to log if the rule matches something."
- }
- },
- "description": "rule log configuration (aka what you want to be logged when the rule execute)."
+ }
}
},
"required": [
From bce1a306f834822f5522f4c8f4c190fbf66ee738 Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Fri, 4 Oct 2024 02:46:34 +0100
Subject: [PATCH 10/12] Added support for multi-data-type in the
environment-settings for each rulesgroup
---
pkg/common/kvstore.go | 77 +++++++++++++++++---
pkg/common/kvstore_test.go | 140 ++++++++++++++++++++++++++++++++++--
pkg/crawler/crawler.go | 4 +-
pkg/ruleset/rulesgroup.go | 12 ++--
pkg/ruleset/types.go | 108 +++++++++++++++++++++++++++-
schemas/ruleset-schema.json | 37 +++++++++-
6 files changed, 351 insertions(+), 27 deletions(-)
diff --git a/pkg/common/kvstore.go b/pkg/common/kvstore.go
index 797c5b5..a14cc92 100644
--- a/pkg/common/kvstore.go
+++ b/pkg/common/kvstore.go
@@ -16,6 +16,7 @@
package common
import (
+ "encoding/json"
"errors"
"fmt"
"reflect"
@@ -55,25 +56,32 @@ func NewKeyValueStore() *KeyValueStore {
}
// NewKVStoreProperty initializes a new Properties object.
-func NewKVStoreProperty(persistent bool, static bool, source string, ctxID string, valueType string) Properties {
+func NewKVStoreProperty(persistent bool, static bool, source string, ctxID string, Type string) Properties {
return Properties{
Persistent: persistent,
Static: static,
Source: source,
CtxID: ctxID,
- Type: valueType, // Store the original type
+ Type: Type,
}
}
// createKeyWithCtx combines the key and CtxID to create a unique key.
func createKeyWithCtx(key string, ctxID string) string {
- return fmt.Sprintf("%s:%s", key, ctxID)
+ return fmt.Sprintf("%s:%s", strings.TrimSpace(key), strings.TrimSpace(ctxID))
}
// Set sets a value (either string or []string) along with its properties for a given key and context.
func (kv *KeyValueStore) Set(key string, value interface{}, properties Properties) error {
+ key = strings.TrimSpace(key)
+ if key == "" {
+ return errors.New("key cannot be empty")
+ }
+
// Store the type of the value in the properties
- properties.Type = reflect.TypeOf(value).String()
+ if strings.TrimSpace(properties.Type) == "" {
+ properties.Type = reflect.TypeOf(value).String()
+ }
fullKey := createKeyWithCtx(key, properties.CtxID)
kv.mutex.Lock()
@@ -191,9 +199,9 @@ func (kv *KeyValueStore) Delete(key string, ctxID string, flags ...bool) error {
func (kv *KeyValueStore) DeleteByCID(ctxID string, flags ...bool) {
kv.mutex.Lock()
defer kv.mutex.Unlock()
-
+ ctxID = strings.TrimSpace(ctxID)
for key := range kv.store {
- if strings.HasSuffix(key, ctxID) {
+ if strings.HasSuffix(key, ":"+ctxID) {
removeEntry := true
// If no flags are provided, only delete non-persistent entries
@@ -228,6 +236,18 @@ func (kv *KeyValueStore) DeleteNonPersistent() {
}
}
+// DeleteNonPersistentByCID removes all key-value pairs for a given context that are not persistent.
+func (kv *KeyValueStore) DeleteNonPersistentByCID(ctxID string) {
+ kv.mutex.Lock()
+ defer kv.mutex.Unlock()
+ ctxID = strings.TrimSpace(ctxID)
+ for key := range kv.store {
+ if strings.HasSuffix(key, ":"+ctxID) && !kv.store[key].Properties.Persistent {
+ delete(kv.store, key)
+ }
+ }
+}
+
// DeleteAll clears all key-value pairs from the store.
func (kv *KeyValueStore) DeleteAll() {
kv.mutex.Lock()
@@ -236,19 +256,33 @@ func (kv *KeyValueStore) DeleteAll() {
kv.store = make(map[string]Entry)
}
-// AllKeys returns a slice of all keys in the store (ignoring context).
+// AllKeys returns a slice of all keys (without the CIDs) in the store (ignoring context).
func (kv *KeyValueStore) AllKeys() []string {
kv.mutex.RLock()
defer kv.mutex.RUnlock()
keys := make([]string, 0, len(kv.store))
for key := range kv.store {
+ // Remove the context ID from the key
+ key = key[:len(key)-len(kv.store[key].Properties.CtxID)-1]
keys = append(keys, key)
}
return keys
}
-// Keys returns a slice of all keys in the store for a given context.
+// AllKeys returns a slice of all keys in the store (ignoring context).
+func (kv *KeyValueStore) AllKeysAndCIDs() []string {
+ kv.mutex.RLock()
+ defer kv.mutex.RUnlock()
+
+ keys := make([]string, 0, len(kv.store))
+ for key := range kv.store {
+ keys = append(keys, key)
+ }
+ return keys
+}
+
+// Keys returns a slice of all keys (without the CID) in the store for a given context.
func (kv *KeyValueStore) Keys(ctxID string) []string {
kv.mutex.RLock()
defer kv.mutex.RUnlock()
@@ -256,8 +290,35 @@ func (kv *KeyValueStore) Keys(ctxID string) []string {
keys := make([]string, 0, len(kv.store))
for key := range kv.store {
if key[len(key)-len(ctxID):] == ctxID {
+ // Remove the context ID from the key
+ key = key[:len(key)-len(ctxID)-1]
keys = append(keys, key)
}
}
return keys
}
+
+// ToJSON converts the key-value store to a JSON string.
+// It uses json.Marshal to convert the value to the correct JSON format.
+func (kv *KeyValueStore) ToJSON() string {
+ kv.mutex.RLock()
+ defer kv.mutex.RUnlock()
+
+ // Create a map to hold the JSON structure
+ jsonMap := make(map[string]interface{})
+
+ // Iterate through the key-value store and add entries to the map
+ for key, entry := range kv.store {
+ jsonMap[key] = entry.Value
+ }
+
+ // Marshal the map into a JSON string
+ jsonBytes, err := json.Marshal(jsonMap)
+ if err != nil {
+ // If there's an error in marshaling, return an empty JSON object
+ return "{}"
+ }
+
+ // Return the generated JSON string
+ return string(jsonBytes)
+}
diff --git a/pkg/common/kvstore_test.go b/pkg/common/kvstore_test.go
index 26d6ffe..ec21edd 100644
--- a/pkg/common/kvstore_test.go
+++ b/pkg/common/kvstore_test.go
@@ -454,11 +454,11 @@ func TestKeyValueStore_DeleteAll(t *testing.T) {
}
}
-func TestKeyValueStore_AllKeys(t *testing.T) {
+func TestKeyValueStore_AllKeysAndCIDs(t *testing.T) {
kvStore := NewKeyValueStore()
// Test with an empty store
- keys := kvStore.AllKeys()
+ keys := kvStore.AllKeysAndCIDs()
if len(keys) != 0 {
t.Fatalf("Expected 0 keys, got %d", len(keys))
}
@@ -478,7 +478,7 @@ func TestKeyValueStore_AllKeys(t *testing.T) {
}
// Test with a populated store
- keys = kvStore.AllKeys()
+ keys = kvStore.AllKeysAndCIDs()
expectedKeys := []string{
createKeyWithCtx("username", "123"),
createKeyWithCtx("email", "456"),
@@ -553,8 +553,8 @@ func TestKeyValueStore_Keys(t *testing.T) {
// Test with a populated store for a specific context
keys = kvStore.Keys("123")
expectedKeys := []string{
- createKeyWithCtx("username", "123"),
- createKeyWithCtx("password", "123"),
+ "username",
+ "password",
}
if len(keys) != len(expectedKeys) {
@@ -577,7 +577,7 @@ func TestKeyValueStore_Keys(t *testing.T) {
// Test with a different context
keys = kvStore.Keys("456")
expectedKeys = []string{
- createKeyWithCtx("email", "456"),
+ "email",
}
if len(keys) != len(expectedKeys) {
@@ -652,3 +652,131 @@ func TestKeyValueStore_DeleteByCID(t *testing.T) {
})
}
}
+
+func TestKeyValueStore_DeleteNonPersistentByCID(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ err = kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ err = kvStore.Set("session", "xyz", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ tests := []struct {
+ ctxID string
+ expectedRem int // Expected number of remaining entries for the given CtxID
+ }{
+ {"123", 1}, // Only persistent entry should remain
+ {"456", 1}, // Only persistent entry should remain
+ }
+
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("DeleteNonPersistentByCID_%s", tt.ctxID), func(t *testing.T) {
+ kvStore.DeleteNonPersistentByCID(tt.ctxID)
+
+ // Count the number of remaining entries for the given CtxID
+ remaining := 0
+ for key := range kvStore.store {
+ parts := strings.Split(key, ":")
+ if len(parts) == 2 && parts[1] == tt.ctxID {
+ remaining++
+ }
+ }
+
+ // Verify the number of remaining entries for the specific CtxID
+ if remaining != tt.expectedRem {
+ t.Errorf("Expected %d remaining entries for CtxID %s, got %d", tt.expectedRem, tt.ctxID, remaining)
+ }
+ })
+ }
+}
+
+func TestKeyValueStore_AllKeys(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Test with an empty store
+ keys := kvStore.AllKeys()
+ if len(keys) != 0 {
+ t.Fatalf("Expected 0 keys, got %d", len(keys))
+ }
+
+ // Add some entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Source: "test", CtxID: "123"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("email", "admin@example.com", Properties{Persistent: false, Source: "test", CtxID: "456"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Source: "test", CtxID: ""})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ // Test with a populated store
+ keys = kvStore.AllKeys()
+ expectedKeys := []string{"username", "email", "password"}
+
+ if len(keys) != len(expectedKeys) {
+ t.Fatalf("Expected %d keys, got %d", len(expectedKeys), len(keys))
+ }
+
+ for _, expectedKey := range expectedKeys {
+ found := false
+ for _, key := range keys {
+ if key == expectedKey {
+ found = true
+ break
+ }
+ }
+ if !found {
+ t.Errorf("Expected key %s not found in keys", expectedKey)
+ }
+ }
+}
+
+func TestKeyValueStore_ToJSON(t *testing.T) {
+ kvStore := NewKeyValueStore()
+
+ // Prepopulate the store with string and []string entries
+ err := kvStore.Set("username", "admin", Properties{Persistent: true, Static: false, Source: "test", CtxID: "123", Type: "string"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ err = kvStore.Set("email", []string{"admin@example.com", "user@example.com"}, Properties{Persistent: false, Static: false, Source: "test", CtxID: "456", Type: "[]string"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ err = kvStore.Set("password", "secret", Properties{Persistent: true, Static: false, Source: "test", CtxID: "", Type: "string"})
+ if err != nil {
+ t.Fatalf("Error setting key: %v", err)
+ }
+
+ expectedJSON := `{"email:456":["admin@example.com","user@example.com"],"password:":"secret","username:123":"admin"}`
+
+ // Convert the store to JSON
+ jsonResult := kvStore.ToJSON()
+
+ // Check if the JSON output matches the expected result
+ if jsonResult != expectedJSON {
+ t.Errorf("Expected JSON %s, got %s", expectedJSON, jsonResult)
+ }
+}
diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
index c1635d3..3acffa6 100644
--- a/pkg/crawler/crawler.go
+++ b/pkg/crawler/crawler.go
@@ -1620,10 +1620,10 @@ func isExternalLink(sourceURL, linkURL string, domainLevel uint) bool {
return sourceParsed.String() != linkParsed.String()
}
- // Check if the link URL contains the source URL (if domainLevel is 1)
+ // Check if the link URL has the source URL as prefix (if domainLevel is 1)
if domainLevel == 1 {
cmn.DebugMsg(cmn.DbgLvlDebug3, "Restriction level 1, Source Domain: %s, Link Domain: %s", sourceURL, linkParsed.String())
- return !strings.Contains(linkParsed.String(), sourceURL)
+ return !strings.HasPrefix(linkParsed.String(), sourceURL)
}
// Get domain parts based on domainLevel
diff --git a/pkg/ruleset/rulesgroup.go b/pkg/ruleset/rulesgroup.go
index fc991d0..b7dff31 100644
--- a/pkg/ruleset/rulesgroup.go
+++ b/pkg/ruleset/rulesgroup.go
@@ -34,18 +34,16 @@ func (rg *RuleGroup) SetEnv(CtxID string) {
for i := 0; i < len(rg.Env); i++ {
// Retrieve the environment variable key, value and properties
key := rg.Env[i].Key
- value := rg.Env[i].Value
+ values := rg.Env[i].Values
properties := rg.Env[i].Properties
- // Set the environment variable
+
+ // Set the environment properties
envProperties := cmn.NewKVStoreProperty(properties.Persistent, properties.Static, properties.Source, CtxID, properties.Type)
- err := cmn.KVStore.Set(key, value, envProperties)
+ // Set the environment variable
+ err := cmn.KVStore.Set(key, values, envProperties)
if err != nil {
cmn.DebugMsg(cmn.DbgLvlError, fmt.Sprintf("setting environment variable %s: %s", key, err.Error()))
}
- // test
- for _, k := range cmn.KVStore.AllKeys() {
- cmn.DebugMsg(cmn.DbgLvlInfo, k)
- }
}
}
}
diff --git a/pkg/ruleset/types.go b/pkg/ruleset/types.go
index 224a3c8..fb31eae 100644
--- a/pkg/ruleset/types.go
+++ b/pkg/ruleset/types.go
@@ -17,6 +17,8 @@
package ruleset
import (
+ "encoding/json"
+ "reflect"
"sync"
"time"
@@ -83,7 +85,7 @@ type RuleGroup struct {
// EnvSetting represents the environment settings for the ruleset
type EnvSetting struct {
Key string `yaml:"key"`
- Value string `yaml:"value"`
+ Values interface{} `yaml:"values"`
Properties EnvProperties `yaml:"properties"`
}
@@ -95,6 +97,110 @@ type EnvProperties struct {
Source string `yaml:"source"`
}
+// UnmarshalJSON implements custom unmarshaling logic for EnvSetting
+func (e *EnvSetting) UnmarshalJSON(data []byte) error {
+ type Alias EnvSetting
+ aux := &struct {
+ Values json.RawMessage `json:"values"` // Read Values as raw JSON first
+ *Alias
+ }{
+ Alias: (*Alias)(e),
+ }
+
+ // Unmarshal the raw data
+ if err := json.Unmarshal(data, &aux); err != nil {
+ return err
+ }
+
+ // Now handle the "values" field, which can be multiple types
+ var value interface{}
+ if err := json.Unmarshal(aux.Values, &value); err != nil {
+ return err
+ }
+
+ // Detect and process the type of "values"
+ switch v := value.(type) {
+ case string:
+ e.Values = v
+ e.Properties.Type = "string"
+ case float64:
+ e.Values = v
+ e.Properties.Type = "number"
+ case bool:
+ e.Values = v
+ e.Properties.Type = "boolean"
+ case nil:
+ e.Values = v
+ e.Properties.Type = "null"
+ case []interface{}:
+ e.Values = processArray(v, e)
+ default:
+ e.Values = nil
+ e.Properties.Type = "unknown"
+ }
+
+ return nil
+}
+
+// Helper function to handle array processing and set the type in EnvProperties
+func processArray(arr []interface{}, e *EnvSetting) interface{} {
+ if len(arr) == 0 {
+ e.Properties.Type = "array"
+ return arr
+ }
+
+ // Check the type of the first element to guess the array type
+ switch arr[0].(type) {
+ case string:
+ e.Properties.Type = "[]string"
+ var stringArray []string
+ for _, elem := range arr {
+ stringArray = append(stringArray, elem.(string))
+ }
+ return stringArray
+ case float64:
+ e.Properties.Type = "[]float64"
+ var numberArray []float64
+ for _, elem := range arr {
+ numberArray = append(numberArray, elem.(float64))
+ }
+ return numberArray
+ case bool:
+ e.Properties.Type = "[]bool"
+ var boolArray []bool
+ for _, elem := range arr {
+ boolArray = append(boolArray, elem.(bool))
+ }
+ return boolArray
+ default:
+ e.Properties.Type = "[]unknown"
+ return arr
+ }
+}
+
+// Custom MarshalJSON to ensure the correct format when marshaling the "values" field
+func (e *EnvSetting) MarshalJSON() ([]byte, error) {
+ type Alias EnvSetting
+ aux := &struct {
+ Values interface{} `json:"values"`
+ *Alias
+ }{
+ Alias: (*Alias)(e),
+ }
+
+ // Handle different types of the "Values" field for correct JSON output
+ switch reflect.TypeOf(e.Values).Kind() {
+ case reflect.Slice, reflect.Array:
+ // If Values is a slice or array, keep it as-is
+ aux.Values = e.Values
+ default:
+ // Otherwise, marshal it as a primitive type (string, number, boolean, etc.)
+ aux.Values = e.Values
+ }
+
+ return json.Marshal(aux)
+}
+
// PreCondition represents a pre-condition for a scraping rule
type PreCondition struct {
URL string `yaml:"url"`
diff --git a/schemas/ruleset-schema.json b/schemas/ruleset-schema.json
index 802798b..8156f42 100644
--- a/schemas/ruleset-schema.json
+++ b/schemas/ruleset-schema.json
@@ -789,9 +789,40 @@
"type": "string",
"description": "The name of the environment setting."
},
- "value": {
- "type": "string",
- "description": "The value of the environment setting."
+ "values": {
+ "description": "A single or a set of values for the environment key.",
+ "anyOf": [
+ { "type": "string" },
+ {
+ "type": "array",
+ "items": { "type": "string" }
+ },
+ { "type": "number" },
+ { "type": "boolean" },
+ { "type": "object" },
+ { "type": "null" },
+ { "type": "integer" },
+ {
+ "type": "array",
+ "items": { "type": "number" }
+ },
+ {
+ "type": "array",
+ "items": { "type": "boolean" }
+ },
+ {
+ "type": "array",
+ "items": { "type": "object" }
+ },
+ {
+ "type": "array",
+ "items": { "type": "null" }
+ },
+ {
+ "type": "array",
+ "items": { "type": "integer" }
+ }
+ ]
},
"properties": {
"type": "object",
From 106af3284f372c89236c7207e55d99b04da2b49b Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Fri, 4 Oct 2024 18:07:02 +0100
Subject: [PATCH 11/12] Some fixing and improvements after testing last night
changes to the ruleset-schema and the handling of multi-type parameters for
both the environment settings and plugins
---
pkg/detection/detection.go | 3 +-
pkg/ruleset/types.go | 114 ++-
schemas/ruleset-schema.json | 1826 ++++++++++++++++++++---------------
schemas/ruleset-schema.yaml | 1311 +++++++++++++++----------
4 files changed, 1910 insertions(+), 1344 deletions(-)
diff --git a/pkg/detection/detection.go b/pkg/detection/detection.go
index 1cc97e5..cec80d9 100644
--- a/pkg/detection/detection.go
+++ b/pkg/detection/detection.go
@@ -491,7 +491,6 @@ func detectTechByMetaTags(responseBody string, signatures *map[string][]ruleset.
// detectTechnologiesWithPlugins runs plugins in the browser and collects the results
// to detect technologies
func detectTechnologiesWithPlugins(wd *selenium.WebDriver, re *ruleset.RuleEngine, plugins *map[string][]ruleset.PluginCall, detectedTech *map[string]detectionEntityDetails) {
- //const detectionType = "plugin"
// Iterate through all the plugins and check for possible technologies
for ObjName := range *plugins {
cmn.DebugMsg(cmn.DbgLvlDebug3, "Running plugins for: %s", ObjName)
@@ -513,7 +512,7 @@ func detectTechnologiesWithPlugins(wd *selenium.WebDriver, re *ruleset.RuleEngin
for _, arg := range args {
jsArgs = append(jsArgs, arg.ArgValue)
if strings.ToLower(strings.TrimSpace(arg.ArgName)) == "confidence" {
- confidence = cmn.StringToFloat32(arg.ArgValue)
+ confidence = cmn.StringToFloat32(strings.TrimSpace(arg.ArgValue.(string)))
}
}
}
diff --git a/pkg/ruleset/types.go b/pkg/ruleset/types.go
index fb31eae..28dc9dc 100644
--- a/pkg/ruleset/types.go
+++ b/pkg/ruleset/types.go
@@ -286,8 +286,118 @@ type PluginCall struct {
// PluginParams represents the parameters for a plugin call
type PluginParams struct {
- ArgName string `yaml:"parameter_name"`
- ArgValue string `yaml:"parameter_value"`
+ ArgName string `yaml:"parameter_name"`
+ ArgValue interface{} `yaml:"parameter_value"`
+ Properties PluginParamsProperties `yaml:"properties"`
+}
+
+// PluginParamsProperties represents the properties for the plugin parameters
+type PluginParamsProperties struct {
+ Type string `yaml:"type"`
+}
+
+// UnmarshalJSON implements custom unmarshaling logic for EnvSetting
+func (e *PluginParams) UnmarshalJSON(data []byte) error {
+ type Alias PluginParams
+ aux := &struct {
+ ArgValue json.RawMessage `json:"parameter_value"` // Read Values as raw JSON first
+ *Alias
+ }{
+ Alias: (*Alias)(e),
+ }
+
+ // Unmarshal the raw data
+ if err := json.Unmarshal(data, &aux); err != nil {
+ return err
+ }
+
+ // Now handle the "values" field, which can be multiple types
+ var value interface{}
+ if err := json.Unmarshal(aux.ArgValue, &value); err != nil {
+ return err
+ }
+
+ // Detect and process the type of "values"
+ switch v := value.(type) {
+ case string:
+ e.ArgValue = v
+ e.Properties.Type = "string"
+ case float64:
+ e.ArgValue = v
+ e.Properties.Type = "number"
+ case bool:
+ e.ArgValue = v
+ e.Properties.Type = "boolean"
+ case nil:
+ e.ArgValue = v
+ e.Properties.Type = "null"
+ case []interface{}:
+ e.ArgValue = processPlgArgArray(v, e)
+ default:
+ e.ArgValue = nil
+ e.Properties.Type = "unknown"
+ }
+
+ return nil
+}
+
+// Helper function to handle array processing and set the type in PluginParamsProperties
+func processPlgArgArray(arr []interface{}, e *PluginParams) interface{} {
+ if len(arr) == 0 {
+ e.Properties.Type = "array"
+ return arr
+ }
+
+ // Check the type of the first element to guess the array type
+ switch arr[0].(type) {
+ case string:
+ e.Properties.Type = "[]string"
+ var stringArray []string
+ for _, elem := range arr {
+ stringArray = append(stringArray, elem.(string))
+ }
+ return stringArray
+ case float64:
+ e.Properties.Type = "[]float64"
+ var numberArray []float64
+ for _, elem := range arr {
+ numberArray = append(numberArray, elem.(float64))
+ }
+ return numberArray
+ case bool:
+ e.Properties.Type = "[]bool"
+ var boolArray []bool
+ for _, elem := range arr {
+ boolArray = append(boolArray, elem.(bool))
+ }
+ return boolArray
+ default:
+ e.Properties.Type = "[]unknown"
+ return arr
+ }
+}
+
+// Custom MarshalJSON to ensure the correct format when marshaling the "parameter_value" field
+func (e *PluginParams) MarshalJSON() ([]byte, error) {
+ type Alias PluginParams
+ aux := &struct {
+ ArgValue interface{} `json:"parameter_value"`
+ *Alias
+ }{
+ Alias: (*Alias)(e),
+ }
+
+ // Handle different types of the "Values" field for correct JSON output
+ switch reflect.TypeOf(e.ArgValue).Kind() {
+ case reflect.Slice, reflect.Array:
+ // If Values is a slice or array, keep it as-is
+ aux.ArgValue = e.ArgValue
+ default:
+ // Otherwise, marshal it as a primitive type (string, number, boolean, etc.)
+ aux.ArgValue = e.ArgValue
+ }
+
+ return json.Marshal(aux)
}
// ExternalDetection represents a call to an external detection service
diff --git a/schemas/ruleset-schema.json b/schemas/ruleset-schema.json
index 8156f42..2516ef3 100644
--- a/schemas/ruleset-schema.json
+++ b/schemas/ruleset-schema.json
@@ -2,900 +2,1117 @@
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://github.com/pzaino/thecrowler/main/schemas/ruleset-schema.json",
"title": "CROWler Ruleset Schema",
- "version": "1.0.4",
+ "version": "1.0.5",
"description": "The CROWler ruleset schema defines the structure of a ruleset file, which contains rules for scraping, action execution, detection, and crawling.",
"type": "object",
- "items": {
- "type": "object",
- "properties": {
- "format_version": {
- "type": "string",
- "description": "Version of the ruleset format, to ensure compatibility.",
- "pattern": "^\\d+\\.\\d+\\.\\d+$",
- "examples": [
- "1.0.4"
- ]
- },
- "author": {
- "type": "string",
- "description": "The author or owner of the ruleset."
- },
- "created_at": {
- "type": "string",
- "description": "Creation date of the ruleset.",
- "pattern": "(?:(?:(?:(\\d{4})[-\\/\\.](\\d{2})[-\\/\\.](\\d{2}))|(?:(\\d{2})[-\\/\\.](\\d{2})[-\\/\\.](\\d{4})))\\s*(?:T\\s*)?)?(?:(\\d{1,2}):(\\d{2})(?::(\\d{2}))?\\s*([AaPp][Mm])?)?"
- },
- "description": {
- "type": "string",
- "description": "A brief description of what the ruleset does."
- },
- "ruleset_name": {
- "type": "string",
- "description": "A unique name identifying the ruleset.",
- "examples": [
- "My Ruleset",
- "https://example.com"
- ]
- },
- "rule_groups": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "group_name": {
- "type": "string",
- "description": "A unique name identifying the group of rules.",
- "examples": [
- "My Group",
- "https://example.com"
- ]
- },
- "valid_from": {
- "type": "string",
- "format": "date-time",
- "description": "The start date from which the rule group becomes active."
- },
- "valid_to": {
- "type": "string",
- "format": "date-time",
- "description": "The end date until which the rule group remains active."
- },
- "is_enabled": {
- "type": "boolean",
- "description": "Flag to enable or disable the rule group."
- },
- "scraping_rules": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "rule_name": {
- "type": "string",
- "description": "A unique name identifying the scraping rule."
- },
- "pre_conditions": {
- "type": "array",
- "description": "Conditions that must be met for the scraping to be executed.",
- "items": {
- "type": "object",
- "properties": {
- "path": {
- "type": "string",
- "description": "The specific path or pattern to match for scraping."
- },
- "url": {
- "type": "string",
- "format": "uri",
- "description": "Optional. The specific URL to which this rule applies. If omitted, the rule is considered applicable to any URL matching the path."
- }
+ "properties": {
+ "format_version": {
+ "title": "The Ruleset Format Version",
+ "description": "Version of the ruleset format, to ensure compatibility.",
+ "type": "string",
+ "pattern": "^\\d+\\.\\d+\\.\\d+$",
+ "examples": [
+ "1.0.5"
+ ]
+ },
+ "author": {
+ "title": "Author's name",
+ "description": "The name of the author or owner of the ruleset.",
+ "type": "string",
+ "examples": [
+ "John Doe",
+ "ZFP Systems Inc."
+ ]
+ },
+ "created_at": {
+ "title": "Creation Date",
+ "description": "The date of when this version of the RUleset was created. SUpports multiple date-time formats.",
+ "type": "string",
+ "pattern": "(?:(?:(?:(\\d{4})[-\/\\.](\\d{2})[-\/\\.](\\d{2}))|(?:(\\d{2})[-\/\\.](\\d{2})[-\/\\.](\\d{4})))\\s*(?:T\\s*)?)?(?:(\\d{1,2}):(\\d{2})(?::(\\d{2}))?\\s*([AaPp][Mm])?)?"
+ },
+ "description": {
+ "title": "Ruleset function description",
+ "description": "A brief description of what the ruleset does.",
+ "type": "string"
+ },
+ "ruleset_name": {
+ "title": "Ruleset Name",
+ "description": "A unique name identifying the ruleset.",
+ "type": "string",
+ "examples": [
+ "My Ruleset",
+ "https://example.com"
+ ]
+ },
+ "rule_groups": {
+ "title": "Rules Groups",
+ "description": "A list of rule groups, each containing mixes of scraping, action, detection, or crawling rules.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "group_name": {
+ "title": "Group's Name",
+ "description": "A unique name identifying the group of rules.",
+ "type": "string",
+ "examples": [
+ "My Group",
+ "https://example.com"
+ ]
+ },
+ "valid_from": {
+ "title": "Valid From",
+ "description": "The start date from which the rule group becomes active.",
+ "type": "string",
+ "format": "date-time"
+ },
+ "valid_to": {
+ "title": "Valid Till",
+ "description": "The end date until which the rule group remains active.",
+ "type": "string",
+ "format": "date-time"
+ },
+ "is_enabled": {
+ "title": "Is Enabled",
+ "description": "Flag to enable or disable the rule group.",
+ "type": "boolean"
+ },
+ "scraping_rules": {
+ "title": "Scraping Rules",
+ "description": "A list of rules to extract data from web pages.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "rule_name": {
+ "title": "Rule's Name",
+ "description": "A unique name identifying the scraping rule.",
+ "type": "string"
+ },
+ "pre_conditions": {
+ "title": "Pre-Conditions",
+ "description": "Conditions that must be met for the scraping to be executed.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "path": {
+ "type": "string",
+ "description": "The specific path or pattern to match for scraping."
+ },
+ "url": {
+ "type": "string",
+ "format": "uri",
+ "description": "Optional. The specific URL to which this rule applies. If omitted, the rule is considered applicable to any URL matching the path."
}
}
- },
- "elements": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "key": {
- "type": "string"
- },
- "selectors": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "selector_type": {
- "type": "string",
- "enum": [
- "css",
- "xpath",
- "id",
- "class_name",
- "class",
- "name",
- "tag_name",
- "element",
- "link_text",
- "partial_link_text",
- "regex",
- "plugin_call"
- ],
- "description": "The type of selector to use to find the element. To extract data using plugins, set this field to 'plugin_call'."
- },
- "selector": {
- "type": "string",
- "description": "The actual selector or pattern used to find the element based on the selector_type. This field is used for the plugin's name when the selector_type is 'plugin_call'."
- },
- "attribute": {
- "type": "object",
- "properties": {
- "name": {
- "type": "string",
- "description": "The name of the attribute to extract, e.g., 'class'."
- },
- "value": {
- "type": "string",
- "description": "Optional. The attribute's value of the element to extract, e.g., 'class_name'. "
- }
+ }
+ },
+ "elements": {
+ "title": "Page's Elements",
+ "description": "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "key": {
+ "type": "string"
+ },
+ "selectors": {
+ "title": "Selectors",
+ "description": "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies. This field is ignored when using action_type like navigate_to_url, forward, back, refresh, close_window, accept_alert, dismiss_alert, get_alert_text, send_keys_to_alert, and take_screenshot.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "selector_type": {
+ "type": "string",
+ "enum": [
+ "css",
+ "xpath",
+ "id",
+ "class_name",
+ "class",
+ "name",
+ "tag_name",
+ "element",
+ "link_text",
+ "partial_link_text",
+ "regex",
+ "plugin_call"
+ ],
+ "description": "The type of selector to use to find the element. To extract data using plugins, set this field to 'plugin_call'."
+ },
+ "selector": {
+ "type": "string",
+ "description": "The actual selector or pattern used to find the element based on the selector_type. This field is used for the plugin's name when the selector_type is 'plugin_call'."
+ },
+ "attribute": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string",
+ "description": "The name of the attribute to extract, e.g., 'class'."
},
- "description": "Optional. The attribute of the element to extract. This field is ignored when using CROWler plugins via plugin_call."
+ "value": {
+ "type": "string",
+ "description": "Optional. The attribute's value of the element to extract, e.g., 'class_name'. "
+ }
},
- "extract_all_occurrences": {
- "type": "boolean",
- "description": "Flag to extract all occurrences of the element, not just the first one. This flag has no effect when using CROWler plugins via plugin_call."
- }
+ "description": "Optional. The attribute of the element to extract. This field is ignored when using CROWler plugins via plugin_call."
},
- "required": [
- "selector_type",
- "selector"
- ]
- }
- }
- },
- "required": [
- "key",
- "selectors"
- ]
- },
- "description": "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies."
- },
- "extract_scripts": {
- "type": "boolean",
- "description": "Indicates whether the rule also has to extract scripts from a page and store them as separate web objects. This is useful for analyzing JavaScript code using 3rd party tools and vulnerability analysis."
- },
- "objects": {
- "type": "array",
- "items": {
- "rule_name": "string",
- "description": "A unique name identifying the detection rule."
- },
- "description": "Identifies specific technologies, requires correspondent detection rules."
- },
- "json_field_rename": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "source_tag": {
- "type": "string",
- "description": "The JSON tag you want to rename."
- },
- "dest_tag": {
- "type": "string",
- "description": "The new name for the JSON tag."
- }
- }
- },
- "description": "Given that the CROWler scraper maps automatically HTML tags to JSON tags, you can use this feature to rename the json-html tag with whatever name you wish to use."
- },
- "wait_conditions": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "condition_type": {
- "type": "string",
- "enum": [
- "element_presence",
- "element_visible",
- "plugin_call",
- "delay"
+ "extract_all_occurrences": {
+ "type": "boolean",
+ "description": "Flag to extract all occurrences of the element, not just the first one. This flag has no effect when using CROWler plugins via plugin_call."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "selector_type",
+ "selector"
]
- },
- "value": {
- "type": "string",
- "description": "a generic value to use with the condition, e.g., a delay in seconds, applicable for delay condition type. For delay type you can also use the CROWler exprterpreter to generate delay values at runtime, e.g., 'random(1, 3)' or 'random(random(1,3), random(5,8))'. If you're using plugin_call, then value field is ignored."
- },
- "selector": {
- "type": "string",
- "description": "The CSS selector for the element, applicable for element_presence and element_visible conditions. This field is used for the plugin's name when the condition_type is 'plugin_call'."
}
}
},
- "description": "Conditions to wait before being able to scrape the data. This to ensure page readiness. Do not use this field to wait after 'navigate_to_url' action type, it doesn't do that, instead it will wait to execute 'navigate_to_url'."
- },
- "post_processing": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "step_type": {
- "type": "string",
- "enum": [
- "replace",
- "remove",
- "transform",
- "validate",
- "clean",
- "plugin_call"
- ],
- "description": "The type of post-processing step to perform on the scraped data. To use plugins to process the data, set this field to 'plugin_call' and place the plugin name in the 'details' object using a field called 'plugin_name'. Do not use 'transform' if you want to use a plugin to transform the output, use 'plugin_call' instead."
- },
- "details": {
- "type": "object",
- "description": "Detailed configuration for the post-processing step, structure depends on the step_type.",
- "additionalProperties": true
- }
+ "additionalProperties": false,
+ "required": [
+ "key",
+ "selectors"
+ ]
+ }
+ },
+ "extract_scripts": {
+ "title": "Extract Page's Scripts",
+ "description": "Indicates whether the rule also has to extract scripts from a page and store them as separate web objects. This is useful for analyzing JavaScript code using 3rd party tools and vulnerability analysis.",
+ "type": "boolean"
+ },
+ "objects": {
+ "title": "Objects",
+ "description": "Identifies specific technologies, requires correspondent detection rules.",
+ "type": "array",
+ "items": {
+ "rule_name": {
+ "type": "string",
+ "description": "A unique name identifying the detection rule.",
+ "examples": [
+ "My Object",
+ "https://example.com"
+ ]
+ }
+ }
+ },
+ "json_field_rename": {
+ "title": "JSON Fields Renaming",
+ "description": "Given that the CROWler scraper maps automatically HTML tags to JSON tags, you can use this feature to rename the json-html tag with whatever name you wish to use.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "source_tag": {
+ "type": "string",
+ "description": "The JSON tag you want to rename."
+ },
+ "dest_tag": {
+ "type": "string",
+ "description": "The new name for the JSON tag."
}
},
- "description": "Post-processing steps for the scraped data to transform, validate, or clean it. To use external APIs to process the data, use the 'transform' step type and, inside the 'details' object, specify the API endpoint and the required parameters. For example, in details, use { 'transform_type': 'api', 'api_url': 'https://api.example.com', 'timeout': 60, 'token': 'your-api-token' }.",
+ "additionalProperties": false,
"required": [
- "step_type",
- "details"
+ "source_tag",
+ "dest_tag"
]
}
},
- "required": [
- "rule_name",
- "elements"
- ]
- }
- },
- "action_rules": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "rule_name": {
- "title": "Rule Name",
- "description": "A unique name identifying the action rule.",
- "type": "string"
- },
- "url": {
- "type": "string",
- "format": "uri",
- "description": "Optional. The specific URL to which this action applies or the URL to navigate to, applicable for navigate action. Do not use this field for 'navigate_to_url' action type, use instead the value field to specify the url to go to, url field is only to match the rule."
- },
- "wait_conditions": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "condition_type": {
- "type": "string",
- "enum": [
- "element_presence",
- "element_visible",
- "plugin_call",
- "delay"
- ]
- },
- "value": {
- "type": "string",
- "description": "a generic value to use with the condition, e.g., a delay in seconds, applicable for delay condition type. For delay type you can also use the CROWler exprterpreter to generate delay values at runtime, e.g., 'random(1, 3)' or 'random(random(1,3), random(5,8))'."
- },
- "selector": {
- "type": "string",
- "description": "The CSS selector for the element, applicable for element_presence and element_visible conditions. If you're using plugin_call, then this field is used for the plugin name."
- }
+ "wait_conditions": {
+ "title": "Wait Conditions",
+ "description": "Conditions to wait before being able to scrape the data. This to ensure page readiness. Do not use this field to wait after 'navigate_to_url' action type, it doesn't do that, instead it will wait to execute 'navigate_to_url'.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "condition_type": {
+ "type": "string",
+ "enum": [
+ "element_presence",
+ "element_visible",
+ "plugin_call",
+ "delay"
+ ]
+ },
+ "value": {
+ "type": "string",
+ "description": "a generic value to use with the condition, e.g., a delay in seconds, applicable for delay condition type. For delay type you can also use the CROWler exprterpreter to generate delay values at runtime, e.g., 'random(1, 3)' or 'random(random(1,3), random(5,8))'. If you're using plugin_call, then value field is ignored."
+ },
+ "selector": {
+ "type": "string",
+ "description": "The CSS selector for the element, applicable for element_presence and element_visible conditions. This field is used for the plugin's name when the condition_type is 'plugin_call'."
+ }
+ },
+ "additionalProperties": false
+ }
+ },
+ "post_processing": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "step_type": {
+ "type": "string",
+ "enum": [
+ "replace",
+ "remove",
+ "transform",
+ "validate",
+ "clean",
+ "plugin_call",
+ "external_api"
+ ],
+ "description": "The type of post-processing step to perform on the scraped data. To use plugins to process the data, set this field to 'plugin_call' and place the plugin name in the 'details' object using a field called 'plugin_name'. Do not use 'transform' if you want to use a plugin to transform the output, use 'plugin_call' instead."
+ },
+ "details": {
+ "type": "object",
+ "description": "Detailed configuration for the post-processing step, structure depends on the step_type.",
+ "additionalProperties": true
}
},
- "description": "Conditions to wait for, that must be met before the action is executed. These conditions are designed to ensure that the page or elements are ready (e.g., waiting for an element to appear, or a delay). Do not use this field to wait after an action is performed, as it only applies before the action is executed."
+ "additionalProperties": false
},
- "conditions": {
+ "description": "Post-processing steps for the scraped data to transform, validate, or clean it. To use external APIs to process the data, use the 'transform' step type and, inside the 'details' object, specify the API endpoint and the required parameters. For example, in details, use { 'transform_type': 'api', 'api_url': 'https://api.example.com', 'timeout': 60, 'token': 'your-api-token' }.",
+ "required": [
+ "step_type",
+ "details"
+ ]
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "rule_name",
+ "elements"
+ ]
+ }
+ },
+ "action_rules": {
+ "title": "Action Rules",
+ "description": "A list of rules to interact with web pages.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "rule_name": {
+ "title": "Rule Name",
+ "description": "A unique name identifying the action rule.",
+ "type": "string"
+ },
+ "url": {
+ "type": "string",
+ "format": "uri",
+ "description": "Optional. The specific URL to which this action applies or the URL to navigate to, applicable for navigate action. Do not use this field for 'navigate_to_url' action type, use instead the value field to specify the url to go to, url field is only to match the rule."
+ },
+ "wait_conditions": {
+ "type": "array",
+ "items": {
"type": "object",
"properties": {
- "type": {
+ "condition_type": {
"type": "string",
"enum": [
- "element",
- "language",
- "plugin_call"
+ "element_presence",
+ "element_visible",
+ "plugin_call",
+ "delay"
]
},
+ "value": {
+ "type": "string",
+ "description": "a generic value to use with the condition, e.g., a delay in seconds, applicable for delay condition type. For delay type you can also use the CROWler exprterpreter to generate delay values at runtime, e.g., 'random(1, 3)' or 'random(random(1,3), random(5,8))'."
+ },
"selector": {
"type": "string",
- "description": "The CSS selector to check if a given element exists, applicable for 'element'. The language id to check if a page is in a certain language, applicable for 'language'. The plugin's name if you're using plugin_call."
+ "description": "The CSS selector for the element, applicable for element_presence and element_visible conditions. If you're using plugin_call, then this field is used for the plugin name."
}
- },
- "description": "Conditions that must be met for the action to be executed. For example, you can check if a certain element exists on the page before performing an action. See this as something to do after we waited for the wait_conditions and we verify that the page is ready to perform the action."
- },
- "action_type": {
- "type": "string",
- "enum": [
- "click",
- "input_text",
- "clear",
- "drag_and_drop",
- "mouse_hover",
- "right_click",
- "double_click",
- "click_and_hold",
- "release",
- "key_down",
- "key_up",
- "navigate_to_url",
- "forward",
- "back",
- "refresh",
- "switch_to_window",
- "switch_to_frame",
- "close_window",
- "accept_alert",
- "dismiss_alert",
- "get_alert_text",
- "send_keys_to_alert",
- "scroll_to_element",
- "scroll_by_amount",
- "take_screenshot",
- "custom"
- ],
- "description": "The type of action to perform, including advanced interactions and calls to plugins.If you want to use plugins then set this field to 'custom', set selector_type field to 'plugin_call', and place the plugin name in the selector field."
+ }
},
- "selectors": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "selector_type": {
- "type": "string",
- "enum": [
- "css",
- "xpath",
- "id",
- "class_name",
- "class",
- "name",
- "tag_name",
- "element",
- "link_text",
- "partial_link_text",
- "plugin_call"
- ],
- "description": "The type of selector to use to find the element."
- },
- "selector": {
- "type": "string",
- "description": "The actual selector or pattern used to find the element based on the selector_type. This field is used for the plugin's name when the selector_type is 'plugin_call'."
- },
- "attribute": {
- "type": "object",
- "properties": {
- "name": {
- "type": "string",
- "description": "The name of the attribute to match for the selector match to be valid."
- },
- "value": {
- "type": "string",
- "description": "The value to of the attribute to match for the selector to be valid."
- }
- },
- "description": "Optional. The attribute of the element to match"
- },
- "value": {
- "type": "string",
- "description": "The value within the selector that we need to match for the action. (this is NOT the value to input!)"
- }
- },
- "required": [
- "selector_type",
- "selector"
+ "description": "Conditions to wait for, that must be met before the action is executed. These conditions are designed to ensure that the page or elements are ready (e.g., waiting for an element to appear, or a delay). Do not use this field to wait after an action is performed, as it only applies before the action is executed."
+ },
+ "conditions": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": [
+ "element",
+ "language",
+ "plugin_call"
]
},
- "description": "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies. This field is ignored when using action_type like navigate_to_url, forward, back, refresh, close_window, accept_alert, dismiss_alert, get_alert_text, send_keys_to_alert, and take_screenshot."
- },
- "value": {
- "type": "string",
- "description": "The value to use with the action, e.g., text to input, applicable for input_text."
+ "selector": {
+ "type": "string",
+ "description": "The CSS selector to check if a given element exists, applicable for 'element'. The language id to check if a page is in a certain language, applicable for 'language'. The plugin's name if you're using plugin_call."
+ }
},
- "error_handling": {
+ "description": "Conditions that must be met for the action to be executed. For example, you can check if a certain element exists on the page before performing an action. See this as something to do after we waited for the wait_conditions and we verify that the page is ready to perform the action."
+ },
+ "action_type": {
+ "type": "string",
+ "enum": [
+ "click",
+ "input_text",
+ "clear",
+ "drag_and_drop",
+ "mouse_hover",
+ "right_click",
+ "double_click",
+ "click_and_hold",
+ "release",
+ "key_down",
+ "key_up",
+ "navigate_to_url",
+ "forward",
+ "back",
+ "refresh",
+ "switch_to_window",
+ "switch_to_frame",
+ "close_window",
+ "accept_alert",
+ "dismiss_alert",
+ "get_alert_text",
+ "send_keys_to_alert",
+ "scroll_to_element",
+ "scroll_by_amount",
+ "take_screenshot",
+ "custom"
+ ],
+ "description": "The type of action to perform, including advanced interactions and calls to plugins.If you want to use plugins then set this field to 'custom', set selector_type field to 'plugin_call', and place the plugin name in the selector field."
+ },
+ "selectors": {
+ "type": "array",
+ "items": {
"type": "object",
"properties": {
- "ignore": {
- "type": "boolean",
- "description": "Flag to ignore errors and continue with the next action."
+ "selector_type": {
+ "type": "string",
+ "enum": [
+ "css",
+ "xpath",
+ "id",
+ "class_name",
+ "class",
+ "name",
+ "tag_name",
+ "element",
+ "link_text",
+ "partial_link_text",
+ "plugin_call"
+ ],
+ "description": "The type of selector to use to find the element."
},
- "retry_count": {
- "type": "integer",
- "description": "The number of times to retry the action on failure."
+ "selector": {
+ "type": "string",
+ "description": "The actual selector or pattern used to find the element based on the selector_type. This field is used for the plugin's name when the selector_type is 'plugin_call'."
},
- "retry_delay": {
- "type": "integer",
- "description": "The delay between retries in seconds."
+ "attribute": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string",
+ "description": "The name of the attribute to match for the selector match to be valid."
+ },
+ "value": {
+ "type": "string",
+ "description": "The value to of the attribute to match for the selector to be valid."
+ }
+ },
+ "description": "Optional. The attribute of the element to match"
+ },
+ "value": {
+ "type": "string",
+ "description": "The value within the selector that we need to match for the action. (this is NOT the value to input!)"
}
},
- "description": "Error handling strategies for the action."
+ "required": [
+ "selector_type",
+ "selector"
+ ]
},
- "post_processing": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "step_type": {
- "type": "string",
- "enum": [
- "collect_cookies"
- ],
- "description": "The type of post-processing step to perform after an action rule has been successfully executed. At the moment the only valid post_processing step is 'collect_cookies'."
- }
+ "description": "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies. This field is ignored when using action_type like navigate_to_url, forward, back, refresh, close_window, accept_alert, dismiss_alert, get_alert_text, send_keys_to_alert, and take_screenshot."
+ },
+ "value": {
+ "type": "string",
+ "description": "The value to use with the action, e.g., text to input, applicable for input_text."
+ },
+ "error_handling": {
+ "type": "object",
+ "properties": {
+ "ignore": {
+ "type": "boolean",
+ "description": "Flag to ignore errors and continue with the next action."
+ },
+ "retry_count": {
+ "type": "integer",
+ "description": "The number of times to retry the action on failure."
+ },
+ "retry_delay": {
+ "type": "integer",
+ "description": "The delay between retries in seconds."
+ }
+ },
+ "description": "Error handling strategies for the action."
+ },
+ "post_processing": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "step_type": {
+ "type": "string",
+ "enum": [
+ "collect_cookies"
+ ],
+ "description": "The type of post-processing step to perform after an action rule has been successfully executed. At the moment the only valid post_processing step is 'collect_cookies'."
}
}
}
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "rule_name",
+ "action_type"
+ ],
+ "anyOf": [
+ {
+ "required": [
+ "selectors"
+ ]
},
- "required": [
- "rule_name",
- "action_type"
- ]
- }
- },
- "detection_rules": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "rule_name": {
- "title": "Rule Name",
- "description": "A unique name identifying the detection rule.",
- "type": "string"
- },
- "object_name": {
- "title": "Object Name",
- "description": "The name of the object or technology to identify. This will also be the JSON key in the output. This is also the field to use for the 'implies' field if you want to imply other objects.",
- "type": "string"
- },
- "http_header_fields": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "key": {
- "type": "string",
- "description": "The name of the HTTP header field."
- },
- "value": {
- "type": "array",
- "items": {
- "type": "string"
- },
- "description": "The expected value of the HTTP header field. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ {
+ "required": [
+ "value"
+ ]
+ }
+ ]
+
+ }
+ },
+ "detection_rules": {
+ "title": "Detection Rules",
+ "description": "A list of rules to detect technologies and objects on web pages.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "rule_name": {
+ "title": "Rule Name",
+ "description": "A unique name identifying the detection rule.",
+ "type": "string"
+ },
+ "object_name": {
+ "title": "Object Name",
+ "description": "The name of the object or technology to identify. This will also be the JSON key in the output. This is also the field to use for the 'implies' field if you want to imply other objects.",
+ "type": "string"
+ },
+ "http_header_fields": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "key": {
+ "type": "string",
+ "description": "The name of the HTTP header field."
+ },
+ "value": {
+ "type": "array",
+ "items": {
+ "type": "string"
},
- "confidence": {
- "type": "number",
- "description": "Optional. The confidence level for the match, ranging from 0 to 10."
- }
+ "description": "The expected value of the HTTP header field. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ },
+ "confidence": {
+ "type": "number",
+ "description": "Optional. The confidence level for the match, ranging from 0 to 10."
}
- },
- "description": "Matching patterns for HTTP header fields to identify technology."
+ }
},
- "page_content_patterns": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "key": {
- "type": "string",
- "description": "The name of the tag to find in the page content."
- },
- "attribute": {
- "type": "string",
- "description": "Optional. The attribute of the tag to match, e.g., 'src' for img tag etc. (leave empty if you want to match the tag's innerText only)."
- },
- "value": {
- "type": "array",
- "items": {
- "type": "string"
- },
- "description": "The pattern to match within the tag's attribute content. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
- },
- "text": {
- "type": "string",
- "description": "Optional. The text to match in the tag's innerText. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
- },
- "confidence": {
- "type": "number",
- "description": "Optional. The confidence level for the detection, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
- }
+ "description": "Matching patterns for HTTP header fields to identify technology."
+ },
+ "page_content_patterns": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "key": {
+ "type": "string",
+ "description": "The name of the tag to find in the page content."
},
- "description": "Phrases or character sequences within page content indicative of specific technology."
- },
- "description": "Patterns within the page content that match specific technologies."
- },
- "certificates_patterns": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "key": {
- "type": "string",
- "description": "The name of the field in an SSL/TLS certificate to find."
- },
- "value": {
- "type": "array",
- "items": {
- "type": "string"
- },
- "description": "The pattern to match within the field's value. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ "attribute": {
+ "type": "string",
+ "description": "Optional. The attribute of the tag to match, e.g., 'src' for img tag etc. (leave empty if you want to match the tag's innerText only)."
+ },
+ "value": {
+ "type": "array",
+ "items": {
+ "type": "string"
},
- "confidence": {
- "type": "number",
- "description": "Optional. The confidence level for the detection, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
- }
+ "description": "The pattern to match within the tag's attribute content. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ },
+ "text": {
+ "type": "string",
+ "description": "Optional. The text to match in the tag's innerText. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ },
+ "confidence": {
+ "type": "number",
+ "description": "Optional. The confidence level for the detection, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
}
},
- "description": "Phrases or character sequences within certain certificate's fields indicative of specific technology."
+ "description": "Phrases or character sequences within page content indicative of specific technology."
},
- "url_micro_signatures": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "value": {
- "type": "string",
- "description": "The micro-signature to match in the URL. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
- },
- "confidence": {
- "type": "number",
- "description": "Optional. The confidence level for the match, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
- }
+ "description": "Patterns within the page content that match specific technologies."
+ },
+ "certificates_patterns": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "key": {
+ "type": "string",
+ "description": "The name of the field in an SSL/TLS certificate to find."
},
- "description": "Micro-signatures in URLs that indicate a specific technology, like '/wp-admin' for WordPress."
- },
- "description": "URL patterns indicative of specific technologies."
- },
- "meta_tags": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "name": {
- "type": "string",
- "description": "The name attribute of the meta tag."
+ "value": {
+ "type": "array",
+ "items": {
+ "type": "string"
},
- "content": {
- "type": "string",
- "description": "The content attribute of the meta tag, which holds the value to match. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
- }
+ "description": "The pattern to match within the field's value. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ },
+ "confidence": {
+ "type": "number",
+ "description": "Optional. The confidence level for the detection, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
}
- },
- "description": "Matching patterns for meta tags to identify technology."
- },
- "implies": {
- "type": "array",
- "items": {
- "type": "string"
- },
- "description": "Optional. A list of object names that this rule implies, e.g., if this rule matches, it implies that the object names in this list are also present."
+ }
},
- "plugin_calls": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "plugin_name": {
- "type": "string",
- "description": "The name of the plugin to call."
- },
- "plugin_parameters": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "parameter_name": {
- "type": "string",
- "description": "The name of the parameter to pass to the plugin."
- },
- "parameter_value": {
- "type": "string",
- "description": "The value of the parameter to pass to the plugin."
- }
- }
- },
- "description": "The parameters to pass to the plugin."
- }
+ "description": "Phrases or character sequences within certain certificate's fields indicative of specific technology."
+ },
+ "url_micro_signatures": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "value": {
+ "type": "string",
+ "description": "The micro-signature to match in the URL. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ },
+ "confidence": {
+ "type": "number",
+ "description": "Optional. The confidence level for the match, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
}
},
- "description": "Optional. Call a plugin to detect the technology."
+ "description": "Micro-signatures in URLs that indicate a specific technology, like '/wp-admin' for WordPress."
},
- "external_detection": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "provider": {
- "title": "Provider",
- "description": "The name of the supported external detection provider.",
- "type": "string",
- "enum": [
- "abuse_ipdb",
- "alien_vault",
- "censys",
- "cisco_umbrella",
- "grey_noise",
- "google_safe_browsing",
- "hybrid_analysis",
- "ip_quality_score",
- "ipvoid",
- "malware_domain_list",
- "shodan",
- "virus_total",
- "url_haus"
- ],
- "examples": [
- "abuse_ipdb",
- "alien_vault",
- "censys",
- "cisco_umbrella",
- "grey_noise",
- "google_safe_browsing",
- "hybrid_analysis",
- "ip_quality_score",
- "ipvoid",
- "malware_domain_list",
- "shodan",
- "virus_total",
- "url_haus"
- ]
- }
+ "description": "URL patterns indicative of specific technologies."
+ },
+ "meta_tags": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string",
+ "description": "The name attribute of the meta tag."
+ },
+ "content": {
+ "type": "string",
+ "description": "The content attribute of the meta tag, which holds the value to match. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
}
}
- }
- },
- "required": [
- "rule_name",
- "object_name"
- ],
- "anyOf": [
- {
- "required": [
- "http_header_fields"
- ]
},
- {
- "required": [
- "page_content_patterns"
- ]
- },
- {
- "required": [
- "certificates_patterns"
- ]
- },
- {
- "required": [
- "url_micro_signatures"
- ]
- },
- {
- "required": [
- "meta_tags"
- ]
- },
- {
- "required": [
- "implies"
- ]
- },
- {
- "required": [
- "plugin_calls"
- ]
+ "description": "Matching patterns for meta tags to identify technology."
+ },
+ "implies": {
+ "title": "Implies",
+ "description": "Optional. A list of rule's names that this rule implies, e.g., if this rule matches, it implies that the rules in this list also match.",
+ "type": "array",
+ "items": {
+ "type": "string"
}
- ]
- }
- },
- "crawling_rules": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "rule_name": {
- "type": "string",
- "description": "A unique name identifying the crawling rule."
- },
- "request_type": {
- "type": "string",
- "enum": [
- "GET",
- "POST"
- ],
- "description": "The type of request to perform for fuzzing."
- },
- "target_elements": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "selector_type": {
- "type": "string",
- "enum": [
- "css",
- "xpath",
- "form"
- ]
- },
- "selector": {
- "type": "string",
- "description": "The actual selector or form name used to find and interact with the target elements for fuzzing."
- }
+ },
+ "plugin_calls": {
+ "title": "Plugin Calls",
+ "description": "Optional. Call a plugin to detect the technology.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "plugin_name": {
+ "type": "string",
+ "description": "The name of the plugin to call."
},
- "required": [
- "selector_type",
- "selector"
- ]
- },
- "description": "Specifies the elements to target for fuzzing, including forms."
- },
- "fuzzing_parameters": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "parameter_name": {
- "type": "string",
- "description": "Name of the parameter to fuzz."
- },
- "fuzzing_type": {
- "type": "string",
- "enum": [
- "fixed_list",
- "pattern_based"
- ],
- "description": "The fuzzing strategy to use for the parameter."
- },
- "values": {
- "type": "array",
- "items": {
- "type": "string"
+ "plugin_parameters": {
+ "title": "Plugin's Parameters",
+ "description": "The parameters to pass to the plugin.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "parameter_name": {
+ "type": "string",
+ "description": "The name of the parameter to pass to the plugin."
+ },
+ "parameter_value": {
+ "description": "The value of the parameter to pass to the plugin.",
+ "anyOf": [
+ {
+ "title": "Object",
+ "type": "object"
+ },
+ {
+ "title": "String",
+ "type": "string"
+ },
+ {
+ "title": "Number",
+ "type": "number"
+ },
+ {
+ "title": "Boolean",
+ "type": "boolean"
+ },
+ {
+ "title": "Null",
+ "type": "null"
+ },
+ {
+ "title": "Integer",
+ "type": "integer"
+ },
+ {
+ "title": "Array of Strings",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ {
+ "title": "Array of Numbers",
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ },
+ {
+ "title": "Array of Booleans",
+ "type": "array",
+ "items": {
+ "type": "boolean"
+ }
+ },
+ {
+ "title": "Array of Objects",
+ "type": "array",
+ "items": {
+ "type": "object"
+ }
+ },
+ {
+ "title": "Array of Nulls",
+ "type": "array",
+ "items": {
+ "type": "null"
+ }
+ },
+ {
+ "title": "Array of Integers",
+ "type": "array",
+ "items": {
+ "type": "integer"
+ }
+ }
+ ],
+ "examples": [
+ "my_api_key",
+ "my_db_password",
+ "700",
+ "true",
+ "1.76",
+ "['value1', 'value2']"
+ ]
+ }
},
- "description": "List of values to use for fuzzing, applicable if 'fuzzing_type' is 'fixed_list'."
+ "additionalProperties": false,
+ "required": [
+ "parameter_name",
+ "parameter_value"
+ ]
},
- "pattern": {
- "type": "string",
- "description": "A pattern to generate fuzzing values, applicable if 'fuzzing_type' is 'pattern_based'."
- }
- },
- "required": [
- "parameter_name",
- "fuzzing_type"
- ]
+ "examples": [
+ {
+ "parameter_name": "api_key",
+ "parameter_value": "my_api_key"
+ },
+ {
+ "parameter_name": "db_password",
+ "parameter_value": "my_db_password"
+ }
+ ]
+ }
},
- "description": "Defines the parameters to fuzz and the strategy for generating fuzz values."
+ "additionalProperties": false
}
},
- "required": [
- "rule_name",
- "request_type",
- "target_elements",
- "fuzzing_parameters"
- ]
- }
- },
- "environment_settings": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "key": {
- "type": "string",
- "description": "The name of the environment setting."
- },
- "values": {
- "description": "A single or a set of values for the environment key.",
- "anyOf": [
- { "type": "string" },
- {
- "type": "array",
- "items": { "type": "string" }
- },
- { "type": "number" },
- { "type": "boolean" },
- { "type": "object" },
- { "type": "null" },
- { "type": "integer" },
- {
- "type": "array",
- "items": { "type": "number" }
- },
- {
- "type": "array",
- "items": { "type": "boolean" }
- },
- {
- "type": "array",
- "items": { "type": "object" }
- },
- {
- "type": "array",
- "items": { "type": "null" }
+ "external_detection": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "provider": {
+ "title": "Provider",
+ "description": "The name of the supported external detection provider.",
+ "type": "string",
+ "enum": [
+ "abuse_ipdb",
+ "alien_vault",
+ "censys",
+ "cisco_umbrella",
+ "grey_noise",
+ "google_safe_browsing",
+ "hybrid_analysis",
+ "ip_quality_score",
+ "ipvoid",
+ "malware_domain_list",
+ "shodan",
+ "virus_total",
+ "url_haus"
+ ],
+ "examples": [
+ "abuse_ipdb",
+ "alien_vault",
+ "censys",
+ "cisco_umbrella",
+ "grey_noise",
+ "google_safe_browsing",
+ "hybrid_analysis",
+ "ip_quality_score",
+ "ipvoid",
+ "malware_domain_list",
+ "shodan",
+ "virus_total",
+ "url_haus"
+ ]
+ }
+ }
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "rule_name",
+ "object_name"
+ ],
+ "anyOf": [
+ {
+ "required": [
+ "http_header_fields"
+ ]
+ },
+ {
+ "required": [
+ "page_content_patterns"
+ ]
+ },
+ {
+ "required": [
+ "certificates_patterns"
+ ]
+ },
+ {
+ "required": [
+ "url_micro_signatures"
+ ]
+ },
+ {
+ "required": [
+ "meta_tags"
+ ]
+ },
+ {
+ "required": [
+ "implies"
+ ]
+ },
+ {
+ "required": [
+ "plugin_calls"
+ ]
+ }
+ ]
+ }
+ },
+ "crawling_rules": {
+ "title": "Crawling Rules",
+ "description": "A list of rules to crawl web pages and fuzz parameters.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "rule_name": {
+ "type": "string",
+ "description": "A unique name identifying the crawling rule."
+ },
+ "request_type": {
+ "type": "string",
+ "enum": [
+ "GET",
+ "POST"
+ ],
+ "description": "The type of request to perform for fuzzing."
+ },
+ "target_elements": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "selector_type": {
+ "type": "string",
+ "enum": [
+ "css",
+ "xpath",
+ "form"
+ ]
},
- {
- "type": "array",
- "items": { "type": "integer" }
+ "selector": {
+ "type": "string",
+ "description": "The actual selector or form name used to find and interact with the target elements for fuzzing."
}
+ },
+ "required": [
+ "selector_type",
+ "selector"
]
},
- "properties": {
+ "description": "Specifies the elements to target for fuzzing, including forms."
+ },
+ "fuzzing_parameters": {
+ "type": "array",
+ "items": {
"type": "object",
"properties": {
- "persistent": {
- "type": "boolean",
- "description": "Optional. Flag to indicate if the environment setting should be persistent after ruleset completes execution."
+ "parameter_name": {
+ "type": "string",
+ "description": "Name of the parameter to fuzz."
+ },
+ "fuzzing_type": {
+ "type": "string",
+ "enum": [
+ "fixed_list",
+ "pattern_based"
+ ],
+ "description": "The fuzzing strategy to use for the parameter."
},
- "static": {
- "type": "boolean",
- "description": "Optional. Flag to indicate if the environment setting should be static and not changeable. That means that the value will be set once and never changed."
+ "values": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "List of values to use for fuzzing, applicable if 'fuzzing_type' is 'fixed_list'."
},
- "source": {
+ "pattern": {
"type": "string",
- "description": "Optional. The source of the environment setting. If not set manually then the current URL will be used as the source."
+ "description": "A pattern to generate fuzzing values, applicable if 'fuzzing_type' is 'pattern_based'."
}
- }
- }
+ },
+ "required": [
+ "parameter_name",
+ "fuzzing_type"
+ ]
+ },
+ "description": "Defines the parameters to fuzz and the strategy for generating fuzz values."
}
},
- "description": "Optional. Custom key value settings to use in the rules. Normally used to set environment variables for the rules."
- },
- "logging_configuration": {
+ "additionalProperties": false,
+ "required": [
+ "rule_name",
+ "request_type",
+ "target_elements",
+ "fuzzing_parameters"
+ ]
+ }
+ },
+ "environment_settings": {
+ "title": "Environment Settings",
+ "description": "Optional. Custom key value settings to use in the rules. Normally used to set environment variables for the rules.",
+ "type": "array",
+ "items": {
"type": "object",
"properties": {
- "log_level": {
+ "key": {
+ "title": "Setting's Name",
+ "description": "The name of the environment setting. It has to be unique within the Rulesgroup namespace.",
"type": "string",
- "enum": [
- "DEBUG",
- "INFO",
- "WARNING",
- "ERROR",
- "CRITICAL"
+ "examples": [
+ "API_KEY",
+ "DB_PASSWORD"
+ ]
+ },
+ "value": {
+ "title": "Setting's Value",
+ "description": "A single or a set of values for the environment key.",
+ "anyOf": [
+ {
+ "title": "Object",
+ "type": "object"
+ },
+ {
+ "title": "String",
+ "type": "string"
+ },
+ {
+ "title": "Number",
+ "type": "number"
+ },
+ {
+ "title": "Boolean",
+ "type": "boolean"
+ },
+ {
+ "title": "Null",
+ "type": "null"
+ },
+ {
+ "title": "Integer",
+ "type": "integer"
+ },
+ {
+ "title": "Array of Strings",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ {
+ "title": "Array of Numbers",
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ },
+ {
+ "title": "Array of Booleans",
+ "type": "array",
+ "items": {
+ "type": "boolean"
+ }
+ },
+ {
+ "title": "Array of Objects",
+ "type": "array",
+ "items": {
+ "type": "object"
+ }
+ },
+ {
+ "title": "Array of Nulls",
+ "type": "array",
+ "items": {
+ "type": "null"
+ }
+ },
+ {
+ "title": "Array of Integers",
+ "type": "array",
+ "items": {
+ "type": "integer"
+ }
+ }
],
- "description": "Optional. Specifies the logging level for actions and scraping activities."
+ "examples": [
+ "my_api_key",
+ "my_db_password",
+ "my_secret_key",
+ "500",
+ "true",
+ "1.76",
+ "['value1', 'value2']"
+ ]
},
- "log_message": {
- "type": "string",
- "description": "Optional. The message you want to log if the rule matches something."
+ "properties": {
+ "title": "Setting's Properties",
+ "description": "Optional. Additional properties for the environment setting. These properties are used to define the behavior of the environment setting.",
+ "type": "object",
+ "properties": {
+ "persistent": {
+ "type": "boolean",
+ "description": "Optional. Flag to indicate if the environment setting should be persistent after ruleset completes execution."
+ },
+ "static": {
+ "type": "boolean",
+ "description": "Optional. Flag to indicate if the environment setting should be static and not changeable. That means that the value will be set once and never changed."
+ },
+ "source": {
+ "type": "string",
+ "description": "Optional. The source of the environment setting. If not set manually then the current URL will be used as the source."
+ }
+ },
+ "additionalProperties": false
}
},
- "description": "rule log configuration (aka what you want to be logged when the rule execute)."
+ "additionalProperties": false,
+ "required": [
+ "key",
+ "value"
+ ]
}
+ },
+ "logging_configuration": {
+ "type": "object",
+ "properties": {
+ "log_level": {
+ "type": "string",
+ "enum": [
+ "DEBUG",
+ "INFO",
+ "WARNING",
+ "ERROR",
+ "CRITICAL"
+ ],
+ "description": "Optional. Specifies the logging level for actions and scraping activities."
+ },
+ "log_message": {
+ "type": "string",
+ "description": "Optional. The message you want to log if the rule matches something."
+ }
+ },
+ "description": "rule log configuration (aka what you want to be logged when the rule execute)."
}
- }
- }
- },
- "required": [
- "group_name",
- "is_enabled"
- ],
- "anyOf": [
- {
- "required": [
- "scraping_rules"
- ]
+ },
+ "additionalProperties": false
},
- {
- "required": [
- "action_rules"
- ]
- },
- {
- "required": [
- "detection_rules"
- ]
- },
- {
- "required": [
- "crawling_rules"
- ]
- }
- ]
+ "required": [
+ "group_name",
+ "is_enabled"
+ ],
+ "anyOf": [
+ {
+ "required": [
+ "scraping_rules"
+ ]
+ },
+ {
+ "required": [
+ "action_rules"
+ ]
+ },
+ {
+ "required": [
+ "detection_rules"
+ ]
+ },
+ {
+ "required": [
+ "crawling_rules"
+ ]
+ }
+ ]
+ }
},
"required": [
"ruleset_name",
@@ -904,6 +1121,5 @@
"created_at",
"author",
"description"
- ],
- "minItems": 1
+ ]
}
diff --git a/schemas/ruleset-schema.yaml b/schemas/ruleset-schema.yaml
index c870871..d3d06dc 100644
--- a/schemas/ruleset-schema.yaml
+++ b/schemas/ruleset-schema.yaml
@@ -1,574 +1,815 @@
-# TheCROWler scrapping and action rules YAML Schema (draft v1.0.4)
+# TheCROWler scrapping and action rules YAML Schema (draft v1.0.5)
# Copyright (c) 2022 Paolo Fabio Zaino, distributed under Apache 2.0 license
---
$schema: "http://json-schema.org/draft-07/schema#"
-type: "object"
+$id: "https://github.com/pzaino/thecrowler/main/schemas/ruleset-schema.json"
+title: "CROWler Ruleset Schema"
+version: "1.0.5"
description: "The CROWler ruleset schema defines the structure of a ruleset file, which contains rules for scraping, action execution, detection, and crawling."
-items:
- type: "object"
- properties:
- format_version:
- type: "string"
- description: "Version of the ruleset format, to ensure compatibility."
- author:
- type: "string"
- description: "The author or owner of the ruleset."
- created_at:
- type: "string"
- format: "date-time"
- description: "Creation date of the ruleset."
- description:
- type: "string"
- description: "A brief description of what the ruleset does."
- ruleset_name:
- type: "string"
- description: "A unique name identifying the ruleset."
- rule_groups:
- type: "array"
- items:
- type: "object"
- properties:
- group_name:
- type: "string"
- description: "A unique name identifying the group of rules."
- valid_from:
- type: "string"
- format: "date-time"
- description: "The start date from which the rule group becomes active."
- valid_to:
- type: "string"
- format: "date-time"
- description: "The end date until which the rule group remains active."
- is_enabled:
- type: "boolean"
- description: "Flag to enable or disable the rule group."
- scraping_rules:
- type: "array"
- items:
- type: "object"
- properties:
- rule_name:
- type: "string"
- description: "A unique name identifying the scraping rule."
- pre_conditions:
- type: "array"
- description: "Conditions that must be met for the scraping to be executed."
- items:
- type: "object"
- properties:
- path:
- type: "string"
- description: "The specific path or pattern to match for scraping."
- url:
- type: "string"
- format: "uri"
- description: "Optional. The specific URL to which this rule applies. If omitted, the rule is considered applicable to any URL matching the path."
- elements:
- type: "array"
- items:
- type: "object"
- properties:
- key:
- type: "string"
- selectors:
- type: "array"
- items:
- type: "object"
- properties:
- selector_type:
- type: "string"
- enum:
- - "css"
- - "xpath"
- - "id"
- - "class_name"
- - "class"
- - "name"
- - "tag_name"
- - "element"
- - "link_text"
- - "partial_link_text"
- - "regex"
- - "plugin_call"
- description: "The type of selector to use to find the element. To extract data using plugins, set this field to 'plugin_call'."
- selector:
- type: "string"
- description: "The actual selector or pattern used to find the element based on the selector_type. This field is used for the plugin's name when the selector_type is 'plugin_call'."
- attribute:
- type: "object"
- properties:
- name:
- type: "string"
- description: "The name of the attribute to extract, e.g., 'class'."
- value:
- type: "string"
- description: "Optional. The attribute's value of the element to extract, e.g., 'class_name'. "
- description: "Optional. The attribute of the element to extract. This field is ignored when using CROWler plugins via plugin_call."
- extract_all_occurrences:
- type: "boolean"
- description: "Flag to extract all occurrences of the element, not just the first one. This flag has no effect when using CROWler plugins via plugin_call."
- required:
- - "selector_type"
- - "selector"
- required:
- - "key"
- - "selectors"
- description: "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies."
- extract_scripts:
- type: "boolean"
- description: "Indicates whether the rule also has to extract scripts from a page and store them as separate web objects. This is useful for analyzing JavaScript code using 3rd party tools and vulnerability analysis."
- objects:
- type: "array"
- items:
- rule_name: "string"
- description: "A unique name identifying the detection rule."
- description: "Identifies specific technologies, requires correspondent detection rules."
- json_field_rename:
- type: "array"
- items:
- type: "object"
- properties:
- source_tag:
- type: "string"
- description: "The JSON tag you want to rename."
- dest_tag:
- type: "string"
- description: "The new name for the JSON tag."
- description: "Given that the CROWler scraper maps automatically HTML tags to JSON tags, you can use this feature to rename the json-html tag with whatever name you wish to use."
- wait_conditions:
- type: "array"
- items:
- type: "object"
- properties:
- condition_type:
- type: "string"
- enum:
- - "element_presence"
- - "element_visible"
- - "plugin_call"
- - "delay"
- value:
- type: "string"
- description: "a generic value to use with the condition, e.g., a delay in seconds, applicable for delay condition type. For delay type you can also use the CROWler exprterpreter to generate delay values at runtime, e.g., 'random(1, 3)' or 'random(random(1,3), random(5,8))'. If you're using plugin_call, then value field is ignored."
- selector:
- type: "string"
- description: "The CSS selector for the element, applicable for element_presence and element_visible conditions. This field is used for the plugin's name when the condition_type is 'plugin_call'."
- description: "Conditions to wait before being able to scrape the data. This to ensure page readiness. Do not use this field to wait after 'navigate_to_url' action type, it doesn't do that, instead it will wait to execute 'navigate_to_url'."
- post_processing:
- type: "array"
- items:
- type: "object"
- properties:
- step_type:
- type: "string"
- enum:
- - "replace"
- - "remove"
- - "transform"
- - "validate"
- - "clean"
- - "plugin_call"
- description: "The type of post-processing step to perform on the scraped data. To use plugins to process the data, set this field to 'plugin_call' and place the plugin name in the 'details' object using a field called 'plugin_name'. Do not use 'transform' if you want to use a plugin to transform the output, use 'plugin_call' instead."
- details:
+type: "object"
+properties:
+ format_version:
+ title: "The Ruleset Format Version"
+ description: "Version of the ruleset format, to ensure compatibility."
+ type: "string"
+ pattern: "^\\d+\\.\\d+\\.\\d+$"
+ examples:
+ - "1.0.5"
+ author:
+ title: "Author's name"
+ description: "The name of the author or owner of the ruleset."
+ type: "string"
+ examples:
+ - "John Doe"
+ - "ZFP Systems Inc."
+ created_at:
+ title: "Creation Date"
+ description: "The date of when this version of the RUleset was created. SUpports multiple date-time formats."
+ type: "string"
+ pattern: "(?:(?:(?:(\\d{4})[-/\\.](\\d{2})[-/\\.](\\d{2}))|(?:(\\d{2})[-/\\.](\\d{2})[-/\\.](\\d{4})))\\s*(?:T\\s*)?)?(?:(\\d{1,2}):(\\d{2})(?::(\\d{2}))?\\s*([AaPp][Mm])?)?"
+ description:
+ title: "Ruleset function description"
+ description: "A brief description of what the ruleset does."
+ type: "string"
+ ruleset_name:
+ title: "Ruleset Name"
+ description: "A unique name identifying the ruleset."
+ type: "string"
+ examples:
+ - "My Ruleset"
+ - "https://example.com"
+ rule_groups:
+ title: "Rules Groups"
+ description: "A list of rule groups, each containing mixes of scraping, action, detection, or crawling rules."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ group_name:
+ title: "Group's Name"
+ description: "A unique name identifying the group of rules."
+ type: "string"
+ examples:
+ - "My Group"
+ - "https://example.com"
+ valid_from:
+ title: "Valid From"
+ description: "The start date from which the rule group becomes active."
+ type: "string"
+ format: "date-time"
+ valid_to:
+ title: "Valid Till"
+ description: "The end date until which the rule group remains active."
+ type: "string"
+ format: "date-time"
+ is_enabled:
+ title: "Is Enabled"
+ description: "Flag to enable or disable the rule group."
+ type: "boolean"
+ scraping_rules:
+ title: "Scraping Rules"
+ description: "A list of rules to extract data from web pages."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ rule_name:
+ title: "Rule's Name"
+ description: "A unique name identifying the scraping rule."
+ type: "string"
+ pre_conditions:
+ title: "Pre-Conditions"
+ description: "Conditions that must be met for the scraping to be executed."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ path:
+ type: "string"
+ description: "The specific path or pattern to match for scraping."
+ url:
+ type: "string"
+ format: "uri"
+ description: "Optional. The specific URL to which this rule applies. If omitted, the rule is considered applicable to any URL matching the path."
+ elements:
+ title: "Page's Elements"
+ description: "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ key:
+ type: "string"
+ selectors:
+ title: "Selectors"
+ description: "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies. This field is ignored when using action_type like navigate_to_url, forward, back, refresh, close_window, accept_alert, dismiss_alert, get_alert_text, send_keys_to_alert, and take_screenshot."
+ type: "array"
+ items:
type: "object"
- description: "Detailed configuration for the post-processing step, structure depends on the step_type."
- additionalProperties: "true"
- description: "Post-processing steps for the scraped data to transform, validate, or clean it. To use external APIs to process the data, use the 'transform' step type and, inside the 'details' object, specify the API endpoint and the required parameters. For example, in details, use { 'transform_type': 'api', 'api_url': 'https://api.example.com', 'timeout': 60, 'token': 'your-api-token' }."
+ properties:
+ selector_type:
+ type: "string"
+ enum:
+ - "css"
+ - "xpath"
+ - "id"
+ - "class_name"
+ - "class"
+ - "name"
+ - "tag_name"
+ - "element"
+ - "link_text"
+ - "partial_link_text"
+ - "regex"
+ - "plugin_call"
+ description: "The type of selector to use to find the element. To extract data using plugins, set this field to 'plugin_call'."
+ selector:
+ type: "string"
+ description: "The actual selector or pattern used to find the element based on the selector_type. This field is used for the plugin's name when the selector_type is 'plugin_call'."
+ attribute:
+ type: "object"
+ properties:
+ name:
+ type: "string"
+ description: "The name of the attribute to extract, e.g., 'class'."
+ value:
+ type: "string"
+ description: "Optional. The attribute's value of the element to extract, e.g., 'class_name'. "
+ description: "Optional. The attribute of the element to extract. This field is ignored when using CROWler plugins via plugin_call."
+ extract_all_occurrences:
+ type: "boolean"
+ description: "Flag to extract all occurrences of the element, not just the first one. This flag has no effect when using CROWler plugins via plugin_call."
+ additionalProperties: "false"
+ required:
+ - "selector_type"
+ - "selector"
+ additionalProperties: "false"
required:
- - "step_type"
- - "details"
+ - "key"
+ - "selectors"
+ extract_scripts:
+ title: "Extract Page's Scripts"
+ description: "Indicates whether the rule also has to extract scripts from a page and store them as separate web objects. This is useful for analyzing JavaScript code using 3rd party tools and vulnerability analysis."
+ type: "boolean"
+ objects:
+ title: "Objects"
+ description: "Identifies specific technologies, requires correspondent detection rules."
+ type: "array"
+ items:
+ rule_name:
+ type: "string"
+ description: "A unique name identifying the detection rule."
+ examples:
+ - "My Object"
+ - "https://example.com"
+ json_field_rename:
+ title: "JSON Fields Renaming"
+ description: "Given that the CROWler scraper maps automatically HTML tags to JSON tags, you can use this feature to rename the json-html tag with whatever name you wish to use."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ source_tag:
+ type: "string"
+ description: "The JSON tag you want to rename."
+ dest_tag:
+ type: "string"
+ description: "The new name for the JSON tag."
+ additionalProperties: "false"
+ required:
+ - "source_tag"
+ - "dest_tag"
+ wait_conditions:
+ title: "Wait Conditions"
+ description: "Conditions to wait before being able to scrape the data. This to ensure page readiness. Do not use this field to wait after 'navigate_to_url' action type, it doesn't do that, instead it will wait to execute 'navigate_to_url'."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ condition_type:
+ type: "string"
+ enum:
+ - "element_presence"
+ - "element_visible"
+ - "plugin_call"
+ - "delay"
+ value:
+ type: "string"
+ description: "a generic value to use with the condition, e.g., a delay in seconds, applicable for delay condition type. For delay type you can also use the CROWler exprterpreter to generate delay values at runtime, e.g., 'random(1, 3)' or 'random(random(1,3), random(5,8))'. If you're using plugin_call, then value field is ignored."
+ selector:
+ type: "string"
+ description: "The CSS selector for the element, applicable for element_presence and element_visible conditions. This field is used for the plugin's name when the condition_type is 'plugin_call'."
+ additionalProperties: "false"
+ post_processing:
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ step_type:
+ type: "string"
+ enum:
+ - "replace"
+ - "remove"
+ - "transform"
+ - "validate"
+ - "clean"
+ - "plugin_call"
+ - "external_api"
+ description: "The type of post-processing step to perform on the scraped data. To use plugins to process the data, set this field to 'plugin_call' and place the plugin name in the 'details' object using a field called 'plugin_name'. Do not use 'transform' if you want to use a plugin to transform the output, use 'plugin_call' instead."
+ details:
+ type: "object"
+ description: "Detailed configuration for the post-processing step, structure depends on the step_type."
+ additionalProperties: "true"
+ additionalProperties: "false"
+ description: "Post-processing steps for the scraped data to transform, validate, or clean it. To use external APIs to process the data, use the 'transform' step type and, inside the 'details' object, specify the API endpoint and the required parameters. For example, in details, use { 'transform_type': 'api', 'api_url': 'https://api.example.com', 'timeout': 60, 'token': 'your-api-token' }."
+ required:
+ - "step_type"
+ - "details"
+ additionalProperties: "false"
+ required:
+ - "rule_name"
+ - "elements"
+ action_rules:
+ title: "Action Rules"
+ description: "A list of rules to interact with web pages."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ rule_name:
+ title: "Rule Name"
+ description: "A unique name identifying the action rule."
+ type: "string"
+ url:
+ type: "string"
+ format: "uri"
+ description: "Optional. The specific URL to which this action applies or the URL to navigate to, applicable for navigate action. Do not use this field for 'navigate_to_url' action type, use instead the value field to specify the url to go to, url field is only to match the rule."
+ wait_conditions:
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ condition_type:
+ type: "string"
+ enum:
+ - "element_presence"
+ - "element_visible"
+ - "plugin_call"
+ - "delay"
+ value:
+ type: "string"
+ description: "a generic value to use with the condition, e.g., a delay in seconds, applicable for delay condition type. For delay type you can also use the CROWler exprterpreter to generate delay values at runtime, e.g., 'random(1, 3)' or 'random(random(1,3), random(5,8))'."
+ selector:
+ type: "string"
+ description: "The CSS selector for the element, applicable for element_presence and element_visible conditions. If you're using plugin_call, then this field is used for the plugin name."
+ description: "Conditions to wait for, that must be met before the action is executed. These conditions are designed to ensure that the page or elements are ready (e.g., waiting for an element to appear, or a delay). Do not use this field to wait after an action is performed, as it only applies before the action is executed."
+ conditions:
+ type: "object"
+ properties:
+ type:
+ type: "string"
+ enum:
+ - "element"
+ - "language"
+ - "plugin_call"
+ selector:
+ type: "string"
+ description: "The CSS selector to check if a given element exists, applicable for 'element'. The language id to check if a page is in a certain language, applicable for 'language'. The plugin's name if you're using plugin_call."
+ description: "Conditions that must be met for the action to be executed. For example, you can check if a certain element exists on the page before performing an action. See this as something to do after we waited for the wait_conditions and we verify that the page is ready to perform the action."
+ action_type:
+ type: "string"
+ enum:
+ - "click"
+ - "input_text"
+ - "clear"
+ - "drag_and_drop"
+ - "mouse_hover"
+ - "right_click"
+ - "double_click"
+ - "click_and_hold"
+ - "release"
+ - "key_down"
+ - "key_up"
+ - "navigate_to_url"
+ - "forward"
+ - "back"
+ - "refresh"
+ - "switch_to_window"
+ - "switch_to_frame"
+ - "close_window"
+ - "accept_alert"
+ - "dismiss_alert"
+ - "get_alert_text"
+ - "send_keys_to_alert"
+ - "scroll_to_element"
+ - "scroll_by_amount"
+ - "take_screenshot"
+ - "custom"
+ description: "The type of action to perform, including advanced interactions and calls to plugins.If you want to use plugins then set this field to 'custom', set selector_type field to 'plugin_call', and place the plugin name in the selector field."
+ selectors:
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ selector_type:
+ type: "string"
+ enum:
+ - "css"
+ - "xpath"
+ - "id"
+ - "class_name"
+ - "class"
+ - "name"
+ - "tag_name"
+ - "element"
+ - "link_text"
+ - "partial_link_text"
+ - "plugin_call"
+ description: "The type of selector to use to find the element."
+ selector:
+ type: "string"
+ description: "The actual selector or pattern used to find the element based on the selector_type. This field is used for the plugin's name when the selector_type is 'plugin_call'."
+ attribute:
+ type: "object"
+ properties:
+ name:
+ type: "string"
+ description: "The name of the attribute to match for the selector match to be valid."
+ value:
+ type: "string"
+ description: "The value to of the attribute to match for the selector to be valid."
+ description: "Optional. The attribute of the element to match"
+ value:
+ type: "string"
+ description: "The value within the selector that we need to match for the action. (this is NOT the value to input!)"
+ required:
+ - "selector_type"
+ - "selector"
+ description: "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies. This field is ignored when using action_type like navigate_to_url, forward, back, refresh, close_window, accept_alert, dismiss_alert, get_alert_text, send_keys_to_alert, and take_screenshot."
+ value:
+ type: "string"
+ description: "The value to use with the action, e.g., text to input, applicable for input_text."
+ error_handling:
+ type: "object"
+ properties:
+ ignore:
+ type: "boolean"
+ description: "Flag to ignore errors and continue with the next action."
+ retry_count:
+ type: "integer"
+ description: "The number of times to retry the action on failure."
+ retry_delay:
+ type: "integer"
+ description: "The delay between retries in seconds."
+ description: "Error handling strategies for the action."
+ post_processing:
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ step_type:
+ type: "string"
+ enum:
+ - "collect_cookies"
+ description: "The type of post-processing step to perform after an action rule has been successfully executed. At the moment the only valid post_processing step is 'collect_cookies'."
+ additionalProperties: "false"
+ required:
+ - "rule_name"
+ - "action_type"
+ anyOf:
+ -
required:
- - "rule_name"
- - "elements"
- action_rules:
- type: "array"
- items:
- type: "object"
- properties:
- rule_name:
- type: "string"
- description: "A unique name identifying the action rule."
- url:
- type: "string"
- format: "uri"
- description: "Optional. The specific URL to which this action applies or the URL to navigate to, applicable for navigate action. Do not use this field for 'navigate_to_url' action type, use instead the value field to specify the url to go to, url field is only to match the rule."
- wait_conditions:
- type: "array"
- items:
- type: "object"
- properties:
- condition_type:
+ - "selectors"
+ -
+ required:
+ - "value"
+ detection_rules:
+ title: "Detection Rules"
+ description: "A list of rules to detect technologies and objects on web pages."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ rule_name:
+ title: "Rule Name"
+ description: "A unique name identifying the detection rule."
+ type: "string"
+ object_name:
+ title: "Object Name"
+ description: "The name of the object or technology to identify. This will also be the JSON key in the output. This is also the field to use for the 'implies' field if you want to imply other objects."
+ type: "string"
+ http_header_fields:
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ key:
+ type: "string"
+ description: "The name of the HTTP header field."
+ value:
+ type: "array"
+ items:
type: "string"
- enum:
- - "element_presence"
- - "element_visible"
- - "plugin_call"
- - "delay"
- value:
+ description: "The expected value of the HTTP header field. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ confidence:
+ type: "number"
+ description: "Optional. The confidence level for the match, ranging from 0 to 10."
+ description: "Matching patterns for HTTP header fields to identify technology."
+ page_content_patterns:
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ key:
+ type: "string"
+ description: "The name of the tag to find in the page content."
+ attribute:
+ type: "string"
+ description: "Optional. The attribute of the tag to match, e.g., 'src' for img tag etc. (leave empty if you want to match the tag's innerText only)."
+ value:
+ type: "array"
+ items:
type: "string"
- description: "a generic value to use with the condition, e.g., a delay in seconds, applicable for delay condition type. For delay type you can also use the CROWler exprterpreter to generate delay values at runtime, e.g., 'random(1, 3)' or 'random(random(1,3), random(5,8))'."
- selector:
+ description: "The pattern to match within the tag's attribute content. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ text:
+ type: "string"
+ description: "Optional. The text to match in the tag's innerText. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ confidence:
+ type: "number"
+ description: "Optional. The confidence level for the detection, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
+ description: "Phrases or character sequences within page content indicative of specific technology."
+ description: "Patterns within the page content that match specific technologies."
+ certificates_patterns:
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ key:
+ type: "string"
+ description: "The name of the field in an SSL/TLS certificate to find."
+ value:
+ type: "array"
+ items:
type: "string"
- description: "The CSS selector for the element, applicable for element_presence and element_visible conditions. If you're using plugin_call, then this field is used for the plugin name."
- description: "Conditions to wait for, that must be met before the action is executed. These conditions are designed to ensure that the page or elements are ready (e.g., waiting for an element to appear, or a delay). Do not use this field to wait after an action is performed, as it only applies before the action is executed."
- conditions:
+ description: "The pattern to match within the field's value. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ confidence:
+ type: "number"
+ description: "Optional. The confidence level for the detection, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
+ description: "Phrases or character sequences within certain certificate's fields indicative of specific technology."
+ url_micro_signatures:
+ type: "array"
+ items:
type: "object"
properties:
- type:
+ value:
type: "string"
- enum:
- - "element"
- - "language"
- - "plugin_call"
- selector:
+ description: "The micro-signature to match in the URL. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ confidence:
+ type: "number"
+ description: "Optional. The confidence level for the match, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
+ description: "Micro-signatures in URLs that indicate a specific technology, like '/wp-admin' for WordPress."
+ description: "URL patterns indicative of specific technologies."
+ meta_tags:
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ name:
+ type: "string"
+ description: "The name attribute of the meta tag."
+ content:
type: "string"
- description: "The CSS selector to check if a given element exists, applicable for 'element'. The language id to check if a page is in a certain language, applicable for 'language'. The plugin's name if you're using plugin_call."
- description: "Conditions that must be met for the action to be executed. For example, you can check if a certain element exists on the page before performing an action. See this as something to do after we waited for the wait_conditions and we verify that the page is ready to perform the action."
- action_type:
+ description: "The content attribute of the meta tag, which holds the value to match. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
+ description: "Matching patterns for meta tags to identify technology."
+ implies:
+ title: "Implies"
+ description: "Optional. A list of rule's names that this rule implies, e.g., if this rule matches, it implies that the rules in this list also match."
+ type: "array"
+ items:
type: "string"
- enum:
- - "click"
- - "input_text"
- - "clear"
- - "drag_and_drop"
- - "mouse_hover"
- - "right_click"
- - "double_click"
- - "click_and_hold"
- - "release"
- - "key_down"
- - "key_up"
- - "navigate_to_url"
- - "forward"
- - "back"
- - "refresh"
- - "switch_to_window"
- - "switch_to_frame"
- - "close_window"
- - "accept_alert"
- - "dismiss_alert"
- - "get_alert_text"
- - "send_keys_to_alert"
- - "scroll_to_element"
- - "scroll_by_amount"
- - "take_screenshot"
- - "custom"
- description: "The type of action to perform, including advanced interactions and calls to plugins.If you want to use plugins then set this field to 'custom', set selector_type field to 'plugin_call', and place the plugin name in the selector field."
- selectors:
- type: "array"
- items:
- type: "object"
- properties:
- selector_type:
- type: "string"
- enum:
- - "css"
- - "xpath"
- - "id"
- - "class_name"
- - "class"
- - "name"
- - "tag_name"
- - "element"
- - "link_text"
- - "partial_link_text"
- - "plugin_call"
- description: "The type of selector to use to find the element."
- selector:
- type: "string"
- description: "The actual selector or pattern used to find the element based on the selector_type. This field is used for the plugin's name when the selector_type is 'plugin_call'."
- attribute:
+ plugin_calls:
+ title: "Plugin Calls"
+ description: "Optional. Call a plugin to detect the technology."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ plugin_name:
+ type: "string"
+ description: "The name of the plugin to call."
+ plugin_parameters:
+ title: "Plugin's Parameters"
+ description: "The parameters to pass to the plugin."
+ type: "array"
+ items:
type: "object"
properties:
- name:
- type: "string"
- description: "The name of the attribute to match for the selector match to be valid."
- value:
+ parameter_name:
type: "string"
- description: "The value to of the attribute to match for the selector to be valid."
- description: "Optional. The attribute of the element to match"
- value:
- type: "string"
- description: "The value within the selector that we need to match for the action. (this is NOT the value to input!)"
- required:
- - "selector_type"
- - "selector"
- description: "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies. This field is ignored when using action_type like navigate_to_url, forward, back, refresh, close_window, accept_alert, dismiss_alert, get_alert_text, send_keys_to_alert, and take_screenshot."
- value:
- type: "string"
- description: "The value to use with the action, e.g., text to input, applicable for input_text."
- error_handling:
+ description: "The name of the parameter to pass to the plugin."
+ parameter_value:
+ description: "The value of the parameter to pass to the plugin."
+ anyOf:
+ - title: "Object"
+ type: "object"
+ - title: "String"
+ type: "string"
+ - title: "Number"
+ type: "number"
+ - title: "Boolean"
+ type: "boolean"
+ - title: "Null"
+ type: "null"
+ - title: "Integer"
+ type: "integer"
+ - title: "Array of Strings"
+ type: "array"
+ items:
+ type: "string"
+ - title: "Array of Numbers"
+ type: "array"
+ items:
+ type: "number"
+ - title: "Array of Booleans"
+ type: "array"
+ items:
+ type: "boolean"
+ - title: "Array of Objects"
+ type: "array"
+ items:
+ type: "object"
+ - title: "Array of Nulls"
+ type: "array"
+ items:
+ type: "null"
+ - title: "Array of Integers"
+ type: "array"
+ items:
+ type: "integer"
+ examples:
+ - "my_api_key"
+ - "my_db_password"
+ - "700"
+ - "true"
+ - "1.76"
+ - "['value1', 'value2']"
+ additionalProperties: "false"
+ required:
+ - "parameter_name"
+ - "parameter_value"
+ examples:
+ - parameter_name: "api_key"
+ parameter_value: "my_api_key"
+ - parameter_name: "db_password"
+ parameter_value: "my_db_password"
+ additionalProperties: "false"
+ external_detection:
+ type: "array"
+ items:
type: "object"
properties:
- ignore:
- type: "boolean"
- description: "Flag to ignore errors and continue with the next action."
- retry_count:
- type: "integer"
- description: "The number of times to retry the action on failure."
- retry_delay:
- type: "integer"
- description: "The delay between retries in seconds."
- description: "Error handling strategies for the action."
- post_processing:
- type: "array"
- items:
- type: "object"
- properties:
- step_type:
- type: "string"
- enum:
- - "collect_cookies"
- description: "The type of post-processing step to perform after an action rule has been successfully executed. At the moment the only valid post_processing step is 'collect_cookies'."
+ provider:
+ title: "Provider"
+ description: "The name of the supported external detection provider."
+ type: "string"
+ enum:
+ - "abuse_ipdb"
+ - "alien_vault"
+ - "censys"
+ - "cisco_umbrella"
+ - "grey_noise"
+ - "google_safe_browsing"
+ - "hybrid_analysis"
+ - "ip_quality_score"
+ - "ipvoid"
+ - "malware_domain_list"
+ - "shodan"
+ - "virus_total"
+ - "url_haus"
+ examples:
+ - "abuse_ipdb"
+ - "alien_vault"
+ - "censys"
+ - "cisco_umbrella"
+ - "grey_noise"
+ - "google_safe_browsing"
+ - "hybrid_analysis"
+ - "ip_quality_score"
+ - "ipvoid"
+ - "malware_domain_list"
+ - "shodan"
+ - "virus_total"
+ - "url_haus"
+ additionalProperties: "false"
+ required:
+ - "rule_name"
+ - "object_name"
+ anyOf:
+ -
required:
- - "rule_name"
- - "action_type"
- detection_rules:
- type: "array"
- items:
- type: "object"
- properties:
- rule_name:
- type: "string"
- description: "A unique name identifying the detection rule."
- object_name:
- type: "string"
- description: "The name of the object or technology to identify. This will also be the JSON key in the output. This is also the field to use for the 'implies' field if you want to imply other objects."
- http_header_fields:
- type: "array"
- items:
- type: "object"
- properties:
- key:
+ - "http_header_fields"
+ -
+ required:
+ - "page_content_patterns"
+ -
+ required:
+ - "certificates_patterns"
+ -
+ required:
+ - "url_micro_signatures"
+ -
+ required:
+ - "meta_tags"
+ -
+ required:
+ - "implies"
+ -
+ required:
+ - "plugin_calls"
+ crawling_rules:
+ title: "Crawling Rules"
+ description: "A list of rules to crawl web pages and fuzz parameters."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ rule_name:
+ type: "string"
+ description: "A unique name identifying the crawling rule."
+ request_type:
+ type: "string"
+ enum:
+ - "GET"
+ - "POST"
+ description: "The type of request to perform for fuzzing."
+ target_elements:
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ selector_type:
+ type: "string"
+ enum:
+ - "css"
+ - "xpath"
+ - "form"
+ selector:
+ type: "string"
+ description: "The actual selector or form name used to find and interact with the target elements for fuzzing."
+ required:
+ - "selector_type"
+ - "selector"
+ description: "Specifies the elements to target for fuzzing, including forms."
+ fuzzing_parameters:
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ parameter_name:
+ type: "string"
+ description: "Name of the parameter to fuzz."
+ fuzzing_type:
+ type: "string"
+ enum:
+ - "fixed_list"
+ - "pattern_based"
+ description: "The fuzzing strategy to use for the parameter."
+ values:
+ type: "array"
+ items:
type: "string"
- description: "The name of the HTTP header field."
- value:
- type: "array"
- items:
- type: "string"
- description: "The expected value of the HTTP header field. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
- confidence:
- type: "number"
- description: "Optional. The confidence level for the match, ranging from 0 to 10."
- description: "Matching patterns for HTTP header fields to identify technology."
- page_content_patterns:
+ description: "List of values to use for fuzzing, applicable if 'fuzzing_type' is 'fixed_list'."
+ pattern:
+ type: "string"
+ description: "A pattern to generate fuzzing values, applicable if 'fuzzing_type' is 'pattern_based'."
+ required:
+ - "parameter_name"
+ - "fuzzing_type"
+ description: "Defines the parameters to fuzz and the strategy for generating fuzz values."
+ additionalProperties: "false"
+ required:
+ - "rule_name"
+ - "request_type"
+ - "target_elements"
+ - "fuzzing_parameters"
+ environment_settings:
+ title: "Environment Settings"
+ description: "Optional. Custom key value settings to use in the rules. Normally used to set environment variables for the rules."
+ type: "array"
+ items:
+ type: "object"
+ properties:
+ key:
+ title: "Setting's Name"
+ description: "The name of the environment setting. It has to be unique within the Rulesgroup namespace."
+ type: "string"
+ examples:
+ - "API_KEY"
+ - "DB_PASSWORD"
+ value:
+ title: "Setting's Value"
+ description: "A single or a set of values for the environment key."
+ anyOf:
+ - title: "Object"
+ type: "object"
+ - title: "String"
+ type: "string"
+ - title: "Number"
+ type: "number"
+ - title: "Boolean"
+ type: "boolean"
+ - title: "Null"
+ type: "null"
+ - title: "Integer"
+ type: "integer"
+ - title: "Array of Strings"
type: "array"
items:
- type: "object"
- properties:
- key:
- type: "string"
- description: "The name of the tag to find in the page content."
- attribute:
- type: "string"
- description: "Optional. The attribute of the tag to match, e.g., 'src' for img tag etc. (leave empty if you want to match the tag's innerText only)."
- value:
- type: "array"
- items:
- type: "string"
- description: "The pattern to match within the tag's attribute content. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
- text:
- type: "string"
- description: "Optional. The text to match in the tag's innerText. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
- confidence:
- type: "number"
- description: "Optional. The confidence level for the detection, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
- description: "Phrases or character sequences within page content indicative of specific technology."
- description: "Patterns within the page content that match specific technologies."
- certificates_patterns:
+ type: "string"
+ - title: "Array of Numbers"
type: "array"
items:
- type: "object"
- properties:
- key:
- type: "string"
- description: "The name of the field in an SSL/TLS certificate to find."
- value:
- type: "array"
- items:
- type: "string"
- description: "The pattern to match within the field's value. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
- confidence:
- type: "number"
- description: "Optional. The confidence level for the detection, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
- description: "Phrases or character sequences within certain certificate's fields indicative of specific technology."
- url_micro_signatures:
+ type: "number"
+ - title: "Array of Booleans"
type: "array"
items:
- type: "object"
- properties:
- value:
- type: "string"
- description: "The micro-signature to match in the URL. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
- confidence:
- type: "number"
- description: "Optional. The confidence level for the match, decimal number ranging from 0 to 10 (or whatever set in the detection_configuration)."
- description: "Micro-signatures in URLs that indicate a specific technology, like '/wp-admin' for WordPress."
- description: "URL patterns indicative of specific technologies."
- meta_tags:
+ type: "boolean"
+ - title: "Array of Objects"
type: "array"
items:
type: "object"
- properties:
- name:
- type: "string"
- description: "The name attribute of the meta tag."
- content:
- type: "string"
- description: "The content attribute of the meta tag, which holds the value to match. You can use Perl-Compatible Regular Expressions (PCRE) to write your signatures and patterns."
- description: "Matching patterns for meta tags to identify technology."
- implies:
+ - title: "Array of Nulls"
type: "array"
items:
- type: "string"
- description: "Optional. A list of object names that this rule implies, e.g., if this rule matches, it implies that the object names in this list are also present."
- plugin_calls:
+ type: "null"
+ - title: "Array of Integers"
type: "array"
items:
- type: "object"
- properties:
- plugin_name:
- type: "string"
- description: "The name of the plugin to call."
- plugin_parameters:
- type: "array"
- items:
- type: "object"
- properties:
- parameter_name:
- type: "string"
- description: "The name of the parameter to pass to the plugin."
- parameter_value:
- type: "string"
- description: "The value of the parameter to pass to the plugin."
- description: "The parameters to pass to the plugin."
- description: "Optional. Call a plugin to detect the technology."
- required:
- - "rule_name"
- - "object_name"
- anyOf:
- - required:
- - "http_header_fields"
- - required:
- - "page_content_patterns"
- - required:
- - "certificates_patterns"
- - required:
- - "url_micro_signatures"
- - required:
- - "meta_tags"
- - required:
- - "implies"
- - required:
- - "plugin_calls"
- crawling_rules:
- type: "array"
- items:
- type: "object"
+ type: "integer"
+ examples:
+ - "my_api_key"
+ - "my_db_password"
+ - "my_secret_key"
+ - "500"
+ - "true"
+ - "1.76"
+ - "['value1', 'value2']"
properties:
- rule_name:
- type: "string"
- description: "A unique name identifying the crawling rule."
- request_type:
- type: "string"
- enum:
- - "GET"
- - "POST"
- description: "The type of request to perform for fuzzing."
- target_elements:
- type: "array"
- items:
- type: "object"
- properties:
- selector_type:
- type: "string"
- enum:
- - "css"
- - "xpath"
- - "form"
- selector:
- type: "string"
- description: "The actual selector or form name used to find and interact with the target elements for fuzzing."
- required:
- - "selector_type"
- - "selector"
- description: "Specifies the elements to target for fuzzing, including forms."
- fuzzing_parameters:
- type: "array"
- items:
- type: "object"
- properties:
- parameter_name:
- type: "string"
- description: "Name of the parameter to fuzz."
- fuzzing_type:
- type: "string"
- enum:
- - "fixed_list"
- - "pattern_based"
- description: "The fuzzing strategy to use for the parameter."
- values:
- type: "array"
- items:
- type: "string"
- description: "List of values to use for fuzzing, applicable if 'fuzzing_type' is 'fixed_list'."
- pattern:
- type: "string"
- description: "A pattern to generate fuzzing values, applicable if 'fuzzing_type' is 'pattern_based'."
- required:
- - "parameter_name"
- - "fuzzing_type"
- description: "Defines the parameters to fuzz and the strategy for generating fuzz values."
- required:
- - "rule_name"
- - "request_type"
- - "target_elements"
- - "fuzzing_parameters"
- environment_settings:
- type: "array"
- items:
- type: "object"
- properties:
- key:
- type: "string"
- description: "The name of the environment setting."
- value:
- type: "string"
- description: "The value of the environment setting."
- description: "Optional. Custom key value settings to use in the rules. Normally used to set environment variables for the rules."
- logging_configuration:
- type: "object"
- properties:
- log_level:
- type: "string"
- enum:
- - "DEBUG"
- - "INFO"
- - "WARNING"
- - "ERROR"
- - "CRITICAL"
- description: "Optional. Specifies the logging level for actions and scraping activities."
- log_message:
- type: "string"
- description: "Optional. The message you want to log if the rule matches something."
- description: "rule log configuration (aka what you want to be logged when the rule execute)."
- required:
+ title: "Setting's Properties"
+ description: "Optional. Additional properties for the environment setting. These properties are used to define the behavior of the environment setting."
+ type: "object"
+ properties:
+ persistent:
+ type: "boolean"
+ description: "Optional. Flag to indicate if the environment setting should be persistent after ruleset completes execution."
+ static:
+ type: "boolean"
+ description: "Optional. Flag to indicate if the environment setting should be static and not changeable. That means that the value will be set once and never changed."
+ source:
+ type: "string"
+ description: "Optional. The source of the environment setting. If not set manually then the current URL will be used as the source."
+ additionalProperties: "false"
+ additionalProperties: "false"
+ required:
+ - "key"
+ - "value"
+ logging_configuration:
+ type: "object"
+ properties:
+ log_level:
+ type: "string"
+ enum:
+ - "DEBUG"
+ - "INFO"
+ - "WARNING"
+ - "ERROR"
+ - "CRITICAL"
+ description: "Optional. Specifies the logging level for actions and scraping activities."
+ log_message:
+ type: "string"
+ description: "Optional. The message you want to log if the rule matches something."
+ description: "rule log configuration (aka what you want to be logged when the rule execute)."
+ additionalProperties: "false"
+ required:
- "group_name"
- "is_enabled"
- anyOf:
- - required:
- - "scraping_rules"
- - required:
- - "action_rules"
- - required:
- - "detection_rules"
- - required:
- - "crawling_rules"
+ anyOf:
+ -
+ required:
+ - "scraping_rules"
+ -
+ required:
+ - "action_rules"
+ -
+ required:
+ - "detection_rules"
+ -
+ required:
+ - "crawling_rules"
required:
- - "ruleset_name"
- - "format_version"
- - "rule_groups"
- - "created_at"
- - "author"
- - "description"
-minItems: "1"
+- "ruleset_name"
+- "format_version"
+- "rule_groups"
+- "created_at"
+- "author"
+- "description"
From 7cab2082c19b4b51d914f19233fbfb5f88f85303 Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Fri, 4 Oct 2024 18:08:07 +0100
Subject: [PATCH 12/12] formatted the yaml ruleset schema document
---
schemas/ruleset-schema.yaml | 529 ++++++++++++++++++------------------
1 file changed, 258 insertions(+), 271 deletions(-)
diff --git a/schemas/ruleset-schema.yaml b/schemas/ruleset-schema.yaml
index d3d06dc..6e0fd7e 100644
--- a/schemas/ruleset-schema.yaml
+++ b/schemas/ruleset-schema.yaml
@@ -14,14 +14,14 @@ properties:
type: "string"
pattern: "^\\d+\\.\\d+\\.\\d+$"
examples:
- - "1.0.5"
+ - "1.0.5"
author:
title: "Author's name"
description: "The name of the author or owner of the ruleset."
type: "string"
examples:
- - "John Doe"
- - "ZFP Systems Inc."
+ - "John Doe"
+ - "ZFP Systems Inc."
created_at:
title: "Creation Date"
description: "The date of when this version of the RUleset was created. SUpports multiple date-time formats."
@@ -36,8 +36,8 @@ properties:
description: "A unique name identifying the ruleset."
type: "string"
examples:
- - "My Ruleset"
- - "https://example.com"
+ - "My Ruleset"
+ - "https://example.com"
rule_groups:
title: "Rules Groups"
description: "A list of rule groups, each containing mixes of scraping, action, detection, or crawling rules."
@@ -50,8 +50,8 @@ properties:
description: "A unique name identifying the group of rules."
type: "string"
examples:
- - "My Group"
- - "https://example.com"
+ - "My Group"
+ - "https://example.com"
valid_from:
title: "Valid From"
description: "The start date from which the rule group becomes active."
@@ -110,18 +110,18 @@ properties:
selector_type:
type: "string"
enum:
- - "css"
- - "xpath"
- - "id"
- - "class_name"
- - "class"
- - "name"
- - "tag_name"
- - "element"
- - "link_text"
- - "partial_link_text"
- - "regex"
- - "plugin_call"
+ - "css"
+ - "xpath"
+ - "id"
+ - "class_name"
+ - "class"
+ - "name"
+ - "tag_name"
+ - "element"
+ - "link_text"
+ - "partial_link_text"
+ - "regex"
+ - "plugin_call"
description: "The type of selector to use to find the element. To extract data using plugins, set this field to 'plugin_call'."
selector:
type: "string"
@@ -141,12 +141,12 @@ properties:
description: "Flag to extract all occurrences of the element, not just the first one. This flag has no effect when using CROWler plugins via plugin_call."
additionalProperties: "false"
required:
- - "selector_type"
- - "selector"
+ - "selector_type"
+ - "selector"
additionalProperties: "false"
required:
- - "key"
- - "selectors"
+ - "key"
+ - "selectors"
extract_scripts:
title: "Extract Page's Scripts"
description: "Indicates whether the rule also has to extract scripts from a page and store them as separate web objects. This is useful for analyzing JavaScript code using 3rd party tools and vulnerability analysis."
@@ -160,8 +160,8 @@ properties:
type: "string"
description: "A unique name identifying the detection rule."
examples:
- - "My Object"
- - "https://example.com"
+ - "My Object"
+ - "https://example.com"
json_field_rename:
title: "JSON Fields Renaming"
description: "Given that the CROWler scraper maps automatically HTML tags to JSON tags, you can use this feature to rename the json-html tag with whatever name you wish to use."
@@ -177,8 +177,8 @@ properties:
description: "The new name for the JSON tag."
additionalProperties: "false"
required:
- - "source_tag"
- - "dest_tag"
+ - "source_tag"
+ - "dest_tag"
wait_conditions:
title: "Wait Conditions"
description: "Conditions to wait before being able to scrape the data. This to ensure page readiness. Do not use this field to wait after 'navigate_to_url' action type, it doesn't do that, instead it will wait to execute 'navigate_to_url'."
@@ -189,10 +189,10 @@ properties:
condition_type:
type: "string"
enum:
- - "element_presence"
- - "element_visible"
- - "plugin_call"
- - "delay"
+ - "element_presence"
+ - "element_visible"
+ - "plugin_call"
+ - "delay"
value:
type: "string"
description: "a generic value to use with the condition, e.g., a delay in seconds, applicable for delay condition type. For delay type you can also use the CROWler exprterpreter to generate delay values at runtime, e.g., 'random(1, 3)' or 'random(random(1,3), random(5,8))'. If you're using plugin_call, then value field is ignored."
@@ -208,13 +208,13 @@ properties:
step_type:
type: "string"
enum:
- - "replace"
- - "remove"
- - "transform"
- - "validate"
- - "clean"
- - "plugin_call"
- - "external_api"
+ - "replace"
+ - "remove"
+ - "transform"
+ - "validate"
+ - "clean"
+ - "plugin_call"
+ - "external_api"
description: "The type of post-processing step to perform on the scraped data. To use plugins to process the data, set this field to 'plugin_call' and place the plugin name in the 'details' object using a field called 'plugin_name'. Do not use 'transform' if you want to use a plugin to transform the output, use 'plugin_call' instead."
details:
type: "object"
@@ -223,12 +223,12 @@ properties:
additionalProperties: "false"
description: "Post-processing steps for the scraped data to transform, validate, or clean it. To use external APIs to process the data, use the 'transform' step type and, inside the 'details' object, specify the API endpoint and the required parameters. For example, in details, use { 'transform_type': 'api', 'api_url': 'https://api.example.com', 'timeout': 60, 'token': 'your-api-token' }."
required:
- - "step_type"
- - "details"
+ - "step_type"
+ - "details"
additionalProperties: "false"
required:
- - "rule_name"
- - "elements"
+ - "rule_name"
+ - "elements"
action_rules:
title: "Action Rules"
description: "A list of rules to interact with web pages."
@@ -252,10 +252,10 @@ properties:
condition_type:
type: "string"
enum:
- - "element_presence"
- - "element_visible"
- - "plugin_call"
- - "delay"
+ - "element_presence"
+ - "element_visible"
+ - "plugin_call"
+ - "delay"
value:
type: "string"
description: "a generic value to use with the condition, e.g., a delay in seconds, applicable for delay condition type. For delay type you can also use the CROWler exprterpreter to generate delay values at runtime, e.g., 'random(1, 3)' or 'random(random(1,3), random(5,8))'."
@@ -269,9 +269,9 @@ properties:
type:
type: "string"
enum:
- - "element"
- - "language"
- - "plugin_call"
+ - "element"
+ - "language"
+ - "plugin_call"
selector:
type: "string"
description: "The CSS selector to check if a given element exists, applicable for 'element'. The language id to check if a page is in a certain language, applicable for 'language'. The plugin's name if you're using plugin_call."
@@ -279,32 +279,32 @@ properties:
action_type:
type: "string"
enum:
- - "click"
- - "input_text"
- - "clear"
- - "drag_and_drop"
- - "mouse_hover"
- - "right_click"
- - "double_click"
- - "click_and_hold"
- - "release"
- - "key_down"
- - "key_up"
- - "navigate_to_url"
- - "forward"
- - "back"
- - "refresh"
- - "switch_to_window"
- - "switch_to_frame"
- - "close_window"
- - "accept_alert"
- - "dismiss_alert"
- - "get_alert_text"
- - "send_keys_to_alert"
- - "scroll_to_element"
- - "scroll_by_amount"
- - "take_screenshot"
- - "custom"
+ - "click"
+ - "input_text"
+ - "clear"
+ - "drag_and_drop"
+ - "mouse_hover"
+ - "right_click"
+ - "double_click"
+ - "click_and_hold"
+ - "release"
+ - "key_down"
+ - "key_up"
+ - "navigate_to_url"
+ - "forward"
+ - "back"
+ - "refresh"
+ - "switch_to_window"
+ - "switch_to_frame"
+ - "close_window"
+ - "accept_alert"
+ - "dismiss_alert"
+ - "get_alert_text"
+ - "send_keys_to_alert"
+ - "scroll_to_element"
+ - "scroll_by_amount"
+ - "take_screenshot"
+ - "custom"
description: "The type of action to perform, including advanced interactions and calls to plugins.If you want to use plugins then set this field to 'custom', set selector_type field to 'plugin_call', and place the plugin name in the selector field."
selectors:
type: "array"
@@ -314,17 +314,17 @@ properties:
selector_type:
type: "string"
enum:
- - "css"
- - "xpath"
- - "id"
- - "class_name"
- - "class"
- - "name"
- - "tag_name"
- - "element"
- - "link_text"
- - "partial_link_text"
- - "plugin_call"
+ - "css"
+ - "xpath"
+ - "id"
+ - "class_name"
+ - "class"
+ - "name"
+ - "tag_name"
+ - "element"
+ - "link_text"
+ - "partial_link_text"
+ - "plugin_call"
description: "The type of selector to use to find the element."
selector:
type: "string"
@@ -343,8 +343,8 @@ properties:
type: "string"
description: "The value within the selector that we need to match for the action. (this is NOT the value to input!)"
required:
- - "selector_type"
- - "selector"
+ - "selector_type"
+ - "selector"
description: "Defines multiple ways to find and interact with elements, allowing for CSS, XPath, and other strategies. This field is ignored when using action_type like navigate_to_url, forward, back, refresh, close_window, accept_alert, dismiss_alert, get_alert_text, send_keys_to_alert, and take_screenshot."
value:
type: "string"
@@ -370,19 +370,17 @@ properties:
step_type:
type: "string"
enum:
- - "collect_cookies"
+ - "collect_cookies"
description: "The type of post-processing step to perform after an action rule has been successfully executed. At the moment the only valid post_processing step is 'collect_cookies'."
additionalProperties: "false"
required:
- - "rule_name"
- - "action_type"
+ - "rule_name"
+ - "action_type"
anyOf:
- -
- required:
- - "selectors"
- -
- required:
- - "value"
+ - required:
+ - "selectors"
+ - required:
+ - "value"
detection_rules:
title: "Detection Rules"
description: "A list of rules to detect technologies and objects on web pages."
@@ -510,58 +508,58 @@ properties:
parameter_value:
description: "The value of the parameter to pass to the plugin."
anyOf:
- - title: "Object"
- type: "object"
- - title: "String"
- type: "string"
- - title: "Number"
- type: "number"
- - title: "Boolean"
- type: "boolean"
- - title: "Null"
- type: "null"
- - title: "Integer"
- type: "integer"
- - title: "Array of Strings"
- type: "array"
- items:
+ - title: "Object"
+ type: "object"
+ - title: "String"
type: "string"
- - title: "Array of Numbers"
- type: "array"
- items:
+ - title: "Number"
type: "number"
- - title: "Array of Booleans"
- type: "array"
- items:
+ - title: "Boolean"
type: "boolean"
- - title: "Array of Objects"
- type: "array"
- items:
- type: "object"
- - title: "Array of Nulls"
- type: "array"
- items:
+ - title: "Null"
type: "null"
- - title: "Array of Integers"
- type: "array"
- items:
+ - title: "Integer"
type: "integer"
+ - title: "Array of Strings"
+ type: "array"
+ items:
+ type: "string"
+ - title: "Array of Numbers"
+ type: "array"
+ items:
+ type: "number"
+ - title: "Array of Booleans"
+ type: "array"
+ items:
+ type: "boolean"
+ - title: "Array of Objects"
+ type: "array"
+ items:
+ type: "object"
+ - title: "Array of Nulls"
+ type: "array"
+ items:
+ type: "null"
+ - title: "Array of Integers"
+ type: "array"
+ items:
+ type: "integer"
examples:
- - "my_api_key"
- - "my_db_password"
- - "700"
- - "true"
- - "1.76"
- - "['value1', 'value2']"
+ - "my_api_key"
+ - "my_db_password"
+ - "700"
+ - "true"
+ - "1.76"
+ - "['value1', 'value2']"
additionalProperties: "false"
required:
- - "parameter_name"
- - "parameter_value"
+ - "parameter_name"
+ - "parameter_value"
examples:
- - parameter_name: "api_key"
- parameter_value: "my_api_key"
- - parameter_name: "db_password"
- parameter_value: "my_db_password"
+ - parameter_name: "api_key"
+ parameter_value: "my_api_key"
+ - parameter_name: "db_password"
+ parameter_value: "my_db_password"
additionalProperties: "false"
external_detection:
type: "array"
@@ -573,59 +571,52 @@ properties:
description: "The name of the supported external detection provider."
type: "string"
enum:
- - "abuse_ipdb"
- - "alien_vault"
- - "censys"
- - "cisco_umbrella"
- - "grey_noise"
- - "google_safe_browsing"
- - "hybrid_analysis"
- - "ip_quality_score"
- - "ipvoid"
- - "malware_domain_list"
- - "shodan"
- - "virus_total"
- - "url_haus"
+ - "abuse_ipdb"
+ - "alien_vault"
+ - "censys"
+ - "cisco_umbrella"
+ - "grey_noise"
+ - "google_safe_browsing"
+ - "hybrid_analysis"
+ - "ip_quality_score"
+ - "ipvoid"
+ - "malware_domain_list"
+ - "shodan"
+ - "virus_total"
+ - "url_haus"
examples:
- - "abuse_ipdb"
- - "alien_vault"
- - "censys"
- - "cisco_umbrella"
- - "grey_noise"
- - "google_safe_browsing"
- - "hybrid_analysis"
- - "ip_quality_score"
- - "ipvoid"
- - "malware_domain_list"
- - "shodan"
- - "virus_total"
- - "url_haus"
+ - "abuse_ipdb"
+ - "alien_vault"
+ - "censys"
+ - "cisco_umbrella"
+ - "grey_noise"
+ - "google_safe_browsing"
+ - "hybrid_analysis"
+ - "ip_quality_score"
+ - "ipvoid"
+ - "malware_domain_list"
+ - "shodan"
+ - "virus_total"
+ - "url_haus"
additionalProperties: "false"
required:
- - "rule_name"
- - "object_name"
+ - "rule_name"
+ - "object_name"
anyOf:
- -
- required:
- - "http_header_fields"
- -
- required:
- - "page_content_patterns"
- -
- required:
- - "certificates_patterns"
- -
- required:
- - "url_micro_signatures"
- -
- required:
- - "meta_tags"
- -
- required:
- - "implies"
- -
- required:
- - "plugin_calls"
+ - required:
+ - "http_header_fields"
+ - required:
+ - "page_content_patterns"
+ - required:
+ - "certificates_patterns"
+ - required:
+ - "url_micro_signatures"
+ - required:
+ - "meta_tags"
+ - required:
+ - "implies"
+ - required:
+ - "plugin_calls"
crawling_rules:
title: "Crawling Rules"
description: "A list of rules to crawl web pages and fuzz parameters."
@@ -639,8 +630,8 @@ properties:
request_type:
type: "string"
enum:
- - "GET"
- - "POST"
+ - "GET"
+ - "POST"
description: "The type of request to perform for fuzzing."
target_elements:
type: "array"
@@ -650,15 +641,15 @@ properties:
selector_type:
type: "string"
enum:
- - "css"
- - "xpath"
- - "form"
+ - "css"
+ - "xpath"
+ - "form"
selector:
type: "string"
description: "The actual selector or form name used to find and interact with the target elements for fuzzing."
required:
- - "selector_type"
- - "selector"
+ - "selector_type"
+ - "selector"
description: "Specifies the elements to target for fuzzing, including forms."
fuzzing_parameters:
type: "array"
@@ -671,8 +662,8 @@ properties:
fuzzing_type:
type: "string"
enum:
- - "fixed_list"
- - "pattern_based"
+ - "fixed_list"
+ - "pattern_based"
description: "The fuzzing strategy to use for the parameter."
values:
type: "array"
@@ -683,15 +674,15 @@ properties:
type: "string"
description: "A pattern to generate fuzzing values, applicable if 'fuzzing_type' is 'pattern_based'."
required:
- - "parameter_name"
- - "fuzzing_type"
+ - "parameter_name"
+ - "fuzzing_type"
description: "Defines the parameters to fuzz and the strategy for generating fuzz values."
additionalProperties: "false"
required:
- - "rule_name"
- - "request_type"
- - "target_elements"
- - "fuzzing_parameters"
+ - "rule_name"
+ - "request_type"
+ - "target_elements"
+ - "fuzzing_parameters"
environment_settings:
title: "Environment Settings"
description: "Optional. Custom key value settings to use in the rules. Normally used to set environment variables for the rules."
@@ -704,56 +695,56 @@ properties:
description: "The name of the environment setting. It has to be unique within the Rulesgroup namespace."
type: "string"
examples:
- - "API_KEY"
- - "DB_PASSWORD"
+ - "API_KEY"
+ - "DB_PASSWORD"
value:
title: "Setting's Value"
description: "A single or a set of values for the environment key."
anyOf:
- - title: "Object"
- type: "object"
- - title: "String"
- type: "string"
- - title: "Number"
- type: "number"
- - title: "Boolean"
- type: "boolean"
- - title: "Null"
- type: "null"
- - title: "Integer"
- type: "integer"
- - title: "Array of Strings"
- type: "array"
- items:
+ - title: "Object"
+ type: "object"
+ - title: "String"
type: "string"
- - title: "Array of Numbers"
- type: "array"
- items:
+ - title: "Number"
type: "number"
- - title: "Array of Booleans"
- type: "array"
- items:
+ - title: "Boolean"
type: "boolean"
- - title: "Array of Objects"
- type: "array"
- items:
- type: "object"
- - title: "Array of Nulls"
- type: "array"
- items:
+ - title: "Null"
type: "null"
- - title: "Array of Integers"
- type: "array"
- items:
+ - title: "Integer"
type: "integer"
+ - title: "Array of Strings"
+ type: "array"
+ items:
+ type: "string"
+ - title: "Array of Numbers"
+ type: "array"
+ items:
+ type: "number"
+ - title: "Array of Booleans"
+ type: "array"
+ items:
+ type: "boolean"
+ - title: "Array of Objects"
+ type: "array"
+ items:
+ type: "object"
+ - title: "Array of Nulls"
+ type: "array"
+ items:
+ type: "null"
+ - title: "Array of Integers"
+ type: "array"
+ items:
+ type: "integer"
examples:
- - "my_api_key"
- - "my_db_password"
- - "my_secret_key"
- - "500"
- - "true"
- - "1.76"
- - "['value1', 'value2']"
+ - "my_api_key"
+ - "my_db_password"
+ - "my_secret_key"
+ - "500"
+ - "true"
+ - "1.76"
+ - "['value1', 'value2']"
properties:
title: "Setting's Properties"
description: "Optional. Additional properties for the environment setting. These properties are used to define the behavior of the environment setting."
@@ -771,19 +762,19 @@ properties:
additionalProperties: "false"
additionalProperties: "false"
required:
- - "key"
- - "value"
+ - "key"
+ - "value"
logging_configuration:
type: "object"
properties:
log_level:
type: "string"
enum:
- - "DEBUG"
- - "INFO"
- - "WARNING"
- - "ERROR"
- - "CRITICAL"
+ - "DEBUG"
+ - "INFO"
+ - "WARNING"
+ - "ERROR"
+ - "CRITICAL"
description: "Optional. Specifies the logging level for actions and scraping activities."
log_message:
type: "string"
@@ -791,25 +782,21 @@ properties:
description: "rule log configuration (aka what you want to be logged when the rule execute)."
additionalProperties: "false"
required:
- - "group_name"
- - "is_enabled"
+ - "group_name"
+ - "is_enabled"
anyOf:
- -
- required:
- - "scraping_rules"
- -
- required:
- - "action_rules"
- -
- required:
- - "detection_rules"
- -
- required:
- - "crawling_rules"
+ - required:
+ - "scraping_rules"
+ - required:
+ - "action_rules"
+ - required:
+ - "detection_rules"
+ - required:
+ - "crawling_rules"
required:
-- "ruleset_name"
-- "format_version"
-- "rule_groups"
-- "created_at"
-- "author"
-- "description"
+ - "ruleset_name"
+ - "format_version"
+ - "rule_groups"
+ - "created_at"
+ - "author"
+ - "description"