diff --git a/README.md b/README.md index 639d5a2..946131b 100644 --- a/README.md +++ b/README.md @@ -103,10 +103,12 @@ For more information on the features, see the [features](doc/features.md) page. ### What problem does it solves? The CROWler is designed to solve a set of problems about web crawling, content -discovery, and data extraction. It's designed to be able to crawl websites in a -respectful and efficient way. It's also designed to be able to crawl private -networks and intranets, so you can use it to create your own or company search -engine. + discovery, technology detection and data extraction. + +While it’s main goal is to enable private, professional and enterprise users to +quickly develop their content discovery solutions, It’s also designed to be +able to crawl private networks and intranets, so you can use it to create your +own or your company search engine. On top of that it can also be used as the "base" for a more complex cyber security tool, as it can be used to gather information about a website, its network, its diff --git a/pkg/config/config.go b/pkg/config/config.go index 66371ec..3ff4826 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -169,6 +169,7 @@ func NewConfig() *Config { SourceScreenshot: false, FullSiteScreenshot: false, MaxDepth: 0, + MaxLinks: 0, Delay: "0", MaxSources: 4, BrowsingMode: "recursive", @@ -1464,37 +1465,64 @@ func combineCrawlerCfg(dstCfg *Crawler, srcCfgIface interface{}) { srcCfg := srcCfgIface.(map[string]interface{}) if srcCfg["workers"] != nil { - dstCfg.Workers = srcCfg["workers"].(int) + if val, ok := srcCfg["workers"].(float64); ok { + dstCfg.Workers = int(val) + } } if srcCfg["interval"] != nil { - dstCfg.Interval = srcCfg["interval"].(string) + if val, ok := srcCfg["interval"].(string); ok { + dstCfg.Interval = val + } } if srcCfg["timeout"] != nil { - dstCfg.Timeout = srcCfg["timeout"].(int) + if val, ok := srcCfg["timeout"].(float64); ok { + dstCfg.Timeout = int(val) + } } if srcCfg["max_depth"] != nil { - dstCfg.MaxDepth = srcCfg["max_depth"].(int) + if val, ok := srcCfg["max_depth"].(float64); ok { + dstCfg.MaxDepth = int(val) + } + } + if srcCfg["max_links"] != nil { + if val, ok := srcCfg["max_links"].(float64); ok { + dstCfg.MaxLinks = int(val) + } } if srcCfg["delay"] != nil { - dstCfg.Delay = srcCfg["delay"].(string) + if val, ok := srcCfg["delay"].(string); ok { + dstCfg.Delay = val + } } if srcCfg["browsing_mode"] != nil { - dstCfg.BrowsingMode = srcCfg["browsing_mode"].(string) + if val, ok := srcCfg["browsing_mode"].(string); ok { + dstCfg.BrowsingMode = val + } } if srcCfg["screenshot_section_wait"] != nil { - dstCfg.ScreenshotSectionWait = srcCfg["screenshot_section_wait"].(int) + if val, ok := srcCfg["screenshot_section_wait"].(float64); ok { + dstCfg.ScreenshotSectionWait = int(val) + } } if srcCfg["max_sources"] != nil { - dstCfg.MaxSources = srcCfg["max_sources"].(int) + if val, ok := srcCfg["max_sources"].(float64); ok { + dstCfg.MaxSources = int(val) + } } if srcCfg["screenshot_max_height"] != nil { - dstCfg.ScreenshotMaxHeight = srcCfg["screenshot_max_height"].(int) + if val, ok := srcCfg["screenshot_max_height"].(float64); ok { + dstCfg.ScreenshotMaxHeight = int(val) + } } if srcCfg["max_retries"] != nil { - dstCfg.MaxRetries = srcCfg["max_retries"].(int) + if val, ok := srcCfg["max_retries"].(float64); ok { + dstCfg.MaxRetries = int(val) + } } if srcCfg["max_redirects"] != nil { - dstCfg.MaxRedirects = srcCfg["max_redirects"].(int) + if val, ok := srcCfg["max_redirects"].(float64); ok { + dstCfg.MaxRedirects = int(val) + } } } @@ -1504,25 +1532,39 @@ func combineVDICfg(dstCfg *[]Selenium, srcCfgIface interface{}) { for i, v := range srcCfgSlice { srcCfg := v.(map[string]interface{}) if srcCfg["type"] != nil { - (*dstCfg)[i].Type = srcCfg["type"].(string) + if val, ok := srcCfg["type"].(string); ok { + (*dstCfg)[i].Type = val + } } if srcCfg["service_type"] != nil { - (*dstCfg)[i].ServiceType = srcCfg["service_type"].(string) + if val, ok := srcCfg["service_type"].(string); ok { + (*dstCfg)[i].ServiceType = val + } } if srcCfg["path"] != nil { - (*dstCfg)[i].Path = srcCfg["path"].(string) + if val, ok := srcCfg["path"].(string); ok { + (*dstCfg)[i].Path = val + } } if srcCfg["driver_path"] != nil { - (*dstCfg)[i].DriverPath = srcCfg["driver_path"].(string) + if val, ok := srcCfg["driver_path"].(string); ok { + (*dstCfg)[i].DriverPath = val + } } if srcCfg["host"] != nil { - (*dstCfg)[i].Host = srcCfg["host"].(string) + if val, ok := srcCfg["host"].(string); ok { + (*dstCfg)[i].Host = val + } } if srcCfg["port"] != nil { - (*dstCfg)[i].Port = srcCfg["port"].(int) + if val, ok := srcCfg["port"].(float64); ok { // Handle float64 to int conversion + (*dstCfg)[i].Port = int(val) + } } if srcCfg["proxy_url"] != nil { - (*dstCfg)[i].ProxyURL = srcCfg["proxy_url"].(string) + if val, ok := srcCfg["proxy_url"].(string); ok { + (*dstCfg)[i].ProxyURL = val + } } } } @@ -1530,28 +1572,44 @@ func combineVDICfg(dstCfg *[]Selenium, srcCfgIface interface{}) { func combineFileStorageCfg(dstCfg *FileStorageAPI, srcCfgIface interface{}) { srcCfg := srcCfgIface.(map[string]interface{}) if srcCfg["type"] != nil { - dstCfg.Type = srcCfg["type"].(string) + if val, ok := srcCfg["type"].(string); ok { + dstCfg.Type = val + } } if srcCfg["host"] != nil { - dstCfg.Host = srcCfg["host"].(string) + if val, ok := srcCfg["host"].(string); ok { + dstCfg.Host = val + } } if srcCfg["path"] != nil { - dstCfg.Path = srcCfg["path"].(string) + if val, ok := srcCfg["path"].(string); ok { + dstCfg.Path = val + } } if srcCfg["port"] != nil { - dstCfg.Port = srcCfg["port"].(int) + if val, ok := srcCfg["port"].(float64); ok { // Handle float64 to int conversion + dstCfg.Port = int(val) + } } - if srcCfg["region"] != "" { - dstCfg.Region = srcCfg["region"].(string) + if srcCfg["region"] != nil { + if val, ok := srcCfg["region"].(string); ok { + dstCfg.Region = val + } } - if srcCfg["token"] != "" { - dstCfg.Token = srcCfg["token"].(string) + if srcCfg["token"] != nil { + if val, ok := srcCfg["token"].(string); ok { + dstCfg.Token = val + } } - if srcCfg["secret"] != "" { - dstCfg.Secret = srcCfg["secret"].(string) + if srcCfg["secret"] != nil { + if val, ok := srcCfg["secret"].(string); ok { + dstCfg.Secret = val + } } - if srcCfg["timeout"] != 0 { - dstCfg.Timeout = srcCfg["timeout"].(int) + if srcCfg["timeout"] != nil { + if val, ok := srcCfg["timeout"].(float64); ok { // Handle float64 to int conversion + dstCfg.Timeout = int(val) + } } } @@ -1559,146 +1617,271 @@ func combineHTTPHeadersCfg(dstCfg *HTTPConfig, srcCfgIface interface{}) { srcCfg := srcCfgIface.(map[string]interface{}) if srcCfg["timeout"] != nil { - dstCfg.Timeout = srcCfg["timeout"].(int) + if val, ok := srcCfg["timeout"].(float64); ok { // Handle float64 to int conversion + dstCfg.Timeout = int(val) + } } if srcCfg["ssl_discovery"] != nil { - dstCfg.SSLDiscovery = srcCfg["ssl_discovery"].(SSLScoutConfig) + if val, ok := srcCfg["ssl_discovery"].(SSLScoutConfig); ok { + dstCfg.SSLDiscovery = val + } } if srcCfg["proxies"] != nil { - dstCfg.Proxies = srcCfg["proxies"].([]SOCKSProxy) + if val, ok := srcCfg["proxies"].([]interface{}); ok { + // Converting interface{} slice to []SOCKSProxy + proxies := make([]SOCKSProxy, len(val)) + for i, p := range val { + if proxy, ok := p.(SOCKSProxy); ok { + proxies[i] = proxy + } + } + dstCfg.Proxies = proxies + } } } func combineNIDNSCfg(dstCfg *DNSConfig, srcCfgIface interface{}) { srcCfg := srcCfgIface.(map[string]interface{}) + if srcCfg["enabled"] != nil { - dstCfg.Enabled = srcCfg["enabled"].(bool) + if val, ok := srcCfg["enabled"].(bool); ok { + dstCfg.Enabled = val + } } if srcCfg["timeout"] != nil { - dstCfg.Timeout = srcCfg["timeout"].(int) + if val, ok := srcCfg["timeout"].(float64); ok { // Handle float64 to int conversion + dstCfg.Timeout = int(val) + } } if srcCfg["rate_limit"] != nil { - dstCfg.RateLimit = srcCfg["rate_limit"].(string) + if val, ok := srcCfg["rate_limit"].(string); ok { + dstCfg.RateLimit = val + } } } func combineNIWHOISCfg(dstCfg *WHOISConfig, srcCfgIface interface{}) { srcCfg := srcCfgIface.(map[string]interface{}) + if srcCfg["enabled"] != nil { - dstCfg.Enabled = srcCfg["enabled"].(bool) + if val, ok := srcCfg["enabled"].(bool); ok { + dstCfg.Enabled = val + } } if srcCfg["timeout"] != nil { - dstCfg.Timeout = srcCfg["timeout"].(int) + if val, ok := srcCfg["timeout"].(float64); ok { // Handle float64 to int conversion + dstCfg.Timeout = int(val) + } } if srcCfg["rate_limit"] != nil { - dstCfg.RateLimit = srcCfg["rate_limit"].(string) + if val, ok := srcCfg["rate_limit"].(string); ok { + dstCfg.RateLimit = val + } } } func combineNINetLookupCfg(dstCfg *NetLookupConfig, srcCfgIface interface{}) { srcCfg := srcCfgIface.(map[string]interface{}) + if srcCfg["enabled"] != nil { - dstCfg.Enabled = srcCfg["enabled"].(bool) + if val, ok := srcCfg["enabled"].(bool); ok { + dstCfg.Enabled = val + } } if srcCfg["timeout"] != nil { - dstCfg.Timeout = srcCfg["timeout"].(int) + if val, ok := srcCfg["timeout"].(float64); ok { // Handle float64 to int conversion + dstCfg.Timeout = int(val) + } } if srcCfg["rate_limit"] != nil { - dstCfg.RateLimit = srcCfg["rate_limit"].(string) + if val, ok := srcCfg["rate_limit"].(string); ok { + dstCfg.RateLimit = val + } } } func combineNIServiceScoutCfg(dstCfg *ServiceScoutConfig, srcCfgIface interface{}) { srcCfg := srcCfgIface.(map[string]interface{}) + if srcCfg["aggressive_scan"] != nil { - dstCfg.AggressiveScan = srcCfg["aggressive_scan"].(bool) + if val, ok := srcCfg["aggressive_scan"].(bool); ok { + dstCfg.AggressiveScan = val + } } if srcCfg["connect_scan"] != nil { - dstCfg.ConnectScan = srcCfg["connect_scan"].(bool) + if val, ok := srcCfg["connect_scan"].(bool); ok { + dstCfg.ConnectScan = val + } } if srcCfg["dns_servers"] != nil { - dstCfg.DNSServers = srcCfg["dns_servers"].([]string) + if val, ok := srcCfg["dns_servers"].([]interface{}); ok { + dnsServers := make([]string, len(val)) + for i, v := range val { + if str, ok := v.(string); ok { + dnsServers[i] = str + } + } + dstCfg.DNSServers = dnsServers + } } if srcCfg["data_length"] != nil { - dstCfg.DataLength = srcCfg["data_length"].(int) + if val, ok := srcCfg["data_length"].(float64); ok { // Handle float64 to int conversion + dstCfg.DataLength = int(val) + } } if srcCfg["enabled"] != nil { - dstCfg.Enabled = srcCfg["enabled"].(bool) + if val, ok := srcCfg["enabled"].(bool); ok { + dstCfg.Enabled = val + } } if srcCfg["exclude_hosts"] != nil { - dstCfg.ExcludeHosts = srcCfg["exclude_hosts"].([]string) + if val, ok := srcCfg["exclude_hosts"].([]interface{}); ok { + excludeHosts := make([]string, len(val)) + for i, v := range val { + if str, ok := v.(string); ok { + excludeHosts[i] = str + } + } + dstCfg.ExcludeHosts = excludeHosts + } } if srcCfg["host_timeout"] != nil { - dstCfg.HostTimeout = srcCfg["host_timeout"].(string) + if val, ok := srcCfg["host_timeout"].(string); ok { + dstCfg.HostTimeout = val + } } if srcCfg["idle_scan"] != nil { - combineSSIdleScanCfg(&dstCfg.IdleScan, srcCfg["idle_scan"].(SSIdleScan)) + if val, ok := srcCfg["idle_scan"].(SSIdleScan); ok { + combineSSIdleScanCfg(&dstCfg.IdleScan, val) + } } if srcCfg["ip_fragment"] != nil { - dstCfg.IPFragment = srcCfg["ip_fragment"].(bool) + if val, ok := srcCfg["ip_fragment"].(bool); ok { + dstCfg.IPFragment = val + } } if srcCfg["max_parallelism"] != nil { - dstCfg.MaxParallelism = srcCfg["max_parallelism"].(int) + if val, ok := srcCfg["max_parallelism"].(float64); ok { // Handle float64 to int conversion + dstCfg.MaxParallelism = int(val) + } } if srcCfg["max_port_number"] != nil { - dstCfg.MaxPortNumber = srcCfg["max_port_number"].(int) + if val, ok := srcCfg["max_port_number"].(float64); ok { // Handle float64 to int conversion + dstCfg.MaxPortNumber = int(val) + } } if srcCfg["max_retries"] != nil { - dstCfg.MaxRetries = srcCfg["max_retries"].(int) + if val, ok := srcCfg["max_retries"].(float64); ok { // Handle float64 to int conversion + dstCfg.MaxRetries = int(val) + } } if srcCfg["min_rate"] != nil { - dstCfg.MinRate = srcCfg["min_rate"].(string) + if val, ok := srcCfg["min_rate"].(string); ok { + dstCfg.MinRate = val + } } if srcCfg["no_dns_resolution"] != nil { - dstCfg.NoDNSResolution = srcCfg["no_dns_resolution"].(bool) + if val, ok := srcCfg["no_dns_resolution"].(bool); ok { + dstCfg.NoDNSResolution = val + } } if srcCfg["os_fingerprinting"] != nil { - dstCfg.OSFingerprinting = srcCfg["os_fingerprinting"].(bool) + if val, ok := srcCfg["os_fingerprinting"].(bool); ok { + dstCfg.OSFingerprinting = val + } } if srcCfg["ping_scan"] != nil { - dstCfg.PingScan = srcCfg["ping_scan"].(bool) + if val, ok := srcCfg["ping_scan"].(bool); ok { + dstCfg.PingScan = val + } } if srcCfg["proxies"] != nil { - dstCfg.Proxies = srcCfg["proxies"].([]string) + if val, ok := srcCfg["proxies"].([]interface{}); ok { + proxies := make([]string, len(val)) + for i, v := range val { + if str, ok := v.(string); ok { + proxies[i] = str + } + } + dstCfg.Proxies = proxies + } } if srcCfg["randomize_hosts"] != nil { - dstCfg.RandomizeHosts = srcCfg["randomize_hosts"].(bool) + if val, ok := srcCfg["randomize_hosts"].(bool); ok { + dstCfg.RandomizeHosts = val + } } if srcCfg["scan_delay"] != nil { - dstCfg.ScanDelay = srcCfg["scan_delay"].(string) + if val, ok := srcCfg["scan_delay"].(string); ok { + dstCfg.ScanDelay = val + } } if srcCfg["scan_flags"] != nil { - dstCfg.ScanFlags = srcCfg["scan_flags"].(string) + if val, ok := srcCfg["scan_flags"].(string); ok { + dstCfg.ScanFlags = val + } } if srcCfg["script_scan"] != nil { - dstCfg.ScriptScan = srcCfg["script_scan"].([]string) + if val, ok := srcCfg["script_scan"].([]interface{}); ok { + scriptScan := make([]string, len(val)) + for i, v := range val { + if str, ok := v.(string); ok { + scriptScan[i] = str + } + } + dstCfg.ScriptScan = scriptScan + } } if srcCfg["service_db"] != nil { - dstCfg.ServiceDB = srcCfg["service_db"].(string) + if val, ok := srcCfg["service_db"].(string); ok { + dstCfg.ServiceDB = val + } } if srcCfg["service_detection"] != nil { - dstCfg.ServiceDetection = srcCfg["service_detection"].(bool) + if val, ok := srcCfg["service_detection"].(bool); ok { + dstCfg.ServiceDetection = val + } } if srcCfg["source_port"] != nil { - dstCfg.SourcePort = srcCfg["source_port"].(int) + if val, ok := srcCfg["source_port"].(float64); ok { // Handle float64 to int conversion + dstCfg.SourcePort = int(val) + } } if srcCfg["spoof_ip"] != nil { - dstCfg.SpoofIP = srcCfg["spoof_ip"].(string) + if val, ok := srcCfg["spoof_ip"].(string); ok { + dstCfg.SpoofIP = val + } } if srcCfg["syn_scan"] != nil { - dstCfg.SynScan = srcCfg["syn_scan"].(bool) + if val, ok := srcCfg["syn_scan"].(bool); ok { + dstCfg.SynScan = val + } } if srcCfg["targets"] != nil { - dstCfg.Targets = srcCfg["targets"].([]string) + if val, ok := srcCfg["targets"].([]interface{}); ok { + targets := make([]string, len(val)) + for i, v := range val { + if str, ok := v.(string); ok { + targets[i] = str + } + } + dstCfg.Targets = targets + } } if srcCfg["timeout"] != nil { - dstCfg.Timeout = srcCfg["timeout"].(int) + if val, ok := srcCfg["timeout"].(float64); ok { // Handle float64 to int conversion + dstCfg.Timeout = int(val) + } } if srcCfg["timing_template"] != nil { - dstCfg.TimingTemplate = srcCfg["timing_template"].(string) + if val, ok := srcCfg["timing_template"].(string); ok { + dstCfg.TimingTemplate = val + } } if srcCfg["udp_scan"] != nil { - dstCfg.UDPScan = srcCfg["udp_scan"].(bool) + if val, ok := srcCfg["udp_scan"].(bool); ok { + dstCfg.UDPScan = val + } } } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index f775e2e..0976316 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -1204,7 +1204,7 @@ func TestConfigString(t *testing.T) { } // Define the expected string representation of the config - expected := "Config{Remote: {https://example.com /api 8080 us-west-1 mytoken 0 }, Database: { 0 testuser testpassword 0 0 0 0}, Crawler: {0 0 0 false false 0 0 0 0 0 0 false false false false false false 0 false { 0 0 0 0 0}}, API: { 0 0 false false false 0 0 0 false}, Selenium: [{ chrome 4444 false false }], RulesetsSchemaPath: path/to/schema, Rulesets: [], ImageStorageAPI: { 0 0 }, FileStorageAPI: { 0 0 }, HTTPHeaders: {false 0 false {false false false false false false false false false false false false false false false false} []}, NetworkInfo: {{false 0 } {false 0 } {false 0 } {false 0 { 0} false false false false false false false false [] [] [] 0 0 0 false 0 false false 0 [] []} {false 0 } { }}, OS: linux, DebugLevel: 1}" + expected := "Config{Remote: {https://example.com /api 8080 us-west-1 mytoken 0 }, Database: { 0 testuser testpassword 0 0 0 0}, Crawler: {0 0 0 false false 0 0 0 0 0 0 0 false false false false false false 0 false { 0 0 0 0 0}}, API: { 0 0 false false false 0 0 0 false}, Selenium: [{ chrome 4444 false false }], RulesetsSchemaPath: path/to/schema, Rulesets: [], ImageStorageAPI: { 0 0 }, FileStorageAPI: { 0 0 }, HTTPHeaders: {false 0 false {false false false false false false false false false false false false false false false false} []}, NetworkInfo: {{false 0 } {false 0 } {false 0 } {false 0 { 0} false false false false false false false false [] [] [] 0 0 0 false 0 false false 0 [] []} {false 0 } { }}, OS: linux, DebugLevel: 1}" // Call the String method on the config result := config.String() diff --git a/pkg/config/types.go b/pkg/config/types.go index 0986ccf..12413dc 100644 --- a/pkg/config/types.go +++ b/pkg/config/types.go @@ -57,6 +57,7 @@ type Crawler struct { ScreenshotMaxHeight int `json:"screenshot_max_height" yaml:"screenshot_max_height"` // Maximum height of the screenshot ScreenshotSectionWait int `json:"screenshot_section_wait" yaml:"screenshot_section_wait"` // Time to wait before taking a screenshot of a section in seconds MaxDepth int `json:"max_depth" yaml:"max_depth"` // Maximum depth to crawl + MaxLinks int `json:"max_links" yaml:"max_links"` // Maximum number of links to crawl per Source MaxSources int `json:"max_sources" yaml:"max_sources"` // Maximum number of sources to crawl Delay string `json:"delay" yaml:"delay"` // Delay between requests (in seconds) BrowsingMode string `json:"browsing_mode" yaml:"browsing_mode"` // Browsing type (e.g., "recursive", "human", "fuzzing") diff --git a/pkg/crawler/action_rules.go b/pkg/crawler/action_rules.go index ffc6560..6595897 100644 --- a/pkg/crawler/action_rules.go +++ b/pkg/crawler/action_rules.go @@ -18,8 +18,6 @@ package crawler import ( "fmt" - "regexp" - "strconv" "strings" "time" @@ -132,7 +130,7 @@ func executeActionRule(ctx *ProcessContext, r *rules.ActionRule, wd *selenium.We if len(r.WaitConditions) != 0 { for _, wc := range r.WaitConditions { // Execute the wait condition - err := executeWaitCondition(ctx, &wc, wd) + err := WaitForCondition(ctx, wd, wc) if err != nil { return err } @@ -435,27 +433,6 @@ func executeActionScrollByAmount(r *rules.ActionRule, wd *selenium.WebDriver) er return err } -// executeWaitCondition is responsible for executing a "wait" condition -func executeWaitCondition(ctx *ProcessContext, r *rules.WaitCondition, wd *selenium.WebDriver) error { - // Execute the wait condition - switch strings.ToLower(strings.TrimSpace(r.ConditionType)) { - case "element": - return nil - case "delay": - return nil - case "plugin_call": - plugin, exists := ctx.re.JSPlugins.GetPlugin(r.Value) - if !exists { - return fmt.Errorf("plugin not found: %s", r.Value) - } - pluginCode := plugin.String() - _, err := (*wd).ExecuteScript(pluginCode, nil) - return err - default: - return fmt.Errorf("wait condition not supported: %s", r.ConditionType) - } -} - // executeActionClick is responsible for executing a "click" action func executeActionClick(ctx *ProcessContext, r *rules.ActionRule, wd *selenium.WebDriver, button int) error { var err error @@ -843,7 +820,7 @@ func findElementBySelectorType(ctx *ProcessContext, wd *selenium.WebDriver, sele var err error var selector rules.Selector for _, selector = range selectors { - wdf, err = findElementByType(ctx, wd, selector) + wdf, err = FindElementByType(ctx, wd, selector) if err == nil && wdf != nil { break } @@ -852,126 +829,6 @@ func findElementBySelectorType(ctx *ProcessContext, wd *selenium.WebDriver, sele return wdf, selector, err } -func findElementByType(ctx *ProcessContext, wd *selenium.WebDriver, selector rules.Selector) (selenium.WebElement, error) { - var elements []selenium.WebElement - var err error - selectorType := strings.TrimSpace(selector.SelectorType) - switch strings.ToLower(selectorType) { - case "css": - elements, err = (*wd).FindElements(selenium.ByCSSSelector, selector.Selector) - case "id": - elements, err = (*wd).FindElements(selenium.ByID, selector.Selector) - case "name": - elements, err = (*wd).FindElements(selenium.ByName, selector.Selector) - case "linktext", "link_text": - elements, err = (*wd).FindElements(selenium.ByLinkText, selector.Selector) - case "partiallinktext", "partial_link_text": - elements, err = (*wd).FindElements(selenium.ByPartialLinkText, selector.Selector) - case "tagname", "tag_name", "tag", "element": - elements, err = (*wd).FindElements(selenium.ByTagName, selector.Selector) - case "class", "classname", "class_name": - elements, err = (*wd).FindElements(selenium.ByClassName, selector.Selector) - case "js_path": - js := fmt.Sprintf("return document.querySelector(\"%s\");", selector.Selector) - res, err := (*wd).ExecuteScript(js, nil) - if err != nil { - return nil, fmt.Errorf("error executing JavaScript: %v", err) - } - if element, ok := res.(selenium.WebElement); ok { - elements = append(elements, element) - } else { - return nil, fmt.Errorf("no element found for JS Path: %s", selector.Selector) - } - case "xpath": - elements, err = (*wd).FindElements(selenium.ByXPATH, selector.Selector) - default: - return nil, fmt.Errorf("unsupported selector type: %s", selectorType) - } - if err != nil { - return nil, fmt.Errorf("error finding element: %v", err) - } - - // Check for the Value if provided - var element selenium.WebElement - for _, e := range elements { - matchL2 := false - if strings.TrimSpace(selector.Attribute.Name) != "" { - attrValue, _ := e.GetAttribute(strings.TrimSpace(selector.Attribute.Name)) - if strings.EqualFold(strings.TrimSpace(attrValue), strings.TrimSpace(selector.Attribute.Value)) { - matchL2 = true - } - } else { - matchL2 = true - } - matchL3 := false - if matchL2 && strings.TrimSpace(selector.Value) != "" { - if matchValue(ctx, e, selector) { - matchL3 = true - } - } else { - if matchL2 { - matchL3 = true - } - } - if matchL3 { - element = e - break - } - } - if element == nil { - return element, fmt.Errorf("element '%s' Not found", selector.Selector) - } - - return element, nil -} - -func matchValue(ctx *ProcessContext, wdf selenium.WebElement, selector rules.Selector) bool { - // Precompute the common value for comparison - wdfText, err := wdf.Text() - if err != nil { - return false - } - - // Check if the selector value is one of the special cases - selValue := strings.TrimSpace(selector.Value) - var rValue cmn.EnvValue - if strings.HasPrefix(selValue, "{{") && strings.HasSuffix(selValue, "}}") { - rValue, err = cmn.ProcessEnvTemplate(selValue, ctx.GetContextID()) - if err != nil { - selValue = "" - } else { - // We have a match, let's update the selector value - switch rValue.Type { - case "string": - selValue = rValue.Value.(string) - case "int": - selValue = strconv.Itoa(rValue.Value.(int)) - case "float": - selValue = fmt.Sprintf("%f", rValue.Value.(float64)) - case "bool": - selValue = fmt.Sprintf("%t", rValue.Value.(bool)) - case "[]string": - selValue = strings.Join(rValue.Value.([]string), "|") - case "[]int": - selValue = cmn.IntSliceToString(rValue.Value.([]int), "|") - case "[]float64": - selValue = cmn.Float64SliceToString(rValue.Value.([]float64), "|") - case "[]float32": - selValue = cmn.Float32SliceToString(rValue.Value.([]float32), "|") - case "[]bool": - selValue = cmn.BoolSliceToString(rValue.Value.([]bool), "|") - } - - } - } - - //cmn.DebugMsg(cmn.DbgLvlDebug3, "Selector Value Resolved: '%s'", selValue) - - // Use Regex to match the selValue against the wdfText - regEx := regexp.MustCompile(selValue) - return regEx.MatchString(wdfText) -} - func DefaultActionConfig(url string) cfg.SourceConfig { return cfg.SourceConfig{ FormatVersion: "1.0", diff --git a/pkg/crawler/common.go b/pkg/crawler/common.go new file mode 100644 index 0000000..a79517d --- /dev/null +++ b/pkg/crawler/common.go @@ -0,0 +1,279 @@ +// Copyright 2023 Paolo Fabio Zaino +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package crawler implements the crawling logic of the application. +// It's responsible for crawling a website and extracting information from it. +package crawler + +import ( + "fmt" + "regexp" + "strconv" + "strings" + "time" + + cmn "github.com/pzaino/thecrowler/pkg/common" + exi "github.com/pzaino/thecrowler/pkg/exprterpreter" + rs "github.com/pzaino/thecrowler/pkg/ruleset" + rules "github.com/pzaino/thecrowler/pkg/ruleset" + "github.com/tebeka/selenium" +) + +// FindElementByType finds an element by the provided selector type +// and returns it if found, otherwise it returns an error. +func FindElementByType(ctx *ProcessContext, wd *selenium.WebDriver, selector rules.Selector) (selenium.WebElement, error) { + var elements []selenium.WebElement + var err error + selectorType := strings.TrimSpace(selector.SelectorType) + switch strings.ToLower(selectorType) { + case "css": + elements, err = (*wd).FindElements(selenium.ByCSSSelector, selector.Selector) + case "id": + elements, err = (*wd).FindElements(selenium.ByID, selector.Selector) + case "name": + elements, err = (*wd).FindElements(selenium.ByName, selector.Selector) + case "linktext", "link_text": + elements, err = (*wd).FindElements(selenium.ByLinkText, selector.Selector) + case "partiallinktext", "partial_link_text": + elements, err = (*wd).FindElements(selenium.ByPartialLinkText, selector.Selector) + case "tagname", "tag_name", "tag", "element": + elements, err = (*wd).FindElements(selenium.ByTagName, selector.Selector) + case "class", "classname", "class_name": + elements, err = (*wd).FindElements(selenium.ByClassName, selector.Selector) + case "js_path": + js := fmt.Sprintf("return document.querySelector(\"%s\");", selector.Selector) + res, err := (*wd).ExecuteScript(js, nil) + if err != nil { + return nil, fmt.Errorf("error executing JavaScript: %v", err) + } + if element, ok := res.(selenium.WebElement); ok { + elements = append(elements, element) + } else { + return nil, fmt.Errorf("no element found for JS Path: %s", selector.Selector) + } + case "xpath": + elements, err = (*wd).FindElements(selenium.ByXPATH, selector.Selector) + default: + return nil, fmt.Errorf("unsupported selector type: %s", selectorType) + } + if err != nil { + return nil, fmt.Errorf("error finding element: %v", err) + } + + // Check for the Value if provided + var element selenium.WebElement + for _, e := range elements { + matchL2 := false + if strings.TrimSpace(selector.Attribute.Name) != "" { + attrValue, _ := e.GetAttribute(strings.TrimSpace(selector.Attribute.Name)) + matchValue := strings.TrimSpace(selector.Attribute.Value) + if matchValue != "" && matchValue != "*" && matchValue != ".*" { + if strings.EqualFold(strings.TrimSpace(attrValue), strings.TrimSpace(selector.Attribute.Value)) { + matchL2 = true + } + } else { + matchL2 = true + } + } else { + matchL2 = true + } + matchL3 := false + if matchL2 && strings.TrimSpace(selector.Value) != "" { + if matchValue(ctx, e, selector) { + matchL3 = true + } + } else { + if matchL2 { + matchL3 = true + } + } + if matchL3 { + element = e + break + } + } + if element == nil { + return element, fmt.Errorf("element '%s' Not found", selector.Selector) + } + + return element, nil +} + +// FindElementsByType finds all elements by the provided selector type +// and returns them, otherwise it returns an error. +func FindElementsByType(ctx *ProcessContext, wd *selenium.WebDriver, selector rules.Selector) ([]selenium.WebElement, error) { + var elements []selenium.WebElement + var err error + selectorType := strings.TrimSpace(selector.SelectorType) + switch strings.ToLower(selectorType) { + case "css": + cmn.DebugMsg(cmn.DbgLvlDebug3, "Finding elements by CSS Selector: '%s'", selector.Selector) + elements, err = (*wd).FindElements(selenium.ByCSSSelector, selector.Selector) + case "id": + elements, err = (*wd).FindElements(selenium.ByID, selector.Selector) + case "name": + elements, err = (*wd).FindElements(selenium.ByName, selector.Selector) + case "linktext", "link_text": + elements, err = (*wd).FindElements(selenium.ByLinkText, selector.Selector) + case "partiallinktext", "partial_link_text": + elements, err = (*wd).FindElements(selenium.ByPartialLinkText, selector.Selector) + case "tagname", "tag_name", "tag", "element": + elements, err = (*wd).FindElements(selenium.ByTagName, selector.Selector) + case "class", "classname", "class_name": + elements, err = (*wd).FindElements(selenium.ByClassName, selector.Selector) + case "js_path": + js := fmt.Sprintf("return document.querySelector(\"%s\");", selector.Selector) + res, err := (*wd).ExecuteScript(js, nil) + if err != nil { + return nil, fmt.Errorf("error executing JavaScript: %v", err) + } + if element, ok := res.(selenium.WebElement); ok { + elements = append(elements, element) + } else { + return nil, fmt.Errorf("no element found for JS Path: %s", selector.Selector) + } + case "plugin_call": + // Call the plugin + pluginName := strings.TrimSpace(selector.Selector) + plugin, exists := ctx.re.JSPlugins.GetPlugin(pluginName) + if !exists { + return nil, fmt.Errorf("plugin not found: %s", pluginName) + } + pluginCode := plugin.String() + result, err := (*wd).ExecuteScript(pluginCode, nil) + if err != nil { + return nil, fmt.Errorf("error executing plugin: %v", err) + } + // result should contain the elements, but it's an interface{} so we need to convert it + var ok bool + if elements, ok = result.([]selenium.WebElement); !ok { + return nil, fmt.Errorf("plugin did not return a list of elements") + } + case "xpath": + elements, err = (*wd).FindElements(selenium.ByXPATH, selector.Selector) + default: + return nil, fmt.Errorf("unsupported selector type: %s", selectorType) + } + if err != nil { + return nil, fmt.Errorf("error finding element: %v", err) + } + + // Check for the Value if provided + for i, e := range elements { + matchL2 := false + if strings.TrimSpace(selector.Attribute.Name) != "" { + attrValue, _ := e.GetAttribute(strings.TrimSpace(selector.Attribute.Name)) + matchValue := strings.TrimSpace(selector.Attribute.Value) + if matchValue != "" && matchValue != "*" && matchValue != ".*" { + if strings.EqualFold(strings.TrimSpace(attrValue), strings.TrimSpace(selector.Attribute.Value)) { + matchL2 = true + } + } else { + matchL2 = true + } + } else { + matchL2 = true + } + matchL3 := false + if matchL2 && strings.TrimSpace(selector.Value) != "" { + if matchValue(ctx, e, selector) { + matchL3 = true + } + } else { + if matchL2 { + matchL3 = true + } + } + if !matchL3 { + // Remove the element from the list + elements = append(elements[:i], elements[i+1:]...) + } + } + if len(elements) == 0 { + return elements, fmt.Errorf("element '%s' Not found", selector.Selector) + } + + return elements, nil +} + +func matchValue(ctx *ProcessContext, wdf selenium.WebElement, selector rules.Selector) bool { + // Precompute the common value for comparison + wdfText, err := wdf.Text() + if err != nil { + return false + } + + // Check if the selector value is one of the special cases + selValue := strings.TrimSpace(selector.Value) + var rValue cmn.EnvValue + if strings.HasPrefix(selValue, "{{") && strings.HasSuffix(selValue, "}}") { + rValue, err = cmn.ProcessEnvTemplate(selValue, ctx.GetContextID()) + if err != nil { + selValue = "" + } else { + // We have a match, let's update the selector value + switch rValue.Type { + case "string": + selValue = rValue.Value.(string) + case "int": + selValue = strconv.Itoa(rValue.Value.(int)) + case "float": + selValue = fmt.Sprintf("%f", rValue.Value.(float64)) + case "bool": + selValue = fmt.Sprintf("%t", rValue.Value.(bool)) + case "[]string": + selValue = strings.Join(rValue.Value.([]string), "|") + case "[]int": + selValue = cmn.IntSliceToString(rValue.Value.([]int), "|") + case "[]float64": + selValue = cmn.Float64SliceToString(rValue.Value.([]float64), "|") + case "[]float32": + selValue = cmn.Float32SliceToString(rValue.Value.([]float32), "|") + case "[]bool": + selValue = cmn.BoolSliceToString(rValue.Value.([]bool), "|") + } + + } + } + + //cmn.DebugMsg(cmn.DbgLvlDebug3, "Selector Value Resolved: '%s'", selValue) + + // Use Regex to match the selValue against the wdfText + regEx := regexp.MustCompile(selValue) + return regEx.MatchString(wdfText) +} + +func WaitForCondition(ctx *ProcessContext, wd *selenium.WebDriver, r rs.WaitCondition) error { + // Execute the wait condition + switch strings.ToLower(strings.TrimSpace(r.ConditionType)) { + case "element": + return nil + case "delay": + delay := exi.GetFloat(r.Value) + if delay > 0 { + time.Sleep(time.Duration(delay) * time.Second) + } + return nil + case "plugin_call": + plugin, exists := ctx.re.JSPlugins.GetPlugin(r.Value) + if !exists { + return fmt.Errorf("plugin not found: %s", r.Value) + } + pluginCode := plugin.String() + _, err := (*wd).ExecuteScript(pluginCode, nil) + return err + default: + return fmt.Errorf("wait condition not supported: %s", r.ConditionType) + } +} diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index d936132..91f8974 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -252,6 +252,16 @@ func CrawlWebsite(args Pars, sel SeleniumInstance, releaseSelenium chan<- Seleni // Prepare for the next iteration processCtx.linksMutex.Lock() if len(processCtx.newLinks) > 0 { + // If MaxLinks is set, limit the number of new links + if processCtx.config.Crawler.MaxLinks > 0 && ((processCtx.Status.TotalPages + len(processCtx.newLinks)) > processCtx.config.Crawler.MaxLinks) { + linksToCrawl := processCtx.config.Crawler.MaxLinks - processCtx.Status.TotalPages + if linksToCrawl <= 0 { + // Remove all new links + processCtx.newLinks = []LinkItem{} + } else { + processCtx.newLinks = processCtx.newLinks[:linksToCrawl] + } + } newLinksFound = len(processCtx.newLinks) processCtx.Status.TotalLinks += newLinksFound allLinks = processCtx.newLinks @@ -1362,6 +1372,9 @@ func extractPageInfo(webPage *selenium.WebDriver, ctx *ProcessContext, docType s metaTags := []MetaTag{} scrapedList := []ScrapedItem{} + // Copy the current webPage object + webPageCopy := *webPage + // Get the HTML content of the page if docTypeIsHTML(objType) { htmlContent, _ = (*webPage).PageSource() @@ -1376,7 +1389,7 @@ func extractPageInfo(webPage *selenium.WebDriver, ctx *ProcessContext, docType s var url string url, err = (*webPage).CurrentURL() if err == nil { - scrapedData = processScrapingRules(webPage, ctx, url) + scrapedData = processScrapingRules(&webPageCopy, ctx, url) } if scrapedData != "" { // put ScrapedData into a map @@ -1695,6 +1708,10 @@ func worker(processCtx *ProcessContext, id int, jobs chan LinkItem) { // Loop over the jobs channel and process each job for url := range jobs { + if processCtx.config.Crawler.MaxLinks > 0 && (processCtx.Status.TotalPages >= processCtx.config.Crawler.MaxLinks) { + cmn.DebugMsg(cmn.DbgLvlDebug, "Worker %d: Stopping due reached max_links limit: %d\n", id, processCtx.Status.TotalPages) + break + } // Check if the URL should be skipped skip := skipURL(processCtx, id, url.Link) @@ -1753,6 +1770,10 @@ func worker(processCtx *ProcessContext, id int, jobs chan LinkItem) { processCtx.Status.LastDelay = delay time.Sleep(time.Duration(delay) * time.Second) } + if processCtx.config.Crawler.MaxLinks > 0 && (processCtx.Status.TotalPages >= processCtx.config.Crawler.MaxLinks) { + cmn.DebugMsg(cmn.DbgLvlDebug, "Worker %d: Stopping due reached max_links limit: %d\n", id, processCtx.Status.TotalPages) + break + } } } @@ -2433,11 +2454,9 @@ func ReturnSeleniumInstance(wg *sync.WaitGroup, pCtx *ProcessContext, sel *Selen if (*pCtx).Status.CrawlingRunning == 1 { QuitSelenium((&(*pCtx).wd)) if *(*pCtx).sel != nil { - //*(*pCtx).sel <- (*sel) releaseSelenium <- (*sel) } (*pCtx).Status.CrawlingRunning = 2 - //wg.Done() } } diff --git a/pkg/crawler/scraper.go b/pkg/crawler/scraper.go index e10cec4..8eeef4a 100644 --- a/pkg/crawler/scraper.go +++ b/pkg/crawler/scraper.go @@ -31,35 +31,17 @@ import ( rs "github.com/pzaino/thecrowler/pkg/ruleset" "github.com/tebeka/selenium" - "github.com/PuerkitoBio/goquery" - "github.com/antchfx/htmlquery" "golang.org/x/net/html" ) // ApplyRule applies the provided scraping rule to the provided web page. func ApplyRule(ctx *ProcessContext, rule *rs.ScrapingRule, webPage *selenium.WebDriver) map[string]interface{} { // Debug message - cmn.DebugMsg(cmn.DbgLvlInfo, "Applying rule: %v", rule.RuleName) + cmn.DebugMsg(cmn.DbgLvlInfo, "Applying scraping rule: %v", rule.RuleName) // Initialize a map to hold the extracted data extractedData := make(map[string]interface{}) - // Prepare content for goquery: - htmlContent, _ := (*webPage).PageSource() - doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) - if err != nil { - cmn.DebugMsg(cmn.DbgLvlError, "loading HTML content: %v", err) - return extractedData - } - - // Parse the HTML content - node, err := htmlquery.Parse(strings.NewReader(htmlContent)) - if err != nil { - // handle error - cmn.DebugMsg(cmn.DbgLvlError, "parsing HTML content: %v", err) - return extractedData - } - // Iterate over the elements to be extracted for _, elementSet := range rule.Elements { key := elementSet.Key @@ -68,32 +50,9 @@ func ApplyRule(ctx *ProcessContext, rule *rs.ScrapingRule, webPage *selenium.Web var allExtracted []string for _, element := range selectors { selectorType := strings.ToLower(strings.TrimSpace(element.SelectorType)) - selector := element.Selector getAllOccurrences := element.ExtractAllOccurrences - var extracted []string - switch selectorType { - case "css": - extracted = extractByCSS(doc, selector, getAllOccurrences) - case "xpath": - extracted = extractByXPath(node, selector, getAllOccurrences) - case "id": - extracted = extractByCSS(doc, "#"+selector, getAllOccurrences) - case "class", "class_name": - extracted = extractByCSS(doc, "."+selector, getAllOccurrences) - case "name": - extracted = extractByCSS(doc, "[name="+selector+"]", getAllOccurrences) - case "tag": - extracted = extractByCSS(doc, selector, getAllOccurrences) - case "link_text", "partial_link_text": - extracted = extractByCSS(doc, "a:contains('"+selector+"')", getAllOccurrences) - case "regex": - extracted = extractByRegex(htmlContent, selector, getAllOccurrences) - case "plugin_call": - extracted = extractByPlugin(ctx, webPage, selector) - default: - extracted = []string{} - } + extracted := extractContent(ctx, webPage, element, getAllOccurrences) if len(extracted) > 0 { allExtracted = append(allExtracted, extracted...) if !getAllOccurrences || selectorType == "plugin_call" { @@ -102,7 +61,26 @@ func ApplyRule(ctx *ProcessContext, rule *rs.ScrapingRule, webPage *selenium.Web } } if len(allExtracted) > 0 { - extractedData[key] = allExtracted + // Ensure that allExtracted is JSON valid + if len(allExtracted) == 1 { + // Check if it's a JSON string + if json.Valid([]byte(allExtracted[0])) { + var jsonData map[string]interface{} + if err := json.Unmarshal([]byte(allExtracted[0]), &jsonData); err == nil { + extractedData[key] = jsonData + continue + } + } + } else { + // Check if it's a JSON array + if json.Valid([]byte("[" + strings.Join(allExtracted, ",") + "]")) { + var jsonData []interface{} + if err := json.Unmarshal([]byte("["+strings.Join(allExtracted, ",")+"]"), &jsonData); err == nil { + extractedData[key] = jsonData + continue + } + } + } } } @@ -260,81 +238,32 @@ func lintScript(scriptContent string) []string { return errors } -// extractByCSS extracts the content from the provided document using the provided CSS selector. -func extractByCSS(doc *goquery.Document, selector string, all bool) []string { +// extractContent extracts the content from the provided document using the provided CSS selector. +func extractContent(ctx *ProcessContext, wd *selenium.WebDriver, selector rs.Selector, all bool) []string { var results []string + var elements []selenium.WebElement + var err error if all { - doc.Find(selector).Each(func(i int, s *goquery.Selection) { - results = append(results, s.Text()) - }) + elements, err = FindElementsByType(ctx, wd, selector) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "Error finding elements: %v", err) + return results + } } else { - if selection := doc.Find(selector).First(); selection.Length() > 0 { - results = append(results, selection.Text()) + element, err := FindElementByType(ctx, wd, selector) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "Error finding element: %v", err) + return results } + elements = append(elements, element) } - return results -} - -func extractByXPath(node *html.Node, selector string, all bool) []string { - var results []string - elements, err := htmlquery.QueryAll(node, selector) - if err != nil { - // handle error - return results - } - if all { - for _, element := range elements { - results = append(results, htmlquery.InnerText(element)) - } - } else if len(elements) > 0 { - results = append(results, htmlquery.InnerText(elements[0])) + for _, element := range elements { + text, _ := element.Text() + results = append(results, text) } return results } -func extractByRegex(content string, pattern string, all bool) []string { - re := regexp.MustCompile(pattern) - if all { - return re.FindAllString(content, -1) - } - if match := re.FindString(content); match != "" { - return []string{match} - } - - return []string{} -} - -func extractByPlugin(ctx *ProcessContext, wd *selenium.WebDriver, selector string) []string { - // Retrieve the JS plugin - plugin, exists := ctx.re.JSPlugins.GetPlugin(selector) - if !exists { - return []string{} - } - - // Execute the plugin - value, err := (*wd).ExecuteScript(plugin.String(), nil) - if err != nil { - cmn.DebugMsg(cmn.DbgLvlError, "Error executing JS plugin: %v", err) - return []string{} - } - // Transform value to a string - valueStr := fmt.Sprintf("%v", value) - - // Check if the valueSTr is a valid JSON - if json.Valid([]byte(valueStr)) { - return []string{valueStr} - } - - // Check if the result can be converted to a valid JSON - if !json.Valid([]byte(valueStr)) { - // transform the valueStr to a valid JSON - valueStr = fmt.Sprintf("{\"plugin_scrap\": \"%v\"}", valueStr) - } - - // It seems we were unable to retrieve the result from the JS output - return []string{valueStr} -} - // ApplyRulesGroup extracts the data from the provided web page using the provided a rule group. func ApplyRulesGroup(ctx *ProcessContext, ruleGroup *rs.RuleGroup, url string, webPage *selenium.WebDriver) (map[string]interface{}, error) { // Initialize a map to hold the extracted data diff --git a/pkg/crawler/scraping_rules.go b/pkg/crawler/scraping_rules.go index 3a32c85..c65596d 100644 --- a/pkg/crawler/scraping_rules.go +++ b/pkg/crawler/scraping_rules.go @@ -164,7 +164,7 @@ func executeScrapingRule(ctx *ProcessContext, r *rules.ScrapingRule, func executeWaitConditions(ctx *ProcessContext, conditions []rules.WaitCondition, wd *selenium.WebDriver) error { for _, wc := range conditions { - err := executeWaitCondition(ctx, &wc, wd) + err := WaitForCondition(ctx, wd, wc) if err != nil { return fmt.Errorf("executing wait condition: %v", err) } diff --git a/pkg/ruleset/common.go b/pkg/ruleset/common.go index ad09ecd..924c4f8 100644 --- a/pkg/ruleset/common.go +++ b/pkg/ruleset/common.go @@ -23,7 +23,9 @@ import ( "net/url" "os" "path/filepath" + "regexp" "strings" + "sync" "time" cmn "github.com/pzaino/thecrowler/pkg/common" @@ -526,26 +528,51 @@ func PreparePathForSearch(path string) (string, error) { return strings.ToLower(strings.TrimSpace(path)), nil } -func IsValidURL(urlStr string) bool { +// IsURL checks if the provided string is a URL or a a pattern to match URLs. +// returns false if it's not a URL or a pattern to match URLs otherwise returns true. +func IsURL(urlStr string) bool { urlStr = strings.TrimSpace(urlStr) if urlStr == "" { return false + } else if urlStr == "*" { + return true } - _, err := url.Parse(urlStr) - if err != nil { + var re *regexp.Regexp + var once sync.Once + + // Define a function to compile the regex only once + once.Do(func() { + // The pattern below is capable of matching regEx patterns used to match URLs + // I understand this might be mind-blowing for somebody, so trust me, it works. + urlHeaders := "(?i)[\\^]?[\\s]*(\\{0,2}http[s]?[\\[s\\]]?.*:|\\{0,2}ftp[s]?:|\\{0,2}www\\.|\\.[a-z]{2,})" + + re = regexp.MustCompile(urlHeaders) + }) + + // Check if it urlStr matches the pattern and so it's a URL or a pattern to match URLs + //return re.MatchString(urlStr) + x := re.MatchString(urlStr) + cmn.DebugMsg(cmn.DbgLvlDebug3, "IsURL: %s -> %v", urlStr, x) + return true +} + +// CheckURL checks if the provided URL match. +func CheckURL(urlStr, urlPattern string) bool { + if !IsURL(urlPattern) { return false } - // Check if it has a valid scheme - // "*" is a valid here as it's a wildcard - if urlStr != "*" && - !strings.HasPrefix(urlStr, "http://") && - !strings.HasPrefix(urlStr, "https://") && - !strings.HasPrefix(urlStr, "ftp://") && - !strings.HasPrefix(urlStr, "ftps://") { + if urlPattern == "*" { + return true + } + + // Check if the URL matches the pattern + matched, err := regexp.MatchString(urlPattern, urlStr) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "error matching URL: %v", err) return false } - return true + return matched } diff --git a/pkg/ruleset/rulesengine.go b/pkg/ruleset/rulesengine.go index 143a8a8..62a1c3e 100644 --- a/pkg/ruleset/rulesengine.go +++ b/pkg/ruleset/rulesengine.go @@ -21,6 +21,7 @@ import ( "encoding/json" "fmt" "net/url" + "regexp" "strings" "sync" "time" @@ -470,9 +471,13 @@ func (re *RuleEngine) GetAllScrapingRulesByURL(url, CtxID string) []ScrapingRule } for _, rule := range rules { - for i2 := 0; i2 < len(rule.PreConditions); i2++ { - if strings.HasPrefix(url, strings.ToLower(strings.TrimSpace(rule.PreConditions[i2].URL))) { - scrapingRules = append(scrapingRules, rule) + if CheckURL(url, rule.RuleName) { + scrapingRules = append(scrapingRules, rule) + } else { + for i2 := 0; i2 < len(rule.PreConditions); i2++ { + if CheckURL(url, rule.PreConditions[i2].URL) { + scrapingRules = append(scrapingRules, rule) + } } } } @@ -560,12 +565,12 @@ func (re *RuleEngine) GetRulesetByURL(urlStr string) (*Ruleset, error) { } for i := 0; i < len((*re).Rulesets); i++ { - rsName := strings.ToLower(strings.TrimSpace(re.Rulesets[i].Name)) - if rsName == "" || !IsValidURL(rsName) { + rsName := strings.TrimSpace(re.Rulesets[i].Name) + if rsName == "" || !IsURL(rsName) { continue } - //cmn.DebugMsg(cmn.DbgLvlDebug2, "Checking ruleset: '%s' == '%s'", rsName, parsedURL) - if strings.HasPrefix(parsedURL, rsName) || rsName == "*" || strings.Contains(parsedURL, rsName) { + cmn.DebugMsg(cmn.DbgLvlDebug2, "Checking ruleset: '%s' == '%s'", rsName, parsedURL) + if CheckURL(parsedURL, rsName) { return &re.Rulesets[i], nil } } @@ -610,11 +615,13 @@ func (re *RuleEngine) GetRuleGroupByURL(urlStr string) (*RuleGroup, error) { } for _, rg := range re.GetAllRuleGroups() { - rgName := strings.ToLower(strings.TrimSpace(rg.GroupName)) - if rgName == "" || !IsValidURL(rgName) { + rgName := strings.TrimSpace(rg.GroupName) + if rgName == "" || !IsURL(rgName) { continue } - if strings.HasPrefix(parsedURL, rgName) || rgName == "*" { + cmn.DebugMsg(cmn.DbgLvlDebug2, "Checking rule group: '%s' == '%s'", rgName, parsedURL) + re := regexp.MustCompile(rgName) + if re.MatchString(parsedURL) || rgName == "*" { if rg.IsValid() { return rg, nil } @@ -669,8 +676,11 @@ func (re *RuleEngine) GetActionRuleByURL(urlStr string) (*ActionRule, error) { for _, rg := range re.GetAllRuleGroups() { for _, r := range rg.ActionRules { - if strings.HasPrefix(parsedURL, strings.ToLower(strings.TrimSpace(r.URL))) { - return &r, nil + if IsURL(r.URL) { + reg := regexp.MustCompile(r.URL) + if reg.MatchString(parsedURL) { + return &r, nil + } } } } diff --git a/schemas/crowler-config-schema.json b/schemas/crowler-config-schema.json index 61df99e..6c52a8b 100644 --- a/schemas/crowler-config-schema.json +++ b/schemas/crowler-config-schema.json @@ -298,6 +298,16 @@ 5 ] }, + "max_links": { + "title": "CROWler Engine Crawling Maximum Number of Links", + "description": "This is the maximum number of links that the CROWler Engine will crawl per each Source. If zero, no limit.", + "type": "integer", + "minimum": 1, + "examples": [ + 3, + 5 + ] + }, "max_sources": { "title": "CROWler Engine Maximum Sources", "description": "This is the maximum number of sources that a single instance of the CROWler's engine will fetch atomically and atomically to enqueue in the jobs-queue and crawl.", @@ -571,7 +581,7 @@ "selenium": { "title": "CROWler VDI access Configuration", - "description": "This is the VDI configuration section, it's used to configure the VDI and tell the CROWler's Engine how to connect to it. It is the configuration for the selenium driver, to scale the CROWler web crawling capabilities, you can add multiple VDIs in an array format.", + "description": "This is the VDI configuration section, it's used to configure the VDI and tell the CROWler's Engine how to connect to it. It is the configuration for all the tools in the VDI image (for ex. selenium driver, Rbee etc.), to scale the CROWler web crawling capabilities, you can add multiple VDIs in an array format.", "type": "array", "items": { "title": "CROWler VDI Configuration Items", @@ -587,6 +597,16 @@ "description": "This is the location of the VDI image.", "type": "string" }, + "language": { + "title": "CROWler VDI Language", + "description": "This is the language oto set the VDI image to.", + "type": "string", + "examples": [ + "en", + "de", + "fr" + ] + }, "path": { "title": "CROWler Selenium Path", "description": "This is the path to the selenium driver (IF LOCAL). It is the path to the selenium driver that the CROWler will use to crawl websites. (deprecated)", @@ -597,9 +617,14 @@ "description": "This is the path to the selenium driver (IF REMOTE). It is the path to the selenium driver that the CROWler will use to crawl websites. (deprecated)", "type": "string" }, + "use_service": { + "title": "CROWler VDI Use Service (deprecated)", + "description": "This is a flag that tells the CROWler to access Selenium as service. (deprecated)", + "type": "boolean" + }, "type": { "title": "CROWler VDI Browser Type", - "description": "This is the type of selenium driver that the CROWler will use to crawl websites. For example, chrome or firefox.", + "description": "This is the type of web browser the CROWler will use to crawl websites. For example, chrome or firefox (normally a VDI image has only one web Browser to reduce space).", "type": "string", "enum": [ "chrome", @@ -609,7 +634,7 @@ }, "port": { "title": "CROWler VDI Port", - "description": "This is the port that the selenium driver will use to connect to the CROWler. It is the port that the selenium driver will use to connect to the CROWler.", + "description": "This is the VDI's API port.", "type": "integer", "minimum": 1, "maximum": 65535 @@ -622,17 +647,12 @@ }, "headless": { "title": "CROWler VDI Headless Mode", - "description": "This is a flag that tells the selenium driver to run in headless mode. This is useful for running the selenium driver in a headless environment. It's generally NOT recommended to enable headless mode for the selenium driver. (don't use headless unless you know what you're doing, headless browsing is mostly blocked these days!)", - "type": "boolean" - }, - "use_service": { - "title": "CROWler VDI Use Service (deprecated)", - "description": "This is a flag that tells the CROWler to access Selenium as service. (deprecated)", + "description": "This is a flag that tells the VDI to run in headless mode. This is useful for running the selenium driver in a headless environment. It's generally NOT recommended to enable headless mode. (don't use headless unless you know what you're doing, headless browsing is mostly blocked these days!)", "type": "boolean" }, "sslmode": { "title": "CROWler VDI SSL Mode", - "description": "This is the sslmode that the selenium driver will use to connect to the CROWler. It is the sslmode that the selenium driver will use to connect to the CROWler.", + "description": "This is the sslmode. If set to `enable` then the CROWler Engine will try to connect to the VDI using HTTPS. Make sure the VDI is configured to accept HTTPS connections.", "type": "string", "enum": [ "enable", @@ -648,6 +668,14 @@ "title": "CROWler VDI Downloaded files Path", "description": "This is the temporary download path for the VDI. It is the local path where the VDI will download files. This is useful for downloading files from websites (like pdf or zip etc.). The CROWler will use this path to temporarily store the downloaded files (before moving them to the storage files area).", "type": "string" + }, + "proxy_url": { + "title": "CROWler VDI Proxy Configuration", + "description": "This is the proxy configuration for the VDI. It is the proxy that the VDI will use to connect to the internet. This is useful for bypassing firewalls or accessing websites that are blocked in your country.", + "type": "string", + "examples": [ + "http://proxy:port" + ] } }, "additionalProperties": false, diff --git a/schemas/crowler-config-schema.yaml b/schemas/crowler-config-schema.yaml index 436a9d2..5937848 100644 --- a/schemas/crowler-config-schema.yaml +++ b/schemas/crowler-config-schema.yaml @@ -238,6 +238,14 @@ properties: examples: - "3" - "5" + max_links: + title: "CROWler Engine Maximum Links" + description: "This is the maximum number of links that the CROWler Engine will crawl per Source. if zero then no limit." + type: "integer" + minimum: "1" + examples: + - "10" + - "100" max_sources: title: "CROWler Engine Maximum Sources" description: "This is the maximum number of sources that a single instance of the CROWler's engine will fetch atomically and atomically to enqueue in the jobs-queue and crawl." @@ -468,6 +476,13 @@ properties: title: "CROWler VDI Location" description: "This is the location of the VDI image." type: "string" + language: + title: "CROWler VDI Language" + description: "This is the language to set the VDI image to." + type: "string" + examples: + - "en" + - "it" path: title: "CROWler Selenium Path" description: "This is the path to the selenium driver (IF LOCAL). It is the path to the selenium driver that the CROWler will use to crawl websites. (deprecated)" @@ -518,6 +533,11 @@ properties: title: "CROWler VDI Downloaded files Path" description: "This is the temporary download path for the VDI. It is the local path where the VDI will download files. This is useful for downloading files from websites (like pdf or zip etc.). The CROWler will use this path to temporarily store the downloaded files (before moving them to the storage files area)." type: "string" + proxy_url: + title: "CROWler VDI Proxy" + description: "This is the proxy configuration for the VDI. It is the proxy that the VDI will use to connect to the internet. This is useful for bypassing firewalls or accessing websites that are blocked in your country." + type: "string" + examples: "http://proxy:8080" additionalProperties: "false" required: - "type" diff --git a/schemas/ruleset-schema.json b/schemas/ruleset-schema.json index f69aaa4..7ac8ae8 100644 --- a/schemas/ruleset-schema.json +++ b/schemas/ruleset-schema.json @@ -138,7 +138,7 @@ "element", "link_text", "partial_link_text", - "regex", + "js_path", "plugin_call" ], "description": "The type of selector to use to find the element. To extract data using plugins, set this field to 'plugin_call'." @@ -161,6 +161,10 @@ }, "description": "Optional. The attribute of the element to extract. This field is ignored when using CROWler plugins via plugin_call." }, + "value": { + "type": "string", + "description": "Optional. The value within the selector that we need to match for the action. (this is NOT the value to input!)" + }, "extract_all_occurrences": { "type": "boolean", "description": "Flag to extract all occurrences of the element, not just the first one. This flag has no effect when using CROWler plugins via plugin_call."