diff --git a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java index f44e64fcc..835af7c1c 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java +++ b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java @@ -115,11 +115,13 @@ public void init(Map configuration) throws Exception { String userAgent = getString("user-agent", DEFAULT_USER_AGENT, configuration); int maxErrorCount = getInt("max-error-count", 5, configuration); int httpTimeout = getInt("http-timeout", 10000, configuration); + boolean allowNonHtmlContents = getBoolean("allow-non-html-contents", false, configuration); boolean handleCookies = getBoolean("handle-cookies", true, configuration); log.info("allowed-domains: {}", allowedDomains); log.info("forbidden-paths: {}", forbiddenPaths); + log.info("allow-non-html-contents: {}", allowNonHtmlContents); log.info("seed-urls: {}", seedUrls); log.info("max-urls: {}", maxUrls); log.info("max-depth: {}", maxDepth); @@ -133,6 +135,7 @@ public void init(Map configuration) throws Exception { WebCrawlerConfiguration webCrawlerConfiguration = WebCrawlerConfiguration.builder() .allowedDomains(allowedDomains) + .allowNonHtmlContents(allowNonHtmlContents) .maxUrls(maxUrls) .maxDepth(maxDepth) .forbiddenPaths(forbiddenPaths) diff --git a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java index a45879520..a23724e6c 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java +++ b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java @@ -204,7 +204,7 @@ public boolean runCycle() throws Exception { // we did something return true; } catch (UnsupportedMimeTypeException notHtml) { - if (configuration.isAllowNonHtmlContent()) { + if (configuration.isAllowNonHtmlContents()) { log.info( "Url {} lead to a {} content-type document. allow-not-html-content is true, so we are processing it", current, diff --git a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java index 1ec3e622f..a070038a6 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java +++ b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java @@ -39,7 +39,7 @@ public class WebCrawlerConfiguration { @Builder.Default private boolean handleCookies = true; @Builder.Default private boolean handleRobotsFile = true; @Builder.Default private boolean scanHtmlDocuments = true; - @Builder.Default private boolean allowNonHtmlContent = false; + @Builder.Default private boolean allowNonHtmlContents = false; @Builder.Default private Set allowedTags = Set.of("a"); diff --git a/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerTest.java b/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerTest.java index be0aed44f..c15a2f80d 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerTest.java +++ b/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerTest.java @@ -345,7 +345,7 @@ void testBinaryContent(WireMockRuntimeInfo vmRuntimeInfo) throws Exception { WebCrawlerConfiguration configuration = WebCrawlerConfiguration.builder() .allowedDomains(Set.of(vmRuntimeInfo.getHttpBaseUrl())) - .allowNonHtmlContent(true) + .allowNonHtmlContents(true) .handleRobotsFile(false) .maxErrorCount(5) .build(); diff --git a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java index e6b31dda6..e1a9d46bd 100644 --- a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java +++ b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java @@ -203,6 +203,15 @@ public static class Config { @JsonProperty("scan-html-documents") private boolean scanHtmlDocuments; + @ConfigProperty( + description = + """ + Whether to emit non HTML documents to the pipeline (i.e. PDF Files). + """, + defaultValue = "false") + @JsonProperty("allow-non-html-contents") + private boolean allowNonHtmlContents; + @ConfigProperty( description = """ diff --git a/langstream-runtime/langstream-runtime-impl/src/test/java/ai/langstream/kafka/WebCrawlerSourceIT.java b/langstream-runtime/langstream-runtime-impl/src/test/java/ai/langstream/kafka/WebCrawlerSourceIT.java index e2536bee1..5671fa4db 100644 --- a/langstream-runtime/langstream-runtime-impl/src/test/java/ai/langstream/kafka/WebCrawlerSourceIT.java +++ b/langstream-runtime/langstream-runtime-impl/src/test/java/ai/langstream/kafka/WebCrawlerSourceIT.java @@ -93,6 +93,7 @@ public void test(WireMockRuntimeInfo vmRuntimeInfo) throws Exception { output: "${globals.output-topic}" configuration:\s seed-urls: ["%s/index.html"] + allow-non-html-contents: true allowed-domains: ["%s"] state-storage: disk """