From a90c90284823bd1bed9ef946ec7929a5cb0ed00c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Boschi?= Date: Thu, 22 Aug 2024 15:49:12 +0200 Subject: [PATCH] feat(webcrawler): improve unchanged pages sourcing (#131) New flags: - `only-main-content`: default to false, if enabled it will remove script, style (and others) tags from the emitted document. This is particalury helpful in order to verify actual semantic changes to the pages, not related to sldf (script versioning, cache busting, etc) - `emit-content-diff`: list, default to all the content diff. You can filter the content diff you want the source to emit, if available. For example, to not emit content_unchanged, you can set `emit-content-diff: ['new', 'content_diff']` --- .../agents/webcrawler/WebCrawlerSource.java | 54 ++-- .../agents/webcrawler/crawler/WebCrawler.java | 6 + .../crawler/WebCrawlerConfiguration.java | 24 +- .../webcrawler/WebCrawlerSourceTest.java | 250 ++++++++++++++---- .../agents/WebCrawlerSourceAgentProvider.java | 31 +++ 5 files changed, 287 insertions(+), 78 deletions(-) diff --git a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java index 677e7dae3..200b24e5d 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java +++ b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java @@ -18,11 +18,7 @@ import static ai.langstream.agents.webcrawler.crawler.WebCrawlerConfiguration.DEFAULT_USER_AGENT; import static ai.langstream.api.util.ConfigurationUtils.*; -import ai.langstream.agents.webcrawler.crawler.Document; -import ai.langstream.agents.webcrawler.crawler.StatusStorage; -import ai.langstream.agents.webcrawler.crawler.WebCrawler; -import ai.langstream.agents.webcrawler.crawler.WebCrawlerConfiguration; -import ai.langstream.agents.webcrawler.crawler.WebCrawlerStatus; +import ai.langstream.agents.webcrawler.crawler.*; import ai.langstream.ai.agents.commons.state.LocalDiskStateStorage; import ai.langstream.ai.agents.commons.state.S3StateStorage; import ai.langstream.ai.agents.commons.state.StateStorage; @@ -159,20 +155,11 @@ public void init(Map configuration) throws Exception { entry.getKey(), entry.getValue())) .collect(Collectors.toUnmodifiableList()); - log.info("allowed-domains: {}", allowedDomains); - log.info("forbidden-paths: {}", forbiddenPaths); - log.info("allow-non-html-contents: {}", allowNonHtmlContents); - log.info("seed-urls: {}", seedUrls); - log.info("max-urls: {}", maxUrls); - log.info("max-depth: {}", maxDepth); - log.info("handle-robots-file: {}", handleRobotsFile); - log.info("scan-html-documents: {}", scanHtmlDocuments); - log.info("user-agent: {}", userAgent); - log.info("max-unflushed-pages: {}", maxUnflushedPages); - log.info("min-time-between-requests: {}", minTimeBetweenRequests); - log.info("reindex-interval-seconds: {}", reindexIntervalSeconds); - - WebCrawlerConfiguration webCrawlerConfiguration = + final boolean onlyMainContent = getBoolean("only-main-content", false, configuration); + final Set excludeFromMainContentTags = + getSet("exclude-from-main-content-tags", configuration); + + WebCrawlerConfiguration.WebCrawlerConfigurationBuilder builder = WebCrawlerConfiguration.builder() .allowedDomains(allowedDomains) .allowNonHtmlContents(allowNonHtmlContents) @@ -185,16 +172,41 @@ public void init(Map configuration) throws Exception { .handleCookies(handleCookies) .httpTimeout(httpTimeout) .maxErrorCount(maxErrorCount) - .build(); + .onlyMainContent(onlyMainContent); + if (!excludeFromMainContentTags.isEmpty()) { + builder.excludeFromMainContentTags(excludeFromMainContentTags); + } + WebCrawlerConfiguration webCrawlerConfiguration = builder.build(); + log.info("configuration: {}", webCrawlerConfiguration); WebCrawlerStatus status = new WebCrawlerStatus(); // this can be overwritten when the status is reloaded status.setLastIndexStartTimestamp(System.currentTimeMillis()); + + final List emitContentDiff = + getList("emit-content-diff", configuration).stream() + .map(String::toLowerCase) + .toList(); + crawler = new WebCrawler( webCrawlerConfiguration, status, - foundDocuments::add, + new DocumentVisitor() { + @Override + public void visit(Document document) { + if (document.contentDiff() == null + || emitContentDiff.isEmpty() + || emitContentDiff.contains( + document.contentDiff().toString().toLowerCase())) { + foundDocuments.add(document); + } else { + log.info( + "Discarding document with content diff {}", + document.contentDiff()); + } + } + }, this::sendDeletedDocument); sourceActivitySummaryTopic = diff --git a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java index 3c085e8d8..ef4b33d75 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java +++ b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java @@ -270,6 +270,12 @@ public boolean runCycle() throws Exception { } }); } + if (configuration.isOnlyMainContent()) { + for (String excludeFromMainContentTag : + configuration.getExcludeFromMainContentTags()) { + document.getElementsByTag(excludeFromMainContentTag).remove(); + } + } onDocumentFound(current, document.html().getBytes(StandardCharsets.UTF_8), contentType); } diff --git a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java index a070038a6..0ac3c606a 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java +++ b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java @@ -40,8 +40,28 @@ public class WebCrawlerConfiguration { @Builder.Default private boolean handleRobotsFile = true; @Builder.Default private boolean scanHtmlDocuments = true; @Builder.Default private boolean allowNonHtmlContents = false; + @Builder.Default private boolean onlyMainContent = false; - @Builder.Default private Set allowedTags = Set.of("a"); + @Builder.Default + private Set excludeFromMainContentTags = + Set.of( + "script", + "style", + "noscript", + "iframe", + "link", + "base", + "meta", + "object", + "embed", + "applet", + "audio", + "video", + "canvas", + "template", + "comment"); + + @Builder.Default private Set allowedTagsForHtmlDocumentScan = Set.of("a"); public boolean isAllowedUrl(String url) { final String domainOnly; @@ -96,6 +116,6 @@ public boolean isAllowedUrl(String url) { } public boolean isAllowedTag(String tagName) { - return tagName != null && allowedTags.contains(tagName.toLowerCase()); + return tagName != null && allowedTagsForHtmlDocumentScan.contains(tagName.toLowerCase()); } } diff --git a/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/WebCrawlerSourceTest.java b/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/WebCrawlerSourceTest.java index 18f0fadcd..c710eb353 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/WebCrawlerSourceTest.java +++ b/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/WebCrawlerSourceTest.java @@ -425,6 +425,71 @@ void testBasic(WireMockRuntimeInfo wmRuntimeInfo) throws Exception { .build())); } + @Test + void testSkipUnchanged(WireMockRuntimeInfo wmRuntimeInfo) throws Exception { + stubFor( + get("/index.html") + .willReturn( + okForContentType( + "text/html", + """ + link + """))); + stubFor( + get("/secondPage.html") + .willReturn( + okForContentType( + "text/html", + """ + link + link to home + """))); + stubFor( + get("/thirdPage.html") + .willReturn( + okForContentType( + "text/html", + """ + Hello! + """))); + + String bucket = "langstream-test-" + UUID.randomUUID(); + String url = wmRuntimeInfo.getHttpBaseUrl() + "/index.html"; + String allowed = wmRuntimeInfo.getHttpBaseUrl(); + Map additionalConfig = + Map.of( + "reindex-interval-seconds", + "2", + "emit-content-diff", + List.of("new", "content_changed")); + + try (WebCrawlerSource agentSource = + buildAgentSource(bucket, allowed, Set.of(), url, additionalConfig); ) { + List read = agentSource.read(); + AtomicInteger reindexCount = new AtomicInteger(); + agentSource.setOnReindexStart( + () -> { + reindexCount.incrementAndGet(); + }); + List pages = new ArrayList<>(); + while (reindexCount.get() < 3) { + for (Record r : read) { + String docUrl = r.key().toString(); + pages.add(docUrl); + if (reindexCount.get() == 0) { + assertEquals("new", r.getHeader("content_diff").valueAsString()); + } else { + fail("Should not have read anything after the first reindex"); + } + } + agentSource.commit(read); + read = agentSource.read(); + } + // please note that JSoup normalised the HTML + assertEquals(3, pages.size()); + } + } + private static final String ROBOTS = """ User-agent: * @@ -457,59 +522,60 @@ void testWithRobots(WireMockRuntimeInfo wmRuntimeInfo) throws Exception { String url = wmRuntimeInfo.getHttpBaseUrl() + "/index.html"; String allowed = wmRuntimeInfo.getHttpBaseUrl(); Map additionalConfig = Map.of(); - WebCrawlerSource agentSource = - buildAgentSource(bucket, allowed, Set.of(), url, additionalConfig); + try (WebCrawlerSource agentSource = + buildAgentSource(bucket, allowed, Set.of(), url, additionalConfig); ) { - WebCrawler crawler = agentSource.getCrawler(); - WebCrawlerStatus status = crawler.getStatus(); + WebCrawler crawler = agentSource.getCrawler(); + WebCrawlerStatus status = crawler.getStatus(); - List read = agentSource.read(); - Set urls = new HashSet<>(); - Map pages = new HashMap<>(); - while (pages.size() != 2) { - log.info("read: {}", read); - log.info("Known urls: {}", status.getUrls().size()); - for (Record r : read) { - String docUrl = r.key().toString(); - String pageName = docUrl.substring(docUrl.lastIndexOf('/') + 1); - pages.put(pageName, new String((byte[]) r.value())); - assertTrue(urls.add(docUrl), "Read twice the same url: " + docUrl); + List read = agentSource.read(); + Set urls = new HashSet<>(); + Map pages = new HashMap<>(); + while (pages.size() != 2) { + log.info("read: {}", read); + log.info("Known urls: {}", status.getUrls().size()); + for (Record r : read) { + String docUrl = r.key().toString(); + String pageName = docUrl.substring(docUrl.lastIndexOf('/') + 1); + pages.put(pageName, new String((byte[]) r.value())); + assertTrue(urls.add(docUrl), "Read twice the same url: " + docUrl); + } + agentSource.commit(read); + read = agentSource.read(); + } + agentSource.close(); + assertEquals(2, pages.size()); + // please note that JSoup normalised the HTML + assertEquals( + """ + + + + link + + """, + pages.get("index.html")); + assertEquals( + """ + + + + link link to home + + """, + pages.get("secondPage.html")); + + assertTrue(status.getRemainingUrls().isEmpty()); + assertTrue(status.getPendingUrls().isEmpty()); + assertFalse(status.getRobotsFiles().isEmpty()); + + // test reload robot rules from S3 + try (WebCrawlerSource agentSource2 = + buildAgentSource(bucket, allowed, Set.of(), url, additionalConfig); ) { + WebCrawler crawler2 = agentSource2.getCrawler(); + assertEquals(crawler2.getStatus().getRobotsFiles(), status.getRobotsFiles()); } - agentSource.commit(read); - read = agentSource.read(); } - agentSource.close(); - assertEquals(2, pages.size()); - // please note that JSoup normalised the HTML - assertEquals( - """ - - - - link - - """, - pages.get("index.html")); - assertEquals( - """ - - - - link link to home - - """, - pages.get("secondPage.html")); - - assertTrue(status.getRemainingUrls().isEmpty()); - assertTrue(status.getPendingUrls().isEmpty()); - assertFalse(status.getRobotsFiles().isEmpty()); - - // test reload robot rules from S3 - WebCrawlerSource agentSource2 = - buildAgentSource(bucket, allowed, Set.of(), url, additionalConfig); - WebCrawler crawler2 = agentSource.getCrawler(); - assertEquals(crawler2.getStatus().getRobotsFiles(), status.getRobotsFiles()); - agentSource2.close(); } private WebCrawlerSource buildAgentSource( @@ -610,7 +676,7 @@ void testRecoverFromWrongJsonFile() throws Exception { json.length(), 5 * 1024 * 1024) .build()); - WebCrawlerSource agentSource = + try (WebCrawlerSource agentSource = buildAgentSource( bucket, allowed, @@ -622,10 +688,84 @@ void testRecoverFromWrongJsonFile() throws Exception { "scan-html-documents", "false", "max-urls", - 10000)); - assertEquals( - "s3://%s/%s".formatted(bucket, objectName), - agentSource.buildAdditionalInfo().get("statusFileName")); - agentSource.close(); + 10000)); ) { + assertEquals( + "s3://%s/%s".formatted(bucket, objectName), + agentSource.buildAdditionalInfo().get("statusFileName")); + } + } + + @Test + void testOnlyMainContent(WireMockRuntimeInfo wmRuntimeInfo) throws Exception { + stubFor( + get("/index.html") + .willReturn( + okForContentType( + "text/html", + """ + + + Test + + + + +
+ link + Hello World! +
+ + + + """))); + stubFor(get("/secondPage.html").willReturn(okForContentType("text/html", "Second"))); + + String bucket = "langstream-test-" + UUID.randomUUID(); + String url = wmRuntimeInfo.getHttpBaseUrl() + "/index.html"; + String allowed = wmRuntimeInfo.getHttpBaseUrl(); + Map additionalConfig = + Map.of("only-main-content", true, "handle-robots-file", "false"); + + try (WebCrawlerSource agentSource = + buildAgentSource(bucket, allowed, Set.of(), url, additionalConfig); ) { + List all = new ArrayList<>(); + for (int i = 0; i < 2; i++) { + all.addAll(agentSource.read()); + } + assertEquals(2, all.size()); + for (Record record : all) { + String content = new String((byte[]) record.value(), StandardCharsets.UTF_8); + if (record.key().toString().contains("index.html")) { + assertEquals( + """ + + + Test + + +
+ link Hello World! +
+ + """, + content); + } else { + + assertEquals( + """ + + + + Second + + """, + content); + } + } + } } } diff --git a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java index 3f8ac8040..519601547 100644 --- a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java +++ b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java @@ -303,6 +303,25 @@ Whether to emit non HTML documents to the pipeline (i.e. PDF Files). @JsonProperty("handle-cookies") private boolean handleCookies; + @ConfigProperty( + description = + """ + Whether to remove non semantic tags from the content. (script, style..) + """, + defaultValue = "false") + @JsonProperty("only-main-content") + private boolean onlyMainContent; + + @ConfigProperty( + description = + """ + If only-main-content is enabled, this list of tags will be excluded from the main content. + """, + defaultValue = + "[\"script\", \"style\", \"noscript\", \"iframe\", \"link\", \"base\", \"meta\", \"object\", \"embed\", \"applet\", \"audio\", \"video\", \"canvas\", \"template\", \"comment\"]") + @JsonProperty("exclude-from-main-content-tags") + private List excludeFromMainContentTags; + @ConfigProperty( description = """ @@ -354,5 +373,17 @@ Use this property to disable the source activity summary (by leaving default to """) @JsonProperty("source-activity-summary-time-seconds-threshold") private int sourceActivitySummaryTimeSecondsThreshold; + + @ConfigProperty( + description = + """ + Filter content diff you want to emit. By default all content diff are emitted. + To emit only new content diff, set to 'new'. + To emit only changed content diff, set to 'content_changed'. + To skip emitting unchanged content diff, set to 'new', 'content_changed'. + """, + defaultValue = "true") + @JsonProperty("emit-content-diff") + private List emitContentDiff; } }