feat(webcrawler): improve unchanged pages sourcing (#131)

New flags: - `only-main-content`: default to false, if enabled it will remove script, style (and others) tags from the emitted document. This is particalury helpful in order to verify actual semantic changes to the pages, not related to sldf (script versioning, cache busting, etc) - `emit-content-diff`: list, default to all the content diff. You can filter the content diff you want the source to emit, if available. For example, to not emit content_unchanged, you can set `emit-content-diff: ['new', 'content_diff']`
vectorize-io · Aug 22, 2024 · a90c902 · a90c902
1 parent 241d16e
commit a90c902
Show file tree

Hide file tree

Showing 5 changed files with 287 additions and 78 deletions.
diff --git a/...ream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java b/...ream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java
@@ -18,11 +18,7 @@
 import static ai.langstream.agents.webcrawler.crawler.WebCrawlerConfiguration.DEFAULT_USER_AGENT;
 import static ai.langstream.api.util.ConfigurationUtils.*;
 
-import ai.langstream.agents.webcrawler.crawler.Document;
-import ai.langstream.agents.webcrawler.crawler.StatusStorage;
-import ai.langstream.agents.webcrawler.crawler.WebCrawler;
-import ai.langstream.agents.webcrawler.crawler.WebCrawlerConfiguration;
-import ai.langstream.agents.webcrawler.crawler.WebCrawlerStatus;
+import ai.langstream.agents.webcrawler.crawler.*;
 import ai.langstream.ai.agents.commons.state.LocalDiskStateStorage;
 import ai.langstream.ai.agents.commons.state.S3StateStorage;
 import ai.langstream.ai.agents.commons.state.StateStorage;
@@ -159,20 +155,11 @@ public void init(Map<String, Object> configuration) throws Exception {
                                                 entry.getKey(), entry.getValue()))
                         .collect(Collectors.toUnmodifiableList());
 
-        log.info("allowed-domains: {}", allowedDomains);
-        log.info("forbidden-paths: {}", forbiddenPaths);
-        log.info("allow-non-html-contents: {}", allowNonHtmlContents);
-        log.info("seed-urls: {}", seedUrls);
-        log.info("max-urls: {}", maxUrls);
-        log.info("max-depth: {}", maxDepth);
-        log.info("handle-robots-file: {}", handleRobotsFile);
-        log.info("scan-html-documents: {}", scanHtmlDocuments);
-        log.info("user-agent: {}", userAgent);
-        log.info("max-unflushed-pages: {}", maxUnflushedPages);
-        log.info("min-time-between-requests: {}", minTimeBetweenRequests);
-        log.info("reindex-interval-seconds: {}", reindexIntervalSeconds);
-
-        WebCrawlerConfiguration webCrawlerConfiguration =
+        final boolean onlyMainContent = getBoolean("only-main-content", false, configuration);
+        final Set<String> excludeFromMainContentTags =
+                getSet("exclude-from-main-content-tags", configuration);
+
+        WebCrawlerConfiguration.WebCrawlerConfigurationBuilder builder =
                 WebCrawlerConfiguration.builder()
                         .allowedDomains(allowedDomains)
                         .allowNonHtmlContents(allowNonHtmlContents)
@@ -185,16 +172,41 @@ public void init(Map<String, Object> configuration) throws Exception {
                         .handleCookies(handleCookies)
                         .httpTimeout(httpTimeout)
                         .maxErrorCount(maxErrorCount)
-                        .build();
+                        .onlyMainContent(onlyMainContent);
+        if (!excludeFromMainContentTags.isEmpty()) {
+            builder.excludeFromMainContentTags(excludeFromMainContentTags);
+        }
+        WebCrawlerConfiguration webCrawlerConfiguration = builder.build();
+        log.info("configuration: {}", webCrawlerConfiguration);
 
         WebCrawlerStatus status = new WebCrawlerStatus();
         // this can be overwritten when the status is reloaded
         status.setLastIndexStartTimestamp(System.currentTimeMillis());
+
+        final List<String> emitContentDiff =
+                getList("emit-content-diff", configuration).stream()
+                        .map(String::toLowerCase)
+                        .toList();
+
         crawler =
                 new WebCrawler(
                         webCrawlerConfiguration,
                         status,
-                        foundDocuments::add,
+                        new DocumentVisitor() {
+                            @Override
+                            public void visit(Document document) {
+                                if (document.contentDiff() == null
+                                        || emitContentDiff.isEmpty()
+                                        || emitContentDiff.contains(
+                                                document.contentDiff().toString().toLowerCase())) {
+                                    foundDocuments.add(document);
+                                } else {
+                                    log.info(
+                                            "Discarding document with content diff {}",
+                                            document.contentDiff());
+                                }
+                            }
+                        },
                         this::sendDeletedDocument);
 
         sourceActivitySummaryTopic =

diff --git a/...am-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java b/...am-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java
@@ -270,6 +270,12 @@ public boolean runCycle() throws Exception {
                                     }
                                 });
             }
+            if (configuration.isOnlyMainContent()) {
+                for (String excludeFromMainContentTag :
+                        configuration.getExcludeFromMainContentTags()) {
+                    document.getElementsByTag(excludeFromMainContentTag).remove();
+                }
+            }
             onDocumentFound(current, document.html().getBytes(StandardCharsets.UTF_8), contentType);
         }
 

diff --git a/...rawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java b/...rawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java
@@ -40,8 +40,28 @@ public class WebCrawlerConfiguration {
     @Builder.Default private boolean handleRobotsFile = true;
     @Builder.Default private boolean scanHtmlDocuments = true;
     @Builder.Default private boolean allowNonHtmlContents = false;
+    @Builder.Default private boolean onlyMainContent = false;
 
-    @Builder.Default private Set<String> allowedTags = Set.of("a");
+    @Builder.Default
+    private Set<String> excludeFromMainContentTags =
+            Set.of(
+                    "script",
+                    "style",
+                    "noscript",
+                    "iframe",
+                    "link",
+                    "base",
+                    "meta",
+                    "object",
+                    "embed",
+                    "applet",
+                    "audio",
+                    "video",
+                    "canvas",
+                    "template",
+                    "comment");
+
+    @Builder.Default private Set<String> allowedTagsForHtmlDocumentScan = Set.of("a");
 
     public boolean isAllowedUrl(String url) {
         final String domainOnly;
@@ -96,6 +116,6 @@ public boolean isAllowedUrl(String url) {
     }
 
     public boolean isAllowedTag(String tagName) {
-        return tagName != null && allowedTags.contains(tagName.toLowerCase());
+        return tagName != null && allowedTagsForHtmlDocumentScan.contains(tagName.toLowerCase());
     }
 }