Skip to content
This repository has been archived by the owner on Aug 25, 2024. It is now read-only.

Commit

Permalink
feat(webcrawler): improve unchanged pages sourcing (#131)
Browse files Browse the repository at this point in the history
New flags:
- `only-main-content`: default to false, if enabled it will remove
script, style (and others) tags from the emitted document. This is
particalury helpful in order to verify actual semantic changes to the
pages, not related to sldf (script versioning, cache busting, etc)
- `emit-content-diff`: list, default to all the content diff. You can
filter the content diff you want the source to emit, if available. For
example, to not emit content_unchanged, you can set `emit-content-diff:
['new', 'content_diff']`
  • Loading branch information
nicoloboschi authored Aug 22, 2024
1 parent 241d16e commit a90c902
Show file tree
Hide file tree
Showing 5 changed files with 287 additions and 78 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,7 @@
import static ai.langstream.agents.webcrawler.crawler.WebCrawlerConfiguration.DEFAULT_USER_AGENT;
import static ai.langstream.api.util.ConfigurationUtils.*;

import ai.langstream.agents.webcrawler.crawler.Document;
import ai.langstream.agents.webcrawler.crawler.StatusStorage;
import ai.langstream.agents.webcrawler.crawler.WebCrawler;
import ai.langstream.agents.webcrawler.crawler.WebCrawlerConfiguration;
import ai.langstream.agents.webcrawler.crawler.WebCrawlerStatus;
import ai.langstream.agents.webcrawler.crawler.*;
import ai.langstream.ai.agents.commons.state.LocalDiskStateStorage;
import ai.langstream.ai.agents.commons.state.S3StateStorage;
import ai.langstream.ai.agents.commons.state.StateStorage;
Expand Down Expand Up @@ -159,20 +155,11 @@ public void init(Map<String, Object> configuration) throws Exception {
entry.getKey(), entry.getValue()))
.collect(Collectors.toUnmodifiableList());

log.info("allowed-domains: {}", allowedDomains);
log.info("forbidden-paths: {}", forbiddenPaths);
log.info("allow-non-html-contents: {}", allowNonHtmlContents);
log.info("seed-urls: {}", seedUrls);
log.info("max-urls: {}", maxUrls);
log.info("max-depth: {}", maxDepth);
log.info("handle-robots-file: {}", handleRobotsFile);
log.info("scan-html-documents: {}", scanHtmlDocuments);
log.info("user-agent: {}", userAgent);
log.info("max-unflushed-pages: {}", maxUnflushedPages);
log.info("min-time-between-requests: {}", minTimeBetweenRequests);
log.info("reindex-interval-seconds: {}", reindexIntervalSeconds);

WebCrawlerConfiguration webCrawlerConfiguration =
final boolean onlyMainContent = getBoolean("only-main-content", false, configuration);
final Set<String> excludeFromMainContentTags =
getSet("exclude-from-main-content-tags", configuration);

WebCrawlerConfiguration.WebCrawlerConfigurationBuilder builder =
WebCrawlerConfiguration.builder()
.allowedDomains(allowedDomains)
.allowNonHtmlContents(allowNonHtmlContents)
Expand All @@ -185,16 +172,41 @@ public void init(Map<String, Object> configuration) throws Exception {
.handleCookies(handleCookies)
.httpTimeout(httpTimeout)
.maxErrorCount(maxErrorCount)
.build();
.onlyMainContent(onlyMainContent);
if (!excludeFromMainContentTags.isEmpty()) {
builder.excludeFromMainContentTags(excludeFromMainContentTags);
}
WebCrawlerConfiguration webCrawlerConfiguration = builder.build();
log.info("configuration: {}", webCrawlerConfiguration);

WebCrawlerStatus status = new WebCrawlerStatus();
// this can be overwritten when the status is reloaded
status.setLastIndexStartTimestamp(System.currentTimeMillis());

final List<String> emitContentDiff =
getList("emit-content-diff", configuration).stream()
.map(String::toLowerCase)
.toList();

crawler =
new WebCrawler(
webCrawlerConfiguration,
status,
foundDocuments::add,
new DocumentVisitor() {
@Override
public void visit(Document document) {
if (document.contentDiff() == null
|| emitContentDiff.isEmpty()
|| emitContentDiff.contains(
document.contentDiff().toString().toLowerCase())) {
foundDocuments.add(document);
} else {
log.info(
"Discarding document with content diff {}",
document.contentDiff());
}
}
},
this::sendDeletedDocument);

sourceActivitySummaryTopic =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,12 @@ public boolean runCycle() throws Exception {
}
});
}
if (configuration.isOnlyMainContent()) {
for (String excludeFromMainContentTag :
configuration.getExcludeFromMainContentTags()) {
document.getElementsByTag(excludeFromMainContentTag).remove();
}
}
onDocumentFound(current, document.html().getBytes(StandardCharsets.UTF_8), contentType);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,28 @@ public class WebCrawlerConfiguration {
@Builder.Default private boolean handleRobotsFile = true;
@Builder.Default private boolean scanHtmlDocuments = true;
@Builder.Default private boolean allowNonHtmlContents = false;
@Builder.Default private boolean onlyMainContent = false;

@Builder.Default private Set<String> allowedTags = Set.of("a");
@Builder.Default
private Set<String> excludeFromMainContentTags =
Set.of(
"script",
"style",
"noscript",
"iframe",
"link",
"base",
"meta",
"object",
"embed",
"applet",
"audio",
"video",
"canvas",
"template",
"comment");

@Builder.Default private Set<String> allowedTagsForHtmlDocumentScan = Set.of("a");

public boolean isAllowedUrl(String url) {
final String domainOnly;
Expand Down Expand Up @@ -96,6 +116,6 @@ public boolean isAllowedUrl(String url) {
}

public boolean isAllowedTag(String tagName) {
return tagName != null && allowedTags.contains(tagName.toLowerCase());
return tagName != null && allowedTagsForHtmlDocumentScan.contains(tagName.toLowerCase());
}
}
Loading

0 comments on commit a90c902

Please sign in to comment.