From 081764bd11f9fa795eaf394fb35bdc5ca6550285 Mon Sep 17 00:00:00 2001 From: Enrico Olivelli Date: Thu, 21 Sep 2023 16:25:20 +0200 Subject: [PATCH 1/3] [webcrawler] Do not fail in case of unknown properties in the JSON file (handle backward compability) --- .../ai/langstream/agents/webcrawler/WebCrawlerSource.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java index 3ade7cc6d..ddc17ae63 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java +++ b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java @@ -32,6 +32,7 @@ import ai.langstream.api.runner.code.Header; import ai.langstream.api.runner.code.Record; import ai.langstream.api.runner.code.SimpleRecord; +import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import io.minio.BucketExistsArgs; import io.minio.GetObjectArgs; @@ -368,7 +369,9 @@ public String toString() { } private class S3StatusStorage implements StatusStorage { - private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final ObjectMapper MAPPER = + new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); @Override public void storeStatus(Status status) throws Exception { From 17674ee59d035bb7c317bf4a1e52b332ee01d7aa Mon Sep 17 00:00:00 2001 From: Enrico Olivelli Date: Thu, 21 Sep 2023 16:57:38 +0200 Subject: [PATCH 2/3] Add tests --- .../agents/webcrawler/WebCrawlerSource.java | 15 ++++--- .../webcrawler/WebCrawlerSourceTest.java | 45 +++++++++++++++++++ 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java index ddc17ae63..deef68d42 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java +++ b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java @@ -32,7 +32,6 @@ import ai.langstream.api.runner.code.Header; import ai.langstream.api.runner.code.Record; import ai.langstream.api.runner.code.SimpleRecord; -import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import io.minio.BucketExistsArgs; import io.minio.GetObjectArgs; @@ -58,6 +57,7 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; +import lombok.Getter; import lombok.extern.slf4j.Slf4j; @Slf4j @@ -75,7 +75,7 @@ public class WebCrawlerSource extends AbstractAgentCode implements AgentSource { private MinioClient minioClient; private int reindexIntervalSeconds; - private String statusFileName; + @Getter private String statusFileName; private WebCrawler crawler; @@ -369,9 +369,7 @@ public String toString() { } private class S3StatusStorage implements StatusStorage { - private static final ObjectMapper MAPPER = - new ObjectMapper() - .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + private static final ObjectMapper MAPPER = new ObjectMapper(); @Override public void storeStatus(Status status) throws Exception { @@ -397,7 +395,12 @@ public Status getCurrentStatus() throws Exception { .build()); byte[] content = result.readAllBytes(); log.info("Restoring status from {}, {} bytes", statusFileName, content.length); - return MAPPER.readValue(content, Status.class); + try { + return MAPPER.readValue(content, Status.class); + } catch (IOException e) { + log.error("Error parsing status file, restarting from scratch", e); + return null; + } } catch (ErrorResponseException e) { if (e.errorResponse().code().equals("NoSuchKey")) { log.info("No status file found, starting from scratch"); diff --git a/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/WebCrawlerSourceTest.java b/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/WebCrawlerSourceTest.java index d7c786bb0..baecde97e 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/WebCrawlerSourceTest.java +++ b/langstream-agents/langstream-agent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/WebCrawlerSourceTest.java @@ -36,7 +36,11 @@ import ai.langstream.api.runner.topics.TopicProducer; import com.github.tomakehurst.wiremock.junit5.WireMockRuntimeInfo; import com.github.tomakehurst.wiremock.junit5.WireMockTest; +import io.minio.MakeBucketArgs; import io.minio.MinioClient; +import io.minio.PutObjectArgs; +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.HashMap; import java.util.HashSet; @@ -422,4 +426,45 @@ public Path getCodeDirectory() { agentSource.start(); return (WebCrawlerSource) agentSource; } + + @Test + void testRecoverFromWrongJsonFile() throws Exception { + String bucket = "langstream-test-" + UUID.randomUUID(); + String url = "https://www.datastax.com/"; + String allowed = "https://www.datastax.com/"; + + String objectName = "test-global-agent-id.webcrawler.status.json"; + String json = + """ + { + "some-field": "some-value" + } + """; + minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucket).build()); + minioClient.putObject( + PutObjectArgs.builder() + .bucket(bucket) + .contentType("application/json") + .object(objectName) + .stream( + new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)), + json.length(), + 5 * 1024 * 1024) + .build()); + WebCrawlerSource agentSource = + buildAgentSource( + bucket, + allowed, + Set.of(), + url, + Map.of( + "reindex-interval-seconds", + "3600", + "scan-html-documents", + "false", + "max-urls", + 10000)); + assertEquals(objectName, agentSource.getStatusFileName()); + agentSource.close(); + } } From ecc09ee8ec82dea4d6b48275e01c81d5e5ee809d Mon Sep 17 00:00:00 2001 From: Enrico Olivelli Date: Thu, 21 Sep 2023 17:20:45 +0200 Subject: [PATCH 3/3] set 50 as default max-depth --- .../java/ai/langstream/agents/webcrawler/WebCrawlerSource.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java index deef68d42..105071d77 100644 --- a/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java +++ b/langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java @@ -108,7 +108,7 @@ public void init(Map configuration) throws Exception { allowedDomains = getSet("allowed-domains", configuration); forbiddenPaths = getSet("forbidden-paths", configuration); maxUrls = getInt("max-urls", 1000, configuration); - int maxDepth = getInt("max-depth", 10, configuration); + int maxDepth = getInt("max-depth", 50, configuration); handleRobotsFile = getBoolean("handle-robots-file", true, configuration); scanHtmlDocuments = getBoolean("scan-html-documents", true, configuration); seedUrls = getSet("seed-urls", configuration);