Skip to content

Commit

Permalink
[webcrawler-source] Expose the allow-non-html-contents parameter (#740)
Browse files Browse the repository at this point in the history
  • Loading branch information
eolivelli authored Nov 30, 2023
1 parent 71723ff commit dd3c120
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,13 @@ public void init(Map<String, Object> configuration) throws Exception {
String userAgent = getString("user-agent", DEFAULT_USER_AGENT, configuration);
int maxErrorCount = getInt("max-error-count", 5, configuration);
int httpTimeout = getInt("http-timeout", 10000, configuration);
boolean allowNonHtmlContents = getBoolean("allow-non-html-contents", false, configuration);

boolean handleCookies = getBoolean("handle-cookies", true, configuration);

log.info("allowed-domains: {}", allowedDomains);
log.info("forbidden-paths: {}", forbiddenPaths);
log.info("allow-non-html-contents: {}", allowNonHtmlContents);
log.info("seed-urls: {}", seedUrls);
log.info("max-urls: {}", maxUrls);
log.info("max-depth: {}", maxDepth);
Expand All @@ -133,6 +135,7 @@ public void init(Map<String, Object> configuration) throws Exception {
WebCrawlerConfiguration webCrawlerConfiguration =
WebCrawlerConfiguration.builder()
.allowedDomains(allowedDomains)
.allowNonHtmlContents(allowNonHtmlContents)
.maxUrls(maxUrls)
.maxDepth(maxDepth)
.forbiddenPaths(forbiddenPaths)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ public boolean runCycle() throws Exception {
// we did something
return true;
} catch (UnsupportedMimeTypeException notHtml) {
if (configuration.isAllowNonHtmlContent()) {
if (configuration.isAllowNonHtmlContents()) {
log.info(
"Url {} lead to a {} content-type document. allow-not-html-content is true, so we are processing it",
current,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public class WebCrawlerConfiguration {
@Builder.Default private boolean handleCookies = true;
@Builder.Default private boolean handleRobotsFile = true;
@Builder.Default private boolean scanHtmlDocuments = true;
@Builder.Default private boolean allowNonHtmlContent = false;
@Builder.Default private boolean allowNonHtmlContents = false;

@Builder.Default private Set<String> allowedTags = Set.of("a");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ void testBinaryContent(WireMockRuntimeInfo vmRuntimeInfo) throws Exception {
WebCrawlerConfiguration configuration =
WebCrawlerConfiguration.builder()
.allowedDomains(Set.of(vmRuntimeInfo.getHttpBaseUrl()))
.allowNonHtmlContent(true)
.allowNonHtmlContents(true)
.handleRobotsFile(false)
.maxErrorCount(5)
.build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,15 @@ public static class Config {
@JsonProperty("scan-html-documents")
private boolean scanHtmlDocuments;

@ConfigProperty(
description =
"""
Whether to emit non HTML documents to the pipeline (i.e. PDF Files).
""",
defaultValue = "false")
@JsonProperty("allow-non-html-contents")
private boolean allowNonHtmlContents;

@ConfigProperty(
description =
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ public void test(WireMockRuntimeInfo vmRuntimeInfo) throws Exception {
output: "${globals.output-topic}"
configuration:\s
seed-urls: ["%s/index.html"]
allow-non-html-contents: true
allowed-domains: ["%s"]
state-storage: disk
"""
Expand Down

0 comments on commit dd3c120

Please sign in to comment.