From 605d2877618680bb04640f38bd5e840f22464747 Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Thu, 19 Dec 2024 16:50:59 +0900 Subject: [PATCH] remove metaData --- .../codelibs/fess/crawler/CrawlerThread.java | 23 +++++----- .../crawler/builder/RequestDataBuilder.java | 4 +- .../fess/crawler/entity/RequestData.java | 42 ++++++++++--------- .../impl/DefaultResponseProcessor.java | 7 ++-- .../impl/SitemapsResponseProcessor.java | 2 +- .../service/impl/UrlQueueServiceImpl.java | 10 ++--- 6 files changed, 47 insertions(+), 41 deletions(-) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/CrawlerThread.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/CrawlerThread.java index 631cb07d..2530d331 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/CrawlerThread.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/CrawlerThread.java @@ -151,8 +151,8 @@ public void run() { log(logHelper, LogType.GET_CONTENT, crawlerContext, urlQueue); // access an url final long startTime = SystemUtil.currentTimeMillis(); - responseData = client.execute( - RequestDataBuilder.newRequestData().method(urlQueue.getMethod()).url(urlQueue.getUrl()).build()); + responseData = client.execute(RequestDataBuilder.newRequestData().method(urlQueue.getMethod()) + .url(urlQueue.getUrl()).weight(urlQueue.getWeight()).build()); responseData.setExecutionTime(SystemUtil.currentTimeMillis() - startTime); responseData.setParentUrl(urlQueue.getParentUrl()); responseData.setSessionId(crawlerContext.sessionId); @@ -163,7 +163,7 @@ public void run() { } else { log(logHelper, LogType.REDIRECT_LOCATION, crawlerContext, urlQueue, responseData); // redirect - storeChildUrl(responseData.getRedirectLocation(), urlQueue.getUrl(), null, + storeChildUrl(responseData.getRedirectLocation(), urlQueue.getUrl(), urlQueue.getWeight(), urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1); } } @@ -234,7 +234,8 @@ protected void addSitemapsFromRobotsTxt(final UrlQueue urlQueue) { if (sitemaps != null) { for (final String childUrl : sitemaps) { try { - storeChildUrl(childUrl, urlQueue.getUrl(), null, urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1); + storeChildUrl(childUrl, urlQueue.getUrl(), urlQueue.getWeight(), + urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1); } catch (final Exception e) { log(logHelper, LogType.PROCESS_CHILD_URL_BY_EXCEPTION, crawlerContext, urlQueue, childUrl, e); } @@ -253,7 +254,8 @@ protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue ResponseData responseData = null; try { // head method - responseData = client.execute(RequestDataBuilder.newRequestData().head().url(urlQueue.getUrl()).build()); + responseData = client + .execute(RequestDataBuilder.newRequestData().head().url(urlQueue.getUrl()).weight(urlQueue.getWeight()).build()); if (responseData != null && responseData.getLastModified() != null && responseData.getLastModified().getTime() <= urlQueue.getLastModified().longValue() && responseData.getHttpStatusCode() == 200) { @@ -301,8 +303,9 @@ protected void storeChildUrls(final Set childUrlList, final String // add url and filter final Set urlSet = new HashSet<>(); - final List> childList = childUrlList.stream().filter(d -> StringUtil.isNotBlank(d.getUrl()) - && urlSet.add(d.getUrl() + "\n" + d.getMetaData()) && crawlerContext.urlFilter.match(d.getUrl())).map(d -> { + final List> childList = childUrlList.stream() + .filter(d -> StringUtil.isNotBlank(d.getUrl()) && urlSet.add(d.getUrl()) && crawlerContext.urlFilter.match(d.getUrl())) + .map(d -> { final UrlQueue uq = crawlerContainer.getComponent("urlQueue"); uq.setCreateTime(SystemUtil.currentTimeMillis()); uq.setDepth(depth); @@ -310,13 +313,13 @@ protected void storeChildUrls(final Set childUrlList, final String uq.setParentUrl(url); uq.setSessionId(crawlerContext.sessionId); uq.setUrl(d.getUrl()); - uq.setMetaData(d.getMetaData()); + uq.setWeight(d.getWeight()); return uq; }).collect(Collectors.toList()); urlQueueService.offerAll(crawlerContext.sessionId, childList); } - protected void storeChildUrl(final String childUrl, final String parentUrl, final String metaData, final int depth) { + protected void storeChildUrl(final String childUrl, final String parentUrl, final float weight, final int depth) { if (crawlerContext.getMaxDepth() >= 0 && depth > crawlerContext.getMaxDepth()) { return; } @@ -331,7 +334,7 @@ protected void storeChildUrl(final String childUrl, final String parentUrl, fina uq.setParentUrl(parentUrl); uq.setSessionId(crawlerContext.sessionId); uq.setUrl(childUrl); - uq.setMetaData(metaData); + uq.setWeight(weight); childList.add(uq); urlQueueService.offerAll(crawlerContext.sessionId, childList); } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/builder/RequestDataBuilder.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/builder/RequestDataBuilder.java index 3b7ba357..3dedb1b2 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/builder/RequestDataBuilder.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/builder/RequestDataBuilder.java @@ -75,8 +75,8 @@ public RequestDataContext url(final String url) { return this; } - public RequestDataContext metaData(final String metaData) { - data.setMetaData(metaData); + public RequestDataContext weight(final float weight) { + data.setWeight(weight); return this; } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RequestData.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RequestData.java index 24be1442..51d8277f 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RequestData.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RequestData.java @@ -15,6 +15,9 @@ */ package org.codelibs.fess.crawler.entity; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; import java.util.Objects; import org.codelibs.fess.crawler.Constants; @@ -32,7 +35,7 @@ public enum Method { private String url; - private String metaData; + private float weight = 1.0f; public Method getMethod() { return method; @@ -62,36 +65,35 @@ public void setUrl(final String url) { this.url = url; } - public String getMetaData() { - return metaData; + public float getWeight() { + return weight; } - public void setMetaData(final String metaData) { - this.metaData = metaData; - } - - @Override - public String toString() { - return "RequestData [method=" + method + ", url=" + url + "]"; + public void setWeight(float weight) { + this.weight = weight; } @Override public int hashCode() { - return Objects.hash(metaData, method, url); + return Objects.hash(method, url, weight); } @Override - public boolean equals(final Object obj) { - if (this == obj) { + public boolean equals(Object obj) { + if (this == obj) return true; - } - if (obj == null || getClass() != obj.getClass()) { + if (obj == null) return false; - } - final RequestData other = (RequestData) obj; - if (!Objects.equals(metaData, other.metaData) || method != other.method || !Objects.equals(url, other.url)) { + if (getClass() != obj.getClass()) return false; - } - return true; + RequestData other = (RequestData) obj; + return method == other.method && Objects.equals(url, other.url) + && Float.floatToIntBits(weight) == Float.floatToIntBits(other.weight); + } + + @Override + public String toString() { + return "RequestData [method=" + method + ", url=" + url + ", weight=" + weight + "]"; } + } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/processor/impl/DefaultResponseProcessor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/processor/impl/DefaultResponseProcessor.java index 2df9941b..0d7b8838 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/processor/impl/DefaultResponseProcessor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/processor/impl/DefaultResponseProcessor.java @@ -173,8 +173,9 @@ protected void storeChildUrls(final CrawlerContext crawlerContext, final Set urlSet = new HashSet<>(); - final List> childList = childUrlList.stream().filter(d -> StringUtil.isNotBlank(d.getUrl()) - && urlSet.add(d.getUrl() + "\n" + d.getMetaData()) && crawlerContext.getUrlFilter().match(d.getUrl())).map(d -> { + final List> childList = childUrlList.stream() + .filter(d -> StringUtil.isNotBlank(d.getUrl()) && urlSet.add(d.getUrl()) && crawlerContext.getUrlFilter().match(d.getUrl())) + .map(d -> { final UrlQueue uq = crawlerContainer.getComponent("urlQueue"); uq.setCreateTime(SystemUtil.currentTimeMillis()); uq.setDepth(depth); @@ -183,7 +184,7 @@ protected void storeChildUrls(final CrawlerContext crawlerContext, final Set requestDataSet = new LinkedHashSet<>(); for (final Sitemap sitemap : sitemapSet.getSitemaps()) { if (sitemap != null) { - requestDataSet.add(RequestDataBuilder.newRequestData().get().url(sitemap.getLoc()).build()); + requestDataSet.add(RequestDataBuilder.newRequestData().get().url(sitemap.getLoc()).build()); // TODO priority } } throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#process"); diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/service/impl/UrlQueueServiceImpl.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/service/impl/UrlQueueServiceImpl.java index 6d69a931..3b482fed 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/service/impl/UrlQueueServiceImpl.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/service/impl/UrlQueueServiceImpl.java @@ -75,7 +75,6 @@ public void add(final String sessionId, final String url) { urlQueue.setSessionId(sessionId); urlQueue.setMethod(Constants.GET_METHOD); urlQueue.setUrl(url); - urlQueue.setUrl(url); urlQueue.setDepth(0); urlQueue.setCreateTime(SystemUtil.currentTimeMillis()); urlQueueList.add(urlQueue); @@ -215,12 +214,13 @@ public void generateUrlQueues(final String previousSessionId, final String sessi for (final Map.Entry> entry : arMap.entrySet()) { synchronized (urlQueueList) { final UrlQueueImpl urlQueue = new UrlQueueImpl<>(); + final AccessResultImpl value = entry.getValue(); urlQueue.setSessionId(sessionId); - urlQueue.setMethod(entry.getValue().getMethod()); - urlQueue.setUrl(entry.getValue().getUrl()); - urlQueue.setParentUrl(entry.getValue().getParentUrl()); + urlQueue.setMethod(value.getMethod()); + urlQueue.setUrl(value.getUrl()); + urlQueue.setParentUrl(value.getParentUrl()); urlQueue.setDepth(0); - urlQueue.setLastModified(entry.getValue().getLastModified()); + urlQueue.setLastModified(value.getLastModified()); urlQueue.setCreateTime(SystemUtil.currentTimeMillis()); urlQueueList.add(urlQueue); }