From 1460c7aa1d17039544d2b8fe441f8c7262ecac2b Mon Sep 17 00:00:00 2001 From: CodingPF Date: Sun, 5 Nov 2023 14:18:34 +0100 Subject: [PATCH 1/4] update configurationmanager to support configuration file name --- .../config/Log4JConfigurationFactory.java | 12 ++- .../mserver/base/config/MServerConfigDTO.java | 6 +- .../base/config/MServerConfigManager.java | 4 - .../mserver/crawler/CrawlerManager.java | 14 ++-- .../crawler/basic/IgnoreFilmFilter.java | 2 +- .../mserver/crawler/dw/DWTaskBase.java | 8 +- .../funk/json/FunkVideoDeserializer.java | 4 +- .../crawler/sr/tasks/SrFilmDetailTask.java | 1 - .../sr/tasks/SrRateLimitedDocumentTask.java | 7 +- .../crawler/zdf/tasks/ZdfTaskBase.java | 9 ++- .../mserver/ui/config/MServerConfigUI.java | 76 +++++++++++++------ .../CrawlerManagerImportFilmlistsTest.java | 17 ++--- .../crawler/CrawlerManagerLivestreamTest.java | 13 +--- .../mserver/crawler/CrawlerManagerTest.java | 17 +---- .../json/ArdTopicPageDeserializerTest.java | 2 - .../ZdfTopicsPageHtmlDeserializerTest.java | 1 - 16 files changed, 101 insertions(+), 92 deletions(-) diff --git a/src/main/java/de/mediathekview/mserver/base/config/Log4JConfigurationFactory.java b/src/main/java/de/mediathekview/mserver/base/config/Log4JConfigurationFactory.java index 8fed8a279..e63e9bb81 100644 --- a/src/main/java/de/mediathekview/mserver/base/config/Log4JConfigurationFactory.java +++ b/src/main/java/de/mediathekview/mserver/base/config/Log4JConfigurationFactory.java @@ -45,9 +45,19 @@ public class Log4JConfigurationFactory extends ConfigurationFactory { private static MServerLogSettingsDTO logSettings; + public Log4JConfigurationFactory(MServerLogSettingsDTO logSettings) { + Log4JConfigurationFactory.logSettings = logSettings; + } + + + public Log4JConfigurationFactory() { + if (Log4JConfigurationFactory.logSettings == null) { + Log4JConfigurationFactory.logSettings = new MServerConfigManager(MServerConfigManager.DEFAULT_CONFIG_FILE).getConfig().getLogSettings(); + } + } + static Configuration createConfiguration( final String name, final ConfigurationBuilder aBuilder) { - logSettings = new MServerConfigManager().getConfig().getLogSettings(); aBuilder.setConfigurationName(name); diff --git a/src/main/java/de/mediathekview/mserver/base/config/MServerConfigDTO.java b/src/main/java/de/mediathekview/mserver/base/config/MServerConfigDTO.java index aa35966a5..a338e0ece 100644 --- a/src/main/java/de/mediathekview/mserver/base/config/MServerConfigDTO.java +++ b/src/main/java/de/mediathekview/mserver/base/config/MServerConfigDTO.java @@ -15,7 +15,7 @@ public class MServerConfigDTO extends MServerBasicConfigDTO implements ConfigDTO private final Boolean writeFilmlistIdFileEnabled; private final String filmlistIdFilePath; /** ignore certain film by title **/ - private final String ignoreFilmlistPath; + private String ignoreFilmlistPath; /** add livestreams from external list **/ private final ImportLivestreamConfiguration importLivestreamConfiguration; /** add additional filmlist from external **/ @@ -147,6 +147,10 @@ public Map getFilmlistSavePaths() { public void setFilmlistSavePaths(final Map filmlistSavePaths) { this.filmlistSavePaths = filmlistSavePaths; } + + public void setIgnoreFilmlistPath(final String ignoreFilmlistPath) { + this.ignoreFilmlistPath = ignoreFilmlistPath; + } public MServerLogSettingsDTO getLogSettings() { return logSettings; diff --git a/src/main/java/de/mediathekview/mserver/base/config/MServerConfigManager.java b/src/main/java/de/mediathekview/mserver/base/config/MServerConfigManager.java index f308b5c8f..12a405655 100644 --- a/src/main/java/de/mediathekview/mserver/base/config/MServerConfigManager.java +++ b/src/main/java/de/mediathekview/mserver/base/config/MServerConfigManager.java @@ -14,10 +14,6 @@ public MServerConfigManager(final String fileName) { configFileName = fileName; } - public MServerConfigManager() { - this(DEFAULT_CONFIG_FILE); - } - /** * @param aSender The {@link Sender} for which the config will be loaded. * @return The Sender specific config. diff --git a/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java b/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java index 8c8619c77..245a3bf16 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java +++ b/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java @@ -57,8 +57,8 @@ public class CrawlerManager extends AbstractManager { private static final String FILMLIST_JSON_COMPRESSED_DEFAULT_NAME = FILMLIST_JSON_DEFAULT_NAME + ".xz"; private static final Logger LOG = LogManager.getLogger(CrawlerManager.class); - private static CrawlerManager instance; private final MServerConfigDTO config; + private final MServerConfigManager rootConfig; private final ForkJoinPool forkJoinPool; private final Filmlist filmlist; private final IgnoreFilmFilter ingoreFilmFilter; @@ -70,9 +70,9 @@ public class CrawlerManager extends AbstractManager { private final Collection copyProgressListeners; private final Filmlist differenceList; - private CrawlerManager() { + public CrawlerManager(MServerConfigManager aMServerConfigManager) { super(); - final MServerConfigManager rootConfig = new MServerConfigManager(); + rootConfig = aMServerConfigManager; config = rootConfig.getConfig(); ingoreFilmFilter = new IgnoreFilmFilter(config.getIgnoreFilmslistPath()); executorService = Executors.newFixedThreadPool(config.getMaximumCpuThreads()); @@ -86,11 +86,8 @@ private CrawlerManager() { initializeCrawler(rootConfig); } - public static CrawlerManager getInstance() { - if (instance == null) { - instance = new CrawlerManager(); - } - return instance; + public MServerConfigManager getConfigManager() { + return rootConfig; } public void copyFilmlist() { @@ -166,6 +163,7 @@ public void importLivestreamFilmlist() { } public void importLivestreamFilmlist(final FilmlistFormats aFormat, final String aFilmlistLocation) { + LOG.debug("importLivestreamFilmlist {}", aFilmlistLocation); try { final Optional importedFilmlist; if (aFilmlistLocation.startsWith(HTTP)) { diff --git a/src/main/java/de/mediathekview/mserver/crawler/basic/IgnoreFilmFilter.java b/src/main/java/de/mediathekview/mserver/crawler/basic/IgnoreFilmFilter.java index 6436230a0..4e291571f 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/basic/IgnoreFilmFilter.java +++ b/src/main/java/de/mediathekview/mserver/crawler/basic/IgnoreFilmFilter.java @@ -31,7 +31,7 @@ public IgnoreFilmFilter(String configFileNameAndPath) { } else { ignoreFilmTitles = read(configFileNameAndPath); } - LOG.debug("ignoreFilmList setup with {} entries", size()); + LOG.debug("ignoreFilmList {} setup with {} entries", configFileNameAndPath, size()); } catch (IOException e) { LOG.error("Could not read ignorefilmlist from {} ",configFileNameAndPath, e); } diff --git a/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java b/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java index 3a57e66fd..43b40f0fb 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java +++ b/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java @@ -5,7 +5,6 @@ import com.google.gson.GsonBuilder; import de.mediathekview.mlib.daten.Sender; -import de.mediathekview.mserver.base.config.MServerConfigManager; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; import de.mediathekview.mserver.crawler.basic.AbstractRestTask; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; @@ -25,9 +24,7 @@ public abstract class DWTaskBase extends AbstractRestTask { private static final Logger LOG = LogManager.getLogger(DWTaskBase.class); - private static final RateLimiter limiter = - RateLimiter.create( - new MServerConfigManager().getSenderConfig(Sender.DW).getMaximumRequestsPerSecond()); + private static RateLimiter limiter = null; private final transient GsonBuilder gsonBuilder; @@ -81,6 +78,9 @@ private Response executeRequest(final WebTarget aTarget) { request.header( ZdfConstants.HEADER_AUTHENTIFICATION, AUTHORIZATION_BEARER + authKey.get()); } + if (limiter == null) { + limiter = RateLimiter.create(crawler.getRuntimeConfig().getSenderConfig(Sender.DW).getMaximumRequestsPerSecond()); + } limiter.acquire(); return request.header(HEADER_ACCEPT_ENCODING, ENCODING_GZIP).get(); } diff --git a/src/main/java/de/mediathekview/mserver/crawler/funk/json/FunkVideoDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/funk/json/FunkVideoDeserializer.java index 5b2d5d3a1..f3de954b9 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/funk/json/FunkVideoDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/funk/json/FunkVideoDeserializer.java @@ -73,9 +73,7 @@ protected FilmInfoDto mapToElement(final JsonObject jsonObject) { } private MServerConfigDTO getRuntimeConfig() { - return crawler - .map(AbstractCrawler::getRuntimeConfig) - .orElseGet(() -> new MServerConfigManager().getConfig()); + return crawler.get().getRuntimeConfig(); } private String createNexxCloudUrl(final String entityId) { diff --git a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java index 925b0c92b..baca014cf 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java @@ -18,7 +18,6 @@ import org.jsoup.nodes.Node; import org.jsoup.select.Elements; -import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; diff --git a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java index cb9d954f5..7ef0414e3 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java @@ -14,9 +14,7 @@ public abstract class SrRateLimitedDocumentTask private static final long serialVersionUID = -4077182368484515410L; - private static final RateLimiter LIMITER = - RateLimiter.create( - new MServerConfigManager().getSenderConfig(Sender.SR).getMaximumRequestsPerSecond()); + private static RateLimiter LIMITER = null; SrRateLimitedDocumentTask( final AbstractCrawler crawler, @@ -26,6 +24,9 @@ public abstract class SrRateLimitedDocumentTask @Override protected void processElement(final D urlDTO) { + if (LIMITER== null) { + LIMITER = RateLimiter.create(crawler.getRuntimeConfig().getSenderConfig(Sender.SR).getMaximumRequestsPerSecond()); + } LIMITER.acquire(); super.processElement(urlDTO); } diff --git a/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java b/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java index 6c0316324..e2318e46b 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java +++ b/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java @@ -4,7 +4,6 @@ import com.google.gson.Gson; import com.google.gson.GsonBuilder; import de.mediathekview.mlib.daten.Sender; -import de.mediathekview.mserver.base.config.MServerConfigManager; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; import de.mediathekview.mserver.crawler.basic.AbstractRestTask; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; @@ -22,9 +21,7 @@ public abstract class ZdfTaskBase extends AbstractRestTask { private static final Logger LOG = LogManager.getLogger(ZdfTaskBase.class); - private static final RateLimiter limiter = - RateLimiter.create( - new MServerConfigManager().getSenderConfig(Sender.ZDF).getMaximumRequestsPerSecond()); + private static RateLimiter limiter = null; private final GsonBuilder gsonBuilder; @@ -76,6 +73,10 @@ private Response executeRequest(final WebTarget aTarget) { request.header( ZdfConstants.HEADER_AUTHENTIFICATION, AUTHORIZATION_BEARER + authKey.get()); } + if (limiter == null) { + limiter = RateLimiter.create(crawler.getRuntimeConfig().getSenderConfig(Sender.ZDF).getMaximumRequestsPerSecond()); + } + limiter.acquire(); return request.header(HEADER_ACCEPT_ENCODING, ENCODING_GZIP).get(); } diff --git a/src/main/java/de/mediathekview/mserver/ui/config/MServerConfigUI.java b/src/main/java/de/mediathekview/mserver/ui/config/MServerConfigUI.java index c57376524..76f648c1b 100644 --- a/src/main/java/de/mediathekview/mserver/ui/config/MServerConfigUI.java +++ b/src/main/java/de/mediathekview/mserver/ui/config/MServerConfigUI.java @@ -2,7 +2,7 @@ import de.mediathekview.mlib.messages.listener.LogMessageListener; import de.mediathekview.mlib.messages.listener.MessageListener; -import de.mediathekview.mserver.base.config.MServerConfigDTO; +import de.mediathekview.mserver.base.config.Log4JConfigurationFactory; import de.mediathekview.mserver.base.config.MServerConfigManager; import de.mediathekview.mserver.base.config.MServerLogSettingsDTO; import de.mediathekview.mserver.base.messages.ServerMessages; @@ -13,34 +13,27 @@ import org.apache.logging.log4j.Logger; import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; import java.net.URISyntaxException; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; import java.util.ArrayList; import java.util.List; public final class MServerConfigUI { - - private static final Logger LOG = LogManager.getLogger(MServerConfigUI.class); + // logger setup in start + private Logger LOG = null; private static final String CONFIG_FILE_NAME = "MServer-Config.yaml"; private static final String ARGUMENT_GCONF = "-gconf"; - private final LogMessageListener logMessageListener; - private final MServerConfigDTO config; + private LogMessageListener logMessageListener; private CrawlerManager manager; public MServerConfigUI() { super(); - config = new MServerConfigManager().getConfig(); - final MServerLogSettingsDTO logSettings = config.getLogSettings(); - logSettings.setLogActivateConsole(true); - - logMessageListener = new LogMessageListener(); - - final Level configLevel = logSettings.getLogLevelConsole(); - if (configLevel == null || !logLevelInfoOrLower(configLevel)) { - logSettings.setLogLevelConsole(Level.INFO); - } } public static void main(final String[] args) { @@ -79,10 +72,6 @@ private boolean interpretProgramArguments(final String[] aProgramAgruments) { if (ARGUMENT_GCONF.equals(aProgramAgruments[0])) { generateDefaultConfiguration(); - } else { - logMessageListener.consumeMessage( - ServerMessages.UI_UNKNOWN_ARGUMENT, aProgramAgruments[0], ARGUMENT_GCONF); - return false; } } @@ -99,9 +88,7 @@ void start() { try { manager.start(); manager.importFilmlist(); - if (Boolean.TRUE.equals(config.getImportLivestreamConfiguration().isActive())) { - manager.importLivestreamFilmlist(); - } + manager.importLivestreamFilmlist(); } finally { manager.filterFilmlist(); manager.saveFilmlist(); @@ -114,10 +101,53 @@ void start() { } void start(final String[] aProgramAgruments) { + MServerConfigManager aMServerConfigManager = null; + if (aProgramAgruments.length > 0 && !ARGUMENT_GCONF.equals(aProgramAgruments[0])) { + String configFileName = aProgramAgruments[0]; + if (configFileName.startsWith("http")) { + URL fileUrl; + try { + // get a copy of this file to use it as configuration file + fileUrl = new URL(configFileName); + String filename = Paths.get(fileUrl.getPath()).getFileName().toString(); + MServerConfigUI.getRemoteFileToLocal(configFileName, filename); + configFileName = filename; + } catch (MalformedURLException e) { + e.printStackTrace(); + } + } + aMServerConfigManager = new MServerConfigManager(configFileName); + } else { + aMServerConfigManager = new MServerConfigManager(MServerConfigManager.DEFAULT_CONFIG_FILE); + } + // here we set the correct configManager for all log4logger + // logsettings are stored static in our factory + new Log4JConfigurationFactory(aMServerConfigManager.getConfig().getLogSettings()); + LOG = LogManager.getLogger(MServerConfigUI.class); + logMessageListener = new LogMessageListener(); + if (interpretProgramArguments(aProgramAgruments)) { - manager = CrawlerManager.getInstance(); + manager = new CrawlerManager(aMServerConfigManager); + final MServerLogSettingsDTO logSettings = aMServerConfigManager.getConfig().getLogSettings(); + logSettings.setLogActivateConsole(true); + final Level configLevel = logSettings.getLogLevelConsole(); + if (configLevel == null || !logLevelInfoOrLower(configLevel)) { + logSettings.setLogLevelConsole(Level.INFO); + } addListeners(); start(); } } + + public static void getRemoteFileToLocal(String source, String target) { + try { + URL fileUrl = new URL(source); + try (InputStream in = fileUrl.openStream()) { + Path outputPath = Path.of(target); + Files.copy(in, outputPath, StandardCopyOption.REPLACE_EXISTING); + } + } catch (IOException e) { + e.printStackTrace(); // we do not have a logger yet + } + } } diff --git a/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerImportFilmlistsTest.java b/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerImportFilmlistsTest.java index 6718b4afc..82d2469df 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerImportFilmlistsTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerImportFilmlistsTest.java @@ -6,6 +6,7 @@ import de.mediathekview.mlib.messages.MessageUtil; import de.mediathekview.mlib.messages.listener.MessageListener; import de.mediathekview.mserver.base.config.ImportFilmlistConfiguration; +import de.mediathekview.mserver.base.config.MServerConfigManager; import de.mediathekview.mserver.testhelper.FileReader; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -24,7 +25,6 @@ import java.io.File; import java.io.IOException; -import java.lang.reflect.Field; import java.nio.file.Files; import java.nio.file.Path; import java.util.Comparator; @@ -34,7 +34,7 @@ public class CrawlerManagerImportFilmlistsTest implements MessageListener { - private static final Logger LOG = LogManager.getLogger(CrawlerManagerImportFilmlistsTest.class); + private Logger LOG; private static final String TEMP_FOLDER_NAME_PATTERN = "MSERVER_TEST_%d"; private static Path testFileFolderPath; @@ -87,16 +87,9 @@ void testSaveAndImport(final ImportFilmlistConfiguration initialList, final Impo } public CrawlerManager createEmptyCrawlerManager() { - // reset singelton CrawlerManager to have an empty filmlist - Field instance; - try { - instance = CrawlerManager.class.getDeclaredField("instance"); - instance.setAccessible(true); - instance.set(null, null); - } catch (Exception e) { - fail("Exception mooking crawler manager: " + e.getMessage()); - } // - return CrawlerManager.getInstance(); + CrawlerManager cm = new CrawlerManager(new MServerConfigManager(MServerConfigManager.DEFAULT_CONFIG_FILE)); + LOG = LogManager.getLogger(CrawlerManagerImportFilmlistsTest.class); + return cm; } @AfterAll diff --git a/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerLivestreamTest.java b/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerLivestreamTest.java index 6f3e9f8fd..7ec9c5fa5 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerLivestreamTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerLivestreamTest.java @@ -6,6 +6,7 @@ import de.mediathekview.mlib.messages.MessageUtil; import de.mediathekview.mlib.messages.listener.MessageListener; import de.mediathekview.mserver.base.config.ImportFilmlistConfiguration; +import de.mediathekview.mserver.base.config.MServerConfigManager; import de.mediathekview.mserver.testhelper.FileReader; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -24,7 +25,6 @@ import java.io.File; import java.io.IOException; -import java.lang.reflect.Field; import java.nio.file.Files; import java.nio.file.Path; import java.util.Comparator; @@ -74,16 +74,7 @@ public void testSaveAndImport(final FilmlistFormats format, final String filmlis } public CrawlerManager createEmptyCrawlerManager() { - // reset singelton CrawlerManager to have an empty filmlist - Field instance; - try { - instance = CrawlerManager.class.getDeclaredField("instance"); - instance.setAccessible(true); - instance.set(null, null); - } catch (Exception e) { - fail("Exception mooking crawler manager: " + e.getMessage()); - } // - return CrawlerManager.getInstance(); + return new CrawlerManager(new MServerConfigManager(MServerConfigManager.DEFAULT_CONFIG_FILE)); } @AfterAll diff --git a/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerTest.java b/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerTest.java index 375bbce05..a259fbcb8 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/CrawlerManagerTest.java @@ -6,6 +6,7 @@ import de.mediathekview.mlib.messages.MessageUtil; import de.mediathekview.mlib.messages.listener.MessageListener; import de.mediathekview.mserver.base.config.ImportFilmlistConfiguration; +import de.mediathekview.mserver.base.config.MServerConfigManager; import de.mediathekview.mserver.testhelper.FileReader; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -18,7 +19,6 @@ import java.io.File; import java.io.IOException; -import java.lang.reflect.Field; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; @@ -27,12 +27,11 @@ import java.util.Date; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.fail; @RunWith(Parameterized.class) public class CrawlerManagerTest implements MessageListener { - private static final Logger LOG = LogManager.getLogger(CrawlerManagerTest.class); + private Logger LOG; private static final String TEMP_FOLDER_NAME_PATTERN = "MSERVER_TEST_%d"; private static Path testFileFolderPath; @@ -45,16 +44,8 @@ public CrawlerManagerTest(final String aFilmlistPath, final FilmlistFormats aFor filmlistPath = aFilmlistPath; expectedSize = aExpectedSize; format = aFormat; - // reset singelton CrawlerManager - Field instance; - try { - instance = CrawlerManager.class.getDeclaredField("instance"); - instance.setAccessible(true); - instance.set(null, null); - } catch (Exception e) { - fail("Exception mooking crawler manager: " + e.getMessage()); - } // - CRAWLER_MANAGER = CrawlerManager.getInstance(); + CRAWLER_MANAGER = new CrawlerManager(new MServerConfigManager(MServerConfigManager.DEFAULT_CONFIG_FILE)); + LOG = LogManager.getLogger(CrawlerManagerTest.class); } @Parameterized.Parameters(name = "Test {index} Filmlist for {0} with {1}") diff --git a/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdTopicPageDeserializerTest.java b/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdTopicPageDeserializerTest.java index 9d369c0b3..bcab74769 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdTopicPageDeserializerTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdTopicPageDeserializerTest.java @@ -7,10 +7,8 @@ import org.hamcrest.Matchers; import org.junit.Test; -import java.net.URLEncoder; import java.util.Set; -import static java.nio.charset.StandardCharsets.UTF_8; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.MatcherAssert.assertThat; diff --git a/src/test/java/de/mediathekview/mserver/crawler/zdf/parser/ZdfTopicsPageHtmlDeserializerTest.java b/src/test/java/de/mediathekview/mserver/crawler/zdf/parser/ZdfTopicsPageHtmlDeserializerTest.java index 2899d6399..cdb5e2496 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/zdf/parser/ZdfTopicsPageHtmlDeserializerTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/zdf/parser/ZdfTopicsPageHtmlDeserializerTest.java @@ -1,7 +1,6 @@ package de.mediathekview.mserver.crawler.zdf.parser; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; -import de.mediathekview.mserver.crawler.zdf.ZdfConstants; import de.mediathekview.mserver.testhelper.FileReader; import org.hamcrest.Matchers; import org.jsoup.Jsoup; From 5a5e59cd91380e1bc2d4c5917b622b1f77fcc8f3 Mon Sep 17 00:00:00 2001 From: CodingPF Date: Wed, 8 Nov 2023 07:54:19 +0100 Subject: [PATCH 2/4] change to configurable pool size --- .../base/utils/CheckUrlAvailability.java | 27 ++++++++++++------- .../mserver/crawler/CrawlerManager.java | 8 +++++- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/main/java/de/mediathekview/mserver/base/utils/CheckUrlAvailability.java b/src/main/java/de/mediathekview/mserver/base/utils/CheckUrlAvailability.java index 45dc0eadd..7a8df2725 100644 --- a/src/main/java/de/mediathekview/mserver/base/utils/CheckUrlAvailability.java +++ b/src/main/java/de/mediathekview/mserver/base/utils/CheckUrlAvailability.java @@ -1,6 +1,8 @@ package de.mediathekview.mserver.base.utils; +import java.util.concurrent.ForkJoinPool; + import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -14,16 +16,19 @@ public class CheckUrlAvailability { private static final Logger LOG = LogManager.getLogger(CheckUrlAvailability.class); + private final FileSizeDeterminer fsd; + private int numberOfThreads = 10; private Long minFileSize = 2048L; private int removedCounter = 0; private long timeoutInMS = 1*60*1000L; private boolean timeout = false; private long start = 0; - private FileSizeDeterminer fsd = new FileSizeDeterminer(); - - public CheckUrlAvailability(long minFileSize, long timeoutInSec) { + + public CheckUrlAvailability(final long minFileSize, final long timeoutInSec, final int numberOfThreads) { this.minFileSize = minFileSize; this.timeoutInMS = timeoutInSec*1000; + this.numberOfThreads = numberOfThreads; + fsd = new FileSizeDeterminer(30L, 30L, numberOfThreads); } public Filmlist getAvaiableFilmlist(final Filmlist importList) { @@ -32,7 +37,14 @@ public Filmlist getAvaiableFilmlist(final Filmlist importList) { Filmlist filteredFilmlist = new Filmlist(); filteredFilmlist.setCreationDate(importList.getCreationDate()); filteredFilmlist.setListId(importList.getListId()); - importList.getFilms().values().stream().parallel().filter(this::isAvailable).forEach(filteredFilmlist::add); + // + ForkJoinPool customThreadPool = new ForkJoinPool(numberOfThreads); + customThreadPool.submit(() -> importList.getFilms().values().parallelStream() + .filter(this::isAvailable) + .forEach(filteredFilmlist::add)) + .join(); + customThreadPool.shutdown(); + // LOG.debug("checked {} urls and removed {} in {} sec and timeout was reached: {}", importList.getFilms().size(), removedCounter, ((System.currentTimeMillis()-start)/1000), timeout); return filteredFilmlist; } @@ -49,7 +61,7 @@ private boolean isAvailable(Film pFilm) { if (pFilm.getThema().equalsIgnoreCase("Livestream")) { // do not remove livestreams return true; - } else if (ri.getCode() == 404) { + } else if (!(ri.getCode() >= 200 && ri.getCode() < 300)) { LOG.debug("Film response ({}): {} # {} # {} # {} ", ri.getCode(), normalUrl, pFilm.getSender(), pFilm.getThema(), pFilm.getTitel()); removedCounter++; return false; @@ -66,11 +78,6 @@ private boolean isAvailable(Film pFilm) { removedCounter++; return false; } - // just for debugging - if (ri.getCode() != 200) { - LOG.debug("Film not removed but status!=200 ({}): {} # {} # {} # {} ", ri.getCode(), normalUrl, pFilm.getSender(), pFilm.getThema(), pFilm.getTitel()); - } - return true; } diff --git a/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java b/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java index 245a3bf16..6ab02771c 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java +++ b/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java @@ -192,7 +192,13 @@ public void importFilmlist(final ImportFilmlistConfiguration importFilmlistConfi } // if (importFilmlistConfiguration.isCheckImportListUrl() && importedFilmlist.isPresent() ) { - importedFilmlist = Optional.of(new CheckUrlAvailability(config.getCheckImportListUrlMinSize() ,config.getCheckImportListUrlTimeoutInSec()).getAvaiableFilmlist(importedFilmlist.get())); + importedFilmlist = Optional.of( + new CheckUrlAvailability( + config.getCheckImportListUrlMinSize(), + config.getCheckImportListUrlTimeoutInSec(), + config.getMaximumCpuThreads()) + .getAvaiableFilmlist(importedFilmlist.get()) + ); } // final Filmlist difflist = new Filmlist(UUID.randomUUID(), LocalDateTime.now()); From 402a0b898a8d6b5470605d37d0d04eb66378815f Mon Sep 17 00:00:00 2001 From: CodingPF Date: Wed, 8 Nov 2023 21:48:50 +0100 Subject: [PATCH 3/4] write new and old format json --- MServer-Config.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/MServer-Config.yaml b/MServer-Config.yaml index c7d8e76fb..752cf2d39 100644 --- a/MServer-Config.yaml +++ b/MServer-Config.yaml @@ -22,6 +22,7 @@ senderIncluded: #- KIKA - DW #- BR + #- PHOENIX # If set the server will be awake after the crawler run and restarts the run after the given amount. #schedules: @@ -52,7 +53,7 @@ filmlistSaveFormats: # The paths where which filmlist should be safed to. filmlistSavePaths: -# JSON: target/filmlists/filmliste.json + JSON: target/filmlists/filmliste.json OLD_JSON: target/filmlists/filmliste_old.json # JSON_COMPRESSED_XZ: target/filmlists/filmliste.json.xz # OLD_JSON_COMPRESSED_XZ: target/filmlists/filmliste_old.json.xz @@ -64,8 +65,8 @@ filmlistSavePaths: # The paths where which diff film list should be safed to. # If not set no difference film lists will be safed. filmlistDiffSavePaths: -# JSON: target/filmlists/filmliste_diff.json -# OLD_JSON: target/filmlists/filmliste_old_diff.json + JSON: target/filmlists/filmliste_diff.json + OLD_JSON: target/filmlists/filmliste_old_diff.json # JSON_COMPRESSED_XZ: target/filmlists/filmliste_diff.json.xz # OLD_JSON_COMPRESSED_XZ: target/filmlists/filmliste_old_diff.json.xz # JSON_COMPRESSED_GZIP: target/filmlists/filmliste_diff.json.gz @@ -76,12 +77,12 @@ filmlistDiffSavePaths: #Sets if a filmlist hash file should be written writeFilmlistHashFileEnabled: true #The filmlist hash file path -filmlistHashFilePath: target/filmlists/filmlist.hash +filmlistHashFilePath: target/filmlists/filmlist.hash.xx #Sets if a filmlist id file should be written writeFilmlistIdFileEnabled: true #The fimlist id file path -filmlistIdFilePath: target/filmlists/filmlist.id +filmlistIdFilePath: target/filmlists/filmlist.id.xx # import additional filmlist sources From 16e2afa712d1cccf50b12c7cd19783651baf9d6d Mon Sep 17 00:00:00 2001 From: CodingPF Date: Wed, 8 Nov 2023 21:52:56 +0100 Subject: [PATCH 4/4] align classpath config and file config --- MServer-Config.yaml | 2 +- src/main/resources/MServer-Config.yaml | 74 ++++++++++++++++++++------ 2 files changed, 60 insertions(+), 16 deletions(-) diff --git a/MServer-Config.yaml b/MServer-Config.yaml index 752cf2d39..d6bc5192d 100644 --- a/MServer-Config.yaml +++ b/MServer-Config.yaml @@ -182,7 +182,7 @@ copySettings: # En- / disables FTP copyEnabled: false - # The paths where to safe the film list files.SrfTopicOverviewTask + # The paths where to safe the film list files. # WARNING: You can only set the path for film list formats you listed in "filmlistSaveFormats". # Required if enabled copyTargetFilePaths: diff --git a/src/main/resources/MServer-Config.yaml b/src/main/resources/MServer-Config.yaml index a08211881..8bb5ac0b5 100644 --- a/src/main/resources/MServer-Config.yaml +++ b/src/main/resources/MServer-Config.yaml @@ -7,6 +7,9 @@ maximumCpuThreads: 16 # If set to 0 the server runs without a time limit. maximumServerDurationInMinutes: 0 +# Rate limiter +maximumRequestsPerSecond: 999.0 + # These Sender will NOT be crawled. # If no Sender are included the server will crawl all Sender but these. #senderExcluded: @@ -77,22 +80,40 @@ writeFilmlistIdFileEnabled: true #The fimlist id file path filmlistIdFilePath: target/filmlist.id -# Sets if a filmlist should be imported -filmlistImporEnabled: false - -# The format of the film list to import. -# Possible are: JSON, OLD_JSON, JSON_COMPRESSED_XZ, OLD_JSON_COMPRESSED_XZ, JSON_COMPRESSED_GZIP, OLD_JSON_COMPRESSED_BZIP, JSON_COMPRESSED_GZIP, OLD_JSON_COMPRESSED_BZIP -#filmlistImportFormat: OLD_JSON_COMPRESSED_XZ - -# The path/URL of the film list to import. -#filmlistImportLocation: http://verteiler1.mediathekview.de/Filmliste-akt.xz +# import additional filmlist sources +importFilmlistConfigurations : + - active: false + path: "someCrawlerlist.json" + format: OLD_JSON + createDiff: false + checkImportListUrl: false + - active: false + path: "someMoreCrawlerlist.json" + format: OLD_JSON + createDiff: false + checkImportListUrl: false + - active: false + path: "https://verteiler1.mediathekview.de/filme-org.xz" + format: OLD_JSON_COMPRESSED_XZ + createDiff: true + checkImportListUrl: true + +# film url is consider invalid if the size is below the minSize +checkImportListUrlMinSize: 5012 + +# abort url checking after x sec +checkImportListUrlTimeoutInSec: 1800 #### Default crawler configurations #### # The maximum amount of URLs to be processed per task. maximumUrlsPerTask: 50 # The maximum duration in minutes a crawler may run. -maximumCrawlDurationInMinutes: 60 +maximumCrawlDurationInMinutes: 120 + +# Enables the topics search +# maximumSubpages limits the depth of the topics search +topicsSearchEnabled: false # The maximum amount of sub pages to be crawled.
# Example: If a Sendung overview side has 10 pages with videos for this Sendung and @@ -113,9 +134,20 @@ socketTimeoutInSeconds: 60 senderConfigurations: ARD: # Actually the ARD has a maximum of 6 days in the past - maximumDaysForSendungVerpasstSection: 6 + maximumDaysForSendungVerpasstSection: 1 + #2,4,8 ok + maximumUrlsPerTask: 32 + #10,20,40 ok + maximumSubpages: 0 + ORF: + #2,4,8 ok + maximumUrlsPerTask: 40 ARTE_DE: - maximumDaysForSendungVerpasstSectionFuture: 21 + maximumUrlsPerTask: 1 + maximumDaysForSendungVerpasstSectionFuture: 0 + maximumRequestsPerSecond: 2.0 + ARTE_FR: + maximumDaysForSendungVerpasstSectionFuture: 0 # The maximum amount of URLs to be processed per task. # maximumUrlsPerTask: 25 # The maximum duration in minutes a crawler may run. @@ -125,9 +157,21 @@ senderConfigurations: # the amount set by this is 5 then the crawler crawls pages 1 to 5. # maximumSubpages: 3 KIKA: - socketTimeoutInSeconds: 120 - - + maximumSubpages: 2 + maximumRequestsPerSecond: 8.0 + SR: + maximumRequestsPerSecond: 2.0 + ZDF: + maximumRequestsPerSecond: 10.0 + FUNK: + maximumUrlsPerTask: 99 + DW: + maximumSubpages: 0 + +# configure string variables +crawlerApiParams: + FUNK_REQUEST_TOKEN: 137782e774d7cadc93dcbffbbde0ce9c + #### COPY #### copySettings: # En- / disables FTP