feat/met 5806 support embeddable resources profile (#675)

* MET-5806 add ombed detection for webresources * MET-5806 add tika detection for oEmbedded items * MET-5806 refactor unit tests * MET-5806 add complementary unit tests * MET-5806 correct file detection implementation and update unit tests * MET-5806 added code review comments * MET-5806 remove unused imports and optimizations * MET-5806 remove chain of responsability pattern, start refactor * MET-5806 add xpath condition for oembed objects, pending fix unit tests * MET-5806 updated textprocessor and oembedprocessor with unit tests * MET-5806 add sonarqube recommendations * MET-5806 sonarcloud recommendations improvements * MET-5806 sonarcloud improvement recommendations part II * MET-5806 sonarcloud recomendations part III * MET-5806 Fix sonar issues --------- Co-authored-by: Simon Tzanakis <[email protected]>
europeana · Aug 15, 2024 · 09d064e · 09d064e
1 parent 6457c23
commit 09d064e
Show file tree

Hide file tree

Showing 32 changed files with 2,037 additions and 207 deletions.
diff --git a/...common/metis-common-utils/src/main/java/eu/europeana/metis/utils/RdfNamespaceContext.java b/...common/metis-common-utils/src/main/java/eu/europeana/metis/utils/RdfNamespaceContext.java
@@ -19,6 +19,8 @@ public class RdfNamespaceContext implements NamespaceContext {
   public static final String RDF_NAMESPACE_PREFIX = "rdf";
   public static final String EDM_NAMESPACE_PREFIX = "edm";
   public static final String ORE_NAMESPACE_PREFIX = "ore";
+  public static final String SVCS_NAMESPACE_PREFIX = "svcs";
+  public static final String DCTERMS_NAMESPACE_PREFIX = "dcterms";
 
   private static final Map<String, String> PREFIX_TO_NAMESPACE_MAP = new HashMap<>();
 
@@ -30,6 +32,8 @@ public class RdfNamespaceContext implements NamespaceContext {
     PREFIX_TO_NAMESPACE_MAP.put(RDF_NAMESPACE_PREFIX, "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
     PREFIX_TO_NAMESPACE_MAP.put(ORE_NAMESPACE_PREFIX, "http://www.openarchives.org/ore/terms/");
     PREFIX_TO_NAMESPACE_MAP.put(EDM_NAMESPACE_PREFIX, "http://www.europeana.eu/schemas/edm/");
+    PREFIX_TO_NAMESPACE_MAP.put(SVCS_NAMESPACE_PREFIX,"http://rdfs.org/sioc/services#");
+    PREFIX_TO_NAMESPACE_MAP.put(DCTERMS_NAMESPACE_PREFIX, "http://purl.org/dc/terms/");
   }
 
   @Override

diff --git a/metis-media-service/pom.xml b/metis-media-service/pom.xml
@@ -130,5 +130,10 @@
       <groupId>org.wiremock</groupId>
       <artifactId>wiremock-standalone</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-junit-jupiter</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 </project>
diff --git a/...s-media-service/src/main/java/eu/europeana/metis/mediaprocessing/RdfDeserializerImpl.java b/...s-media-service/src/main/java/eu/europeana/metis/mediaprocessing/RdfDeserializerImpl.java
diff --git a/metis-media-service/src/main/java/eu/europeana/metis/mediaprocessing/RdfXpathConstants.java b/metis-media-service/src/main/java/eu/europeana/metis/mediaprocessing/RdfXpathConstants.java
@@ -0,0 +1,17 @@
+package eu.europeana.metis.mediaprocessing;
+
+/**
+ * Rdf xpath string constants.
+ */
+public final class RdfXpathConstants {
+
+  public static final String RDF_NAMESPACE = "/rdf:RDF";
+  public static final String ORE_AGGREGATION = RDF_NAMESPACE + "/ore:Aggregation";
+  public static final String EDM_OBJECT =  ORE_AGGREGATION + "/edm:object/@rdf:resource";
+  public static final String EDM_IS_SHOWN_BY = ORE_AGGREGATION + "/edm:isShownBy/@rdf:resource";
+  public static final String EDM_HAS_VIEW = ORE_AGGREGATION + "/edm:hasView/@rdf:resource";
+  public static final String EDM_IS_SHOWN_AT = ORE_AGGREGATION + "/edm:isShownAt/@rdf:resource";
+
+  private RdfXpathConstants() {}
+
+}
diff --git a/...vice/src/main/java/eu/europeana/metis/mediaprocessing/extraction/AudioVideoProcessor.java b/...vice/src/main/java/eu/europeana/metis/mediaprocessing/extraction/AudioVideoProcessor.java
@@ -48,10 +48,13 @@
 class AudioVideoProcessor implements MediaProcessor {
 
   private static final Logger LOGGER = LoggerFactory.getLogger(AudioVideoProcessor.class);
+  public static final int FFPROBE_MAX_VERSION = 7;
+  public static final int FFPROBE_MIN_VERSION = 2;
 
   private static String globalFfprobeCommand;
 
   private final CommandExecutor commandExecutor;
+
   private final String ffprobeCommand;
 
   /**
@@ -97,7 +100,7 @@ static String discoverFfprobeCommand(CommandExecutor commandExecutor)
     int indexVersion = output.lastIndexOf("version ") + "version ".length();
     int version = Character.isDigit(output.charAt(indexVersion)) ?
         Integer.parseInt(String.valueOf(output.charAt(indexVersion))) : 0;
-    if (!(version >= 2 && version < 7)) {
+    if (!(version >= FFPROBE_MIN_VERSION && version < FFPROBE_MAX_VERSION)) {
       throw new MediaProcessorException("ffprobe version " + version + ".x not found");
     }
 

diff --git a/...service/src/main/java/eu/europeana/metis/mediaprocessing/extraction/Media3dProcessor.java b/...service/src/main/java/eu/europeana/metis/mediaprocessing/extraction/Media3dProcessor.java
@@ -29,4 +29,5 @@ public ResourceExtractionResult copyMetadata(Resource resource, String detectedM
   public boolean downloadResourceForFullProcessing() {
     return false;
   }
+
 }
diff --git a/...rvice/src/main/java/eu/europeana/metis/mediaprocessing/extraction/MediaExtractorImpl.java b/...rvice/src/main/java/eu/europeana/metis/mediaprocessing/extraction/MediaExtractorImpl.java
@@ -21,6 +21,7 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.EnumSet;
+import java.util.List;
 import java.util.Optional;
 import java.util.Set;
 import org.apache.tika.io.TikaInputStream;
@@ -51,27 +52,27 @@ enum ProcessingMode {FULL, REDUCED, NONE}
   private final AudioVideoProcessor audioVideoProcessor;
   private final TextProcessor textProcessor;
   private final Media3dProcessor media3dProcessor;
+  private final OEmbedProcessor oEmbedProcessor;
 
   /**
    * Constructor meant for testing purposes.
    *
    * @param resourceDownloadClient The download client for resources.
    * @param mimeTypeDetectHttpClient The mime type detector for URLs.
    * @param tika A tika instance.
-   * @param imageProcessor An image processor.
-   * @param audioVideoProcessor An audio/video processor.
-   * @param textProcessor A text processor.
+   * @param mediaProcessorList the media processor list
    */
   MediaExtractorImpl(ResourceDownloadClient resourceDownloadClient,
-      MimeTypeDetectHttpClient mimeTypeDetectHttpClient, TikaWrapper tika, ImageProcessor imageProcessor,
-      AudioVideoProcessor audioVideoProcessor, TextProcessor textProcessor, Media3dProcessor media3dProcessor) {
+      MimeTypeDetectHttpClient mimeTypeDetectHttpClient, TikaWrapper tika,
+      List<MediaProcessor> mediaProcessorList) {
     this.resourceDownloadClient = resourceDownloadClient;
     this.mimeTypeDetectHttpClient = mimeTypeDetectHttpClient;
     this.tika = tika;
-    this.imageProcessor = imageProcessor;
-    this.audioVideoProcessor = audioVideoProcessor;
-    this.textProcessor = textProcessor;
-    this.media3dProcessor = media3dProcessor;
+    this.imageProcessor = (ImageProcessor) getMediaProcessor(mediaProcessorList, ImageProcessor.class);
+    this.audioVideoProcessor = (AudioVideoProcessor) getMediaProcessor(mediaProcessorList, AudioVideoProcessor.class);
+    this.textProcessor = (TextProcessor) getMediaProcessor(mediaProcessorList, TextProcessor.class);
+    this.media3dProcessor = (Media3dProcessor) getMediaProcessor(mediaProcessorList, Media3dProcessor.class);
+    this.oEmbedProcessor = (OEmbedProcessor) getMediaProcessor(mediaProcessorList, OEmbedProcessor.class);
   }
 
   /**
@@ -102,6 +103,16 @@ public MediaExtractorImpl(int redirectCount, int thumbnailGenerateTimeout,
     this.textProcessor = new TextProcessor(thumbnailGenerator,
         new PdfToImageConverter(new CommandExecutor(thumbnailGenerateTimeout)));
     this.media3dProcessor = new Media3dProcessor();
+    this.oEmbedProcessor = new OEmbedProcessor();
+  }
+
+  private <T> Object getMediaProcessor(List<?> mediaProcessorList, Class<T> type) {
+    for (Object mediaProcessor : mediaProcessorList) {
+      if (type.isInstance(mediaProcessor)) {
+        return type.cast(mediaProcessor);
+      }
+    }
+    return null;
   }
 
   @Override
@@ -193,10 +204,10 @@ String detectType(Path path, String providedMimeType) throws IOException {
     }
   }
 
-  MediaProcessor chooseMediaProcessor(MediaType mediaType) {
+  MediaProcessor chooseMediaProcessor(MediaType mediaType, String detectedMimeType) {
     final MediaProcessor processor;
     switch (mediaType) {
-      case TEXT -> processor = textProcessor;
+      case TEXT, OTHER -> processor = chooseByDetectedMimeType(mediaType, detectedMimeType);
       case AUDIO, VIDEO -> processor = audioVideoProcessor;
       case IMAGE -> processor = imageProcessor;
       case THREE_D -> processor = media3dProcessor;
@@ -205,6 +216,20 @@ MediaProcessor chooseMediaProcessor(MediaType mediaType) {
     return processor;
   }
 
+  MediaProcessor chooseByDetectedMimeType(MediaType mediaType, String detectedMimeType) {
+    if (detectedMimeType == null) {
+      return null;
+    } else if ((mediaType == MediaType.TEXT || mediaType == MediaType.OTHER) &&
+        (detectedMimeType.startsWith("text/xml") || detectedMimeType.startsWith("application/xml")
+            || detectedMimeType.startsWith("application/json"))) {
+      return oEmbedProcessor;
+    } else if (mediaType == MediaType.TEXT) {
+      return textProcessor;
+    } else {
+      return null;
+    }
+  }
+
   void verifyAndCorrectContentAvailability(Resource resource, ProcessingMode mode,
       String detectedMimeType) throws MediaExtractionException, IOException {
 
@@ -255,19 +280,32 @@ ResourceExtractionResult performProcessing(Resource resource, ProcessingMode mod
     }
 
     // Choose the right media processor.
-    final MediaProcessor processor = chooseMediaProcessor(MediaType.getMediaType(detectedMimeType));
+    MediaProcessor processor = chooseMediaProcessor(MediaType.getMediaType(detectedMimeType), detectedMimeType);
 
-    // Process the resource depending on the mode.
-    final ResourceExtractionResult result;
+    ResourceExtractionResult result;
     if (processor == null) {
       result = null;
-    } else if (mode == ProcessingMode.FULL) {
+    } else {
+      result = getResourceExtractionResult(resource, mode, mainThumbnailAvailable, processor, detectedMimeType);
+    }
+    // No oEmbed detected try with text processing
+    if (processor instanceof OEmbedProcessor && result == null) {
+      processor = textProcessor;
+      result = getResourceExtractionResult(resource, mode, mainThumbnailAvailable, processor, detectedMimeType);
+    }
+    // Done
+    return result;
+  }
+
+  private static ResourceExtractionResult getResourceExtractionResult(Resource resource, ProcessingMode mode,
+      boolean mainThumbnailAvailable, MediaProcessor processor, String detectedMimeType) throws MediaExtractionException {
+    ResourceExtractionResult result;
+    // Process the resource depending on the mode.
+    if (mode == ProcessingMode.FULL) {
       result = processor.extractMetadata(resource, detectedMimeType, mainThumbnailAvailable);
     } else {
       result = processor.copyMetadata(resource, detectedMimeType);
     }
-
-    // Done
     return result;
   }
 
@@ -281,7 +319,7 @@ public void close() throws IOException {
    * @return true if and only if resources of the given type need to be downloaded before performing full processing.
    */
   boolean shouldDownloadForFullProcessing(String mimeType) {
-    return Optional.of(MediaType.getMediaType(mimeType)).map(this::chooseMediaProcessor)
+    return Optional.of(MediaType.getMediaType(mimeType)).map(mediaType -> chooseMediaProcessor(mediaType, mimeType))
                    .map(MediaProcessor::downloadResourceForFullProcessing).orElse(Boolean.FALSE);
   }
 }
diff --git a/...-service/src/main/java/eu/europeana/metis/mediaprocessing/extraction/OEmbedProcessor.java b/...-service/src/main/java/eu/europeana/metis/mediaprocessing/extraction/OEmbedProcessor.java
@@ -0,0 +1,123 @@
+package eu.europeana.metis.mediaprocessing.extraction;
+
+import static eu.europeana.metis.mediaprocessing.extraction.oembed.OEmbedValidation.checkValidWidthAndHeightDimensions;
+import static eu.europeana.metis.mediaprocessing.extraction.oembed.OEmbedValidation.getDurationFromModel;
+import static eu.europeana.metis.mediaprocessing.extraction.oembed.OEmbedValidation.getOEmbedModelFromJson;
+import static eu.europeana.metis.mediaprocessing.extraction.oembed.OEmbedValidation.getOEmbedModelFromXml;
+import static eu.europeana.metis.mediaprocessing.extraction.oembed.OEmbedValidation.isValidOEmbedPhotoOrVideo;
+
+import eu.europeana.metis.mediaprocessing.exception.MediaExtractionException;
+import eu.europeana.metis.mediaprocessing.extraction.oembed.OEmbedModel;
+import eu.europeana.metis.mediaprocessing.model.ImageResourceMetadata;
+import eu.europeana.metis.mediaprocessing.model.Resource;
+import eu.europeana.metis.mediaprocessing.model.ResourceExtractionResult;
+import eu.europeana.metis.mediaprocessing.model.ResourceExtractionResultImpl;
+import eu.europeana.metis.mediaprocessing.model.VideoResourceMetadata;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Locale;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The type Oembed processor.
+ */
+public class OEmbedProcessor implements MediaProcessor {
+
+  /**
+   * The constant LOGGER.
+   */
+  private static final Logger LOGGER = LoggerFactory.getLogger(OEmbedProcessor.class);
+
+  /**
+   * Process a resource by extracting the metadata from the content.
+   *
+   * @param resource The resource to process. Note that the resource may not have content (see
+   * {@link MediaExtractorImpl#shouldDownloadForFullProcessing(String)}).
+   * @param detectedMimeType The mime type that was detected for this resource (may deviate from the mime type that was provided
+   * by the server and which is stored in {@link Resource#getProvidedMimeType()}).
+   * @param mainThumbnailAvailable Whether the main thumbnail for this record is available. This may influence the decision on
+   * whether to generate a thumbnail for this resource.
+   * @return The result of the processing.
+   * @throws MediaExtractionException In case something went wrong during the extraction.
+   */
+  @Override
+  public ResourceExtractionResult extractMetadata(Resource resource, String detectedMimeType, boolean mainThumbnailAvailable)
+      throws MediaExtractionException {
+
+    ResourceExtractionResult resourceExtractionResult;
+    // the content for this oembed needs to be downloaded to be examined
+    if (resource.getContentPath() != null) {
+      try {
+        OEmbedModel embedModel = null;
+        if (detectedMimeType.startsWith("application/json")) {
+          embedModel = getOEmbedModelFromJson(Files.readAllBytes(Paths.get(resource.getContentPath().toString())));
+        } else if (detectedMimeType.startsWith("application/xml")) {
+          embedModel = getOEmbedModelFromXml(Files.readAllBytes(Paths.get(resource.getContentPath().toString())));
+        }
+        if (isValidOEmbedPhotoOrVideo(embedModel)) {
+          checkValidWidthAndHeightDimensions(embedModel, resource.getResourceUrl());
+          resourceExtractionResult = getResourceExtractionResult(resource, detectedMimeType, embedModel);
+        } else {
+          LOGGER.warn("No oembed model found");
+          resourceExtractionResult = null;
+        }
+      } catch (IOException e) {
+        throw new MediaExtractionException("Unable to read OEmbedded resource", e);
+      }
+    } else {
+      resourceExtractionResult = null;
+    }
+
+    return resourceExtractionResult;
+  }
+
+  /**
+   * Process a resource by copying the metadata from the input without performing any extraction.
+   *
+   * @param resource The resource to process. The resource is not expected to have content.
+   * @param detectedMimeType The mime type that was detected for this resource (may deviate from the mime type that was provided
+   * by the server and which is stored in {@link Resource#getProvidedMimeType()}).
+   * @return The result of the processing.
+   * @throws MediaExtractionException In case something went wrong during the extraction.
+   */
+  @Override
+  public ResourceExtractionResult copyMetadata(Resource resource, String detectedMimeType) throws MediaExtractionException {
+    return null;
+  }
+
+  /**
+   * @return Whether the processor needs the downloaded resource for full processing.
+   */
+  @Override
+  public boolean downloadResourceForFullProcessing() {
+    return true;
+  }
+
+  private ResourceExtractionResult getResourceExtractionResult(Resource resource, String detectedMimeType,
+      OEmbedModel oEmbedModel) throws MediaExtractionException {
+    ResourceExtractionResult resourceExtractionResult;
+    if (oEmbedModel != null) {
+      switch (oEmbedModel.getType().toLowerCase(Locale.US)) {
+        case "photo" -> {
+          ImageResourceMetadata imageResourceMetadata = new ImageResourceMetadata(detectedMimeType,
+              resource.getResourceUrl(),
+              resource.getProvidedFileSize(), oEmbedModel.getWidth(), oEmbedModel.getHeight(), null, null, null);
+          resourceExtractionResult = new ResourceExtractionResultImpl(imageResourceMetadata);
+        }
+        case "video" -> {
+          Double duration = getDurationFromModel(oEmbedModel);
+          VideoResourceMetadata videoResourceMetadata = new VideoResourceMetadata(detectedMimeType,
+              resource.getResourceUrl(),
+              resource.getProvidedFileSize(), duration, null, oEmbedModel.getWidth(), oEmbedModel.getHeight(), null, null);
+          resourceExtractionResult = new ResourceExtractionResultImpl(videoResourceMetadata);
+        }
+        default -> resourceExtractionResult = null;
+      }
+    } else {
+      resourceExtractionResult = null;
+    }
+    return resourceExtractionResult;
+  }
+}
-Original file line number
+Diff line change
@@ Expand Up @@
       public boolean downloadResourceForFullProcessing() {
         return false;
       }
     }