Skip to content

Commit

Permalink
MET-5806: Oembed for generic (non photo/video) resources. Stricter ch…
Browse files Browse the repository at this point in the history
…ecks.
  • Loading branch information
jochen-vermeulen committed Sep 4, 2024
1 parent 7d8c824 commit 50d4caf
Show file tree
Hide file tree
Showing 25 changed files with 353 additions and 417 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import static eu.europeana.metis.mediaprocessing.RdfXpathConstants.EDM_IS_SHOWN_AT;
import static eu.europeana.metis.mediaprocessing.RdfXpathConstants.EDM_IS_SHOWN_BY;
import static eu.europeana.metis.mediaprocessing.RdfXpathConstants.EDM_OBJECT;
import static eu.europeana.metis.mediaprocessing.RdfXpathConstants.EDM_WEBRESOURCE;
import static eu.europeana.metis.mediaprocessing.RdfXpathConstants.SVCS_SERVICE;

import eu.europeana.metis.mediaprocessing.exception.RdfDeserializationException;
import eu.europeana.metis.mediaprocessing.model.EnrichedRdf;
Expand All @@ -20,6 +22,7 @@
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -50,33 +53,36 @@ class RdfDeserializerImpl implements RdfDeserializer {

private static final String OEMBED_NAMESPACE = "https://oembed.com/";
private static final String XPATH_HAS_SERVICE =
"svcs:has_service/@rdf:resource = /rdf:RDF/svcs:Service/@rdf:about" +
" and /rdf:RDF/svcs:Service/dcterms:conformsTo/@rdf:resource";
"svcs:has_service/@rdf:resource = " + SVCS_SERVICE + "/@rdf:about" +
" and " + SVCS_SERVICE + "/dcterms:conformsTo/@rdf:resource";
private static final String XPATH_WEB_RESOURCE =
"/rdf:RDF/edm:WebResource[" + XPATH_HAS_SERVICE + " = \"" + OEMBED_NAMESPACE + "\"";
EDM_WEBRESOURCE + "[" + XPATH_HAS_SERVICE + " = \"" + OEMBED_NAMESPACE + "\"";
private static final String OEMBED_XPATH_CONDITION_IS_SHOWN_BY =
EDM_IS_SHOWN_BY + "[" + EDM_IS_SHOWN_BY + " = " + XPATH_WEB_RESOURCE + "]/@rdf:about]";
private static final String OEMBED_XPATH_CONDITION_HAS_VIEW = EDM_HAS_VIEW
+ "[" + EDM_HAS_VIEW + "=" + XPATH_WEB_RESOURCE + "]/@rdf:about]";

private static final Set<UrlType> URL_TYPES_FOR_OEMBED = Set.of(UrlType.IS_SHOWN_BY, UrlType.HAS_VIEW);

private final UnmarshallingContextWrapper unmarshallingContext = new UnmarshallingContextWrapper();
private final XPathExpressionWrapper getObjectExpression = new XPathExpressionWrapper(
xPath -> xPath.compile(EDM_OBJECT));
private final XPathExpressionWrapper getHasViewExpression = new XPathExpressionWrapper(
xPath -> xPath.compile(EDM_HAS_VIEW + " | " + OEMBED_XPATH_CONDITION_HAS_VIEW));
xPath -> xPath.compile(EDM_HAS_VIEW));
private final XPathExpressionWrapper getIsShownAtExpression = new XPathExpressionWrapper(
xPath -> xPath.compile(EDM_IS_SHOWN_AT));
private final XPathExpressionWrapper getIsShownByExpression = new XPathExpressionWrapper(
xPath -> xPath.compile(EDM_IS_SHOWN_BY + " | " + OEMBED_XPATH_CONDITION_IS_SHOWN_BY));
xPath -> xPath.compile(EDM_IS_SHOWN_BY));

private static List<RdfResourceEntry> convertToResourceEntries(
Map<String, Set<UrlType>> urlWithTypes) {
Map<String, ResourceInfo> urlWithTypes) {
return urlWithTypes.entrySet().stream().map(RdfDeserializerImpl::convertToResourceEntry)
.toList();
}

private static RdfResourceEntry convertToResourceEntry(Map.Entry<String, Set<UrlType>> entry) {
return new RdfResourceEntry(entry.getKey(), entry.getValue());
private static RdfResourceEntry convertToResourceEntry(Map.Entry<String, ResourceInfo> entry) {
return new RdfResourceEntry(entry.getKey(), entry.getValue().urlTypes(),
entry.getValue().configuredForOembed());
}

private static <R> R performDeserialization(byte[] input, DeserializationOperation<R> operation)
Expand Down Expand Up @@ -113,7 +119,7 @@ public List<RdfResourceEntry> getRemainingResourcesForMediaExtraction(InputStrea

// Get all the resource entries.
final Document deserializedDocument = deserializeToDocument(inputStream);
final Map<String, Set<UrlType>> allResources = getResourceEntries(deserializedDocument,
final Map<String, ResourceInfo> allResources = getResourceEntries(deserializedDocument,
UrlType.URL_TYPES_FOR_MEDIA_EXTRACTION);

// Find the main thumbnail resource if it exists and remove it from the result.
Expand Down Expand Up @@ -152,7 +158,7 @@ private Optional<RdfResourceEntry> getMainThumbnailResourceForMediaExtraction(Do
throws RdfDeserializationException {

// Get the entries of the required types.
final Map<String, Set<UrlType>> resourceEntries = getResourceEntries(document,
final Map<String, ResourceInfo> resourceEntries = getResourceEntries(document,
Collections.singleton(UrlType.URL_TYPE_FOR_MAIN_THUMBNAIL_RESOURCE));

// If there is not exactly one, we return an empty optional.
Expand Down Expand Up @@ -295,15 +301,39 @@ public RDF deserializeToRdf(InputStream inputStream) throws RdfDeserializationEx
* @return the resource entries
* @throws RdfDeserializationException the rdf deserialization exception
*/
Map<String, Set<UrlType>> getResourceEntries(Document document,
Map<String, ResourceInfo> getResourceEntries(Document document,
Set<UrlType> allowedUrlTypes) throws RdfDeserializationException {

// Get the resources and their types.
final Map<String, Set<UrlType>> urls = new HashMap<>();
for (UrlType type : allowedUrlTypes) {
final Set<String> urlsForType = getUrls(document, type);
for (String url : urlsForType) {
urls.computeIfAbsent(url, k -> new HashSet<>()).add(type);
}
}
return urls;

// For each resource, check whether they are configured for oEmbed.
final Map<String, ResourceInfo> result = new HashMap<>(urls.size());
for (Entry<String, Set<UrlType>> entry :urls.entrySet()){
final boolean configuredForOembed =
URL_TYPES_FOR_OEMBED.stream().anyMatch(entry.getValue()::contains) &&
configuredForOembed(entry.getKey(), document);
result.put(entry.getKey(), new ResourceInfo(entry.getValue(), configuredForOembed));
}

// Done
return result;
}

private boolean configuredForOembed(String url, Document document) {
// TODO Perform XPath queries on the document. Only using the URL. We want to know whether there
// is a webresource for the url with a link to a service which conforms to oembed. This can be
// done in multiple steps if needed.
return false;
}

record ResourceInfo(Set<UrlType> urlTypes, boolean configuredForOembed) {

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ public final class RdfXpathConstants {
public static final String EDM_IS_SHOWN_BY = ORE_AGGREGATION + "/edm:isShownBy/@rdf:resource";
public static final String EDM_HAS_VIEW = ORE_AGGREGATION + "/edm:hasView/@rdf:resource";
public static final String EDM_IS_SHOWN_AT = ORE_AGGREGATION + "/edm:isShownAt/@rdf:resource";
public static final String SVCS_SERVICE = RDF_NAMESPACE + "/svcs:Service";
public static final String EDM_WEBRESOURCE = RDF_NAMESPACE + "/edm:WebResource";

private RdfXpathConstants() {}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
Expand All @@ -44,7 +45,8 @@ enum ProcessingMode {FULL, REDUCED, NONE}
private static final Set<UrlType> URL_TYPES_FOR_REDUCED_PROCESSING = Collections
.singleton(UrlType.IS_SHOWN_AT);

private final ResourceDownloadClient resourceDownloadClient;
private final ResourceDownloadClient resourceDownloadClientOembed;
private final ResourceDownloadClient resourceDownloadClientNonOembed;
private final MimeTypeDetectHttpClient mimeTypeDetectHttpClient;
private final TikaWrapper tika;

Expand All @@ -65,7 +67,8 @@ enum ProcessingMode {FULL, REDUCED, NONE}
MediaExtractorImpl(ResourceDownloadClient resourceDownloadClient,
MimeTypeDetectHttpClient mimeTypeDetectHttpClient, TikaWrapper tika,
List<MediaProcessor> mediaProcessorList) {
this.resourceDownloadClient = resourceDownloadClient;
this.resourceDownloadClientNonOembed = resourceDownloadClient;
this.resourceDownloadClientOembed = resourceDownloadClient;
this.mimeTypeDetectHttpClient = mimeTypeDetectHttpClient;
this.tika = tika;
this.imageProcessor = (ImageProcessor) getMediaProcessor(mediaProcessorList, ImageProcessor.class);
Expand Down Expand Up @@ -93,8 +96,12 @@ public MediaExtractorImpl(int redirectCount, int thumbnailGenerateTimeout,
throws MediaProcessorException {
final ThumbnailGenerator thumbnailGenerator = new ThumbnailGenerator(
new CommandExecutor(thumbnailGenerateTimeout));
this.resourceDownloadClient = new ResourceDownloadClient(redirectCount,
this::shouldDownloadForFullProcessing, connectTimeout, responseTimeout, downloadTimeout);
this.resourceDownloadClientOembed = new ResourceDownloadClient(redirectCount,
type -> this.shouldDownloadForFullProcessing(type, true),
connectTimeout, responseTimeout, downloadTimeout);
this.resourceDownloadClientNonOembed = new ResourceDownloadClient(redirectCount,
type -> this.shouldDownloadForFullProcessing(type, false),
connectTimeout, responseTimeout, downloadTimeout);
this.mimeTypeDetectHttpClient = new MimeTypeDetectHttpClient(connectTimeout, responseTimeout,
downloadTimeout);
this.tika = new TikaWrapper();
Expand Down Expand Up @@ -126,26 +133,36 @@ public ResourceExtractionResult performMediaExtraction(RdfResourceEntry resource
}

// Download resource and then perform media extraction on it.
try (Resource resource = downloadBasedOnProcessingMode(resourceEntry, mode)) {
return performProcessing(resource, mode, mainThumbnailAvailable);
try (Resource resource = downloadBasedOnProcessingMode(resourceEntry, mode,
resourceEntry.isResourceConfiguredForOembed())) {
return performProcessing(resource, mode, mainThumbnailAvailable,
resourceEntry.isResourceConfiguredForOembed());
} catch (IOException | RuntimeException e) {
throw new MediaExtractionException(
String.format("Problem while processing %s", resourceEntry.getResourceUrl()), e);
}
}

private ResourceDownloadClient getResourceDownloadClient(boolean potentialOembedResource) {
return potentialOembedResource ? this.resourceDownloadClientOembed
: this.resourceDownloadClientNonOembed;
}

private Resource downloadBasedOnProcessingMode(RdfResourceEntry resourceEntry,
ProcessingMode mode) throws IOException {
ProcessingMode mode, boolean potentialOembedResource) throws IOException {

// Determine the download method to use (full download vs. quick ping)
final ResourceDownloadClient client = getResourceDownloadClient(potentialOembedResource);
return (mode == ProcessingMode.FULL)
? this.resourceDownloadClient.downloadBasedOnMimeType(resourceEntry)
: this.resourceDownloadClient.downloadWithoutContent(resourceEntry);
? client.downloadBasedOnMimeType(resourceEntry)
: client.downloadWithoutContent(resourceEntry);
}

ProcessingMode getMode(RdfResourceEntry resourceEntry) {
final ProcessingMode result;
if (URL_TYPES_FOR_FULL_PROCESSING.stream().anyMatch(resourceEntry.getUrlTypes()::contains)) {
if (resourceEntry.isResourceConfiguredForOembed()) {
result = ProcessingMode.FULL;
} else if (URL_TYPES_FOR_FULL_PROCESSING.stream().anyMatch(resourceEntry.getUrlTypes()::contains)) {
result = ProcessingMode.FULL;
} else if (URL_TYPES_FOR_REDUCED_PROCESSING.stream()
.anyMatch(resourceEntry.getUrlTypes()::contains)) {
Expand Down Expand Up @@ -204,40 +221,40 @@ String detectType(Path path, String providedMimeType) throws IOException {
}
}

MediaProcessor chooseMediaProcessor(MediaType mediaType, String detectedMimeType) {
final MediaProcessor processor;
switch (mediaType) {
case TEXT, OTHER -> processor = chooseByDetectedMimeType(mediaType, detectedMimeType);
case AUDIO, VIDEO -> processor = audioVideoProcessor;
case IMAGE -> processor = imageProcessor;
case THREE_D -> processor = media3dProcessor;
default -> processor = null;
}
return processor;
List<MediaProcessor> chooseMediaProcessor(MediaType mediaType, String detectedMimeType,
boolean potentialOembedResource) {
return switch (mediaType) {
case TEXT, OTHER -> chooseMediaProcessorTextAndOther(mediaType, detectedMimeType,
potentialOembedResource);
case AUDIO, VIDEO -> List.of(audioVideoProcessor);
case IMAGE -> List.of(imageProcessor);
case THREE_D -> List.of(media3dProcessor);
};
}

MediaProcessor chooseByDetectedMimeType(MediaType mediaType, String detectedMimeType) {
private List<MediaProcessor> chooseMediaProcessorTextAndOther(MediaType mediaType,
String detectedMimeType, boolean potentialOembedResource) {
if (detectedMimeType == null) {
return null;
} else if ((mediaType == MediaType.TEXT || mediaType == MediaType.OTHER) &&
(detectedMimeType.startsWith("text/xml") || detectedMimeType.startsWith("application/xml")
|| detectedMimeType.startsWith("application/json"))) {
return oEmbedProcessor;
} else if (potentialOembedResource && (detectedMimeType.startsWith("text/xml")
|| detectedMimeType.startsWith("application/xml") || detectedMimeType.startsWith("application/json"))) {
return List.of(oEmbedProcessor, textProcessor);
} else if (mediaType == MediaType.TEXT) {
return textProcessor;
return List.of(textProcessor);
} else {
return null;
return Collections.emptyList();
}
}

void verifyAndCorrectContentAvailability(Resource resource, ProcessingMode mode,
String detectedMimeType) throws MediaExtractionException, IOException {
String detectedMimeType, boolean potentialOembedResource)
throws MediaExtractionException, IOException {

// If the mime type changed and we need the content after all, we download it.
if (mode == ProcessingMode.FULL && shouldDownloadForFullProcessing(detectedMimeType)
&& !shouldDownloadForFullProcessing(resource.getProvidedMimeType())) {
final RdfResourceEntry downloadInput =
new RdfResourceEntry(resource.getResourceUrl(), new ArrayList<>(resource.getUrlTypes()));
if (mode == ProcessingMode.FULL && shouldDownloadForFullProcessing(detectedMimeType, potentialOembedResource)
&& !shouldDownloadForFullProcessing(resource.getProvidedMimeType(), potentialOembedResource)) {
final RdfResourceEntry downloadInput = new RdfResourceEntry(resource.getResourceUrl(),
new ArrayList<>(resource.getUrlTypes()), potentialOembedResource);

ThrowingConsumer<Resource, IOException> action = resourceWithContent -> {
if (resourceWithContent.hasContent()) {
Expand All @@ -246,22 +263,22 @@ void verifyAndCorrectContentAvailability(Resource resource, ProcessingMode mode,
}
}
};
try (final Resource resourceWithContent = this.resourceDownloadClient
try (final Resource resourceWithContent = getResourceDownloadClient(potentialOembedResource)
.downloadWithContent(downloadInput)) {
performThrowingAction(resourceWithContent, action);
}
}

// Verify that we have content when we need to.
if (mode == ProcessingMode.FULL && shouldDownloadForFullProcessing(detectedMimeType)
if (mode == ProcessingMode.FULL && shouldDownloadForFullProcessing(detectedMimeType, potentialOembedResource)
&& !resource.hasContent()) {
throw new MediaExtractionException(
"File content is not downloaded and mimeType does not support processing without a downloaded file.");
}
}

ResourceExtractionResult performProcessing(Resource resource, ProcessingMode mode,
boolean mainThumbnailAvailable) throws MediaExtractionException {
boolean mainThumbnailAvailable, boolean potentialOembedResource) throws MediaExtractionException {

// Sanity check - shouldn't be called for this mode.
if (mode == ProcessingMode.NONE) {
Expand All @@ -274,32 +291,30 @@ ResourceExtractionResult performProcessing(Resource resource, ProcessingMode mod
// Verify that we have content when we need to. This can happen if the resource doesn't come
// with the correct mime type. We correct this here.
try {
verifyAndCorrectContentAvailability(resource, mode, detectedMimeType);
verifyAndCorrectContentAvailability(resource, mode, detectedMimeType, potentialOembedResource);
} catch (IOException e) {
throw new MediaExtractionException("Content availability verification error.", e);
}

// Choose the right media processor.
MediaProcessor processor = chooseMediaProcessor(MediaType.getMediaType(detectedMimeType), detectedMimeType);

ResourceExtractionResult result;
if (processor == null) {
result = null;
} else {
result = getResourceExtractionResult(resource, mode, mainThumbnailAvailable, processor, detectedMimeType);
}
// No oEmbed detected try with text processing
if (processor instanceof OEmbedProcessor && result == null) {
processor = textProcessor;
result = getResourceExtractionResult(resource, mode, mainThumbnailAvailable, processor, detectedMimeType);
final List<MediaProcessor> processors = chooseMediaProcessor(
MediaType.getMediaType(detectedMimeType), detectedMimeType, potentialOembedResource);

// Go in order, the first result we get, we accept.
for (MediaProcessor processor: processors) {
final ResourceExtractionResult result = getResourceExtractionResult(resource, mode,
mainThumbnailAvailable, processor, detectedMimeType);
if (result != null) {
return result;
}
}
// Done
return result;
return null;
}

private static ResourceExtractionResult getResourceExtractionResult(Resource resource, ProcessingMode mode,
boolean mainThumbnailAvailable, MediaProcessor processor, String detectedMimeType) throws MediaExtractionException {
ResourceExtractionResult result;
private static ResourceExtractionResult getResourceExtractionResult(Resource resource,
ProcessingMode mode, boolean mainThumbnailAvailable, MediaProcessor processor,
String detectedMimeType) throws MediaExtractionException {
final ResourceExtractionResult result;
// Process the resource depending on the mode.
if (mode == ProcessingMode.FULL) {
result = processor.extractMetadata(resource, detectedMimeType, mainThumbnailAvailable);
Expand All @@ -311,15 +326,18 @@ private static ResourceExtractionResult getResourceExtractionResult(Resource res

@Override
public void close() throws IOException {
resourceDownloadClient.close();
resourceDownloadClientOembed.close();
resourceDownloadClientNonOembed.close();
mimeTypeDetectHttpClient.close();
}

/**
* @return true if and only if resources of the given type need to be downloaded before performing full processing.
*/
boolean shouldDownloadForFullProcessing(String mimeType) {
return Optional.of(MediaType.getMediaType(mimeType)).map(mediaType -> chooseMediaProcessor(mediaType, mimeType))
.map(MediaProcessor::downloadResourceForFullProcessing).orElse(Boolean.FALSE);
boolean shouldDownloadForFullProcessing(String mimeType, boolean potentialOembedResource) {
return Optional.of(MediaType.getMediaType(mimeType))
.map(mediaType -> chooseMediaProcessor(mediaType, mimeType, potentialOembedResource))
.stream().flatMap(Collection::stream)
.anyMatch(MediaProcessor::downloadResourceForFullProcessing);
}
}
Loading

0 comments on commit 50d4caf

Please sign in to comment.