Skip to content

Commit

Permalink
MET-6037: Ensure that the tiers for provider data only are returned d… (
Browse files Browse the repository at this point in the history
#678)

* MET-6037: Ensure that the tiers for provider data only are returned during indexing.

* MET-6037 update unit tests to support fix changes

* MET-6037 update rdf file used for unit tests

---------

Co-authored-by: Jorge Ortiz <[email protected]>
  • Loading branch information
jochen-vermeulen and jeortizquan authored Jul 26, 2024
1 parent ac24500 commit faeff27
Show file tree
Hide file tree
Showing 8 changed files with 220 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ public interface Indexer extends Closeable {
*
* @param stringRdfRecord The record to index (can be parsed to RDF).
* @param indexingProperties The properties of this indexing operation.
* @param tierResultsConsumer The predicate deciding if the record should be published based on evaluated tier.
* @param tierResultsConsumer The predicate deciding whether the record should be published based
* on the evaluated tier. Note: the tier calculations that are provided
* to the consumer are for provider data only (i.e. mode
* {@link eu.europeana.indexing.tiers.metadata.ClassifierMode#PROVIDER_PROXIES}).
* @throws IndexingException In case a problem occurred during indexing.
*/
void index(String stringRdfRecord, IndexingProperties indexingProperties,
Expand Down Expand Up @@ -131,7 +134,9 @@ void index(String stringRdfRecord, IndexingProperties indexingProperties,
* @param recordContent The record to index (can be parsed to RDF).
* @param indexingProperties The properties of this indexing operation.
* @throws IndexingException In case a problem occurred during indexing.
* @return A pair with both content tier and metadata tier calculations results of the given record
* @return A pair with both content tier and metadata tier calculations results of the given
* record. The tier calculations are for provider data only (i.e. mode
* {@link eu.europeana.indexing.tiers.metadata.ClassifierMode#PROVIDER_PROXIES}).
*/
TierResults indexAndGetTierCalculations(InputStream recordContent,
IndexingProperties indexingProperties) throws IndexingException;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,29 @@ private IndexerPreprocessor() {
*
* @param rdf the rdf
* @param properties the properties
* @return the tier results
* @return the tier results for the provider data (i.e. {@link ClassifierMode#PROVIDER_PROXIES}).
* @throws IndexingException the indexing exception
*/
public static TierResults preprocessRecord(RDF rdf, IndexingProperties properties)
throws IndexingException {

// Perform the tier classification
final RdfWrapper rdfWrapper = new RdfWrapper(rdf);
TierResults tierCalculationsResult = new TierResults();
if (properties.isPerformTierCalculation() && properties.getTypesEnabledForTierCalculation()
.contains(rdfWrapper.getEdmType())) {
tierCalculationsResult = new TierResults(mediaClassifier.classify(rdfWrapper),
metadataClassifier.classify(rdfWrapper));
RdfTierUtils.setTier(rdf, tierCalculationsResult.getMediaTier());
RdfTierUtils.setTier(rdf, tierCalculationsResult.getMetadataTier());

tierCalculationsResult = new TierResults(mediaClassifier.classify(rdfWrapper),
metadataClassifierEuropeana.classify(rdfWrapper));
RdfTierUtils.setTierEuropeana(rdf, tierCalculationsResult.getMediaTier());
RdfTierUtils.setTierEuropeana(rdf, tierCalculationsResult.getMetadataTier());
final TierResults tierCalculationsResultProvidedData = new TierResults(
mediaClassifier.classify(rdfWrapper), metadataClassifier.classify(rdfWrapper));
RdfTierUtils.setTier(rdf, tierCalculationsResultProvidedData.getMediaTier());
RdfTierUtils.setTier(rdf, tierCalculationsResultProvidedData.getMetadataTier());

final TierResults tierCalculationsResultEuropeana = new TierResults(
mediaClassifier.classify(rdfWrapper), metadataClassifierEuropeana.classify(rdfWrapper));
RdfTierUtils.setTierEuropeana(rdf, tierCalculationsResultEuropeana.getMediaTier());
RdfTierUtils.setTierEuropeana(rdf, tierCalculationsResultEuropeana.getMetadataTier());

return tierCalculationsResultProvidedData;
}

return tierCalculationsResult;
return new TierResults();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ private ClassifierFactory() {
* @return A classifier for the metadata tier with Provider Proxies as default mode.
*/
public static TierClassifier<MetadataTier, MetadataTierBreakdown> getMetadataClassifier() {
return new MetadataClassifier(new LanguageClassifier(), new EnablingElementsClassifier(), new ContextualClassesClassifier());
return getMetadataClassifier(ClassifierMode.PROVIDER_PROXIES);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public class ContextualClassesClassifier implements TierClassifierBreakdown<Cont
private final ClassifierMode classifierMode;

/**
* Instantiates a new Contextual classes classifier.
* Instantiates a new Contextual classes classifier for default mode {@link ClassifierMode#PROVIDER_PROXIES}.
*/
public ContextualClassesClassifier() {
this.classifierMode = ClassifierMode.PROVIDER_PROXIES;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public class EnablingElementsClassifier implements TierClassifierBreakdown<Enabl
private final ClassifierMode classifierMode;

/**
* Instantiates a new Enabling elements' classifier.
* Instantiates a new Enabling elements' classifier for default mode {@link ClassifierMode#PROVIDER_PROXIES}.
*/
public EnablingElementsClassifier() {
this.classifierMode = ClassifierMode.PROVIDER_PROXIES;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public class LanguageClassifier implements TierClassifierBreakdown<LanguageBreak
private final ClassifierMode classifierMode;

/**
* Instantiates a new Language classifier.
* Instantiates a new Language classifier for default mode {@link ClassifierMode#PROVIDER_PROXIES}.
*/
public LanguageClassifier() {
this.classifierMode = ClassifierMode.PROVIDER_PROXIES;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package eu.europeana.indexing;

import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

import eu.europeana.indexing.base.IndexingTestUtils;
import eu.europeana.indexing.exception.IndexingException;
Expand All @@ -18,6 +21,9 @@
*/
class IndexerPreprocessorTest {

private static final String CONTENT_TIER_URI = "http://www.europeana.eu/schemas/epf/contentTier";
private static final String METADATA_TIER_URI = "http://www.europeana.eu/schemas/epf/metadataTier";

/**
* Preprocess record.
*
Expand All @@ -26,16 +32,46 @@ class IndexerPreprocessorTest {
*/
@Test
void preprocessRecord() throws SerializationException, IndexingException {
// given
final RdfConversionUtils conversionUtils = new RdfConversionUtils();
final RDF inputRdf = conversionUtils.convertStringToRdf(
IndexingTestUtils.getResourceFileContent("europeana_record_to_sample_index_rdf.xml"));
IndexingTestUtils.getResourceFileContent("europeana_record_tier_calculation_rdf.xml"));
final IndexingProperties indexingProperties = new IndexingProperties(Date.from(Instant.now()),
true,
List.of(), true, true);

// when
TierResults results = IndexerPreprocessor.preprocessRecord(inputRdf, indexingProperties);

assertEquals("4", results.getMediaTier().toString());
assertEquals("B", results.getMetadataTier().toString());
// then
List<String> tierProvidedData = inputRdf.getAggregationList()
.stream()
.map(provideddata -> provideddata.getHasQualityAnnotationList()
.stream()
.map(q -> q.getQualityAnnotation().getHasBody()
.getResource()).toList())
.findFirst().orElse(null);

List<String> tierEuropeanaData = inputRdf.getEuropeanaAggregationList()
.stream()
.map(eudata -> eudata.getHasQualityAnnotationList()
.stream()
.map(q -> q.getQualityAnnotation().getHasBody().getResource())
.toList())
.findFirst().orElse(null);

// verify two different aggregation has different calculations
assertArrayEquals(new String[]{CONTENT_TIER_URI + "1", METADATA_TIER_URI + "A"}, tierProvidedData.toArray());
assertArrayEquals(new String[]{CONTENT_TIER_URI + "1", METADATA_TIER_URI + "B"}, tierEuropeanaData.toArray());

// verify return of tier calculation
assertEquals("1", results.getMediaTier().toString());
assertEquals("A", results.getMetadataTier().toString());

// verify return is equal to aggregation and not europeana aggregation
assertTrue(tierProvidedData.contains(CONTENT_TIER_URI + results.getMediaTier().toString()) &&
tierProvidedData.contains(METADATA_TIER_URI + results.getMetadataTier().toString()));
assertFalse(tierEuropeanaData.contains(CONTENT_TIER_URI + results.getMediaTier().toString()) &&
tierEuropeanaData.contains(METADATA_TIER_URI + results.getMetadataTier().toString()));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/" xmlns:edm="http://www.europeana.eu/schemas/edm/"
xmlns:owl="http://www.w3.org/2002/07/owl#"
xmlns:skos="http://www.w3.org/2004/02/skos/core#"
xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:ebucore="http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#"
xmlns:ore="http://www.openarchives.org/ore/terms/"
>
<edm:ProvidedCHO rdf:about="/305/_nnhSX08"/>
<edm:WebResource rdf:about="http://mbc.malopolska.pl/Content/48386/d2j:big,0/0437_0001.djvu.jpg">
<edm:rights rdf:resource="http://rightsstatements.org/vocab/InC/1.0/"/>
<ebucore:hasMimeType>image/jpeg</ebucore:hasMimeType>
<ebucore:fileByteSize rdf:datatype="http://www.w3.org/2001/XMLSchema#long">112031</ebucore:fileByteSize>
<ebucore:width rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">562</ebucore:width>
<ebucore:height rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">761</ebucore:height>
<edm:hasColorSpace>sRGB</edm:hasColorSpace>
<edm:componentColor rdf:datatype="http://www.w3.org/2001/XMLSchema#hexBinary">#C0C0C0</edm:componentColor>
<edm:componentColor rdf:datatype="http://www.w3.org/2001/XMLSchema#hexBinary">#A9A9A9</edm:componentColor>
<edm:componentColor rdf:datatype="http://www.w3.org/2001/XMLSchema#hexBinary">#BDB76B</edm:componentColor>
<edm:componentColor rdf:datatype="http://www.w3.org/2001/XMLSchema#hexBinary">#BC8F8F</edm:componentColor>
<edm:componentColor rdf:datatype="http://www.w3.org/2001/XMLSchema#hexBinary">#D2B48C</edm:componentColor>
<edm:componentColor rdf:datatype="http://www.w3.org/2001/XMLSchema#hexBinary">#808080</edm:componentColor>
<ebucore:orientation rdf:datatype="http://www.w3.org/2001/XMLSchema#string">portrait</ebucore:orientation>
</edm:WebResource>
<edm:WebResource rdf:about="http://fbc.pionier.net.pl/id/oai:mbc.malopolska.pl:48386">
<ebucore:hasMimeType>text/html</ebucore:hasMimeType>
<ebucore:fileByteSize rdf:datatype="http://www.w3.org/2001/XMLSchema#long">1477</ebucore:fileByteSize>
</edm:WebResource>
<edm:TimeSpan rdf:about="#1923">
<skos:prefLabel xml:lang="zxx">1923</skos:prefLabel>
<dcterms:isPartOf rdf:resource="http://data.europeana.eu/timespan/20"></dcterms:isPartOf>
<edm:begin>1923-01-01</edm:begin>
<edm:end>1923-12-31</edm:end>
<skos:notation rdf:datatype="http://id.loc.gov/datatypes/edtf/EDTF-level1">1923</skos:notation>
</edm:TimeSpan>
<edm:TimeSpan rdf:about="http://data.europeana.eu/timespan/20">
<skos:prefLabel xml:lang="de">20. Jahrhundert</skos:prefLabel>
<skos:prefLabel xml:lang="fi">1900-luku</skos:prefLabel>
<skos:prefLabel xml:lang="ru">XX век</skos:prefLabel>
<skos:prefLabel xml:lang="pt">Século XX</skos:prefLabel>
<skos:prefLabel xml:lang="bg">20 век</skos:prefLabel>
<skos:prefLabel xml:lang="lt">XX amžius</skos:prefLabel>
<skos:prefLabel xml:lang="hr">20. stoljeće</skos:prefLabel>
<skos:prefLabel xml:lang="lv">20. gadsimts</skos:prefLabel>
<skos:prefLabel xml:lang="fr">XXe siècle</skos:prefLabel>
<skos:prefLabel xml:lang="hu">20. század</skos:prefLabel>
<skos:prefLabel xml:lang="sk">20. storočie</skos:prefLabel>
<skos:prefLabel xml:lang="sl">20. stoletje</skos:prefLabel>
<skos:prefLabel xml:lang="ga">20ú haois</skos:prefLabel>
<skos:prefLabel xml:lang="ca">Segle XX</skos:prefLabel>
<skos:prefLabel xml:lang="sv">1900-talet</skos:prefLabel>
<skos:prefLabel xml:lang="el">20ός αιώνας</skos:prefLabel>
<skos:prefLabel xml:lang="en">20th century</skos:prefLabel>
<skos:prefLabel xml:lang="it">XX secolo</skos:prefLabel>
<skos:prefLabel xml:lang="es">Siglo XX</skos:prefLabel>
<skos:prefLabel xml:lang="et">20. sajand</skos:prefLabel>
<skos:prefLabel xml:lang="cs">20. století</skos:prefLabel>
<skos:prefLabel xml:lang="eu">XX. mendea</skos:prefLabel>
<skos:prefLabel xml:lang="pl">XX wiek</skos:prefLabel>
<skos:prefLabel xml:lang="da">20. århundrede</skos:prefLabel>
<skos:prefLabel xml:lang="ro">Secolul al XX-lea</skos:prefLabel>
<skos:prefLabel xml:lang="nl">20e eeuw</skos:prefLabel>
<skos:altLabel xml:lang="sv">20:e århundradet</skos:altLabel>
<skos:altLabel xml:lang="sv">20:e seklet</skos:altLabel>
<skos:altLabel xml:lang="sv">1900-tal</skos:altLabel>
<skos:altLabel xml:lang="sv">1900-talet (århundrade)</skos:altLabel>
<skos:altLabel xml:lang="sv">1900-talet (sekel)</skos:altLabel>
<skos:altLabel xml:lang="ru">20 век</skos:altLabel>
<skos:altLabel xml:lang="pt">Século 20</skos:altLabel>
<skos:altLabel xml:lang="pt">Século vinte</skos:altLabel>
<skos:altLabel xml:lang="pt">Periodo 1901-2000</skos:altLabel>
<skos:altLabel xml:lang="pt">Ciclo (1901-2000)</skos:altLabel>
<skos:altLabel xml:lang="en">20th-century</skos:altLabel>
<skos:altLabel xml:lang="en">20th-century</skos:altLabel>
<skos:altLabel xml:lang="en">Twentieth century</skos:altLabel>
<skos:altLabel xml:lang="en">The past century</skos:altLabel>
<skos:altLabel xml:lang="en">History, 20th Century</skos:altLabel>
<skos:altLabel xml:lang="en">XX Century</skos:altLabel>
<skos:altLabel xml:lang="it">Novecento</skos:altLabel>
<skos:altLabel xml:lang="it">20° secolo</skos:altLabel>
<skos:altLabel xml:lang="it">'900</skos:altLabel>
<skos:altLabel xml:lang="it">Novecento</skos:altLabel>
<skos:altLabel xml:lang="fr">20e siècle</skos:altLabel>
<skos:altLabel xml:lang="es">Siglo 20</skos:altLabel>
<skos:altLabel xml:lang="es">El siglo pasado</skos:altLabel>
<skos:altLabel xml:lang="es">Siglo veinte</skos:altLabel>
<skos:altLabel xml:lang="es">Siglo XX después de Cristo</skos:altLabel>
<skos:altLabel xml:lang="es">Siglo XX d. C.</skos:altLabel>
<edm:begin>1901-01-01</edm:begin>
<edm:end>2000-12-31</edm:end>
<edm:isNextInSequence rdf:resource="http://data.europeana.eu/timespan/19"/>
<owl:sameAs rdf:resource="http://www.wikidata.org/entity/Q6927"/>
<owl:sameAs rdf:resource="http://id.loc.gov/authorities/names/sh2002012476"/>
<owl:sameAs rdf:resource="http://id.loc.gov/authorities/names/sh85139020"/>
<owl:sameAs rdf:resource="http://id.nlm.nih.gov/mesh/D049673"/>
<owl:sameAs rdf:resource="https://www.freebase.com/m/089_x"/>
<owl:sameAs rdf:resource="https://g.co/kg/m/089_x"/>
<owl:sameAs rdf:resource="http://id.nlm.nih.gov/mesh/K01.400.504.968"/>
<owl:sameAs rdf:resource="http://vocab.getty.edu/aat/300404514"/>
<owl:sameAs rdf:resource="http://id.worldcat.org/fast/1159810"/>
<owl:sameAs rdf:resource="http://dbpedia.org/resource/20th_century"/>
</edm:TimeSpan>
<ore:Aggregation rdf:about="/aggregation/provider/305/_nnhSX08">
<edm:aggregatedCHO rdf:resource="/305/_nnhSX08"/>
<edm:dataProvider rdf:resource="http://data.europeana.eu/organization/2897"></edm:dataProvider>
<edm:isShownAt rdf:resource="http://fbc.pionier.net.pl/id/oai:mbc.malopolska.pl:48386"/>
<edm:object rdf:resource="http://mbc.malopolska.pl/Content/48386/d2j:big,0/0437_0001.djvu.jpg"/>
<edm:provider rdf:resource="http://data.europeana.eu/organization/1566"></edm:provider>
<edm:rights rdf:resource="http://rightsstatements.org/vocab/InC/1.0/"/>
</ore:Aggregation>
<ore:Proxy rdf:about="/proxy/provider/305/_nnhSX08">
<dc:contributor xml:lang="pl">Beaupre, Antoni.(Red.)</dc:contributor>
<dc:date xml:lang="pl">1923</dc:date>
<dc:description xml:lang="pl">Wojewódzka Biblioteka Publiczna w Krakowie</dc:description>
<dc:format>image/vnd.djvu</dc:format>
<dc:language>pol</dc:language>
<dc:publisher xml:lang="pl">Spółka Wydawnicza "Czas"</dc:publisher>
<dc:rights xml:lang="pl">Biblioteka Książąt Czartoryskich</dc:rights>
<dc:subject xml:lang="pl">20 w.</dc:subject>
<dc:subject xml:lang="pl">gazety polskie</dc:subject>
<dc:title xml:lang="pl">Czas. 1923, nr 251 (10 XI)</dc:title>
<dc:type xml:lang="pl">czasopismo</dc:type>
<dcterms:extent>Dziennik polityczny i literacko-społeczny; organ konserwatystów. - Opis fiz.: 4 s. ; 61 cm.</dcterms:extent>
<edm:europeanaProxy>false</edm:europeanaProxy>
<ore:proxyFor rdf:resource="/305/_nnhSX08"/>
<ore:proxyIn rdf:resource="/aggregation/provider/305/_nnhSX08"/>
<edm:type>TEXT</edm:type>
</ore:Proxy>
<ore:Proxy rdf:about="/proxy/europeana/305/_nnhSX08">
<dc:date rdf:resource="#1923"></dc:date>
<dc:identifier>#nnhSX08</dc:identifier>
<dc:language>pol</dc:language>
<edm:europeanaProxy>true</edm:europeanaProxy>
<edm:year>1923</edm:year>
<ore:proxyFor rdf:resource="/305/_nnhSX08"/>
<ore:proxyIn rdf:resource="/aggregation/europeana/305/_nnhSX08"/>
<ore:lineage rdf:resource="/proxy/provider/305/_nnhSX08"/>
</ore:Proxy>
<edm:EuropeanaAggregation rdf:about="/aggregation/europeana/305/_nnhSX08">
<edm:aggregatedCHO rdf:resource="/305/_nnhSX08"/>
<edm:dataProvider xml:lang="en">Europeana Foundation</edm:dataProvider>
<edm:provider xml:lang="en">Europeana Foundation</edm:provider>
<edm:datasetName>305_local_26072024_1620</edm:datasetName>
<edm:country>Netherlands</edm:country>
<edm:preview rdf:resource="http://mbc.malopolska.pl/Content/48386/d2j:big,0/0437_0001.djvu.jpg"/>
<edm:language>nl</edm:language>
<edm:completeness>6</edm:completeness>
</edm:EuropeanaAggregation>
<foaf:Organization rdf:about="http://data.europeana.eu/organization/1566">
<skos:prefLabel xml:lang="en">Digital Libraries Federation</skos:prefLabel>
<skos:prefLabel xml:lang="pl">Federacja Bibliotek Cyfrowych</skos:prefLabel>
</foaf:Organization>
<foaf:Organization rdf:about="http://data.europeana.eu/organization/2897">
<skos:prefLabel xml:lang="en">Malopolska Digital Library</skos:prefLabel>
<skos:prefLabel xml:lang="pl">Małopolska Biblioteka Cyfrowa</skos:prefLabel>
</foaf:Organization>
</rdf:RDF>

0 comments on commit faeff27

Please sign in to comment.