From 5fd53ff2f41df9f829e838d4c5d8135c8968251f Mon Sep 17 00:00:00 2001 From: Simon Tzanakis Date: Thu, 19 Oct 2023 14:18:57 +0200 Subject: [PATCH] Debt/met 5132 dates normalization cleanup part 4 (#622) * MET-5132: Update naming of numeric spaces variant * MET-5132: Prepare first sample of tests * MET-5132: More tests * MET-5132: Restructure with enum MontNameDateExtractor * MET-5132: Reusable DatePartsIndices * MET-5132: Cleanup * MET-5132: Add spaces clean and trim * MET-5132: Cleanup * MET-5132: First split the range part * MET-5132: Refactor PatternBcAdDateExtractor with tests * MET-5132: PatternBcAdRangeDateExtractor cleanup * MET-5132: Reuse range code for dates * MET-5132: Reuse range code for BriefRangeDateExtractor * MET-5132: Simplify hierarchy for ranges * MET-5132: Add sample tests for PatternLongNegativeYearDateExtractor and adapt edtf builder * MET-5132: Refactor LongNegativeYearDateExtractor with a separate range class reusing already existent code * MET-5132: Split century extraction to numeric, roman, roman range reusing code * MET-5132: Centralize sanitization operation for all extractors. * MET-5132: Add tests for generic properties * MET-5132: Reuse test code * MET-5132: Repackage * MET-5132: Split EdtfDateExtractor to handle ranges separately with code reuse * MET-5132: Centralize date qualification overwriting * MET-5132: Simplify date qualification overwriting * MET-5132: Update code after answers from rnd * MET-5132: Cleanup * MET-5132: PatternFormattedFullDateDateExtractor cleanup * MET-5132: Add millisecond support * MET-5132: Process review * MET-5132: Process review 2 --- .../DateNormalizationExtractorMatchId.java | 2 +- .../dates/edtf/AbstractEdtfDate.java | 11 +- .../dates/edtf/DateQualification.java | 56 ++- .../dates/edtf/InstantEdtfDate.java | 26 +- .../dates/edtf/InstantEdtfDateBuilder.java | 97 +++--- .../dates/edtf/IntervalEdtfDate.java | 33 +- .../dates/edtf/IntervalEdtfDateBuilder.java | 19 +- .../dates/extraction/DatePartsIndices.java | 33 ++ .../dates/extraction/DatesSeparator.java | 9 + .../extraction/DefaultDatesSeparator.java | 20 ++ .../dates/extraction/MonthMultilingual.java | 42 +-- .../dates/extraction/NumericPartsPattern.java | 61 ++-- .../dateextractors/AbstractDateExtractor.java | 82 ----- .../BriefRangeDateExtractor.java | 70 ---- .../dateextractors/CenturyDateExtractor.java | 150 -------- .../dateextractors/EdtfDateExtractor.java | 105 ------ .../NumericPartsRangeDateExtractor.java | 118 ------- .../PatternBcAdDateExtractor.java | 124 ------- .../PatternFormatedFullDateDateExtractor.java | 66 ---- .../PatternLongNegativeYearDateExtractor.java | 60 ---- .../PatternMonthNameDateExtractor.java | 92 ----- .../extractors/AbstractDateExtractor.java | 75 ++++ .../AbstractRangeDateExtractor.java | 69 ++++ .../extractors/BcAdDateExtractor.java | 70 ++++ .../extractors/BcAdRangeDateExtractor.java | 44 +++ .../extractors/BriefRangeDateExtractor.java | 125 +++++++ .../CenturyNumericDateExtractor.java | 87 +++++ .../extractors/CenturyRomanDateExtractor.java | 48 +++ .../CenturyRomanRangeDateExtractor.java | 46 +++ .../DateExtractor.java | 14 +- .../DcmiPeriodDateExtractor.java | 31 +- .../DecadeDateExtractor.java | 22 +- .../extractors/EdtfDateExtractor.java | 95 ++++++ .../extractors/EdtfRangeDateExtractor.java | 75 ++++ .../extractors/FullDateDateExtractor.java | 92 +++++ .../LongNegativeYearDateExtractor.java | 35 ++ .../LongNegativeYearRangeDateExtractor.java | 46 +++ .../extractors/MonthNameDateExtractor.java | 117 +++++++ .../NumericPartsDateExtractor.java | 44 +-- .../NumericPartsRangeDateExtractor.java | 93 +++++ .../extractors/RangeDateExtractor.java | 76 +++++ .../normalizers/DatesNormalizer.java | 150 ++++---- .../BriefRangeDateExtractorTest.java | 88 ----- .../CenturyDateExtractorTest.java | 216 ------------ .../DcmiPeriodDateExtractorTest.java | 119 ------- .../DecadeDateExtractorTest.java | 74 ---- .../dateextractors/EdtfDateExtractorTest.java | 319 ------------------ .../extractors/BcAdDateExtractorTest.java | 117 +++++++ .../BcAdRangeDateExtractorTest.java | 68 ++++ .../BriefRangeDateExtractorTest.java | 61 ++++ .../CenturyNumericDateExtractorTest.java | 60 ++++ .../CenturyRomaDateExtractorTest.java | 93 +++++ .../CenturyRomanRangeDateExtractorTest.java | 103 ++++++ .../extractors/DateExtractorTest.java | 66 ++++ .../DcmiPeriodDateExtractorTest.java | 90 +++++ .../extractors/DecadeDateExtractorTest.java | 63 ++++ .../extractors/EdtfDateExtractorTest.java | 167 +++++++++ .../EdtfRangeDateExtractorTest.java | 155 +++++++++ .../extractors/FullDateDateExtractorTest.java | 51 +++ .../LongNegativeYearDateExtractorTest.java | 43 +++ ...ongNegativeYearRangeDateExtractorTest.java | 50 +++ .../MonthNameDateExtractorTest.java | 173 ++++++++++ .../NumericPartsDateExtractorTest.java | 30 +- .../NumericPartsRangeDateExtractorTest.java | 22 +- .../NumericRangeDMYArgumentsProvider.java | 2 +- .../NumericRangeDMYXXArgumentsProvider.java | 2 +- .../NumericRangeYMDArgumentsProvider.java | 2 +- .../NumericRangeYMDXXArgumentsProvider.java | 2 +- .../normalizers/DatesNormalizerTest.java | 194 +++++------ 69 files changed, 3017 insertions(+), 2143 deletions(-) create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatePartsIndices.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatesSeparator.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DefaultDatesSeparator.java delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/AbstractDateExtractor.java delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractor.java delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractor.java delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractor.java delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractor.java delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternBcAdDateExtractor.java delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternFormatedFullDateDateExtractor.java delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternLongNegativeYearDateExtractor.java delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternMonthNameDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractRangeDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractor.java rename metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/DateExtractor.java (58%) rename metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/DcmiPeriodDateExtractor.java (75%) rename metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/DecadeDateExtractor.java (59%) create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractor.java rename metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericPartsDateExtractor.java (76%) create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractor.java create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/RangeDateExtractor.java delete mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractorTest.java delete mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractorTest.java delete mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractorTest.java delete mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractorTest.java delete mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomaDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractorTest.java create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractorTest.java rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericPartsDateExtractorTest.java (92%) rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericPartsRangeDateExtractorTest.java (58%) rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericRangeDMYArgumentsProvider.java (99%) rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericRangeDMYXXArgumentsProvider.java (99%) rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericRangeYMDArgumentsProvider.java (99%) rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericRangeYMDXXArgumentsProvider.java (99%) diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/DateNormalizationExtractorMatchId.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/DateNormalizationExtractorMatchId.java index c9c714799b..2a6378e9b1 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/DateNormalizationExtractorMatchId.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/DateNormalizationExtractorMatchId.java @@ -19,7 +19,7 @@ public enum DateNormalizationExtractorMatchId { NUMERIC_ALL_VARIANTS_XX("numeric date (various separators and unknown parts)"), NUMERIC_RANGE_ALL_VARIANTS("numeric date interval (various separators)"), NUMERIC_RANGE_ALL_VARIANTS_XX("numeric date interval (various separators and unknown parts)"), - YYYY_MM_DD_SPACES("numeric date (whitespace separators)"); + NUMERIC_SPACES_VARIANT("numeric date (whitespace separators)"); final String label; diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/AbstractEdtfDate.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/AbstractEdtfDate.java index d840ce71bf..9c79e906e7 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/AbstractEdtfDate.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/AbstractEdtfDate.java @@ -1,5 +1,7 @@ package eu.europeana.normalization.dates.edtf; +import java.util.Set; + /** * An abstract class that contains the template that an EDTF date with compliance level 1 should implement. *

See more in the specification of EDTF

@@ -17,11 +19,18 @@ protected AbstractEdtfDate(String label) { this.label = label; } + /** + * Add the date qualification, mainly used for pre-sanitized values. + * + * @param dateQualification the date qualification + */ + public abstract void addQualification(DateQualification dateQualification); + public String getLabel() { return label; } - public abstract DateQualification getDateQualification(); + public abstract Set getDateQualifications(); public abstract boolean isOpen(); diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/DateQualification.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/DateQualification.java index 1c9c06c6ac..aef6749e42 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/DateQualification.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/DateQualification.java @@ -1,6 +1,7 @@ package eu.europeana.normalization.dates.edtf; -import java.util.Arrays; +import java.util.EnumSet; +import java.util.Set; import java.util.regex.Pattern; /** @@ -8,32 +9,51 @@ * Specification */ public enum DateQualification { + UNCERTAIN, APPROXIMATE; - NO_QUALIFICATION(""), - UNCERTAIN("?"), - APPROXIMATE("~"), - UNCERTAIN_APPROXIMATE("%"); - - public static final Pattern CHECK_QUALIFICATION_PATTERN = Pattern.compile("^[^\\?~%]*([\\?~%]?)$"); - private final String character; - - DateQualification(String character) { - this.character = character; - } + private static final String UNCERTAIN_CHARACTER = "?"; + private static final String APPROXIMATE_CHARACTER = "~"; + private static final String UNCERTAIN_APPROXIMATE_CHARACTER = "%"; + private static final String CHARACTERS_REGEX = UNCERTAIN_CHARACTER + APPROXIMATE_CHARACTER + UNCERTAIN_APPROXIMATE_CHARACTER; + public static final Pattern PATTERN = Pattern.compile("^[^" + CHARACTERS_REGEX + "]*([" + CHARACTERS_REGEX + "])$"); /** - * Get the enum value based on the character provided. - *

It will return a matched enum value or {@link #NO_QUALIFICATION}.

+ * Get the enum values based on the character provided. + *

It will return an empty set or the set with the applicable qualifications.

* * @param character the provided character * @return the enum value */ - public static DateQualification fromCharacter(String character) { - return Arrays.stream(DateQualification.values()).filter(value -> value.character.equals(character)).findFirst().orElse( - NO_QUALIFICATION); + public static Set fromCharacter(String character) { + final Set dateQualifications = EnumSet.noneOf(DateQualification.class); + if (UNCERTAIN_APPROXIMATE_CHARACTER.equals(character)) { + dateQualifications.add(DateQualification.UNCERTAIN); + dateQualifications.add(DateQualification.APPROXIMATE); + } else if (UNCERTAIN_CHARACTER.equals(character)) { + dateQualifications.add(DateQualification.UNCERTAIN); + } else if (APPROXIMATE_CHARACTER.equals(character)) { + dateQualifications.add(DateQualification.APPROXIMATE); + } + return dateQualifications; } - public String getCharacter() { + /** + * Get the string representation based on the provided date qualifications. + * + * @param dateQualifications the date qualifications + * @return the string representation + */ + public static String getCharacterFromQualifications(Set dateQualifications) { + final String character; + if (dateQualifications.contains(UNCERTAIN) && dateQualifications.contains(APPROXIMATE)) { + character = UNCERTAIN_APPROXIMATE_CHARACTER; + } else if (dateQualifications.contains(UNCERTAIN)) { + character = UNCERTAIN_CHARACTER; + } else if (dateQualifications.contains(APPROXIMATE)) { + character = APPROXIMATE_CHARACTER; + } else { + character = ""; + } return character; } } diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDate.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDate.java index bb8fb4bc95..883515d19c 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDate.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDate.java @@ -3,7 +3,6 @@ import static eu.europeana.normalization.dates.edtf.DateBoundaryType.DECLARED; import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN; import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; import static eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder.THRESHOLD_4_DIGITS_YEAR; import static eu.europeana.normalization.dates.edtf.Iso8601Parser.ISO_8601_MINIMUM_YEAR_DIGITS; import static java.lang.Math.abs; @@ -19,7 +18,9 @@ import java.time.Year; import java.time.YearMonth; import java.time.temporal.TemporalAccessor; +import java.util.EnumSet; import java.util.Objects; +import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -38,11 +39,13 @@ public final class InstantEdtfDate extends AbstractEdtfDate implements Comparabl private Month month; private LocalDate yearMonthDay; private YearPrecision yearPrecision; - private DateQualification dateQualification = NO_QUALIFICATION; + private Set dateQualifications = EnumSet.noneOf(DateQualification.class); private DateBoundaryType dateBoundaryType = DECLARED; /** * Restricted constructor by provided {@link InstantEdtfDateBuilder}. + *

All fields apart from {@link #dateQualifications} are strictly contained in the constructor. The date qualifications can + * be further extended to, for example, add an approximate qualification for a date that was sanitized.

* * @param instantEdtfDateBuilder the builder with all content verified */ @@ -51,13 +54,18 @@ public final class InstantEdtfDate extends AbstractEdtfDate implements Comparabl year = instantEdtfDateBuilder.getYearObj(); month = instantEdtfDateBuilder.getMonthObj(); yearMonthDay = instantEdtfDateBuilder.getYearMonthDayObj(); - dateQualification = instantEdtfDateBuilder.getDateQualification(); + dateQualifications = instantEdtfDateBuilder.getDateQualifications(); } private InstantEdtfDate(DateBoundaryType dateBoundaryType) { this.dateBoundaryType = dateBoundaryType; } + @Override + public void addQualification(DateQualification dateQualification) { + this.dateQualifications.add(dateQualification); + } + /** * Create an {@link DateBoundaryType#UNKNOWN} instant. * @@ -188,7 +196,7 @@ public Integer getCentury() { int centuryDivision = year.getValue() / YearPrecision.CENTURY.getDuration(); int centuryModulo = year.getValue() % YearPrecision.CENTURY.getDuration(); //For case 1900 it is 19th. For case 1901 it is 20th century - return centuryModulo == 0 ? centuryDivision : centuryDivision + 1; + return (centuryModulo == 0) ? centuryDivision : (centuryDivision + 1); } /** @@ -230,7 +238,7 @@ public String toString() { stringBuilder.append( ofNullable(yearMonthDay).map(LocalDate::getDayOfMonth).map(decimalFormat::format).map(d -> "-" + d).orElse("")); } - stringBuilder.append(dateQualification.getCharacter()); + stringBuilder.append(DateQualification.getCharacterFromQualifications(dateQualifications)); return stringBuilder.toString(); } @@ -256,13 +264,13 @@ public boolean equals(Object o) { } InstantEdtfDate that = (InstantEdtfDate) o; return yearPrecision == that.yearPrecision && Objects.equals(year, that.year) && Objects.equals(month, - that.month) && Objects.equals(yearMonthDay, that.yearMonthDay) && dateQualification == that.dateQualification + that.month) && Objects.equals(yearMonthDay, that.yearMonthDay) && dateQualifications == that.dateQualifications && dateBoundaryType == that.dateBoundaryType; } @Override public int hashCode() { - return Objects.hash(yearPrecision, year, month, yearMonthDay, dateQualification, dateBoundaryType); + return Objects.hash(yearPrecision, year, month, yearMonthDay, dateQualifications, dateBoundaryType); } public Year getYear() { @@ -281,8 +289,8 @@ public YearPrecision getYearPrecision() { return yearPrecision; } - public DateQualification getDateQualification() { - return dateQualification; + public Set getDateQualifications() { + return EnumSet.copyOf(dateQualifications); } public DateBoundaryType getDateBoundaryType() { diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDateBuilder.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDateBuilder.java index ba3a45bfca..e47ec5e69c 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDateBuilder.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDateBuilder.java @@ -1,8 +1,6 @@ package eu.europeana.normalization.dates.edtf; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; import static java.lang.String.format; -import static java.util.Optional.ofNullable; import eu.europeana.normalization.dates.YearPrecision; import eu.europeana.normalization.dates.extraction.DateExtractionException; @@ -14,14 +12,16 @@ import java.time.YearMonth; import java.time.temporal.ChronoField; import java.time.temporal.TemporalAccessor; +import java.util.EnumSet; import java.util.Objects; +import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Builder class for {@link InstantEdtfDate}. *

During {@link #build()} it will verify all the parameters that have been requested. - * The {@link #build()}, if {@link #withFlexibleDateBuild(boolean)} was called with {@code true}, will also attempt a second time + * The {@link #build()}, if {@link #withAllowDayMonthSwap(boolean)} was called with {@code true}, will also attempt a second time * by switching month and day values if the original values were invalid. Furthermore, there are a set of constructors that can * start the builder and will perform a build with specific characteristics: *

    @@ -38,14 +38,13 @@ public class InstantEdtfDateBuilder { private Year yearObj; private Month monthObj; private LocalDate yearMonthDayObj; - private Integer year; + private final Integer year; private Integer month; private Integer day; - private TemporalAccessor temporalAccessor; - private YearPrecision yearPrecision; - private DateQualification dateQualification; - private boolean flexibleDateBuild = true; - private boolean longDatePrefixedWithY = false; + private YearPrecision yearPrecision = YearPrecision.YEAR; + private final Set dateQualifications = EnumSet.noneOf(DateQualification.class); + private boolean allowDayMonthSwap = true; + private boolean isMoreThanFourDigitsYear = false; /** * Constructor which initializes the builder with the minimum requirement of year value. @@ -65,7 +64,12 @@ public InstantEdtfDateBuilder(final Integer year) { * @param temporalAccessor the temporal accessor */ public InstantEdtfDateBuilder(TemporalAccessor temporalAccessor) { - this.temporalAccessor = temporalAccessor; + day = temporalAccessor.isSupported(ChronoField.DAY_OF_MONTH) ? + temporalAccessor.get(ChronoField.DAY_OF_MONTH) : null; + month = temporalAccessor.isSupported(ChronoField.MONTH_OF_YEAR) ? + temporalAccessor.get(ChronoField.MONTH_OF_YEAR) : null; + year = temporalAccessor.isSupported(ChronoField.YEAR) ? + temporalAccessor.get(ChronoField.YEAR) : null; } /** @@ -75,10 +79,9 @@ public InstantEdtfDateBuilder(TemporalAccessor temporalAccessor) { * @throws DateExtractionException if something went wrong during date validation */ public InstantEdtfDate build() throws DateExtractionException { - InstantEdtfDate instantEdtfDate; - instantEdtfDate = buildInternal(); + InstantEdtfDate instantEdtfDate = buildInternal(); //Try once more if flexible date - if (instantEdtfDate == null && flexibleDateBuild) { + if (instantEdtfDate == null && isPositive(month) && isPositive(day) && allowDayMonthSwap) { swapMonthDay(); instantEdtfDate = buildInternal(); } @@ -92,14 +95,7 @@ public InstantEdtfDate build() throws DateExtractionException { private InstantEdtfDate buildInternal() { InstantEdtfDate instantEdtfDate = null; - //Setup defaults - yearPrecision = ofNullable(yearPrecision).orElse(YearPrecision.YEAR); - dateQualification = ofNullable(dateQualification).orElse(NO_QUALIFICATION); - try { - if (temporalAccessor != null) { - parseTemporalAccessor(); - } parseYear(); parseMonthDay(); validateDateNotInFuture(); @@ -111,34 +107,24 @@ private InstantEdtfDate buildInternal() { return instantEdtfDate; } - private void parseTemporalAccessor() { - LOGGER.debug("TemporalAccessor present. Overwriting values."); - day = temporalAccessor.isSupported(ChronoField.DAY_OF_MONTH) ? - temporalAccessor.get(ChronoField.DAY_OF_MONTH) : null; - month = temporalAccessor.isSupported(ChronoField.MONTH_OF_YEAR) ? - temporalAccessor.get(ChronoField.MONTH_OF_YEAR) : null; - year = temporalAccessor.isSupported(ChronoField.YEAR) ? - temporalAccessor.get(ChronoField.YEAR) : null; - } - private void parseYear() throws DateExtractionException { Objects.requireNonNull(year, "Year value can never be null"); - if (longDatePrefixedWithY && Math.abs(year) <= THRESHOLD_4_DIGITS_YEAR) { + if (isMoreThanFourDigitsYear && Math.abs(year) <= THRESHOLD_4_DIGITS_YEAR) { throw new DateExtractionException( - format("Prefixed year with 'Y' is enabled indicating that year should have absolute value greater than %s", - THRESHOLD_4_DIGITS_YEAR)); - } else if (!longDatePrefixedWithY && Math.abs(year) > THRESHOLD_4_DIGITS_YEAR) { + format("isLongerThanFourDigitsYear is %s indicating that year should have absolute value greater than %s", + true, THRESHOLD_4_DIGITS_YEAR)); + } else if (!isMoreThanFourDigitsYear && Math.abs(year) > THRESHOLD_4_DIGITS_YEAR) { throw new DateExtractionException( - format("Year absolute value greater than %s, should be prefixed with 'Y'", THRESHOLD_4_DIGITS_YEAR)); + format("Year absolute value is greater than %s, and isLongerThanFourDigitsYear is %s", THRESHOLD_4_DIGITS_YEAR, false)); } yearObj = Year.of(year * yearPrecision.getDuration()); } private void parseMonthDay() throws DateExtractionException { try { - if (month != null && month >= 1) { + if (isPositive(month)) { monthObj = Month.of(month); - if (day != null && day >= 1) { + if (isPositive(day)) { yearMonthDayObj = LocalDate.of(yearObj.getValue(), monthObj.getValue(), day); } } @@ -147,6 +133,10 @@ private void parseMonthDay() throws DateExtractionException { } } + private boolean isPositive(Integer value) { + return value != null && value > 0; + } + private void validateDateNotInFuture() throws DateExtractionException { try { final boolean isYearMonthDayInTheFuture = yearMonthDayObj != null && yearMonthDayObj.isAfter(LocalDate.now()); @@ -164,13 +154,12 @@ private void validateDateNotInFuture() throws DateExtractionException { private void validateStrict() throws DateExtractionException { //If it is not a long year, and we want to be strict we further validate - boolean notLongYearAndStrictBuild = !longDatePrefixedWithY && !flexibleDateBuild; - // TODO: 15/02/2023 Check this instruction. It used to be like that - // return edtfDatePart.isUnknown() || edtfDatePart.isUncertain() || edtfDatePart.getYearPrecision() != null; - // but do we actually need the check on unknown? - boolean isDateNonPrecise = dateQualification == DateQualification.UNCERTAIN || yearPrecision != null; + boolean isNotMoreThanFourDigitsYearAndStrictBuild = !isMoreThanFourDigitsYear && !allowDayMonthSwap; + boolean isDateNonPrecise = + dateQualifications.contains(DateQualification.UNCERTAIN) || (yearPrecision != null + && yearPrecision != YearPrecision.YEAR); boolean notCompleteDate = monthObj == null || yearMonthDayObj == null; - if (notLongYearAndStrictBuild && (isDateNonPrecise || notCompleteDate)) { + if (isNotMoreThanFourDigitsYearAndStrictBuild && (isDateNonPrecise || notCompleteDate)) { throw new DateExtractionException("Date is invalid according to our strict profile!"); } } @@ -217,22 +206,22 @@ public InstantEdtfDateBuilder withYearPrecision(YearPrecision yearPrecision) { /** * Add date qualification. * - * @param dateQualification the date qualification + * @param dateQualifications the date qualifications * @return the extended builder */ - public InstantEdtfDateBuilder withDateQualification(DateQualification dateQualification) { - this.dateQualification = dateQualification; + public InstantEdtfDateBuilder withDateQualification(Set dateQualifications) { + this.dateQualifications.addAll(dateQualifications); return this; } /** - * Opt in/out for flexible date building. + * Opt in/out for day month swap if original values failed validation. * - * @param flexibleDateBuild the boolean (dis|en)abling the flexibility + * @param allowDayMonthSwap the boolean (dis|en)abling the day and month swap * @return the extended builder */ - public InstantEdtfDateBuilder withFlexibleDateBuild(boolean flexibleDateBuild) { - this.flexibleDateBuild = flexibleDateBuild; + public InstantEdtfDateBuilder withAllowDayMonthSwap(boolean allowDayMonthSwap) { + this.allowDayMonthSwap = allowDayMonthSwap; return this; } @@ -241,8 +230,8 @@ public InstantEdtfDateBuilder withFlexibleDateBuild(boolean flexibleDateBuild) { * * @return the extended builder */ - public InstantEdtfDateBuilder withLongYearPrefixedWithY() { - this.longDatePrefixedWithY = true; + public InstantEdtfDateBuilder withMoreThanFourDigitsYear() { + this.isMoreThanFourDigitsYear = true; return this; } @@ -262,7 +251,7 @@ public YearPrecision getYearPrecision() { return yearPrecision; } - public DateQualification getDateQualification() { - return dateQualification; + public Set getDateQualifications() { + return EnumSet.copyOf(dateQualifications); } } diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDate.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDate.java index 28100b047d..f5c6659526 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDate.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDate.java @@ -1,16 +1,28 @@ package eu.europeana.normalization.dates.edtf; +import static eu.europeana.normalization.dates.extraction.DefaultDatesSeparator.SLASH_DELIMITER; import static java.lang.String.format; +import java.util.EnumSet; +import java.util.Set; + /** * An EDTF date that represents a period of time specified by a start and end date with various degrees of precision */ public class IntervalEdtfDate extends AbstractEdtfDate { - public static final String DATE_INTERVAL_SEPARATOR = "/"; private InstantEdtfDate start; private InstantEdtfDate end; + + /** + * Restricted constructor by provided {@link InstantEdtfDateBuilder}. + *

    All fields apart from the internal {@link IntervalEdtfDate#addQualification(DateQualification)}(for each boundary) are + * strictly contained in the constructor. The date qualifications can be further extended to, for example, add an approximate + * qualification for a date that was sanitized.

    + * + * @param intervalEdtfDateBuilder the builder with all content verified + */ IntervalEdtfDate(IntervalEdtfDateBuilder intervalEdtfDateBuilder) { super(intervalEdtfDateBuilder.getLabel()); this.start = intervalEdtfDateBuilder.getStart(); @@ -18,13 +30,16 @@ public class IntervalEdtfDate extends AbstractEdtfDate { } @Override - public DateQualification getDateQualification() { - // TODO: 24/02/2023 To verify what this should return. - if (start.getDateQualification() == DateQualification.NO_QUALIFICATION) { - return end.getDateQualification(); - } else { - return start.getDateQualification(); - } + public void addQualification(DateQualification dateQualification) { + start.addQualification(dateQualification); + end.addQualification(dateQualification); + } + + @Override + public Set getDateQualifications() { + Set dateQualifications = EnumSet.copyOf(start.getDateQualifications()); + dateQualifications.addAll(end.getDateQualifications()); + return dateQualifications; } @Override @@ -60,6 +75,6 @@ public void setEnd(InstantEdtfDate end) { @Override public String toString() { - return format("%s%s%s", start.toString(), DATE_INTERVAL_SEPARATOR, end.toString()); + return format("%s%s%s", start.toString(), SLASH_DELIMITER.getStringRepresentation(), end.toString()); } } diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDateBuilder.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDateBuilder.java index 428a673eda..053a64fdc2 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDateBuilder.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDateBuilder.java @@ -10,8 +10,8 @@ * Builder class for {@link IntervalEdtfDate}. *

    * During {@link #build()} it will verify all the parameters that have been requested. The {@link #build()}, if - * {@link #withFlexibleDateBuild(boolean)} was called with {@code true}, will also attempt a second time by switching start and - * end values if the original values were invalid. + * {@link #withAllowStartEndSwap(boolean)} was called with {@code true}, will also attempt a second time by switching + * start and end values if the original values were invalid. *

    */ public class IntervalEdtfDateBuilder { @@ -20,8 +20,7 @@ public class IntervalEdtfDateBuilder { private InstantEdtfDate start; private InstantEdtfDate end; private String label; - - private boolean flexibleDateBuild = false; + private boolean allowStartEndSwap = true; /** * Constructor which initializes the builder with the start and end date boundaries. @@ -47,13 +46,13 @@ public IntervalEdtfDateBuilder withLabel(String label) { } /** - * Opt in/out for flexible date building. + * Opt in/out for start end swap if original values failed validation. * - * @param flexibleDateBuild the boolean (dis|en)abling the flexibility + * @param allowStartEndSwap the boolean (dis|en)abling the start and end swap * @return the extended builder */ - public IntervalEdtfDateBuilder withFlexibleDateBuild(boolean flexibleDateBuild) { - this.flexibleDateBuild = flexibleDateBuild; + public IntervalEdtfDateBuilder withAllowStartEndSwap(boolean allowStartEndSwap) { + this.allowStartEndSwap = allowStartEndSwap; return this; } @@ -67,13 +66,11 @@ public IntervalEdtfDate build() throws DateExtractionException { IntervalEdtfDate intervalEdtfDate; intervalEdtfDate = buildInternal(); //Try once more if switching allowed - if (intervalEdtfDate == null && flexibleDateBuild) { - //Retry with swapping month and day + if (intervalEdtfDate == null && allowStartEndSwap) { switchStartWithEnd(); intervalEdtfDate = buildInternal(); } - //Still nothing, we are done. if (intervalEdtfDate == null) { throw new DateExtractionException("Could not instantiate date"); } diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatePartsIndices.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatePartsIndices.java new file mode 100644 index 0000000000..f2e639e383 --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatePartsIndices.java @@ -0,0 +1,33 @@ +package eu.europeana.normalization.dates.extraction; + +import org.apache.commons.lang3.tuple.ImmutableTriple; +import org.apache.commons.lang3.tuple.Triple; + +/** + * Enum containing triples of the group indices. + *

    The positions are Left = Year, Middle = Month, Right = Day

    + */ +public enum DatePartsIndices { + DMY_INDICES(ImmutableTriple.of(3, 2, 1)), + YMD_INDICES(ImmutableTriple.of(1, 2, 3)), + MDY_INDICES(ImmutableTriple.of(3, 1, 2)), + MY_INDICES(ImmutableTriple.of(2, 1, null)); + + private final Triple indicesTriple; + + DatePartsIndices(Triple indicesTriple) { + this.indicesTriple = indicesTriple; + } + + public Integer getYearIndex() { + return indicesTriple.getLeft(); + } + + public Integer getMonthIndex() { + return indicesTriple.getMiddle(); + } + + public Integer getDayIndex() { + return indicesTriple.getRight(); + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatesSeparator.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatesSeparator.java new file mode 100644 index 0000000000..584db54b6c --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatesSeparator.java @@ -0,0 +1,9 @@ +package eu.europeana.normalization.dates.extraction; + +/** + * Interface to get the separator between two dates + */ +public interface DatesSeparator { + + String getStringRepresentation(); +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DefaultDatesSeparator.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DefaultDatesSeparator.java new file mode 100644 index 0000000000..1d693adb28 --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DefaultDatesSeparator.java @@ -0,0 +1,20 @@ +package eu.europeana.normalization.dates.extraction; + +/** + * Basic default enum for date separators + */ +public enum DefaultDatesSeparator implements DatesSeparator { + DASH_DELIMITER("-"), + SLASH_DELIMITER("/"); + + private final String stringRepresentation; + + DefaultDatesSeparator(String stringRepresentation) { + this.stringRepresentation = stringRepresentation; + } + + @Override + public String getStringRepresentation() { + return stringRepresentation; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/MonthMultilingual.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/MonthMultilingual.java index ce0b26cde0..bbd7ec53f0 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/MonthMultilingual.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/MonthMultilingual.java @@ -1,10 +1,15 @@ package eu.europeana.normalization.dates.extraction; +import static java.util.Collections.unmodifiableSet; + import java.time.Month; import java.time.format.TextStyle; +import java.util.Collections; import java.util.EnumMap; import java.util.HashSet; +import java.util.Locale; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; /** @@ -13,7 +18,7 @@ */ public class MonthMultilingual { - private final Map> monthToAllLanguagesStringsMap; + private final EnumMap> monthToAllLanguagesStringsMap; /** * Default constructor. @@ -27,34 +32,33 @@ public MonthMultilingual() { for (Month month : Month.values()) { final HashSet languageValues = new HashSet<>(); for (EuropeanLanguage europeanLanguage : EuropeanLanguage.values()) { - languageValues.add(month.getDisplayName(TextStyle.SHORT, europeanLanguage.getLocale())); - languageValues.add(month.getDisplayName(TextStyle.SHORT_STANDALONE, europeanLanguage.getLocale())); - languageValues.add(month.getDisplayName(TextStyle.FULL, europeanLanguage.getLocale())); - languageValues.add(month.getDisplayName(TextStyle.FULL_STANDALONE, europeanLanguage.getLocale())); + languageValues.add(month.getDisplayName(TextStyle.SHORT, europeanLanguage.getLocale()) + .toLowerCase(europeanLanguage.getLocale())); + languageValues.add(month.getDisplayName(TextStyle.SHORT_STANDALONE, europeanLanguage.getLocale()) + .toLowerCase(europeanLanguage.getLocale())); + languageValues.add(month.getDisplayName(TextStyle.FULL, europeanLanguage.getLocale()) + .toLowerCase(europeanLanguage.getLocale())); + languageValues.add(month.getDisplayName(TextStyle.FULL_STANDALONE, europeanLanguage.getLocale()) + .toLowerCase(europeanLanguage.getLocale())); } - monthToAllLanguagesStringsMap.put(month, languageValues); + monthToAllLanguagesStringsMap.put(month, unmodifiableSet(languageValues)); } } - /** - * Get all languages string values for a month. - * - * @param month the month - * @return the set of all string representations - */ - public Set getMonthStrings(Month month) { - return monthToAllLanguagesStringsMap.get(month); + public Map> getMonthToAllLanguagesStringsMap() { + return Collections.unmodifiableMap(monthToAllLanguagesStringsMap); } /** - * Get the month index based on a month name in any supported language, full or short, standard or stand-alone. + * Get {@link Month} by name. * * @param monthName the month name - * @return the month index + * @return the month */ - public Integer getMonthIndexValue(String monthName) { - return monthToAllLanguagesStringsMap.entrySet().stream().filter(entry -> entry.getValue().contains(monthName)) - .findFirst().map(entry -> entry.getKey().getValue()).orElse(null); + public Month getMonth(String monthName) { + return monthToAllLanguagesStringsMap.entrySet().stream() + .filter(entry -> entry.getValue().contains(monthName.toLowerCase(Locale.ROOT))) + .findFirst().map(Entry::getKey).orElse(null); } } diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/NumericPartsPattern.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/NumericPartsPattern.java index c5defa979e..27beb5eebb 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/NumericPartsPattern.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/NumericPartsPattern.java @@ -2,17 +2,17 @@ import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS; import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS_XX; -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.YYYY_MM_DD_SPACES; -import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.DatePartsIndices.DMY_INDICES; -import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.DatePartsIndices.YMD_INDICES; +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_SPACES_VARIANT; +import static eu.europeana.normalization.dates.extraction.DatePartsIndices.DMY_INDICES; +import static eu.europeana.normalization.dates.extraction.DatePartsIndices.YMD_INDICES; import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericDateDelimiters.DASH_DOT_DELIMITERS; import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericDateDelimiters.DASH_DOT_SLASH_DELIMITERS; import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericDateDelimiters.DOT_SLASH_DELIMITERS; import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericDateDelimiters.SPACE_DELIMITER; -import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeDateDelimiters.DASH_RANGE; -import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeDateDelimiters.SLASH_RANGE; -import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeDateDelimiters.SPACED_DASH_RANGE; -import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeDateDelimiters.SPACE_RANGE; +import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeQualifier.DASH_RANGE; +import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeQualifier.SLASH_RANGE; +import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeQualifier.SPACED_DASH_RANGE; +import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeQualifier.SPACE_RANGE; import static java.util.Collections.unmodifiableSet; import static java.util.regex.Pattern.CASE_INSENSITIVE; import static java.util.regex.Pattern.compile; @@ -21,8 +21,6 @@ import java.util.EnumSet; import java.util.Set; import java.util.regex.Pattern; -import org.apache.commons.lang3.tuple.ImmutableTriple; -import org.apache.commons.lang3.tuple.Triple; /** * Enum with all the acceptable date patterns used for numeric dates. @@ -37,8 +35,8 @@ public enum NumericPartsPattern { YMD_XX(DASH_DOT_SLASH_DELIMITERS, YMD_INDICES, NUMERIC_ALL_VARIANTS_XX), DMY_XX(DASH_DOT_SLASH_DELIMITERS, DMY_INDICES, NUMERIC_ALL_VARIANTS_XX), - YMD_SPACES(SPACE_DELIMITER, YMD_INDICES, YYYY_MM_DD_SPACES), - DMY_SPACES(SPACE_DELIMITER, DMY_INDICES, YYYY_MM_DD_SPACES), + YMD_SPACES(SPACE_DELIMITER, YMD_INDICES, NUMERIC_SPACES_VARIANT), + DMY_SPACES(SPACE_DELIMITER, DMY_INDICES, NUMERIC_SPACES_VARIANT), YMD_SPACED_DASH_RANGE(SPACED_DASH_RANGE, YMD_INDICES, NUMERIC_ALL_VARIANTS), DMY_SPACED_DASH_RANGE(SPACED_DASH_RANGE, DMY_INDICES, NUMERIC_ALL_VARIANTS), @@ -88,9 +86,9 @@ public enum NumericPartsPattern { NumericPartsPattern(DateDelimiters dateDelimiters, DatePartsIndices dateFormatIndices, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { this.dateNormalizationExtractorMatchId = dateNormalizationExtractorMatchId; - this.yearIndex = dateFormatIndices.tripleIndices.getLeft(); - this.monthIndex = dateFormatIndices.tripleIndices.getMiddle(); - this.dayIndex = dateFormatIndices.tripleIndices.getRight(); + this.yearIndex = dateFormatIndices.getYearIndex(); + this.monthIndex = dateFormatIndices.getMonthIndex(); + this.dayIndex = dateFormatIndices.getDayIndex(); this.pattern = NumericPartsPattern.generatePattern(dateDelimiters.getDatesDelimiters(), dateNormalizationExtractorMatchId, dateFormatIndices); @@ -125,7 +123,7 @@ private static Pattern generatePattern(String dateDelimiters, year = "(\\d{2}(?:XX|UU|--|\\?\\?)|\\d{3}(?!\\?)[XU]|\\d{4})"; delimiterDigits = "(?:" + dateDelimiters + "(\\d{2}|XX|UU|(? */ - public enum NumericRangeDateDelimiters implements DateDelimiters { + public enum NumericRangeQualifier implements DateDelimiters, DatesSeparator { //"[XU]" with "-" delimiter, "[\\-XU]" with "./" delimiters - SPACED_DASH_RANGE(" - ", DASH_DOT_SLASH_DELIMITERS, NumericRangeDateDelimiters.DEFAULT_UNSPECIFIED_CHARACTERS), + SPACED_DASH_RANGE(" - ", DASH_DOT_SLASH_DELIMITERS, NumericRangeQualifier.DEFAULT_UNSPECIFIED_CHARACTERS), //"[XU]" with "-" delimiter, "[\\-XU]" with "./" delimiters - PIPE_RANGE("\\|", DASH_DOT_SLASH_DELIMITERS, NumericRangeDateDelimiters.DEFAULT_UNSPECIFIED_CHARACTERS), + PIPE_RANGE("\\|", DASH_DOT_SLASH_DELIMITERS, NumericRangeQualifier.DEFAULT_UNSPECIFIED_CHARACTERS), //For space separator we don't accept unspecified boundaries //Does not exist in XX SPACE_RANGE(" ", DASH_DOT_SLASH_DELIMITERS, null), //"[XU]" DASH_RANGE("-", DOT_SLASH_DELIMITERS, "\\?|\\.\\."), //"[XU]" with "-" delimiter, "[\\-XU]" with "." delimiter - SLASH_RANGE("/", DASH_DOT_DELIMITERS, NumericRangeDateDelimiters.DEFAULT_UNSPECIFIED_CHARACTERS); + SLASH_RANGE("/", DASH_DOT_DELIMITERS, NumericRangeQualifier.DEFAULT_UNSPECIFIED_CHARACTERS); public static final String DEFAULT_UNSPECIFIED_CHARACTERS = "\\?|-|\\.\\."; @@ -215,13 +213,14 @@ public enum NumericRangeDateDelimiters implements DateDelimiters { private final String datesDelimiters; private final String unspecifiedCharacters; - NumericRangeDateDelimiters(String datesSeparator, NumericDateDelimiters datesDelimiters, String unspecifiedCharacters) { + NumericRangeQualifier(String datesSeparator, NumericDateDelimiters datesDelimiters, String unspecifiedCharacters) { this.datesSeparator = datesSeparator; this.datesDelimiters = datesDelimiters.getDatesDelimiters(); this.unspecifiedCharacters = unspecifiedCharacters; } - public String getDatesSeparator() { + @Override + public String getStringRepresentation() { return datesSeparator; } @@ -234,18 +233,4 @@ public String getUnspecifiedCharacters() { return unspecifiedCharacters; } } - - /** - * Simple internal enum that contains the indices order of a DMY and YMD date formatting. - */ - enum DatePartsIndices { - DMY_INDICES(ImmutableTriple.of(3, 2, 1)), - YMD_INDICES(ImmutableTriple.of(1, 2, 3)); - - private final Triple tripleIndices; - - DatePartsIndices(Triple tripleIndices) { - this.tripleIndices = tripleIndices; - } - } } diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/AbstractDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/AbstractDateExtractor.java deleted file mode 100644 index 0538114121..0000000000 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/AbstractDateExtractor.java +++ /dev/null @@ -1,82 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult; -import static java.lang.String.format; - -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.edtf.DateQualification; -import eu.europeana.normalization.dates.extraction.DateExtractionException; -import java.lang.invoke.MethodHandles; -import java.util.function.Supplier; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Abstract class implementing interface {@link DateExtractor} with default functionality for all extractors - */ -public abstract class AbstractDateExtractor implements DateExtractor { - - private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - /** - * Utility method for calling {@link #extract(String, DateQualification, boolean)} with allowSwitchesDuringValidation as true. - *

    It also captures relevant exceptions so that return is performed

    - * - * @param inputValue the input value - * @param dateQualification the date qualification requested - * @return the date normalization result - */ - @Override - public DateNormalizationResult extractDateProperty(String inputValue, DateQualification dateQualification) { - return getDateNormalizationResult(inputValue, dateQualification, true); - } - - /** - * Utility method for calling {@link #extract(String, DateQualification, boolean)} with allowSwitchesDuringValidation as false. - *

    It also captures relevant exceptions so that return is performed

    - * - * @param inputValue the input value - * @param dateQualification the date qualification requested - * @return the date normalization result - */ - @Override - public DateNormalizationResult extractGenericProperty(String inputValue, DateQualification dateQualification) { - return getDateNormalizationResult(inputValue, dateQualification, false); - } - - private DateNormalizationResult getDateNormalizationResult(String inputValue, DateQualification dateQualification, - boolean flexibleDateBuild) { - DateNormalizationResult dateNormalizationResult = getNoMatchResult(inputValue); - try { - dateNormalizationResult = extract(inputValue, dateQualification, flexibleDateBuild); - } catch (DateExtractionException e) { - LOGGER.debug(format("Date extraction failed %s: ", inputValue), e); - } - //Sanity check to avoid null return. - if (dateNormalizationResult == null) { - dateNormalizationResult = getNoMatchResult(inputValue); - } - return dateNormalizationResult; - } - - /** - * Default method to get the correct date qualification. - *

    If a requested date qualification is requested we then set that, overwriting any other that would otherwise be computed. - * The date qualification will be overwritten if the requested date qualification in non-null and - * non-{@link DateQualification#NO_QUALIFICATION}. Otherwise we compute it with the supplier provided.

    - * - * @param requestedDateQualification the requested date qualification - * @param dateQualificationSupplier the date qualification supplier - * @return the computed date qualification - */ - DateQualification computeDateQualification(DateQualification requestedDateQualification, - Supplier dateQualificationSupplier) { - final DateQualification dateQualification; - if (requestedDateQualification != null && requestedDateQualification != DateQualification.NO_QUALIFICATION) { - dateQualification = requestedDateQualification; - } else { - dateQualification = dateQualificationSupplier.get(); - } - return dateQualification; - } -} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractor.java deleted file mode 100644 index 5c353611b8..0000000000 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractor.java +++ /dev/null @@ -1,70 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.YearPrecision.CENTURY; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN; - -import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.edtf.DateQualification; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder; -import eu.europeana.normalization.dates.extraction.DateExtractionException; -import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer; -import java.time.Month; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * Extractor that matches a date range where the end year includes only the rightmost two digits. - *

    - * The end year in this extractor has to: - *

      - *
    • Be higher than 12 to avoid matching a month value from other extractors.
    • - *
    • Be higher than the two rightmost digits of the start year.
    • - *
    - *

    - *

    - * This pattern needs to be executed before the Edtf extractor because EDTF could potentially match yyyy/MM and yyyy-MM. - * Therefore in this extractor we check only the values that are higher than 12 to avoid a mismatch. - *

    - */ -public class BriefRangeDateExtractor extends AbstractDateExtractor { - - private final Pattern briefRangePattern = Pattern.compile("\\??(\\d{3,4})[\\-/](\\d{2})\\??"); - - public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - boolean flexibleDateBuild) throws DateExtractionException { - final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue); - final DateQualification dateQualification = computeDateQualification(requestedDateQualification, () -> - (sanitizedValue.startsWith("?") || sanitizedValue.endsWith("?")) ? UNCERTAIN : NO_QUALIFICATION); - - DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue); - final Matcher matcher = briefRangePattern.matcher(sanitizedValue); - if (matcher.matches()) { - final int startYearFourDigits = Integer.parseInt(matcher.group(1)); - final int startYearLastTwoDigits = startYearFourDigits % CENTURY.getDuration(); - final int endYearTwoDigits = Integer.parseInt(matcher.group(2)); - final int endYearFourDigits = (startYearFourDigits / CENTURY.getDuration()) * CENTURY.getDuration() + endYearTwoDigits; - - if (endYearTwoDigits > Month.DECEMBER.getValue() && startYearLastTwoDigits < endYearTwoDigits) { - final InstantEdtfDate startDate = new InstantEdtfDateBuilder(startYearFourDigits) - .withDateQualification(dateQualification) - .withFlexibleDateBuild(flexibleDateBuild) - .build(); - - final InstantEdtfDate endDate = new InstantEdtfDateBuilder(endYearFourDigits) - .withDateQualification(dateQualification) - .withFlexibleDateBuild(flexibleDateBuild) - .build(); - - dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.BRIEF_DATE_RANGE, inputValue, - new IntervalEdtfDateBuilder(startDate, endDate).withFlexibleDateBuild(flexibleDateBuild).build()); - } - } - - return dateNormalizationResult; - } -} - diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractor.java deleted file mode 100644 index f8c8c46bda..0000000000 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractor.java +++ /dev/null @@ -1,150 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.YearPrecision.CENTURY; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN; -import static java.util.regex.Pattern.CASE_INSENSITIVE; -import static java.util.regex.Pattern.compile; - -import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.DateNormalizationResultStatus; -import eu.europeana.normalization.dates.edtf.AbstractEdtfDate; -import eu.europeana.normalization.dates.edtf.DateQualification; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder; -import eu.europeana.normalization.dates.extraction.DateExtractionException; -import eu.europeana.normalization.dates.extraction.RomanToNumber; -import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer; -import java.lang.invoke.MethodHandles; -import java.util.Arrays; -import java.util.function.ToIntFunction; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Extractor that matches a century with a decimal or Roman numerals - *

    The range of values this accepts are from 1-21 including.

    - *

    Examples of some cases: - *

      - *
    • - * Value = 18.. | Outcome = 18XX - * Value = 1st century | Outcome = 00XX - * Value = s. XXI | Outcome = 20XX - * Value = s.II-III | Outcome = 01XX/02XX - *
    • - *
    - *

    - *

    The Roman numerals may also be preceded by an abbreviation of century, for example ‘s. XIX’.

    - *

    Also supports ranges.

    - */ -public class CenturyDateExtractor extends AbstractDateExtractor { - - private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private static final String NUMERIC_10_TO_21_ENDING_DOTS_REGEX = "(1\\d|2[0-1])\\.{2}"; - private static final String NUMERIC_1_TO_21_SUFFIXED_REGEX = "(2?1st|2nd|3rd|(?:1\\d|[4-9]|20)th)\\scentury"; - private static final String ROMAN_1_TO_21_REGEX = "(X?(?:IX|IV|VI{0,3}|I{1,3})|X|XXI?)"; - private static final String CENTURY_PREFIX = "(?:(?:s|sec|saec)\\s|(?:s|sec|saec)\\.\\s?)?"; - private static final String QUESTION_MARK = "\\??"; - - enum PatternCenturyDateOperation { - PATTERN_YYYY( - compile(QUESTION_MARK + NUMERIC_10_TO_21_ENDING_DOTS_REGEX + QUESTION_MARK, CASE_INSENSITIVE), - Integer::parseInt, DateNormalizationExtractorMatchId.CENTURY_NUMERIC), - PATTERN_ENGLISH( - compile(QUESTION_MARK + NUMERIC_1_TO_21_SUFFIXED_REGEX + QUESTION_MARK, CASE_INSENSITIVE), - century -> (Integer.parseInt(century.substring(0, century.length() - 2)) - 1), - DateNormalizationExtractorMatchId.CENTURY_NUMERIC), - PATTERN_ROMAN( - compile(QUESTION_MARK + CENTURY_PREFIX + ROMAN_1_TO_21_REGEX + QUESTION_MARK, CASE_INSENSITIVE), - century -> (RomanToNumber.romanToDecimal(century) - 1), - DateNormalizationExtractorMatchId.CENTURY_ROMAN), - PATTERN_ROMAN_RANGE( - compile(QUESTION_MARK + CENTURY_PREFIX + ROMAN_1_TO_21_REGEX + "\\s?-\\s?" + ROMAN_1_TO_21_REGEX + QUESTION_MARK, - CASE_INSENSITIVE), century -> (RomanToNumber.romanToDecimal(century) - 1), - DateNormalizationExtractorMatchId.CENTURY_RANGE_ROMAN); - - private final Pattern pattern; - private final ToIntFunction centuryExtractorFunction; - private final DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId; - - PatternCenturyDateOperation(Pattern pattern, ToIntFunction centuryExtractorFunction, - DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { - this.pattern = pattern; - this.centuryExtractorFunction = centuryExtractorFunction; - this.dateNormalizationExtractorMatchId = dateNormalizationExtractorMatchId; - } - - public Pattern getPattern() { - return pattern; - } - - public ToIntFunction getCenturyExtractorFunction() { - return centuryExtractorFunction; - } - - public DateNormalizationExtractorMatchId getDateNormalizationExtractorMatchId() { - return dateNormalizationExtractorMatchId; - } - } - - @Override - public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - boolean flexibleDateBuild) { - return Arrays.stream(PatternCenturyDateOperation.values()) - .map(operation -> { - try { - return extractInstance(inputValue, requestedDateQualification, operation, - flexibleDateBuild); - } catch (DateExtractionException e) { - LOGGER.warn("Failed instance extraction!", e); - } - return DateNormalizationResult.getNoMatchResult(inputValue); - }) - .filter(dateNormalizationResult -> dateNormalizationResult.getDateNormalizationResultStatus() - == DateNormalizationResultStatus.MATCHED).findFirst() - .orElse(DateNormalizationResult.getNoMatchResult(inputValue)); - } - - private DateNormalizationResult extractInstance(String inputValue, DateQualification requestedDateQualification, - PatternCenturyDateOperation patternCenturyDateOperation, - boolean allowSwitchMonthDay) throws DateExtractionException { - final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue); - final DateQualification dateQualification = computeDateQualification(requestedDateQualification, () -> - (sanitizedValue.startsWith("?") || sanitizedValue.endsWith("?")) ? UNCERTAIN : NO_QUALIFICATION); - - final Matcher matcher = patternCenturyDateOperation.getPattern().matcher(sanitizedValue); - DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue); - if (matcher.matches()) { - AbstractEdtfDate abstractEdtfDate; - InstantEdtfDateBuilder startDatePartBuilder = extractEdtfDatePart(patternCenturyDateOperation, matcher, 1); - InstantEdtfDate startEdtfDate = startDatePartBuilder.withDateQualification(dateQualification) - .withFlexibleDateBuild(allowSwitchMonthDay).build(); - - boolean isInterval = matcher.groupCount() == 2; - if (isInterval) { - InstantEdtfDateBuilder endDatePartBuilder = extractEdtfDatePart(patternCenturyDateOperation, matcher, 2); - InstantEdtfDate endEdtfDate = endDatePartBuilder.withDateQualification(dateQualification) - .withFlexibleDateBuild(allowSwitchMonthDay).build(); - abstractEdtfDate = new IntervalEdtfDateBuilder(startEdtfDate, endEdtfDate).withFlexibleDateBuild(allowSwitchMonthDay) - .build(); - } else { - abstractEdtfDate = startEdtfDate; - } - - dateNormalizationResult = new DateNormalizationResult(patternCenturyDateOperation.getDateNormalizationExtractorMatchId(), - inputValue, abstractEdtfDate); - } - return dateNormalizationResult; - } - - private InstantEdtfDateBuilder extractEdtfDatePart(PatternCenturyDateOperation patternCenturyDateOperation, - Matcher matcher, int group) { - final String century = matcher.group(group); - return new InstantEdtfDateBuilder(patternCenturyDateOperation.getCenturyExtractorFunction().applyAsInt(century)) - .withYearPrecision(CENTURY); - } -} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractor.java deleted file mode 100644 index 84c617d165..0000000000 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractor.java +++ /dev/null @@ -1,105 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN; -import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN; -import static eu.europeana.normalization.dates.edtf.DateQualification.CHECK_QUALIFICATION_PATTERN; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder.OVER_4_DIGITS_YEAR_PREFIX; -import static eu.europeana.normalization.dates.edtf.IntervalEdtfDate.DATE_INTERVAL_SEPARATOR; - -import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.edtf.AbstractEdtfDate; -import eu.europeana.normalization.dates.edtf.DateQualification; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDate; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder; -import eu.europeana.normalization.dates.edtf.Iso8601Parser; -import eu.europeana.normalization.dates.extraction.DateExtractionException; -import java.time.temporal.TemporalAccessor; -import java.util.regex.Matcher; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.math.NumberUtils; - -/** - * The pattern for EDTF dates and compatible with ISO 8601 dates. - *

    This parser supports partial Level0 and Level1 from the Extended - * Date/Time Format (EDTF) Specification. It only validates the date part of a date and the time if existent it is discarded. - * Specifically from Level1, seasons and Unspecified digit(s) from the right are not supported - *

    - */ -public class EdtfDateExtractor extends AbstractDateExtractor { - - private static final Iso8601Parser ISO_8601_PARSER = new Iso8601Parser(); - - @Override - public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - boolean flexibleDateBuild) throws DateExtractionException { - if (StringUtils.isEmpty(inputValue)) { - throw new DateExtractionException("Empty argument"); - } - final AbstractEdtfDate edtfDate; - if (inputValue.contains(DATE_INTERVAL_SEPARATOR)) { - edtfDate = extractInterval(inputValue, requestedDateQualification, flexibleDateBuild); - } else { - edtfDate = extractInstant(inputValue, requestedDateQualification, flexibleDateBuild); - } - return new DateNormalizationResult(DateNormalizationExtractorMatchId.EDTF, inputValue, edtfDate); - } - - protected IntervalEdtfDate extractInterval(String dateInput, DateQualification requestedDateQualification, - boolean allowSwitchMonthDay) throws DateExtractionException { - String startPart = dateInput.substring(0, dateInput.indexOf(DATE_INTERVAL_SEPARATOR)); - String endPart = dateInput.substring(dateInput.indexOf(DATE_INTERVAL_SEPARATOR) + 1); - final InstantEdtfDate start = extractInstant(startPart, requestedDateQualification, allowSwitchMonthDay); - final InstantEdtfDate end = extractInstant(endPart, requestedDateQualification, allowSwitchMonthDay); - - //Are both ends unknown or open, then it is not a date - if ((end.getDateBoundaryType() == UNKNOWN || end.getDateBoundaryType() == OPEN) && - (start.getDateBoundaryType() == UNKNOWN || start.getDateBoundaryType() == OPEN)) { - throw new DateExtractionException(dateInput); - } - return new IntervalEdtfDateBuilder(start, end).withFlexibleDateBuild(allowSwitchMonthDay).build(); - } - - protected InstantEdtfDate extractInstant(String dateInput, DateQualification requestedDateQualification, - boolean allowSwitchMonthDay) throws DateExtractionException { - final InstantEdtfDate instantEdtfDate; - if (UNKNOWN.getDeserializedRepresentation().equals(dateInput)) { - instantEdtfDate = InstantEdtfDate.getUnknownInstance(); - } else if (OPEN.getDeserializedRepresentation().equals(dateInput)) { - instantEdtfDate = InstantEdtfDate.getOpenInstance(); - } else if (dateInput.startsWith(String.valueOf(OVER_4_DIGITS_YEAR_PREFIX))) { - int year = NumberUtils.toInt(dateInput.substring(1)); - instantEdtfDate = new InstantEdtfDateBuilder(year).withLongYearPrefixedWithY() - .withDateQualification(requestedDateQualification).build(); - } else { - instantEdtfDate = extractInstantEdtfDate(dateInput, requestedDateQualification, allowSwitchMonthDay); - } - return instantEdtfDate; - } - - private static InstantEdtfDate extractInstantEdtfDate(String dateInput, DateQualification requestedDateQualification, - boolean allowSwitchMonthDay) throws DateExtractionException { - Matcher matcher = CHECK_QUALIFICATION_PATTERN.matcher(dateInput); - String dateInputStrippedModifier = dateInput; - DateQualification dateQualification = requestedDateQualification; - - boolean containsQualification = matcher.matches(); - if (containsQualification && (requestedDateQualification == null || requestedDateQualification == NO_QUALIFICATION)) { - final String modifier = matcher.group(1); - if (StringUtils.isNotEmpty(modifier)) { - dateQualification = DateQualification.fromCharacter(String.valueOf(modifier.charAt(0))); - dateInputStrippedModifier = dateInput.substring(0, dateInput.length() - 1); - } - } - - final TemporalAccessor temporalAccessor = ISO_8601_PARSER.parseDatePart(dateInputStrippedModifier); - return new InstantEdtfDateBuilder(temporalAccessor) - .withDateQualification(dateQualification) - .withFlexibleDateBuild(allowSwitchMonthDay) - .build(); - } - -} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractor.java deleted file mode 100644 index ca9dd573c9..0000000000 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractor.java +++ /dev/null @@ -1,118 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS; -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS_XX; -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS; -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS_XX; - -import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.DateNormalizationResultStatus; -import eu.europeana.normalization.dates.edtf.DateBoundaryType; -import eu.europeana.normalization.dates.edtf.DateQualification; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDate; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder; -import eu.europeana.normalization.dates.extraction.DateExtractionException; -import eu.europeana.normalization.dates.extraction.NumericPartsPattern; -import eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeDateDelimiters; -import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer; - -/** - * Patterns for numeric date ranges with variations in the separators of date components. - *

    We reuse the already existent {@link NumericPartsDateExtractor} code for the boundaries.

    - */ -public class NumericPartsRangeDateExtractor extends AbstractDateExtractor { - - private static final NumericPartsDateExtractor NUMERIC_WITH_MISSING_PARTS_DATE_EXTRACTOR = new NumericPartsDateExtractor(); - - /** - * Extract the date normalization result for a range. - *

    - * The date is split in two boundaries using the {@link NumericRangeDateDelimiters#values()} as a separator. The result will - * contain the first split that is exactly splitting the original value in two parts(boundaries) and those two boundaries are - * valid parsable boundaries or null if none found. - *

    - * - * @param inputValue the range value to attempt parsing - * @return the date normalization result - */ - @Override - public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - boolean flexibleDateBuild) throws DateExtractionException { - final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue); - DateNormalizationResult startDateResult; - DateNormalizationResult endDateResult; - DateNormalizationResult rangeDate = DateNormalizationResult.getNoMatchResult(inputValue); - for (NumericRangeDateDelimiters numericRangeSpecialCharacters : NumericRangeDateDelimiters.values()) { - // Split with -1 limit does not discard empty splits - final String[] sanitizedDateSplitArray = sanitizedValue.split(numericRangeSpecialCharacters.getDatesSeparator(), -1); - // The sanitizedDateSplitArray has to be exactly in two, and then we can verify. - // This also guarantees that the separator used is not used for unknown characters. - if (sanitizedDateSplitArray.length == 2) { - // Try extraction and verify - startDateResult = extractDateNormalizationResult(sanitizedDateSplitArray[0], numericRangeSpecialCharacters, - requestedDateQualification, - flexibleDateBuild); - endDateResult = extractDateNormalizationResult(sanitizedDateSplitArray[1], numericRangeSpecialCharacters, - requestedDateQualification, flexibleDateBuild); - if (startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED - && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED - && !areYearsAmbiguous((InstantEdtfDate) startDateResult.getEdtfDate(), (InstantEdtfDate) endDateResult.getEdtfDate(), - numericRangeSpecialCharacters)) { - - final DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId = - getDateNormalizationExtractorId(startDateResult, endDateResult); - final IntervalEdtfDate intervalEdtfDate = new IntervalEdtfDateBuilder((InstantEdtfDate) startDateResult.getEdtfDate(), - (InstantEdtfDate) endDateResult.getEdtfDate()).withFlexibleDateBuild(flexibleDateBuild).build(); - rangeDate = new DateNormalizationResult(dateNormalizationExtractorMatchId, inputValue, intervalEdtfDate); - break; - } - } - } - return rangeDate; - } - - /** - * Captures the ambiguous case of "198-?". - * - * @param startDate the start date of a range - * @param endDate the end date of the range - * @param numericRangeSpecialCharacters the date separator of the range - * @return true if the range is ambiguous - */ - private boolean areYearsAmbiguous(InstantEdtfDate startDate, InstantEdtfDate endDate, - NumericRangeDateDelimiters numericRangeSpecialCharacters) { - boolean isAmbiguous = false; - if (numericRangeSpecialCharacters == NumericRangeDateDelimiters.DASH_RANGE) { - final boolean isStartDeclared = startDate.getDateBoundaryType() == DateBoundaryType.DECLARED; - final boolean isStartThreeDigit = - isStartDeclared && Integer.toString(startDate.getYear().getValue()).matches("\\d{3}"); - if (isStartThreeDigit && endDate.getDateBoundaryType() == DateBoundaryType.OPEN) { - isAmbiguous = true; - } - } - return isAmbiguous; - } - - private DateNormalizationResult extractDateNormalizationResult(String dateString, - NumericRangeDateDelimiters numericRangeSpecialCharacters, DateQualification requestedDateQualification, - boolean allowSwitchMonthDay) throws DateExtractionException { - final DateNormalizationResult dateNormalizationResult; - if (numericRangeSpecialCharacters.getUnspecifiedCharacters() != null && dateString.matches( - numericRangeSpecialCharacters.getUnspecifiedCharacters())) { - dateNormalizationResult = new DateNormalizationResult(NUMERIC_ALL_VARIANTS, dateString, InstantEdtfDate.getOpenInstance()); - } else { - dateNormalizationResult = NUMERIC_WITH_MISSING_PARTS_DATE_EXTRACTOR.extract(dateString, requestedDateQualification, - NumericPartsPattern.NUMERIC_RANGE_SET, allowSwitchMonthDay); - } - return dateNormalizationResult; - } - - private static DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDate, - DateNormalizationResult endDate) { - final boolean isStartXX = startDate.getDateNormalizationExtractorMatchId() == NUMERIC_ALL_VARIANTS_XX; - final boolean isEndXX = endDate.getDateNormalizationExtractorMatchId() == NUMERIC_ALL_VARIANTS_XX; - return isStartXX || isEndXX ? NUMERIC_RANGE_ALL_VARIANTS_XX : NUMERIC_RANGE_ALL_VARIANTS; - } -} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternBcAdDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternBcAdDateExtractor.java deleted file mode 100644 index 803d25e00a..0000000000 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternBcAdDateExtractor.java +++ /dev/null @@ -1,124 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.edtf.DateQualification; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder; -import eu.europeana.normalization.dates.extraction.DateExtractionException; -import java.util.HashSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -/** - * A year with an indication of the era, for example ‘3000 BC’. Currently, the normalisation process recognizes ‘BC/AD’ and - * ‘AC/DC’, but the abbreviations used in other languages will be supported in the future. Or a date range where the start/end - * years contain an indication of the era. - */ -public class PatternBcAdDateExtractor extends AbstractDateExtractor { - - static final HashSet bcAbbreviations = new HashSet<>(); - - static { - bcAbbreviations.add("B\\.?C".toLowerCase()); - bcAbbreviations.add("A\\.?C".toLowerCase()); - bcAbbreviations.add("v\\.?Chr".toLowerCase()); - bcAbbreviations.add("vC".toLowerCase()); - bcAbbreviations.add("avant J\\.?-C".toLowerCase()); - bcAbbreviations.add("av[\\. ]J\\.?-C".toLowerCase()); - //bcAbbreviations.add("eKr"); removed due to ambiguity - bcAbbreviations.add("f\\.?Kr".toLowerCase()); - bcAbbreviations.add("π\\.*Χ".toLowerCase()); - } - - static final HashSet adAbbreviations = new HashSet<>(); - - static { - adAbbreviations.add("A\\.?D".toLowerCase()); - adAbbreviations.add("D\\.?C".toLowerCase()); - adAbbreviations.add("n\\.?Chr".toLowerCase()); - adAbbreviations.add("nC".toLowerCase()); - adAbbreviations.add("après J-C".toLowerCase()); - adAbbreviations.add("apres J-C".toLowerCase()); - adAbbreviations.add("ap[\\. ]J-C".toLowerCase()); - //adAbbreviations.add("eKr"); removed due to ambiguity - adAbbreviations.add("j\\.?Kr".toLowerCase()); - adAbbreviations.add("μ\\.?Χ".toLowerCase()); - } - - static final HashSet bcAbbreviationsPatterns = new HashSet<>(); - - static { - for (String abbrev : bcAbbreviations) { - bcAbbreviationsPatterns.add(Pattern.compile(abbrev, Pattern.CASE_INSENSITIVE)); - } - } - - Pattern patYyyy; - Pattern patRange; - - public PatternBcAdDateExtractor() { - String patYearBcAd = "(?\\d{2,4})\\s*(?"; - patYearBcAd += bcAbbreviations.stream().collect(Collectors.joining("|")); - patYearBcAd += adAbbreviations.stream().collect(Collectors.joining("|")); - patYearBcAd = patYearBcAd.substring(0, patYearBcAd.length() - 1) + ")\\.?"; - - patYyyy = Pattern.compile(patYearBcAd, Pattern.CASE_INSENSITIVE); - patRange = Pattern.compile( - patYearBcAd + "\\s*[\\-\\/]\\s*" + patYearBcAd.replace("", "").replace("", ""), - Pattern.CASE_INSENSITIVE); - } - - @Override - public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - boolean flexibleDateBuild) throws DateExtractionException { - Matcher m = patYyyy.matcher(inputValue); - if (m.matches()) { - final InstantEdtfDateBuilder instantEdtfDateBuilder; - if (bcAbbreviations.contains(m.group("era").toLowerCase())) { - instantEdtfDateBuilder = new InstantEdtfDateBuilder(-Integer.parseInt(m.group("year"))); - } else { - instantEdtfDateBuilder = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year"))); - } - return new DateNormalizationResult(DateNormalizationExtractorMatchId.BC_AD, inputValue, - instantEdtfDateBuilder.withDateQualification(requestedDateQualification).withFlexibleDateBuild( - flexibleDateBuild) - .build()); - } - m = patRange.matcher(inputValue); - if (m.matches()) { - final InstantEdtfDateBuilder startDatePartBuilder; - if (isBc(m.group("era"))) { - startDatePartBuilder = new InstantEdtfDateBuilder(-Integer.parseInt(m.group("year"))); - } else { - startDatePartBuilder = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year"))); - } - InstantEdtfDate start = startDatePartBuilder.withDateQualification(requestedDateQualification) - .withFlexibleDateBuild(flexibleDateBuild).build(); - - final InstantEdtfDateBuilder endDatePartBuilder; - if (isBc(m.group("era2"))) { - endDatePartBuilder = new InstantEdtfDateBuilder(-Integer.parseInt(m.group("year2"))); - } else { - endDatePartBuilder = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year2"))); - } - InstantEdtfDate end = endDatePartBuilder.withDateQualification(requestedDateQualification) - .withFlexibleDateBuild(flexibleDateBuild).build(); - - return new DateNormalizationResult(DateNormalizationExtractorMatchId.BC_AD, inputValue, - new IntervalEdtfDateBuilder(start, end).withFlexibleDateBuild(flexibleDateBuild).build()); - } - return DateNormalizationResult.getNoMatchResult(inputValue); - } - - private boolean isBc(String abbreviation) { - for (Pattern pat : bcAbbreviationsPatterns) { - if (pat.matcher(abbreviation).matches()) { - return true; - } - } - return false; - } -} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternFormatedFullDateDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternFormatedFullDateDateExtractor.java deleted file mode 100644 index 3754efe377..0000000000 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternFormatedFullDateDateExtractor.java +++ /dev/null @@ -1,66 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.edtf.DateQualification; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; -import eu.europeana.normalization.dates.extraction.DateExtractionException; -import eu.europeana.normalization.dates.extraction.MonthMultilingual; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * Patterns for date formats that are well-structured but do not follow a particular standard - */ -public class PatternFormatedFullDateDateExtractor extends AbstractDateExtractor { - - MonthMultilingual monthNames = new MonthMultilingual(); - - // "Thu Dec 31 01:00:00 CET 1863","31 Dec 1863" - // month day hour minute second year - Pattern patFormatedDate = Pattern - .compile("\\w{3} (\\w{3}) (\\d{2}) (\\d{2}):(\\d{2}):(\\d{2}) \\w{3,4} (\\d{1,4})"); - - // 2020-06-21 13:43:26 UTC - // year month day hour minute second - Pattern patFormatedDate2 = Pattern - .compile("(\\d{4})-(\\d{2})-(\\d{2}) (\\d{2}):(\\d{2}):(\\d{2}) \\w{3,4}\\s?(\\d{0,4})"); - - // 2018-03-27 09:08:34 - // year month day hour minute second - Pattern patFormatedDate3 = Pattern.compile("(\\d{4})-(\\d{2})-(\\d{2}) (\\d{2}):(\\d{2}):(\\d{2})(\\.\\d{1,3})?"); - - @Override - public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - boolean flexibleDateBuild) throws DateExtractionException { - final DateQualification dateQualification = computeDateQualification(requestedDateQualification, - () -> DateQualification.NO_QUALIFICATION); - - Matcher m = patFormatedDate2.matcher(inputValue); - if (m.matches()) { - final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group(1))) - .withMonth(Integer.parseInt(m.group(2))) - .withDay(Integer.parseInt(m.group(3))) - .withDateQualification(dateQualification).withFlexibleDateBuild(flexibleDateBuild).build(); - return new DateNormalizationResult(DateNormalizationExtractorMatchId.FORMATTED_FULL_DATE, inputValue, datePart); - } - m = patFormatedDate.matcher(inputValue); - if (m.matches()) { - final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group(6))) - .withMonth(monthNames.getMonthIndexValue(m.group(1))) - .withDay(Integer.parseInt(m.group(2))) - .withDateQualification(dateQualification).withFlexibleDateBuild(flexibleDateBuild).build(); - return new DateNormalizationResult(DateNormalizationExtractorMatchId.FORMATTED_FULL_DATE, inputValue, datePart); - } - m = patFormatedDate3.matcher(inputValue); - if (m.matches()) { - final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group(1))) - .withMonth(Integer.parseInt(m.group(2))) - .withDay(Integer.parseInt(m.group(3))) - .withDateQualification(dateQualification).withFlexibleDateBuild(flexibleDateBuild).build(); - return new DateNormalizationResult(DateNormalizationExtractorMatchId.FORMATTED_FULL_DATE, inputValue, datePart); - } - return DateNormalizationResult.getNoMatchResult(inputValue); - } -} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternLongNegativeYearDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternLongNegativeYearDateExtractor.java deleted file mode 100644 index 7e8b9dc942..0000000000 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternLongNegativeYearDateExtractor.java +++ /dev/null @@ -1,60 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN; - -import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.edtf.DateQualification; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDate; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder; -import eu.europeana.normalization.dates.extraction.DateExtractionException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * A year before 1 AD with more than 4 digits. This pattern is typically used in archaeological contexts. The year may contain - * between 5 and 9 digits. Aso includes the pattern for ranges of this kind of years. - */ -public class PatternLongNegativeYearDateExtractor extends AbstractDateExtractor { - - Pattern patYyyyyy = Pattern.compile("\\s*(?\\?)?(?-\\d{5,9})(?\\?)?\\s*", - Pattern.CASE_INSENSITIVE); - Pattern patYyyyyyRange = Pattern.compile( - "\\s*(?\\?)?(?-\\d{5,9})\\s*/\\s*(?-\\d{5,9})(?\\?)?\\s*", - Pattern.CASE_INSENSITIVE); - - @Override - public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - boolean flexibleDateBuild) throws DateExtractionException { - final DateQualification dateQualification; - - final Matcher m = patYyyyyy.matcher(inputValue); - if (m.matches()) { - dateQualification = - computeDateQualification(requestedDateQualification, - () -> (m.group("uncertain") != null || m.group("uncertain2") != null) ? UNCERTAIN : NO_QUALIFICATION); - - final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year"))).withDateQualification( - dateQualification).withFlexibleDateBuild(flexibleDateBuild).build(); - return new DateNormalizationResult(DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR, inputValue, datePart); - } - final Matcher m2 = patYyyyyyRange.matcher(inputValue); - if (m2.matches()) { - dateQualification = - computeDateQualification(requestedDateQualification, - () -> (m2.group("uncertain") != null || m2.group("uncertain2") != null) ? UNCERTAIN : NO_QUALIFICATION); - - final InstantEdtfDate startDatePart = new InstantEdtfDateBuilder(Integer.parseInt(m2.group("year"))).withDateQualification( - dateQualification).withFlexibleDateBuild(flexibleDateBuild).build(); - final InstantEdtfDate endDatePart = new InstantEdtfDateBuilder(Integer.parseInt(m2.group("year2"))).withDateQualification( - dateQualification).withFlexibleDateBuild(flexibleDateBuild).build(); - IntervalEdtfDate intervalEdtfDate = new IntervalEdtfDateBuilder(startDatePart, endDatePart).withFlexibleDateBuild( - flexibleDateBuild).build(); - return new DateNormalizationResult(DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR, inputValue, intervalEdtfDate); - } - return DateNormalizationResult.getNoMatchResult(inputValue); - } -} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternMonthNameDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternMonthNameDateExtractor.java deleted file mode 100644 index d5e594cc6e..0000000000 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternMonthNameDateExtractor.java +++ /dev/null @@ -1,92 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.edtf.DateQualification; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; -import eu.europeana.normalization.dates.extraction.DateExtractionException; -import eu.europeana.normalization.dates.extraction.MonthMultilingual; -import java.time.Month; -import java.util.HashMap; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * A date where the month is specified by its name or an abbreviation. Supports all the official languages of the European Union - */ -public class PatternMonthNameDateExtractor extends AbstractDateExtractor { - - HashMap patternDayMonthYear = new HashMap<>(12); - HashMap patternMonthDayYear = new HashMap<>(12); - HashMap patternMonthYear = new HashMap<>(12); - - public PatternMonthNameDateExtractor() { - MonthMultilingual months = new MonthMultilingual(); - for (Month month : Month.values()) { - String monthNamesPattern = null; - for (String m : months.getMonthStrings(month)) { - if (monthNamesPattern == null) { - monthNamesPattern = "(?"; - } else { - monthNamesPattern += "|"; - } - monthNamesPattern += m.replaceAll("\\.", "\\."); - } - monthNamesPattern += ")"; - - patternDayMonthYear - .put(month, - Pattern.compile( - "\\s*(?\\d\\d?)[ .,]([a-zA-Z]{0,2}[ .,])?" + monthNamesPattern - + "[ .,]([a-zA-Z]{0,2}[ .,])?(?\\d{4})\\s*", - Pattern.CASE_INSENSITIVE)); - patternMonthDayYear.put(month, Pattern.compile("\\s*" + monthNamesPattern - + "[ .,]([a-zA-Z]{0,2}[ .,])?(?\\d\\d?)[ .,][a-zA-Z]{0,2}[ .,](?\\d{4})\\s*", - Pattern.CASE_INSENSITIVE)); - patternMonthYear.put(month, - Pattern.compile("\\s*" + monthNamesPattern + "[ .,]([a-zA-Z]{0,2}[ .,])?(?\\d{4})\\s*", - Pattern.CASE_INSENSITIVE)); - } - } - - @Override - public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - boolean flexibleDateBuild) throws DateExtractionException { - final DateQualification dateQualification = computeDateQualification(requestedDateQualification, - () -> DateQualification.NO_QUALIFICATION); - - for (Month month : Month.values()) { - Matcher m = patternDayMonthYear.get(month).matcher(inputValue); - if (m.matches()) { - final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year"))) - .withMonth(month.getValue()) - .withDay(Integer.parseInt(m.group("day"))) - .withDateQualification(dateQualification) - .withFlexibleDateBuild(flexibleDateBuild) - .build(); - return new DateNormalizationResult(DateNormalizationExtractorMatchId.MONTH_NAME, inputValue, datePart); - } - m = patternMonthDayYear.get(month).matcher(inputValue); - if (m.matches()) { - final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year"))) - .withMonth(month.getValue()) - .withDay(Integer.parseInt(m.group("day"))) - .withDateQualification(dateQualification) - .withFlexibleDateBuild(flexibleDateBuild) - .build(); - return new DateNormalizationResult(DateNormalizationExtractorMatchId.MONTH_NAME, inputValue, datePart); - } - m = patternMonthYear.get(month).matcher(inputValue); - if (m.matches()) { - final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year"))) - .withMonth(month.getValue()) - .withDateQualification(dateQualification) - .withFlexibleDateBuild(flexibleDateBuild) - .build(); - return new DateNormalizationResult(DateNormalizationExtractorMatchId.MONTH_NAME, inputValue, datePart); - } - } - return DateNormalizationResult.getNoMatchResult(inputValue); - } -} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractDateExtractor.java new file mode 100644 index 0000000000..ab6a047b70 --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractDateExtractor.java @@ -0,0 +1,75 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult; +import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN; +import static java.lang.String.format; + +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.edtf.DateQualification; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer; +import java.lang.invoke.MethodHandles; +import java.util.EnumSet; +import java.util.Set; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Abstract class implementing interface {@link DateExtractor} with default functionality for all extractors + */ +public abstract class AbstractDateExtractor implements DateExtractor { + + private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + static final String OPTIONAL_QUESTION_MARK_REGEX = "\\??"; + + /** + * Reusable default checking of Date qualification on an input. + * + * @param inputValue the input value + * @return the date qualification + */ + public Set getQualification(String inputValue) { + final Set dateQualifications = EnumSet.noneOf(DateQualification.class); + if (inputValue.startsWith("?") || inputValue.endsWith("?")) { + dateQualifications.add(UNCERTAIN); + } + return dateQualifications; + } + + /** + * Utility method for calling {@link DateExtractor#extract(String, boolean)} with flexibleDateBuild as true. + *

    It also captures relevant exceptions so that return is performed

    + * + * @param inputValue the input value + * @return the date normalization result + */ + @Override + public DateNormalizationResult extractDateProperty(String inputValue) { + return getDateNormalizationResult(inputValue, true); + } + + /** + * Utility method for calling {@link DateExtractor#extract(String, boolean)} with flexibleDateBuild as false. + *

    It also captures relevant exceptions so that return is performed

    + * + * @param inputValue the input value + * @return the date normalization result + */ + @Override + public DateNormalizationResult extractGenericProperty(String inputValue) { + return getDateNormalizationResult(inputValue, false); + } + + private DateNormalizationResult getDateNormalizationResult(String inputValue, boolean flexibleDateBuild) { + final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue); + DateNormalizationResult dateNormalizationResult; + try { + dateNormalizationResult = extract(sanitizedValue, flexibleDateBuild); + } catch (DateExtractionException e) { + LOGGER.debug(format("Date extraction failed %s: ", sanitizedValue), e); + dateNormalizationResult = getNoMatchResult(inputValue); + } + + return dateNormalizationResult; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractRangeDateExtractor.java new file mode 100644 index 0000000000..92a354ef5b --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractRangeDateExtractor.java @@ -0,0 +1,69 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; +import eu.europeana.normalization.dates.edtf.IntervalEdtfDate; +import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.DatesSeparator; +import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +/** + * The abstract class adding the option to a reusable range extractor functionality. + *

    It is a generic way to capture ranges for all implementations. It is based on required methods being implemented from + * {@link RangeDateExtractor} interface.

    + * + * @param the object containing delimiters/separators for dates + */ +public abstract class AbstractRangeDateExtractor extends AbstractDateExtractor implements + RangeDateExtractor { + + public static final int KEEP_EMPTY_SPLITS_LIMIT_VALUE = -1; + /** + * The date split has to be exactly two. This also guarantees that the separator used is not used for unknown characters. + */ + public static final int VALID_SPLIT_SIZE = 2; + + /** + * Extract the date normalization result for a range. + *

    + * The date is split in two boundaries using the {@link T} to provide the separators. The result will contain the first split + * that is exactly splitting the original value in two parts(boundaries) and those two boundaries are valid parsable boundaries + * or null if none found. + *

    + * + * @param inputValue the range value to attempt parsing + * @param flexibleDateBuild the flag indicating if during creating of the dates we are flexible with validation + * @return the date normalization result + * @throws DateExtractionException if anything happened during the extraction of the date + */ + @Override + public DateNormalizationResult extract(String inputValue, boolean flexibleDateBuild) throws DateExtractionException { + DateNormalizationResult rangeDate = DateNormalizationResult.getNoMatchResult(inputValue); + for (T rangeDateQualifier : getRangeDateQualifiers()) { + final List sanitizedDateList = + Arrays.stream(inputValue.split(rangeDateQualifier.getStringRepresentation(), KEEP_EMPTY_SPLITS_LIMIT_VALUE)) + .map(DateFieldSanitizer::cleanSpacesAndTrim).collect( + Collectors.toList()); + if (sanitizedDateList.size() == VALID_SPLIT_SIZE) { + final DateNormalizationResultRangePair dateNormalizationResultRangePair = extractDateNormalizationResult( + sanitizedDateList.get(0), sanitizedDateList.get(1), rangeDateQualifier, flexibleDateBuild); + final DateNormalizationResult startResult = dateNormalizationResultRangePair.getStartDateNormalizationResult(); + final DateNormalizationResult endResult = dateNormalizationResultRangePair.getEndDateNormalizationResult(); + if (isRangeMatchSuccess(rangeDateQualifier, startResult, endResult)) { + final DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId = + getDateNormalizationExtractorId(startResult, endResult); + final IntervalEdtfDate intervalEdtfDate = new IntervalEdtfDateBuilder((InstantEdtfDate) startResult.getEdtfDate(), + (InstantEdtfDate) endResult.getEdtfDate()).withAllowStartEndSwap(flexibleDateBuild).build(); + rangeDate = new DateNormalizationResult(dateNormalizationExtractorMatchId, inputValue, intervalEdtfDate); + break; + } + } + } + return rangeDate; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractor.java new file mode 100644 index 0000000000..81e51a137e --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractor.java @@ -0,0 +1,70 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult; +import static java.util.regex.Pattern.compile; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.EuropeanLanguage; +import java.text.DateFormatSymbols; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * A year with an indication of the era in european languages + *

    + * Some examples: + *

      + *
    • 1989 BC
    • + *
    • 1989 AD
    • + *
    • 1989 π.Χ.
    • + *
    • 1989 μ.Χ.
    • + *
    + *

    + */ +public class BcAdDateExtractor extends AbstractDateExtractor { + + private static final String YEAR_REGEX = "(\\d{1,4})"; + private static final String DELIMITERS_REGEX = " "; + private static final Set adAbbreviations = new HashSet<>(); + private static final Pattern pattern; + + static { + final Set bcAbbreviations = new HashSet<>(); + for (EuropeanLanguage europeanLanguage : EuropeanLanguage.values()) { + final DateFormatSymbols symbols = DateFormatSymbols.getInstance(europeanLanguage.getLocale()); + bcAbbreviations.add(symbols.getEras()[0]); + adAbbreviations.add(symbols.getEras()[1]); + } + final String abbreviationsJoinedValues = Stream.concat(bcAbbreviations.stream(), adAbbreviations.stream()) + .map(Pattern::quote) + .collect(Collectors.joining("|", "(", ")")); + pattern = compile(String.join(DELIMITERS_REGEX, YEAR_REGEX, abbreviationsJoinedValues), Pattern.CASE_INSENSITIVE); + } + + @Override + public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException { + DateNormalizationResult dateNormalizationResult = getNoMatchResult(inputValue); + + Matcher matcher = pattern.matcher(inputValue); + if (matcher.matches()) { + final int year = Integer.parseInt(matcher.group(1)); + //Year should not be 0 on an era + if (year != 0) { + final boolean isAd = adAbbreviations.contains(matcher.group(2)); + int yearSign = isAd ? 1 : -1; + int yearAdjusted = (isAd ? year : (year - 1)) * yearSign; + final InstantEdtfDateBuilder instantEdtfDateBuilder = new InstantEdtfDateBuilder(yearAdjusted); + dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.BC_AD, inputValue, + instantEdtfDateBuilder.build()); + } + } + return dateNormalizationResult; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractor.java new file mode 100644 index 0000000000..171e8b92fe --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractor.java @@ -0,0 +1,44 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.DateNormalizationResultStatus; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.DefaultDatesSeparator; +import java.util.List; + +/** + * Extractor for BC and AD date ranges with variations in the separators of date components. + *

    We reuse the already existent {@link BcAdDateExtractor} code for the boundaries.

    + */ +public class BcAdRangeDateExtractor extends AbstractRangeDateExtractor { + + private static final BcAdDateExtractor BC_AD_DATE_EXTRACTOR = new BcAdDateExtractor(); + + @Override + public DateNormalizationResultRangePair extractDateNormalizationResult(String startString, String endString, + DefaultDatesSeparator rangeDateDelimiters, + boolean allowDayMonthSwap) throws DateExtractionException { + return new DateNormalizationResultRangePair( + BC_AD_DATE_EXTRACTOR.extract(startString, allowDayMonthSwap), + BC_AD_DATE_EXTRACTOR.extract(endString, allowDayMonthSwap)); + } + + @Override + public List getRangeDateQualifiers() { + return List.of(DefaultDatesSeparator.values()); + } + + @Override + public boolean isRangeMatchSuccess(DefaultDatesSeparator rangeDateDelimiters, DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED + && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED; + } + + @Override + public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return DateNormalizationExtractorMatchId.BC_AD; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractor.java new file mode 100644 index 0000000000..69353a8b7f --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractor.java @@ -0,0 +1,125 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult; +import static eu.europeana.normalization.dates.YearPrecision.CENTURY; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.DateNormalizationResultStatus; +import eu.europeana.normalization.dates.edtf.DateQualification; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; +import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.DefaultDatesSeparator; +import java.time.Month; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Extractor that matches a date range where the end year includes only the rightmost two digits. + *

    + * The end year in this extractor has to: + *

      + *
    • Be higher than 12(or lower than -12) to avoid matching a month value from other extractors.
    • + *
    • Be higher than the two rightmost digits of the start year.
    • + *
    + *

    + *

    + * This pattern needs to be executed before the Edtf extractor because EDTF could potentially match yyyy/MM and yyyy-MM. + * Therefore in this extractor we check only the values that are higher than 12 to avoid a mismatch. + *

    + */ +public class BriefRangeDateExtractor extends AbstractRangeDateExtractor { + + private static final Pattern YEAR_PATTERN = Pattern.compile( + OPTIONAL_QUESTION_MARK_REGEX + "(\\d{2,4})" + OPTIONAL_QUESTION_MARK_REGEX); + + @Override + public DateNormalizationResultRangePair extractDateNormalizationResult(String startString, + String endString, DefaultDatesSeparator rangeDateDelimiters, + boolean allowDayMonthSwap) throws DateExtractionException { + final DateNormalizationResult startDateNormalizationResult = + extractStartDateNormalizationResult(startString, allowDayMonthSwap); + final DateNormalizationResult endDateNormalizationResult = + extractEndDateNormalizationResult(startDateNormalizationResult, endString, allowDayMonthSwap); + return new DateNormalizationResultRangePair(startDateNormalizationResult, endDateNormalizationResult); + } + + private DateNormalizationResult extractStartDateNormalizationResult(String dateString, boolean allowDayMonthSwap) + throws DateExtractionException { + DateNormalizationResult dateNormalizationResult = getNoMatchResult(dateString); + final DateNormalizationResult startYearDateDateNormalizationResult = extractYear(dateString, allowDayMonthSwap); + + if (startYearDateDateNormalizationResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED) { + int absoluteYear = Math.abs(((InstantEdtfDate) startYearDateDateNormalizationResult.getEdtfDate()).getYear().getValue()); + int startYearDigitsLength = (int) (Math.log10(absoluteYear) + 1); + if (startYearDigitsLength > 2) { + dateNormalizationResult = startYearDateDateNormalizationResult; + } + } + + return dateNormalizationResult; + } + + private DateNormalizationResult extractEndDateNormalizationResult(DateNormalizationResult startDateNormalizationResult, + String dateString, boolean allowDayMonthSwap) throws DateExtractionException { + DateNormalizationResult dateNormalizationResult = getNoMatchResult(dateString); + if (startDateNormalizationResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED) { + final DateNormalizationResult endDateNormalizationResult = extractYear(dateString, allowDayMonthSwap); + + if (endDateNormalizationResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED) { + final Set endDateQualifications = endDateNormalizationResult.getEdtfDate().getDateQualifications(); + + final int startYearFourDigits = ((InstantEdtfDate) startDateNormalizationResult.getEdtfDate()).getYear().getValue(); + final int startYearLastTwoDigits = startYearFourDigits % CENTURY.getDuration(); + final int endYear = ((InstantEdtfDate) endDateNormalizationResult.getEdtfDate()).getYear().getValue(); + + int absoluteEndYear = Math.abs(endYear); + int endYearDigitsLength = (int) (Math.log10(absoluteEndYear) + 1); + if (endYearDigitsLength == 2 && Math.abs(endYear) > Month.DECEMBER.getValue() && startYearLastTwoDigits < endYear) { + final int endYearFourDigits = (startYearFourDigits / CENTURY.getDuration()) * CENTURY.getDuration() + endYear; + final InstantEdtfDate endInstantEdtfDate = new InstantEdtfDateBuilder(endYearFourDigits).withDateQualification( + endDateQualifications).withAllowDayMonthSwap(allowDayMonthSwap).build(); + dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.BRIEF_DATE_RANGE, dateString, + endInstantEdtfDate); + } + } + } + + return dateNormalizationResult; + } + + private DateNormalizationResult extractYear(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException { + DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue); + final Matcher matcher = YEAR_PATTERN.matcher(inputValue); + if (matcher.matches()) { + final int year = Integer.parseInt(matcher.group(1)); + final InstantEdtfDate instantEdtfDate = new InstantEdtfDateBuilder(year).withDateQualification(getQualification(inputValue)) + .withAllowDayMonthSwap(allowDayMonthSwap).build(); + dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.BRIEF_DATE_RANGE, inputValue, + instantEdtfDate); + } + return dateNormalizationResult; + } + + @Override + public List getRangeDateQualifiers() { + return List.of(DefaultDatesSeparator.values()); + } + + @Override + public boolean isRangeMatchSuccess(DefaultDatesSeparator rangeDateDelimiters, DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED + && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED; + } + + @Override + public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return DateNormalizationExtractorMatchId.BRIEF_DATE_RANGE; + } +} + diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractor.java new file mode 100644 index 0000000000..9668dc7ffd --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractor.java @@ -0,0 +1,87 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.YearPrecision.CENTURY; +import static java.util.regex.Pattern.CASE_INSENSITIVE; +import static java.util.regex.Pattern.compile; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; +import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import java.util.function.ToIntFunction; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Extractor that matches a century with a decimal numerals. + *

    The range of values this accepts are from 1-21 including.

    + *

    Examples of some cases: + *

      + *
    • + * Value = 18.. | Outcome = 18XX + * Value = 1st century | Outcome = 00XX + *
    • + *
    + *

    + */ +public class CenturyNumericDateExtractor extends AbstractDateExtractor { + + private static final String NUMERIC_10_TO_21_ENDING_DOTS_REGEX = "(1\\d|2[0-1])\\.{2}"; + private static final String NUMERIC_1_TO_21_SUFFIXED_REGEX = "(2?1st|2nd|3rd|(?:1\\d|[4-9]|20)th)\\scentury"; + + private enum CenturyNumericDatePattern { + PATTERN_YYYY( + compile(OPTIONAL_QUESTION_MARK_REGEX + NUMERIC_10_TO_21_ENDING_DOTS_REGEX + OPTIONAL_QUESTION_MARK_REGEX, + CASE_INSENSITIVE), + Integer::parseInt, DateNormalizationExtractorMatchId.CENTURY_NUMERIC), + PATTERN_ENGLISH( + compile(OPTIONAL_QUESTION_MARK_REGEX + NUMERIC_1_TO_21_SUFFIXED_REGEX + OPTIONAL_QUESTION_MARK_REGEX, CASE_INSENSITIVE), + century -> (Integer.parseInt(century.substring(0, century.length() - 2)) - 1), + DateNormalizationExtractorMatchId.CENTURY_NUMERIC); + + private final Pattern pattern; + private final ToIntFunction centuryExtractorFunction; + private final DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId; + + CenturyNumericDatePattern(Pattern pattern, ToIntFunction centuryExtractorFunction, + DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { + this.pattern = pattern; + this.centuryExtractorFunction = centuryExtractorFunction; + this.dateNormalizationExtractorMatchId = dateNormalizationExtractorMatchId; + } + + public Pattern getPattern() { + return pattern; + } + + public ToIntFunction getCenturyExtractorFunction() { + return centuryExtractorFunction; + } + + public DateNormalizationExtractorMatchId getDateNormalizationExtractorMatchId() { + return dateNormalizationExtractorMatchId; + } + } + + @Override + public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException { + DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue); + for (CenturyNumericDatePattern centerNumericDatePattern : CenturyNumericDatePattern.values()) { + final Matcher matcher = centerNumericDatePattern.getPattern().matcher(inputValue); + if (matcher.matches()) { + final String century = matcher.group(1); + InstantEdtfDateBuilder instantEdtfDateBuilder = new InstantEdtfDateBuilder( + centerNumericDatePattern.getCenturyExtractorFunction().applyAsInt(century)) + .withYearPrecision(CENTURY); + InstantEdtfDate instantEdtfDate = instantEdtfDateBuilder.withDateQualification(getQualification(inputValue)) + .withAllowDayMonthSwap(allowDayMonthSwap).build(); + dateNormalizationResult = + new DateNormalizationResult(centerNumericDatePattern.getDateNormalizationExtractorMatchId(), inputValue, + instantEdtfDate); + break; + } + } + return dateNormalizationResult; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanDateExtractor.java new file mode 100644 index 0000000000..6cc2ad3ae1 --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanDateExtractor.java @@ -0,0 +1,48 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.YearPrecision.CENTURY; +import static java.util.regex.Pattern.CASE_INSENSITIVE; +import static java.util.regex.Pattern.compile; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.RomanToNumber; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Extractor that matches a century with Roman numerals + *

    The range of values this accepts are from 1-21 including. + * The Roman numerals may also be preceded by an abbreviation of century, for example ‘s. XIX’.

    + *

    Examples of some cases: + *

      + *
    • + * Value = s. XX | Outcome = 19XX + * Value = s. XXI | Outcome = 20XX + *
    • + *
    + *

    + */ +public class CenturyRomanDateExtractor extends AbstractDateExtractor { + + private static final String CENTURY_PREFIX = "(?:(?:s|sec|saec)\\s|(?:s|sec|saec)\\.\\s?)?"; + private static final String ROMAN_1_TO_21_REGEX = "(X?(?:IX|IV|VI{0,3}|I{1,3})|X|XXI?)"; + private static final Pattern ROMAN_2_TO_21_PATTERN = compile( + OPTIONAL_QUESTION_MARK_REGEX + CENTURY_PREFIX + ROMAN_1_TO_21_REGEX + OPTIONAL_QUESTION_MARK_REGEX, CASE_INSENSITIVE); + + @Override + public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException { + DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue); + final Matcher matcher = ROMAN_2_TO_21_PATTERN.matcher(inputValue); + if (matcher.matches()) { + final int century = RomanToNumber.romanToDecimal(matcher.group(1)) - 1; + final InstantEdtfDateBuilder instantEdtfDateBuilder = + new InstantEdtfDateBuilder(century).withYearPrecision(CENTURY).withDateQualification(getQualification(inputValue)); + dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.CENTURY_ROMAN, + inputValue, instantEdtfDateBuilder.build()); + } + return dateNormalizationResult; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractor.java new file mode 100644 index 0000000000..168d1a2a52 --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractor.java @@ -0,0 +1,46 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.DateNormalizationResultStatus; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.DefaultDatesSeparator; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +/** + * Extractor for Roman century ranges. + *

    We reuse the already existent {@link CenturyRomanDateExtractor} code for the boundaries.

    + */ +public class CenturyRomanRangeDateExtractor extends AbstractRangeDateExtractor { + + private static final CenturyRomanDateExtractor ROMAN_CENTURY_DATE_EXTRACTOR = new CenturyRomanDateExtractor(); + + @Override + public DateNormalizationResultRangePair extractDateNormalizationResult(String startString, String endString, + DefaultDatesSeparator rangeDateDelimiters, + boolean allowDayMonthSwap) throws DateExtractionException { + return new DateNormalizationResultRangePair( + ROMAN_CENTURY_DATE_EXTRACTOR.extract(startString, allowDayMonthSwap), + ROMAN_CENTURY_DATE_EXTRACTOR.extract(endString, allowDayMonthSwap)); + } + + @Override + public List getRangeDateQualifiers() { + return new ArrayList<>(EnumSet.of(DefaultDatesSeparator.DASH_DELIMITER)); + } + + @Override + public boolean isRangeMatchSuccess(DefaultDatesSeparator rangeDateDelimiters, DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED + && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED; + } + + @Override + public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return DateNormalizationExtractorMatchId.CENTURY_RANGE_ROMAN; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractor.java similarity index 58% rename from metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DateExtractor.java rename to metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractor.java index 08ab31305e..0cbedfcfc5 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DateExtractor.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractor.java @@ -1,11 +1,10 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; +package eu.europeana.normalization.dates.extraction.extractors; import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.edtf.DateQualification; import eu.europeana.normalization.dates.extraction.DateExtractionException; /** - * The interface for all the implementation of date patterns + * The interface for date extractors. */ public interface DateExtractor { @@ -13,30 +12,27 @@ public interface DateExtractor { * Extractor of a date normalization operation. * * @param inputValue the value containing the date - * @param requestedDateQualification the overwriting value of date qualification, if any * @param flexibleDateBuild the flag indicating if during creating of the dates we are flexible with validation * @return the date normalization result * @throws DateExtractionException if anything happened during the extraction of the date */ - DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, boolean flexibleDateBuild) + DateNormalizationResult extract(String inputValue, boolean flexibleDateBuild) throws DateExtractionException; /** * Extractor of a date normalization operation for date properties * * @param inputValue the value containing the date - * @param requestedDateQualification the overwriting value of date qualification, if any * @return the date normalization result */ - DateNormalizationResult extractDateProperty(String inputValue, DateQualification requestedDateQualification); + DateNormalizationResult extractDateProperty(String inputValue); /** * Extractor of a date normalization operation for generic properties * * @param inputValue the value containing the date - * @param requestedDateQualification the overwriting value of date qualification, if any * @return the date normalization result */ - DateNormalizationResult extractGenericProperty(String inputValue, DateQualification requestedDateQualification); + DateNormalizationResult extractGenericProperty(String inputValue); } diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractor.java similarity index 75% rename from metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractor.java rename to metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractor.java index c2d23bae6e..a3ddde3604 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractor.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractor.java @@ -1,9 +1,8 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; +package eu.europeana.normalization.dates.extraction.extractors; import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; import eu.europeana.normalization.dates.DateNormalizationResult; import eu.europeana.normalization.dates.edtf.DateBoundaryType; -import eu.europeana.normalization.dates.edtf.DateQualification; import eu.europeana.normalization.dates.edtf.InstantEdtfDate; import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; import eu.europeana.normalization.dates.edtf.IntervalEdtfDate; @@ -27,8 +26,8 @@ public class DcmiPeriodDateExtractor extends AbstractDateExtractor { private static final String NON_SPACE_NON_SEMICOLON = "[^\\s;]*"; private static final String NON_SPACE_NON_LINE_END = "[^\\s$]*"; private static final String VALUE_ENDING = "(?:;|$)"; - private static final String SPACE_VALUE_ENDING = "\\s*" + VALUE_ENDING; - private static final String EQUALS_SPACES_WRAPPED = "\\s*=\\s*"; + private static final String SPACE_VALUE_ENDING = "\\s?" + VALUE_ENDING; + private static final String EQUALS_SPACES_WRAPPED = "\\s?=\\s?"; private static final String DCMI_FIELD_REGEX = EQUALS_SPACES_WRAPPED + "(" + NON_SPACE_NON_SEMICOLON + "|" + NON_SPACE_NON_LINE_END + ")" + SPACE_VALUE_ENDING; private static final Pattern DCMI_PERIOD_SCHEME_PATTERN = Pattern.compile("scheme" + DCMI_FIELD_REGEX); @@ -48,22 +47,22 @@ public class DcmiPeriodDateExtractor extends AbstractDateExtractor { private static final Set W3C_DTF_SCHEME_VALUES = Set.of("W3C-DTF", "W3CDTF"); @Override - public DateNormalizationResult extract(String value, DateQualification requestedDateQualification, - boolean flexibleDateBuild) throws DateExtractionException { + public DateNormalizationResult extract(String value, boolean flexibleDateBuild) throws DateExtractionException { DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(value); if (isValidScheme(value)) { Matcher matcher = DCMI_PERIOD_START_PATTERN.matcher(value); - InstantEdtfDate start = extractDate(matcher, requestedDateQualification, flexibleDateBuild); + final InstantEdtfDate start = extractDate(matcher, flexibleDateBuild); matcher = DCMI_PERIOD_END_PATTERN.matcher(value); - InstantEdtfDate end = extractDate(matcher, requestedDateQualification, flexibleDateBuild); + final InstantEdtfDate end = extractDate(matcher, flexibleDateBuild); String name = extractName(value); //At least one end has to be specified if (start.getDateBoundaryType() == DateBoundaryType.DECLARED || end.getDateBoundaryType() == DateBoundaryType.DECLARED) { - IntervalEdtfDate intervalEdtfDate = new IntervalEdtfDateBuilder(start, end).withLabel(name) - .withFlexibleDateBuild( - flexibleDateBuild) - .build(); + final IntervalEdtfDate intervalEdtfDate = + new IntervalEdtfDateBuilder(start, end) + .withLabel(name) + .withAllowStartEndSwap(flexibleDateBuild) + .build(); dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.DCMI_PERIOD, value, intervalEdtfDate); } @@ -97,17 +96,13 @@ private static boolean isValidScheme(String dcmiPeriod) { return isValidScheme; } - private InstantEdtfDate extractDate(Matcher matcher, DateQualification requestedDateQualification, - boolean allowSwitchMonthDay) throws DateExtractionException { + private InstantEdtfDate extractDate(Matcher matcher, boolean allowDayMonthSwap) throws DateExtractionException { InstantEdtfDate instantEdtfDate = null; if (matcher.find()) { final String fieldValue = matcher.group(1); if (StringUtils.isNotBlank(fieldValue)) { TemporalAccessor temporalAccessor = ISO_8601_PARSER.parseDatePart(fieldValue); - DateQualification dateQualification = computeDateQualification(requestedDateQualification, - () -> DateQualification.NO_QUALIFICATION); - instantEdtfDate = new InstantEdtfDateBuilder(temporalAccessor).withDateQualification(dateQualification) - .withFlexibleDateBuild(allowSwitchMonthDay).build(); + instantEdtfDate = new InstantEdtfDateBuilder(temporalAccessor).withAllowDayMonthSwap(allowDayMonthSwap).build(); } //if we find it again we declare invalid if (matcher.find()) { diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractor.java similarity index 59% rename from metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractor.java rename to metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractor.java index 94be631ddc..abd14233e1 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractor.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractor.java @@ -1,16 +1,12 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; +package eu.europeana.normalization.dates.extraction.extractors; import static eu.europeana.normalization.dates.YearPrecision.DECADE; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN; import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.edtf.DateQualification; import eu.europeana.normalization.dates.edtf.InstantEdtfDate; import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; import eu.europeana.normalization.dates.extraction.DateExtractionException; -import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -35,22 +31,18 @@ */ public class DecadeDateExtractor extends AbstractDateExtractor { - private static final Pattern decadePattern = Pattern.compile("\\??(\\d{3})(?:[XU]\\??|\\?\\?)", Pattern.CASE_INSENSITIVE); + private static final Pattern decadePattern = Pattern.compile( + OPTIONAL_QUESTION_MARK_REGEX + "(\\d{3})(?:[XU]" + OPTIONAL_QUESTION_MARK_REGEX + "|\\?\\?)", Pattern.CASE_INSENSITIVE); @Override - public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - boolean flexibleDateBuild) throws DateExtractionException { - final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue); - final DateQualification dateQualification = computeDateQualification(requestedDateQualification, () -> - (sanitizedValue.startsWith("?") || sanitizedValue.endsWith("?")) ? UNCERTAIN : NO_QUALIFICATION); - + public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException { DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue); - final Matcher matcher = decadePattern.matcher(sanitizedValue); + final Matcher matcher = decadePattern.matcher(inputValue); if (matcher.matches()) { final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(matcher.group(1))) .withYearPrecision(DECADE) - .withDateQualification(dateQualification) - .withFlexibleDateBuild(flexibleDateBuild) + .withDateQualification(getQualification(inputValue)) + .withAllowDayMonthSwap(allowDayMonthSwap) .build(); dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.DECADE, inputValue, datePart); } diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractor.java new file mode 100644 index 0000000000..cf0f67477d --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractor.java @@ -0,0 +1,95 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder.OVER_4_DIGITS_YEAR_PREFIX; +import static eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder.THRESHOLD_4_DIGITS_YEAR; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.edtf.DateQualification; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; +import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; +import eu.europeana.normalization.dates.edtf.Iso8601Parser; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import java.lang.invoke.MethodHandles; +import java.time.temporal.TemporalAccessor; +import java.util.EnumSet; +import java.util.Set; +import java.util.regex.Matcher; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The pattern for EDTF dates and compatible with ISO 8601 dates. + *

    This parser supports partial Level0 and Level1 from the Extended + * Date/Time Format (EDTF) Specification. It only validates the date part of a date and the time if existent is discarded. + * Specifically from Level1, seasons and unspecified digit(s) from the right are not supported + *

    + */ +public class EdtfDateExtractor extends AbstractDateExtractor { + + private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Iso8601Parser ISO_8601_PARSER = new Iso8601Parser(); + + @Override + public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException { + final InstantEdtfDate instantEdtfDate = extractInstant(inputValue, allowDayMonthSwap); + return new DateNormalizationResult(DateNormalizationExtractorMatchId.EDTF, inputValue, instantEdtfDate); + } + + private InstantEdtfDate extractInstant(String dateInput, boolean allowDayMonthSwap) throws DateExtractionException { + final InstantEdtfDate instantEdtfDate; + final Integer moreThanFourDigitsYear = getMoreThanFourDigitsYear(dateInput); + if (moreThanFourDigitsYear != null) { + instantEdtfDate = new InstantEdtfDateBuilder(moreThanFourDigitsYear).withMoreThanFourDigitsYear().build(); + } else { + instantEdtfDate = extractInstantEdtfDate(dateInput, allowDayMonthSwap); + } + return instantEdtfDate; + } + + private static Integer getMoreThanFourDigitsYear(String dateInput) { + final boolean startsWithY = dateInput.startsWith(String.valueOf(OVER_4_DIGITS_YEAR_PREFIX)); + Integer longYear = null; + if (startsWithY) { + final String yearSubstring = dateInput.substring(1); + try { + //Try parsing year + longYear = Integer.parseInt(yearSubstring); + } catch (NumberFormatException er) { + LOGGER.debug("Not a valid integer at this stage"); + } + //If prefixed we have to be strict on the length + if (longYear != null && Math.abs(longYear) <= THRESHOLD_4_DIGITS_YEAR) { + longYear = null; + } + } + return longYear; + } + + @Override + public Set getQualification(String inputValue) { + final Matcher qualificationMatcher = DateQualification.PATTERN.matcher(inputValue); + Set dateQualifications = EnumSet.noneOf(DateQualification.class); + if (qualificationMatcher.matches()) { + final String modifier = qualificationMatcher.group(1); + dateQualifications = DateQualification.fromCharacter(String.valueOf(modifier.charAt(0))); + } + return dateQualifications; + } + + private InstantEdtfDate extractInstantEdtfDate(String inputValue, boolean allowDayMonthSwap) + throws DateExtractionException { + final Set dateQualifications = getQualification(inputValue); + String dateInputStrippedModifier = inputValue; + if (!dateQualifications.isEmpty()) { + dateInputStrippedModifier = inputValue.substring(0, inputValue.length() - 1); + } + + final TemporalAccessor temporalAccessor = ISO_8601_PARSER.parseDatePart(dateInputStrippedModifier); + return new InstantEdtfDateBuilder(temporalAccessor) + .withDateQualification(dateQualifications) + .withAllowDayMonthSwap(allowDayMonthSwap) + .build(); + } + +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractor.java new file mode 100644 index 0000000000..959ee3d04f --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractor.java @@ -0,0 +1,75 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult; +import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN; +import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.DateNormalizationResultStatus; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.DefaultDatesSeparator; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +/** + * Extractor for Edtf date ranges. + *

    We reuse the already existent {@link EdtfDateExtractor} code for the boundaries.

    + */ +public class EdtfRangeDateExtractor extends AbstractRangeDateExtractor { + + private static final EdtfDateExtractor EDTF_DATE_EXTRACTOR = new EdtfDateExtractor(); + + @Override + public List getRangeDateQualifiers() { + return new ArrayList<>(EnumSet.of(DefaultDatesSeparator.SLASH_DELIMITER)); + } + + @Override + public DateNormalizationResultRangePair extractDateNormalizationResult(String startString, String endString, + DefaultDatesSeparator rangeDateDelimiters, boolean allowDayMonthSwap) + throws DateExtractionException { + DateNormalizationResult startDateNormalizationResult = extractInstant(startString, allowDayMonthSwap); + DateNormalizationResult endDateNormalizationResult = extractInstant(endString, allowDayMonthSwap); + final InstantEdtfDate startInstantEdtfDate = (InstantEdtfDate) startDateNormalizationResult.getEdtfDate(); + final InstantEdtfDate endInstantEdtfDate = (InstantEdtfDate) endDateNormalizationResult.getEdtfDate(); + + //Are both ends unknown or open, then it is not a date + if ((startInstantEdtfDate.getDateBoundaryType() == UNKNOWN || startInstantEdtfDate.getDateBoundaryType() == OPEN) && + (endInstantEdtfDate.getDateBoundaryType() == UNKNOWN || endInstantEdtfDate.getDateBoundaryType() == OPEN)) { + startDateNormalizationResult = getNoMatchResult(startString); + endDateNormalizationResult = getNoMatchResult(endString); + } + + return new DateNormalizationResultRangePair(startDateNormalizationResult, endDateNormalizationResult); + } + + private DateNormalizationResult extractInstant(String dateInput, boolean allowDayMonthSwap) throws DateExtractionException { + final DateNormalizationResult dateNormalizationResult; + if (UNKNOWN.getDeserializedRepresentation().equals(dateInput)) { + dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.EDTF, dateInput, + InstantEdtfDate.getUnknownInstance()); + } else if (OPEN.getDeserializedRepresentation().equals(dateInput)) { + dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.EDTF, dateInput, + InstantEdtfDate.getOpenInstance()); + } else { + dateNormalizationResult = EDTF_DATE_EXTRACTOR.extract(dateInput, allowDayMonthSwap); + } + return dateNormalizationResult; + } + + @Override + public boolean isRangeMatchSuccess(DefaultDatesSeparator rangeDateDelimiters, DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED + && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED; + } + + @Override + public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return DateNormalizationExtractorMatchId.EDTF; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractor.java new file mode 100644 index 0000000000..40df096398 --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractor.java @@ -0,0 +1,92 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static java.lang.String.format; +import static java.time.format.DateTimeFormatter.ofPattern; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; +import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.EuropeanLanguage; +import java.lang.invoke.MethodHandles; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.time.temporal.ChronoField; +import java.util.LinkedList; +import java.util.List; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A full date pattern that does not follow a particular standard. + *

    If a timezone with or without offset is present, those are discarded and the date part is taken as such without any + * adjustment. For example a date "Wed Nov 01 01:00:00 CEST 1989" will be parsed as "1989-11-01" and not as "1989-10-31"

    + *

    + * Examples: + *

      + *
    • Wed Nov 01 01:00:00 CEST 1989
    • + *
    • 1989-11-01 04:05:06 UTC+01
    • + *
    • 1989-11-01 01:02:03
    • + *
    + *

    + */ +public class FullDateDateExtractor extends AbstractDateExtractor { + + private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final List DATE_TIME_FORMATTERS = new LinkedList<>(); + + public static final int MIN_MILLISECONDS_WIDTH = 0; + public static final int MAX_MILLISECONDS_WIDTH = 3; + + static { + DATE_TIME_FORMATTERS.add( + new DateTimeFormatterBuilder() + .append(ofPattern("EEE MMM dd HH:mm:ss zzz")) + .appendOptional(ofPattern("x")) + .append(ofPattern(" yyyy")) + .toFormatter() + ); + DATE_TIME_FORMATTERS.add( + new DateTimeFormatterBuilder() + .append(ofPattern("yyyy-MM-dd HH:mm:ss")) + .appendFraction(ChronoField.MILLI_OF_SECOND, MIN_MILLISECONDS_WIDTH, MAX_MILLISECONDS_WIDTH, true) + .optionalStart() + .append(ofPattern(" zzz")) + .appendOptional(ofPattern("x")) + .optionalEnd() + .toFormatter() + ); + } + + @Override + public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException { + for (DateTimeFormatter dateTimeFormatter : DATE_TIME_FORMATTERS) { + final LocalDateTime localDateTime = parseDateWithLocales(inputValue, dateTimeFormatter); + if (localDateTime != null) { + final InstantEdtfDate instantEdtfDate = new InstantEdtfDateBuilder(localDateTime) + .withAllowDayMonthSwap(allowDayMonthSwap) + .build(); + return new DateNormalizationResult(DateNormalizationExtractorMatchId.FORMATTED_FULL_DATE, inputValue, instantEdtfDate); + } + } + return DateNormalizationResult.getNoMatchResult(inputValue); + } + + private static LocalDateTime parseDateWithLocales(String inputValue, DateTimeFormatter dateTimeFormatter) { + LocalDateTime localDateTime = null; + for (EuropeanLanguage europeanLanguage : EuropeanLanguage.values()) { + final DateTimeFormatter dateTimeFormatterWithLocale = dateTimeFormatter.withLocale(europeanLanguage.getLocale()); + try { + localDateTime = LocalDateTime.parse(inputValue, dateTimeFormatterWithLocale); + break; + } catch (DateTimeParseException e) { + LOGGER.debug(format("Parsing date failed with date time formatter: %s, and locale: %s", dateTimeFormatterWithLocale, + dateTimeFormatterWithLocale.getLocale()), e); + } + } + return localDateTime; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractor.java new file mode 100644 index 0000000000..6cabab5821 --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractor.java @@ -0,0 +1,35 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; +import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * A year before 1 AD with more than 4 digits. This pattern is typically used in archaeological contexts. The year may contain + * between 5 and 9 digits. Aso includes the pattern for ranges of this kind of years. + */ +public class LongNegativeYearDateExtractor extends AbstractDateExtractor { + + private static final Pattern YEAR_PATTERN = Pattern.compile("(-?\\d{5,9})"); + + @Override + public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException { + DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue); + final Matcher matcher = YEAR_PATTERN.matcher(inputValue); + if (matcher.matches()) { + final int year = Integer.parseInt(matcher.group(1)); + final InstantEdtfDate instantEdtfDate = + new InstantEdtfDateBuilder(year).withDateQualification(getQualification(inputValue)) + .withMoreThanFourDigitsYear() + .withAllowDayMonthSwap(allowDayMonthSwap).build(); + dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR, inputValue, + instantEdtfDate); + } + return dateNormalizationResult; + } + +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractor.java new file mode 100644 index 0000000000..b965a8cbdf --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractor.java @@ -0,0 +1,46 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.DateNormalizationResultStatus; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.DefaultDatesSeparator; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +/** + * A year before 1 AD with more than 4 digits. This pattern is typically used in archaeological contexts. The year may contain + * between 5 and 9 digits. Aso includes the pattern for ranges of this kind of years. + */ +public class LongNegativeYearRangeDateExtractor extends AbstractRangeDateExtractor { + + private static final LongNegativeYearDateExtractor LONG_NEGATIVE_YEAR_DATE_EXTRACTOR = new LongNegativeYearDateExtractor(); + + @Override + public List getRangeDateQualifiers() { + return new ArrayList<>(EnumSet.of(DefaultDatesSeparator.SLASH_DELIMITER)); + } + + @Override + public DateNormalizationResultRangePair extractDateNormalizationResult(String startString, String endString, + DefaultDatesSeparator rangeDateDelimiters, boolean allowDayMonthSwap) + throws DateExtractionException { + return new DateNormalizationResultRangePair( + LONG_NEGATIVE_YEAR_DATE_EXTRACTOR.extract(startString, allowDayMonthSwap), + LONG_NEGATIVE_YEAR_DATE_EXTRACTOR.extract(endString, allowDayMonthSwap)); + } + + @Override + public boolean isRangeMatchSuccess(DefaultDatesSeparator rangeDateDelimiters, DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED + && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED; + } + + @Override + public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractor.java new file mode 100644 index 0000000000..3df77433af --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractor.java @@ -0,0 +1,117 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult; +import static eu.europeana.normalization.dates.extraction.DatePartsIndices.DMY_INDICES; +import static eu.europeana.normalization.dates.extraction.DatePartsIndices.MDY_INDICES; +import static eu.europeana.normalization.dates.extraction.DatePartsIndices.MY_INDICES; +import static java.util.regex.Pattern.compile; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.DateNormalizationResultStatus; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; +import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.DatePartsIndices; +import eu.europeana.normalization.dates.extraction.MonthMultilingual; +import java.lang.invoke.MethodHandles; +import java.time.Month; +import java.util.Arrays; +import java.util.Optional; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Extractor that matches dates which contain months represented by their name, in all the 24 european languages. + * + *

    Examples of some cases: + *

      + *
    • 01 November 1989
    • + *
    • 01.November.1989
    • + *
    • 01,November,1989
    • + *
    • November 01 1989
    • + *
    • November 1989
    • + *
    + *

    + */ +public class MonthNameDateExtractor extends AbstractDateExtractor { + + private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final String DELIMITERS_REGEX = "[ .,]"; + private static final String YEAR_REGEX = "(\\d{4})"; + private static final String DAY_REGEX = "(\\d{1,2})"; + + private static final MonthMultilingual monthMultilingual = new MonthMultilingual(); + private static final String MONTH_JOINED_VALUES = + monthMultilingual.getMonthToAllLanguagesStringsMap().values().stream().flatMap(Set::stream) + .map(Pattern::quote) + .collect(Collectors.joining("|", "(", ")")); + + private enum MonthNameDatePattern { + DAY_MONTH_YEAR_PATTERN(compilePattern(new String[]{DAY_REGEX, MONTH_JOINED_VALUES, YEAR_REGEX}), DMY_INDICES), + MONTH_DAY_YEAR_PATTERN(compilePattern(new String[]{MONTH_JOINED_VALUES, DAY_REGEX, YEAR_REGEX}), MDY_INDICES), + MONTH_YEAR_PATTERN(compilePattern(new String[]{MONTH_JOINED_VALUES, YEAR_REGEX}), MY_INDICES); + + private final Pattern pattern; + private final DatePartsIndices datePartsIndices; + + MonthNameDatePattern(Pattern pattern, DatePartsIndices datePartsIndices) { + this.pattern = pattern; + this.datePartsIndices = datePartsIndices; + } + + public Pattern getPattern() { + return pattern; + } + + public DatePartsIndices getDatePartsIndices() { + return datePartsIndices; + } + } + + private static Pattern compilePattern(String[] parts) { + return compile(String.join(DELIMITERS_REGEX, parts), Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + } + + @Override + public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException { + return Arrays.stream(MonthNameDatePattern.values()) + .map(operation -> extract(operation, inputValue)) + .filter(dateNormalizationResult -> dateNormalizationResult.getDateNormalizationResultStatus() + == DateNormalizationResultStatus.MATCHED).findFirst() + .orElse(getNoMatchResult(inputValue)); + } + + private DateNormalizationResult extract(MonthNameDatePattern monthNameDatePattern, String inputValue) { + DateNormalizationResult dateNormalizationResult = getNoMatchResult(inputValue); + try { + final Matcher matcher = monthNameDatePattern.getPattern().matcher(inputValue); + if (matcher.matches()) { + final Month month = monthMultilingual.getMonth( + matcher.group(monthNameDatePattern.getDatePartsIndices().getMonthIndex())); + final InstantEdtfDateBuilder instantEdtfDateBuilder = new InstantEdtfDateBuilder( + Integer.parseInt(matcher.group(monthNameDatePattern.getDatePartsIndices().getYearIndex()))) + .withMonth(month.getValue()); + getDayIfPresent(monthNameDatePattern, matcher).ifPresent(instantEdtfDateBuilder::withDay); + final InstantEdtfDate instantEdtfDate = instantEdtfDateBuilder.build(); + dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.MONTH_NAME, inputValue, + instantEdtfDate); + } + } catch (DateExtractionException e) { + LOGGER.warn("Failed instance extraction!", e); + } + return dateNormalizationResult; + } + + private Optional getDayIfPresent(MonthNameDatePattern monthNameDatePattern, Matcher matcher) { + if (monthNameDatePattern.getDatePartsIndices().getDayIndex() != null) { + return Optional.of(Integer.parseInt(matcher.group(monthNameDatePattern.getDatePartsIndices().getDayIndex()))); + } + return Optional.empty(); + } +} + diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractor.java similarity index 76% rename from metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractor.java rename to metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractor.java index 92b972f8ed..f04d123039 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractor.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractor.java @@ -1,6 +1,5 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; +package eu.europeana.normalization.dates.extraction.extractors; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN; import static java.util.Optional.ofNullable; import static java.util.regex.Pattern.compile; @@ -8,10 +7,11 @@ import eu.europeana.normalization.dates.DateNormalizationResult; import eu.europeana.normalization.dates.YearPrecision; import eu.europeana.normalization.dates.edtf.DateQualification; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder; import eu.europeana.normalization.dates.extraction.DateExtractionException; import eu.europeana.normalization.dates.extraction.NumericPartsPattern; -import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer; +import java.util.EnumSet; import java.util.Locale; import java.util.Set; import java.util.regex.Matcher; @@ -39,9 +39,17 @@ public class NumericPartsDateExtractor extends AbstractDateExtractor { private static final String UNKNOWN_CHARACTERS_REGEX = "[XU?-]"; @Override - public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - boolean flexibleDateBuild) throws DateExtractionException { - return extract(inputValue, requestedDateQualification, NumericPartsPattern.NUMERIC_SET, flexibleDateBuild); + public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException { + return extract(inputValue, NumericPartsPattern.NUMERIC_SET, allowDayMonthSwap); + } + + @Override + public Set getQualification(String inputValue) { + final Set dateQualifications = EnumSet.noneOf(DateQualification.class); + if (STARTING_UNCERTAIN_PATTERN.matcher(inputValue).find() || ENDING_UNCERTAIN_PATTERN.matcher(inputValue).find()) { + dateQualifications.add(UNCERTAIN); + } + return dateQualifications; } /** @@ -49,27 +57,21 @@ public DateNormalizationResult extract(String inputValue, DateQualification requ * * @param inputValue the input value * @param numericPatternValues the patterns to check against - * @param allowSwitchMonthDay allow switching month and day values if month and day original values are not valid + * @param flexibleDateBuild allow switching month and day values if month and day original values are not valid * @return the date normalization result */ - protected DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, - Set numericPatternValues, - boolean allowSwitchMonthDay) throws DateExtractionException { - final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue); - final DateQualification dateQualification = computeDateQualification(requestedDateQualification, () -> - (STARTING_UNCERTAIN_PATTERN.matcher(sanitizedValue).find() || ENDING_UNCERTAIN_PATTERN.matcher(sanitizedValue).find()) - ? UNCERTAIN : NO_QUALIFICATION); - + protected DateNormalizationResult extract(String inputValue, Set numericPatternValues, + boolean flexibleDateBuild) throws DateExtractionException { DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue); for (NumericPartsPattern numericWithMissingPartsPattern : numericPatternValues) { - final Matcher matcher = numericWithMissingPartsPattern.getPattern().matcher(sanitizedValue); + final Matcher matcher = numericWithMissingPartsPattern.getPattern().matcher(inputValue); if (matcher.matches()) { InstantEdtfDateBuilder instantEdtfDateBuilder = extractDateProperty(numericWithMissingPartsPattern, matcher); - dateNormalizationResult = new DateNormalizationResult( - numericWithMissingPartsPattern.getDateNormalizationExtractorMatchId(), inputValue, - instantEdtfDateBuilder.withDateQualification(dateQualification).withFlexibleDateBuild(allowSwitchMonthDay) - .build()); - break; + final InstantEdtfDate instantEdtfDate = instantEdtfDateBuilder.withDateQualification(getQualification(inputValue)) + .withAllowDayMonthSwap(flexibleDateBuild).build(); + dateNormalizationResult = new DateNormalizationResult( + numericWithMissingPartsPattern.getDateNormalizationExtractorMatchId(), inputValue, instantEdtfDate); + break; } } diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractor.java new file mode 100644 index 0000000000..bf3077a08c --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractor.java @@ -0,0 +1,93 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS; +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS_XX; +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS; +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS_XX; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.DateNormalizationResultStatus; +import eu.europeana.normalization.dates.edtf.DateBoundaryType; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.NumericPartsPattern; +import eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeQualifier; +import java.util.List; + +/** + * Patterns for numeric date ranges with variations in the separators of date components. + *

    We reuse the already existent {@link NumericPartsDateExtractor} code for the boundaries.

    + */ +public class NumericPartsRangeDateExtractor extends AbstractRangeDateExtractor { + + private static final NumericPartsDateExtractor NUMERIC_WITH_MISSING_PARTS_DATE_EXTRACTOR = new NumericPartsDateExtractor(); + + @Override + public boolean isRangeMatchSuccess(NumericRangeQualifier numericRangeQualifier, DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED + && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED + && !areYearsAmbiguous((InstantEdtfDate) startDateResult.getEdtfDate(), (InstantEdtfDate) endDateResult.getEdtfDate(), + numericRangeQualifier); + } + + /** + * Captures the ambiguous case of "198-?". + * + * @param startDate the start date of a range + * @param endDate the end date of the range + * @param numericRangeQualifier the numeric range qualifier + * @return true if the range is ambiguous + */ + private boolean areYearsAmbiguous(InstantEdtfDate startDate, InstantEdtfDate endDate, + NumericRangeQualifier numericRangeQualifier) { + boolean isAmbiguous = false; + if (numericRangeQualifier == NumericRangeQualifier.DASH_RANGE) { + final boolean isStartDeclared = startDate.getDateBoundaryType() == DateBoundaryType.DECLARED; + final boolean isStartThreeDigit = + isStartDeclared && Integer.toString(startDate.getYear().getValue()).matches("\\d{3}"); + if (isStartThreeDigit && endDate.getDateBoundaryType() == DateBoundaryType.OPEN) { + isAmbiguous = true; + } + } + return isAmbiguous; + } + + @Override + public List getRangeDateQualifiers() { + return List.of(NumericRangeQualifier.values()); + } + + @Override + public DateNormalizationResultRangePair extractDateNormalizationResult(String startString, + String endString, NumericRangeQualifier numericRangeQualifier, + boolean allowDayMonthSwap) + throws DateExtractionException { + return new DateNormalizationResultRangePair( + extractDate(startString, numericRangeQualifier, allowDayMonthSwap), + extractDate(endString, numericRangeQualifier, allowDayMonthSwap)); + } + + @Override + public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult) { + final boolean isStartXX = startDateResult.getDateNormalizationExtractorMatchId() == NUMERIC_ALL_VARIANTS_XX; + final boolean isEndXX = endDateResult.getDateNormalizationExtractorMatchId() == NUMERIC_ALL_VARIANTS_XX; + return isStartXX || isEndXX ? NUMERIC_RANGE_ALL_VARIANTS_XX : NUMERIC_RANGE_ALL_VARIANTS; + } + + private static DateNormalizationResult extractDate(String dateString, + NumericRangeQualifier numericRangeQualifier, boolean allowDayMonthSwap) + throws DateExtractionException { + final DateNormalizationResult dateNormalizationResult; + if (numericRangeQualifier.getUnspecifiedCharacters() != null && dateString.matches( + numericRangeQualifier.getUnspecifiedCharacters())) { + dateNormalizationResult = new DateNormalizationResult(NUMERIC_ALL_VARIANTS, dateString, InstantEdtfDate.getOpenInstance()); + } else { + dateNormalizationResult = NUMERIC_WITH_MISSING_PARTS_DATE_EXTRACTOR.extract(dateString, + NumericPartsPattern.NUMERIC_RANGE_SET, allowDayMonthSwap); + } + return dateNormalizationResult; + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/RangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/RangeDateExtractor.java new file mode 100644 index 0000000000..4f1a3345c3 --- /dev/null +++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/RangeDateExtractor.java @@ -0,0 +1,76 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.extraction.DateExtractionException; +import eu.europeana.normalization.dates.extraction.DatesSeparator; +import java.util.List; + +/** + * The interface for range date extractors. + * + * @param the object containing delimiters/separators for dates + */ +public interface RangeDateExtractor { + + List getRangeDateQualifiers(); + + /** + * Extract the start and end date normalization result pair. + *

    At this stage we just perform an extraction, the range is not verified yet.

    + * + * @param startString the start date string + * @param endString the end date string + * @param rangeDateDelimiters the range date delimiters + * @param allowDayMonthSwap the boolean opting flexible date build + * @return the start and end date result pair + * @throws DateExtractionException if the date extraction failed + */ + DateNormalizationResultRangePair extractDateNormalizationResult( + String startString, String endString, T rangeDateDelimiters, + boolean allowDayMonthSwap) throws DateExtractionException; + + /** + * Checks if a provided date range was successfully extracted + * + * @param rangeDateDelimiters the range date delimiters + * @param startDateResult the extracted start date boundary + * @param endDateResult the extracted end date boundary + * @return the boolean representing a successful date range extraction + */ + boolean isRangeMatchSuccess(T rangeDateDelimiters, DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult); + + /** + * Get the date normalization extractor match identifier from the two date boundaries. + * + * @param startDateResult the start date boundary + * @param endDateResult the end date boundary + * @return the date normalization extractor match identifier + */ + DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult, + DateNormalizationResult endDateResult); + + /** + * Class wrapping a pair of start and end dates. + */ + class DateNormalizationResultRangePair { + + final DateNormalizationResult startDateNormalizationResult; + final DateNormalizationResult endDateNormalizationResult; + + public DateNormalizationResultRangePair(DateNormalizationResult startDateNormalizationResult, + DateNormalizationResult endDateNormalizationResult) { + this.startDateNormalizationResult = startDateNormalizationResult; + this.endDateNormalizationResult = endDateNormalizationResult; + } + + public DateNormalizationResult getStartDateNormalizationResult() { + return startDateNormalizationResult; + } + + public DateNormalizationResult getEndDateNormalizationResult() { + return endDateNormalizationResult; + } + } +} diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/normalizers/DatesNormalizer.java b/metis-normalization/src/main/java/eu/europeana/normalization/normalizers/DatesNormalizer.java index 0848263726..3c83973772 100644 --- a/metis-normalization/src/main/java/eu/europeana/normalization/normalizers/DatesNormalizer.java +++ b/metis-normalization/src/main/java/eu/europeana/normalization/normalizers/DatesNormalizer.java @@ -8,18 +8,23 @@ import eu.europeana.normalization.dates.edtf.AbstractEdtfDate; import eu.europeana.normalization.dates.edtf.DateQualification; import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.extraction.dateextractors.BriefRangeDateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.CenturyDateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.DateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.DcmiPeriodDateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.DecadeDateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.EdtfDateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.NumericPartsDateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.NumericPartsRangeDateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.PatternBcAdDateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.PatternFormatedFullDateDateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.PatternLongNegativeYearDateExtractor; -import eu.europeana.normalization.dates.extraction.dateextractors.PatternMonthNameDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.BcAdDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.BcAdRangeDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.BriefRangeDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.CenturyNumericDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.CenturyRomanDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.CenturyRomanRangeDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.DateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.DcmiPeriodDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.DecadeDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.EdtfDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.EdtfRangeDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.FullDateDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.LongNegativeYearDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.LongNegativeYearRangeDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.MonthNameDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.NumericPartsDateExtractor; +import eu.europeana.normalization.dates.extraction.extractors.NumericPartsRangeDateExtractor; import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer; import eu.europeana.normalization.dates.sanitize.SanitizeOperation; import eu.europeana.normalization.dates.sanitize.SanitizedDate; @@ -29,10 +34,12 @@ import eu.europeana.normalization.util.NormalizationException; import eu.europeana.normalization.util.XmlUtil; import eu.europeana.normalization.util.XpathQuery; +import java.lang.invoke.MethodHandles; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Optional; +import java.util.function.BiFunction; import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -40,6 +47,8 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Element; @@ -56,13 +65,15 @@ */ public class DatesNormalizer implements RecordNormalizeAction { + private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Namespace.Element EDM_PROVIDED_CHO = Namespace.EDM.getElement("ProvidedCHO"); private static final Namespace.Element EDM_WEB_RESOURCE = Namespace.EDM.getElement("WebResource"); private static final Namespace.Element EDM_AGENT = Namespace.EDM.getElement("Agent"); private static final Namespace.Element EDM_PLACE = Namespace.EDM.getElement("Place"); private static final Namespace.Element EDM_TIMESPAN = Namespace.EDM.getElement("TimeSpan"); private static final Namespace.Element RDF_ABOUT = Namespace.RDF.getElement("about"); - private static final Namespace.Element SKOS_PREFLABEL = Namespace.SKOS.getElement("prefLabel"); + private static final Namespace.Element SKOS_PREF_LABEL = Namespace.SKOS.getElement("prefLabel"); private static final Namespace.Element XML_LANG = Namespace.XML.getElement("lang"); private static final Namespace.Element SKOS_NOTATION = Namespace.SKOS.getElement("notation"); private static final Namespace.Element SKOS_NOTE = Namespace.SKOS.getElement("note"); @@ -70,7 +81,7 @@ public class DatesNormalizer implements RecordNormalizeAction { private static final Namespace.Element RDF_RESOURCE = Namespace.RDF.getElement("resource"); private static final Namespace.Element EDM_BEGIN = Namespace.EDM.getElement("begin"); private static final Namespace.Element EDM_END = Namespace.EDM.getElement("end"); - private static final Namespace.Element DCTERMS_ISPARTOF = Namespace.DCTERMS.getElement("isPartOf"); + private static final Namespace.Element DC_TERMS_IS_PART_OF = Namespace.DCTERMS.getElement("isPartOf"); private static final Namespace.Element ORE_PROXY = Namespace.ORE.getElement("Proxy"); private static final Namespace.Element EDM_EUROPEANA_PROXY = Namespace.EDM.getElement("europeanaProxy"); @@ -118,33 +129,42 @@ public DatesNormalizer() { extractorsInOrderForDateProperties = List.of( new BriefRangeDateExtractor(), new EdtfDateExtractor(), - new CenturyDateExtractor(), + new EdtfRangeDateExtractor(), + new CenturyNumericDateExtractor(), + new CenturyRomanDateExtractor(), + new CenturyRomanRangeDateExtractor(), new DecadeDateExtractor(), new NumericPartsRangeDateExtractor(), new NumericPartsDateExtractor(), new DcmiPeriodDateExtractor(), - new PatternMonthNameDateExtractor(), - new PatternFormatedFullDateDateExtractor(), - new PatternBcAdDateExtractor(), - new PatternLongNegativeYearDateExtractor()); + new MonthNameDateExtractor(), + new FullDateDateExtractor(), + new BcAdDateExtractor(), + new BcAdRangeDateExtractor(), + new LongNegativeYearDateExtractor(), + new LongNegativeYearRangeDateExtractor()); extractorsInOrderForGenericProperties = extractorsInOrderForDateProperties.stream() - .filter( - not(BriefRangeDateExtractor.class::isInstance)) - .collect(Collectors.toList()); + .filter(not(BriefRangeDateExtractor.class::isInstance)).collect(Collectors.toList()); normalizationOperationsInOrderDateProperty = List.of( - input -> normalizeInput(extractorsInOrderForDateProperties, input, DateQualification.NO_QUALIFICATION), - input -> normalizeInput(extractorsInOrderForDateProperties, input, dateFieldSanitizer::sanitize1stTimeDateProperty, - SanitizeOperation::isApproximateSanitizeOperationForDateProperty), - input -> normalizeInput(extractorsInOrderForDateProperties, input, dateFieldSanitizer::sanitize2ndTimeDateProperty, - SanitizeOperation::isApproximateSanitizeOperationForDateProperty)); + input -> normalizeInput(extractorsInOrderForDateProperties, input), + input -> normalizeInputSanitized(extractorsInOrderForDateProperties, input, + dateFieldSanitizer::sanitize1stTimeDateProperty, + SanitizeOperation::isApproximateSanitizeOperationForDateProperty, + (dateExtractors, sanitizedDate) -> normalizeInput(dateExtractors, sanitizedDate.getSanitizedDateString())), + input -> normalizeInputSanitized(extractorsInOrderForDateProperties, input, + dateFieldSanitizer::sanitize2ndTimeDateProperty, + SanitizeOperation::isApproximateSanitizeOperationForDateProperty, + (dateExtractors, sanitizedDate) -> normalizeInput(dateExtractors, sanitizedDate.getSanitizedDateString()))); normalizationOperationsInOrderGenericProperty = List.of( - input -> normalizeInputGeneric(extractorsInOrderForGenericProperties, input, DateQualification.NO_QUALIFICATION), - input -> normalizeInputGeneric(extractorsInOrderForGenericProperties, input, - dateFieldSanitizer::sanitizeGenericProperty, SanitizeOperation::isApproximateSanitizeOperationForGenericProperty)); + input -> normalizeInputGeneric(extractorsInOrderForGenericProperties, input), + input -> normalizeInputSanitized(extractorsInOrderForGenericProperties, input, + dateFieldSanitizer::sanitizeGenericProperty, + SanitizeOperation::isApproximateSanitizeOperationForGenericProperty, + (dateExtractors, sanitizedDate) -> normalizeInputGeneric(dateExtractors, sanitizedDate.getSanitizedDateString()))); } private static Pair getProxySubtagQuery(Namespace.Element subtag) { @@ -160,10 +180,8 @@ public NormalizationReport normalize(Document document) throws NormalizationExce // Perform the two different kinds of normalizations final InternalNormalizationReport report = new InternalNormalizationReport(); - report.mergeWith(normalizeElements(document, europeanaProxy, DATE_PROPERTY_FIELDS, - this::normalizeDateProperty)); - report.mergeWith(normalizeElements(document, europeanaProxy, GENERIC_PROPERTY_FIELDS, - this::normalizeGenericProperty)); + report.mergeWith(normalizeElements(document, europeanaProxy, DATE_PROPERTY_FIELDS, this::normalizeDateProperty)); + report.mergeWith(normalizeElements(document, europeanaProxy, GENERIC_PROPERTY_FIELDS, this::normalizeGenericProperty)); return report; } @@ -194,6 +212,7 @@ private void normalizeElement(Document document, Element element, Namespace.Elem final String elementText = XmlUtil.getElementText(element); final DateNormalizationResult dateNormalizationResult = normalizationFunction.apply(elementText); if (dateNormalizationResult.getDateNormalizationResultStatus() == NO_MATCH) { + LOGGER.debug("Normalization did not find a match"); return; } @@ -209,10 +228,10 @@ private void normalizeElement(Document document, Element element, Namespace.Elem final Element reference = XmlUtil.createElement(elementType, europeanaProxy, List.of()); final String fullResourceName = XmlUtil.getPrefixedElementName(RDF_RESOURCE, reference.lookupPrefix(RDF_RESOURCE.getNamespace().getUri())); - final Attr dctermsIsPartOfResource = document.createAttributeNS( + final Attr dcTermsIsPartOfResource = document.createAttributeNS( RDF_RESOURCE.getNamespace().getUri(), fullResourceName); - dctermsIsPartOfResource.setValue(timespanId); - reference.setAttributeNode(dctermsIsPartOfResource); + dcTermsIsPartOfResource.setValue(timespanId); + reference.setAttributeNode(dcTermsIsPartOfResource); // Update the report. report.increment(this.getClass().getSimpleName(), ConfidenceLevel.CERTAIN); @@ -256,58 +275,31 @@ private DateNormalizationResult normalizeProperty( return dateNormalizationResult; } - private DateNormalizationResult normalizeInput(List dateExtractors, String inputDate, - DateQualification dateQualification) { - return dateExtractors.stream().map( - dateExtractor -> dateExtractor.extractDateProperty(inputDate, dateQualification)) + private DateNormalizationResult normalizeInput(List dateExtractors, String inputDate) { + return dateExtractors.stream().map(dateExtractor -> dateExtractor.extractDateProperty(inputDate)) .filter(dateNormalizationResult -> dateNormalizationResult.getDateNormalizationResultStatus() == MATCHED).findFirst() .orElse(DateNormalizationResult.getNoMatchResult(inputDate)); } - private DateNormalizationResult normalizeInputGeneric(List dateExtractors, String input, - DateQualification dateQualification) { - return dateExtractors.stream().map(dateExtractor -> dateExtractor.extractGenericProperty(input, dateQualification)) + private DateNormalizationResult normalizeInputGeneric(List dateExtractors, String input) { + return dateExtractors.stream().map(dateExtractor -> dateExtractor.extractGenericProperty(input)) .filter(dateNormalizationResult -> dateNormalizationResult.getDateNormalizationResultStatus() == MATCHED).findFirst() .orElse(DateNormalizationResult.getNoMatchResult(input)); } - private DateNormalizationResult normalizeInput(List dateExtractors, String input, - Function sanitizeFunction, Predicate checkIfApproximateCleanOperationId) { + private DateNormalizationResult normalizeInputSanitized(List dateExtractors, String input, + Function sanitizeFunction, Predicate checkIfApproximateCleanOperationId, + BiFunction, SanitizedDate, DateNormalizationResult> normalizeFunction) { final SanitizedDate sanitizedDate = sanitizeFunction.apply(input); DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(input); if (sanitizedDate != null && StringUtils.isNotEmpty(sanitizedDate.getSanitizedDateString())) { - final DateQualification dateQualification; - if (checkIfApproximateCleanOperationId.test(sanitizedDate.getSanitizeOperation())) { - dateQualification = DateQualification.APPROXIMATE; - } else { - dateQualification = DateQualification.NO_QUALIFICATION; - } - dateNormalizationResult = normalizeInput(dateExtractors, sanitizedDate.getSanitizedDateString(), dateQualification); - - if (dateNormalizationResult.getDateNormalizationResultStatus() == MATCHED) { - //Re-create result containing sanitization operation. - dateNormalizationResult = new DateNormalizationResult(dateNormalizationResult, sanitizedDate.getSanitizeOperation()); - } - } - return dateNormalizationResult; - } - - private DateNormalizationResult normalizeInputGeneric(List dateExtractors, String input, - Function sanitizeFunction, Predicate checkIfApproximateCleanOperationId) { - final SanitizedDate sanitizedDate = sanitizeFunction.apply(input); - DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(input); - if (sanitizedDate != null && StringUtils.isNotEmpty(sanitizedDate.getSanitizedDateString())) { - if (checkIfApproximateCleanOperationId.test(sanitizedDate.getSanitizeOperation())) { - dateNormalizationResult = normalizeInputGeneric(dateExtractors, sanitizedDate.getSanitizedDateString(), - DateQualification.APPROXIMATE); - } else { - dateNormalizationResult = normalizeInputGeneric(dateExtractors, sanitizedDate.getSanitizedDateString(), - DateQualification.NO_QUALIFICATION); - } - + dateNormalizationResult = normalizeFunction.apply(dateExtractors, sanitizedDate); if (dateNormalizationResult.getDateNormalizationResultStatus() == MATCHED) { + if (checkIfApproximateCleanOperationId.test(sanitizedDate.getSanitizeOperation())) { + dateNormalizationResult.getEdtfDate().addQualification(DateQualification.APPROXIMATE); + } //Re-create result containing sanitization operation. dateNormalizationResult = new DateNormalizationResult(dateNormalizationResult, sanitizedDate.getSanitizeOperation()); } @@ -362,7 +354,7 @@ private void appendTimespanEntity(Document document, AbstractEdtfDate edtfDate, timeSpan.setAttributeNode(rdfAbout); // Create and add skosPrefLabel to timespan - final Element skosPrefLabel = XmlUtil.createElement(SKOS_PREFLABEL, timeSpan, null); + final Element skosPrefLabel = XmlUtil.createElement(SKOS_PREF_LABEL, timeSpan, null); if (StringUtils.isNotBlank(edtfDate.getLabel())) { skosPrefLabel.setNodeValue(edtfDate.getLabel()); skosPrefLabel.appendChild(document.createTextNode(edtfDate.getLabel())); @@ -376,11 +368,11 @@ private void appendTimespanEntity(Document document, AbstractEdtfDate edtfDate, } // Create and add skosNote elements to timespan in case of approximate or uncertain dates. - if (edtfDate.getDateQualification() == DateQualification.APPROXIMATE) { + if (edtfDate.getDateQualifications().contains(DateQualification.APPROXIMATE)) { final Element skosNote = XmlUtil.createElement(SKOS_NOTE, timeSpan, null); skosNote.appendChild(document.createTextNode("approximate")); } - if (edtfDate.getDateQualification() == DateQualification.UNCERTAIN) { + if (edtfDate.getDateQualifications().contains(DateQualification.UNCERTAIN)) { final Element skosNote = XmlUtil.createElement(SKOS_NOTE, timeSpan, null); skosNote.appendChild(document.createTextNode("uncertain")); } @@ -407,7 +399,7 @@ private void appendTimespanEntity(Document document, AbstractEdtfDate edtfDate, final String fullResourceName = XmlUtil.getPrefixedElementName(RDF_RESOURCE, timeSpan.lookupPrefix(RDF_RESOURCE.getNamespace().getUri())); for (int century = Math.max(1, startCentury); century <= Math.max(0, endCentury); century++) { - final Element dctermsIsPartOf = XmlUtil.createElement(DCTERMS_ISPARTOF, timeSpan, null); + final Element dctermsIsPartOf = XmlUtil.createElement(DC_TERMS_IS_PART_OF, timeSpan, null); final Attr dctermsIsPartOfResource = document.createAttributeNS(RDF_RESOURCE.getNamespace().getUri(), fullResourceName); dctermsIsPartOfResource.setValue("http://data.europeana.eu/timespan/" + century); dctermsIsPartOf.setAttributeNode(dctermsIsPartOfResource); diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractorTest.java deleted file mode 100644 index 4d0334b4be..0000000000 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractorTest.java +++ /dev/null @@ -1,88 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN; -import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN; -import static eu.europeana.normalization.dates.edtf.DateQualification.APPROXIMATE; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN; -import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN_APPROXIMATE; -import static eu.europeana.normalization.dates.edtf.IntervalEdtfDate.DATE_INTERVAL_SEPARATOR; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.params.provider.Arguments.of; - -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.DateNormalizationResultStatus; -import eu.europeana.normalization.dates.edtf.AbstractEdtfDate; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDate; -import java.util.stream.Stream; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; - -class BriefRangeDateExtractorTest { - - private final BriefRangeDateExtractor briefRangeDateExtractor = new BriefRangeDateExtractor(); - - private void assertExtract(String input, String expected) { - final DateNormalizationResult dateNormalizationResult = briefRangeDateExtractor.extractDateProperty(input, NO_QUALIFICATION); - if (expected == null) { - assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus()); - } else { - AbstractEdtfDate edtfDate = dateNormalizationResult.getEdtfDate(); - if (edtfDate instanceof IntervalEdtfDate) { - String startPart = expected.substring(0, expected.indexOf(DATE_INTERVAL_SEPARATOR)); - String endPart = expected.substring(expected.indexOf(DATE_INTERVAL_SEPARATOR) + 1); - InstantEdtfDate start = ((IntervalEdtfDate) edtfDate).getStart(); - InstantEdtfDate end = ((IntervalEdtfDate) edtfDate).getEnd(); - assertEdtfDate(startPart, start); - assertEdtfDate(endPart, end); - } else { - assertEdtfDate(expected, (InstantEdtfDate) dateNormalizationResult.getEdtfDate()); - } - assertEquals(expected, edtfDate.toString()); - } - } - - private static void assertEdtfDate(String expected, InstantEdtfDate instantEdtfDate) { - assertEquals(expected.contains("?"), instantEdtfDate.getDateQualification() == UNCERTAIN); - assertEquals(expected.contains("~"), instantEdtfDate.getDateQualification() == APPROXIMATE); - assertEquals(expected.contains("%"), instantEdtfDate.getDateQualification() == UNCERTAIN_APPROXIMATE); - assertEquals(expected.equals(OPEN.getSerializedRepresentation()), - instantEdtfDate.getDateBoundaryType() == OPEN || instantEdtfDate.getDateBoundaryType() == UNKNOWN); - } - - @ParameterizedTest - @MethodSource - void extractBrief(String input, String expected) { - assertExtract(input, expected); - } - - private static Stream extractBrief() { - return Stream.of( - of("1989/90", "1989/1990"), - of("1989/90?", "1989?/1990?"), - of("1989-90", "1989/1990"), - of("1989-90?", "1989?/1990?"), - of("1900-13", "1900/1913"), - - //End date lower rightmost two digits than start year - of("1989/89", null), - of("1989/88", null), - of("1989-89", null), - of("1989-88", null), - - //More than two digits on end year not allowed - of("1989/990", null), - of("1989-990", null), - - //End year cannot be lower or equal than 12 - of("1900/01", null), - of("1900-12", null), - - //Less than three digits on start year - of("89-90", null) - ); - } - -} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractorTest.java deleted file mode 100644 index 1a8df29669..0000000000 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractorTest.java +++ /dev/null @@ -1,216 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.CENTURY_NUMERIC; -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.CENTURY_RANGE_ROMAN; -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.CENTURY_ROMAN; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.params.provider.Arguments.of; - -import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.DateNormalizationResultStatus; -import eu.europeana.normalization.dates.edtf.DateQualification; -import java.util.stream.Stream; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; - -class CenturyDateExtractorTest { - private static final CenturyDateExtractor CENTURY_DATE_EXTRACTOR = new CenturyDateExtractor(); - - void assertExtract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { - final DateNormalizationResult dateNormalizationResult = CENTURY_DATE_EXTRACTOR.extractDateProperty(input, NO_QUALIFICATION); - if (expected == null) { - assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus()); - } else { - final String actual = dateNormalizationResult.getEdtfDate().toString(); - assertEquals(expected, actual); - assertEquals(actual.contains("?"), - dateNormalizationResult.getEdtfDate().getDateQualification() == DateQualification.UNCERTAIN); - assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId()); - } - } - - @ParameterizedTest - @MethodSource("extractNumericData") - void extractNumeric(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { - assertExtract(input, expected, dateNormalizationExtractorMatchId); - } - - @ParameterizedTest - @MethodSource("extractRomanData") - void extractRoman(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { - assertExtract(input, expected, dateNormalizationExtractorMatchId); - } - - private static Stream extractNumericData() { - return Stream.of( - //PATTERN_YYYY - of("18..", "18XX", CENTURY_NUMERIC), - of(" 18.. ", "18XX", CENTURY_NUMERIC), - of("?18..", "18XX?", CENTURY_NUMERIC), - of("18..?", "18XX?", CENTURY_NUMERIC), - of("?18..?", "18XX?", CENTURY_NUMERIC), - of("192?", null, null, null), //Too many digits - of("1..", null, null, null), //Too few digits - - //PATTERN_ENGLISH - of("1st century", "00XX", CENTURY_NUMERIC), - of("2nd century", "01XX", CENTURY_NUMERIC), - of("3rd century", "02XX", CENTURY_NUMERIC), - of("11th century", "10XX", CENTURY_NUMERIC), - of(" 11th century ", "10XX", CENTURY_NUMERIC), - of("?11th century", "10XX?", CENTURY_NUMERIC), - of("11th century?", "10XX?", CENTURY_NUMERIC), - of("?11th century?", "10XX?", CENTURY_NUMERIC), - of("12th century BC", null, null, null), // not supported - of("[10th century]", null, null, null), // not supported - of("11thcentury", null, null, null), //Incorrect spacing numeric - of("11st century", null, null, null), //Incorrect suffix - of("12rd century", null, null, null), //Incorrect suffix - of("13st century", null, null, null), //Incorrect suffix - of("21th century", null, null, null), //Incorrect suffix - of("0st century", null, null, null), //Out of range - of("22nd century", null, null, null) //Out of range - ); - } - - private static Stream extractRomanData() { - return Stream.of( - //PATTERN_ROMAN - //Uppercase - of("I", "00XX", CENTURY_ROMAN), - of("IV", "03XX", CENTURY_ROMAN), - of("V", "04XX", CENTURY_ROMAN), - of("VI", "05XX", CENTURY_ROMAN), - of("IX", "08XX", CENTURY_ROMAN), - of("X", "09XX", CENTURY_ROMAN), - of("XI", "10XX", CENTURY_ROMAN), - of("XIV", "13XX", CENTURY_ROMAN), - of("XV", "14XX", CENTURY_ROMAN), - of("XVI", "15XX", CENTURY_ROMAN), - of("XIX", "18XX", CENTURY_ROMAN), - of("XX", "19XX", CENTURY_ROMAN), - of("XXI", "20XX", CENTURY_ROMAN), - - //Lower case - of("i", "00XX", CENTURY_ROMAN), - of("iv", "03XX", CENTURY_ROMAN), - of("v", "04XX", CENTURY_ROMAN), - of("vi", "05XX", CENTURY_ROMAN), - of("ix", "08XX", CENTURY_ROMAN), - of("x", "09XX", CENTURY_ROMAN), - of("xi", "10XX", CENTURY_ROMAN), - of("xiv", "13XX", CENTURY_ROMAN), - of("xv", "14XX", CENTURY_ROMAN), - of("xvi", "15XX", CENTURY_ROMAN), - of("xix", "18XX", CENTURY_ROMAN), - of("xx", "19XX", CENTURY_ROMAN), - of("xxi", "20XX", CENTURY_ROMAN), - - //Prefixes - of("s I", "00XX", CENTURY_ROMAN), - of("s. I", "00XX", CENTURY_ROMAN), - of("S I", "00XX", CENTURY_ROMAN), - of("S.I", "00XX", CENTURY_ROMAN), - of("sec.I", "00XX", CENTURY_ROMAN), - of("SEC.I", "00XX", CENTURY_ROMAN), - of("sec. I", "00XX", CENTURY_ROMAN), - of("SEC. I", "00XX", CENTURY_ROMAN), - of("saec.I", "00XX", CENTURY_ROMAN), - of("SAEC.I", "00XX", CENTURY_ROMAN), - of("saec. I", "00XX", CENTURY_ROMAN), - of("SAEC. I", "00XX", CENTURY_ROMAN), - //Other possibilities and uncertain - of("Ii", "01XX", CENTURY_ROMAN), - of(" s I ", "00XX", CENTURY_ROMAN), - of("?s. I", "00XX?", CENTURY_ROMAN), - of("sec. I?", "00XX?", CENTURY_ROMAN), - of("?saec. I?", "00XX?", CENTURY_ROMAN), - of(" I ", "00XX", CENTURY_ROMAN), - of("?I", "00XX?", CENTURY_ROMAN), - of("I?", "00XX?", CENTURY_ROMAN), - of("?I?", "00XX?", CENTURY_ROMAN), - //Non matches - of("saecI", null, null), //Without a dot a space is required - of("secI", null, null), //Without a dot a space is required - of("MDCLXX", null, null, null), // Not supported range - of("IXX", null, null, null), // Invalid roman - - //PATTERN_ROMAN_RANGE - //Uppercase - of("I-II", "00XX/01XX", CENTURY_RANGE_ROMAN), - of("II-III", "01XX/02XX", CENTURY_RANGE_ROMAN), - of("III-IV", "02XX/03XX", CENTURY_RANGE_ROMAN), - of("IV-V", "03XX/04XX", CENTURY_RANGE_ROMAN), - of("V-VI", "04XX/05XX", CENTURY_RANGE_ROMAN), - of("VI-VII", "05XX/06XX", CENTURY_RANGE_ROMAN), - of("VII-VIII", "06XX/07XX", CENTURY_RANGE_ROMAN), - of("VIII-IX", "07XX/08XX", CENTURY_RANGE_ROMAN), - of("IX-X", "08XX/09XX", CENTURY_RANGE_ROMAN), - of("X-XI", "09XX/10XX", CENTURY_RANGE_ROMAN), - of("XI-XII", "10XX/11XX", CENTURY_RANGE_ROMAN), - of("XII-XIII", "11XX/12XX", CENTURY_RANGE_ROMAN), - of("XIII-XIV", "12XX/13XX", CENTURY_RANGE_ROMAN), - of("XIV-XV", "13XX/14XX", CENTURY_RANGE_ROMAN), - of("XV-XVI", "14XX/15XX", CENTURY_RANGE_ROMAN), - of("XVI-XVII", "15XX/16XX", CENTURY_RANGE_ROMAN), - of("XVII-XVIII", "16XX/17XX", CENTURY_RANGE_ROMAN), - of("XVIII-XIX", "17XX/18XX", CENTURY_RANGE_ROMAN), - of("XIX-XX", "18XX/19XX", CENTURY_RANGE_ROMAN), - of("XX-XXI", "19XX/20XX", CENTURY_RANGE_ROMAN), - - //Lowercase - of("i-ii", "00XX/01XX", CENTURY_RANGE_ROMAN), - of("ii-iii", "01XX/02XX", CENTURY_RANGE_ROMAN), - of("iii-iv", "02XX/03XX", CENTURY_RANGE_ROMAN), - of("iv-v", "03XX/04XX", CENTURY_RANGE_ROMAN), - of("v-vi", "04XX/05XX", CENTURY_RANGE_ROMAN), - of("vi-vii", "05XX/06XX", CENTURY_RANGE_ROMAN), - of("vii-viii", "06XX/07XX", CENTURY_RANGE_ROMAN), - of("viii-ix", "07XX/08XX", CENTURY_RANGE_ROMAN), - of("ix-x", "08XX/09XX", CENTURY_RANGE_ROMAN), - of("x-xi", "09XX/10XX", CENTURY_RANGE_ROMAN), - of("xi-xii", "10XX/11XX", CENTURY_RANGE_ROMAN), - of("xii-xiii", "11XX/12XX", CENTURY_RANGE_ROMAN), - of("xiii-xiv", "12XX/13XX", CENTURY_RANGE_ROMAN), - of("xiv-xv", "13XX/14XX", CENTURY_RANGE_ROMAN), - of("xv-xvi", "14XX/15XX", CENTURY_RANGE_ROMAN), - of("xvi-xvii", "15XX/16XX", CENTURY_RANGE_ROMAN), - of("xvii-xviii", "16XX/17XX", CENTURY_RANGE_ROMAN), - of("xviii-xix", "17XX/18XX", CENTURY_RANGE_ROMAN), - of("xix-xx", "18XX/19XX", CENTURY_RANGE_ROMAN), - of("xx-xxi", "19XX/20XX", CENTURY_RANGE_ROMAN), - - //Prefixes - of("s I-II", "00XX/01XX", CENTURY_RANGE_ROMAN), - of("S I-II", "00XX/01XX", CENTURY_RANGE_ROMAN), - of("s. I-II", "00XX/01XX", CENTURY_RANGE_ROMAN), - of("S. I-II", "00XX/01XX", CENTURY_RANGE_ROMAN), - of("sec.IV-VII", "03XX/06XX", CENTURY_RANGE_ROMAN), - of("SEC.IV-VII", "03XX/06XX", CENTURY_RANGE_ROMAN), - of("sec. IV-VII", "03XX/06XX", CENTURY_RANGE_ROMAN), - of("SEC. IV-VII", "03XX/06XX", CENTURY_RANGE_ROMAN), - of("saec.VII-XVIII", "06XX/17XX", CENTURY_RANGE_ROMAN), - of("SAEC.VII-XVIII", "06XX/17XX", CENTURY_RANGE_ROMAN), - of("saec. XVI-XVIII", "15XX/17XX", CENTURY_RANGE_ROMAN), - of("SAEC. XVI-XVIII", "15XX/17XX", CENTURY_RANGE_ROMAN), - - //Other possibilities and uncertain - of("s I-iI", "00XX/01XX", CENTURY_RANGE_ROMAN), - of(" s I-II ", "00XX/01XX", CENTURY_RANGE_ROMAN), - of("?saec.X-XVIII", "09XX?/17XX?", CENTURY_RANGE_ROMAN), - of("X-XVIII?", "09XX?/17XX?", CENTURY_RANGE_ROMAN), - of("?saec.X-XVIII?", "09XX?/17XX?", CENTURY_RANGE_ROMAN), - - //Non matches - of("S. XIIII-XIIIV", null, null), //Invalid roman - of("S. XVIII-", null, null, null), //Open-ended incorrect - of("sII-V", null, null), //Without a dot a space is required - of("secVI-XVII", null, null), //Without a dot a space is required - of("saecX-XVIII?", null, null) //Without a dot a space is required - ); - } - -} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractorTest.java deleted file mode 100644 index 74670ea253..0000000000 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractorTest.java +++ /dev/null @@ -1,119 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.DCMI_PERIOD; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.params.provider.Arguments.of; - -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.DateNormalizationResultStatus; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDate; -import java.util.stream.Stream; -import org.junit.jupiter.api.DisplayName; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; - -/** - * Unit tests for {@link DcmiPeriodDateExtractor} class - */ -class DcmiPeriodDateExtractorTest { - - @ParameterizedTest - @MethodSource("extractData") - @DisplayName("Extract DCMI Period") - void extract(String actualDcmiPeriod, String expectedLabel, String expectedStartDate, String expectedEndDate) { - DcmiPeriodDateExtractor periodDateExtractor = new DcmiPeriodDateExtractor(); - DateNormalizationResult dateNormalizationResult = periodDateExtractor.extractDateProperty(actualDcmiPeriod, NO_QUALIFICATION); - if (expectedStartDate == null || expectedEndDate == null) { - assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus()); - } else { - IntervalEdtfDate interval = (IntervalEdtfDate) dateNormalizationResult.getEdtfDate(); - assertEquals(expectedLabel, interval.getLabel()); - assertEquals(expectedStartDate, interval.getStart() != null ? interval.getStart().toString() : null); - assertEquals(expectedEndDate, interval.getEnd() != null ? interval.getEnd().toString() : null); - assertEquals(DCMI_PERIOD, dateNormalizationResult.getDateNormalizationExtractorMatchId()); - } - } - - private static Stream extractData() { - return Stream.of( - of("name=The Great Depression; start=1929; end=1939;", - "The Great Depression", "1929", "1939"), - of("name=Haagse International Arts Festival, 2000; start=2000-01-26; end=2000-02-20;", - "Haagse International Arts Festival, 2000", "2000-01-26", "2000-02-20"), - of("start=1998-09-25; end=1998-09-25; scheme=W3C-DTF;", - null, "1998-09-25", "1998-09-25"), - of("start=1998-09-25T14:20:00+10:00; scheme=W3C-DTF;", - null, "1998-09-25", ".."), - of("end=1998-09-25T16:40:00+10:00; scheme=W3C-DTF;", - null, "..", "1998-09-25"), - of("end=1998-09-25T16:40+10:00; start=1998/01/01 scheme=W3C-DTF;", null, "..", "1998-09-25"), - - //Scheme checks - of("name=The Great Depression; start=1929; end=1939; scheme=W3CDTF;", - "The Great Depression", "1929", "1939"), - of("name=The Great Depression; start=1929; end=1939; scheme=W3C-DTF;", - "The Great Depression", "1929", "1939"), - of("scheme=W3C-DTF; name=The Great Depression; start=1929; end=1939;", - "The Great Depression", "1929", "1939"), - of("name=The Great Depression; start=1929; end=1939; scheme=W3C-DTF", - "The Great Depression", "1929", "1939"), - of("name=The Great Depression; start=1929; end=1939; scheme=W3C-", null, null, null), - - //double fields should be false - of("name=The Great Depression; start=1929; end=1939; name=The Great Depression;", null, null, null), - of("name=The Great Depression; start=1929; end=1939; start=1929;", null, null, null), - of("name=The Great Depression; end=1939; start=1929; end=1939;", null, null, null), - - //Both start and end null then false - of("name=The Great Depression; start=; end=;", null, null, null), - of("name=The Great Depression;", null, null, null), - - //One end bounded - of("name=The Great Depression; start=; end=1939;", - "The Great Depression", "..", "1939"), - of("name=The Great Depression; start=1929; end=;", - "The Great Depression", "1929", ".."), - - //Full date - of("name=Haagse International Arts Festival, 2000; start=2000-01-26; end=2000-02-20;", - "Haagse International Arts Festival, 2000", "2000-01-26", "2000-02-20"), - - //Full date and time - of("start=1999-09-25T14:20:00+10:00; end=1999-09-25T16:40:00+10:00; scheme=W3C-DTF;", - null, "1999-09-25", "1999-09-25"), - of("start=1999-09-25T14:20:00+10:00; scheme=W3C-DTF;", - null, "1999-09-25", ".."), - of("end=1999-09-25T16:40:00+10:00; scheme=W3C-DTF;", - null, "..", "1999-09-25"), - - //Missing semicolon - of("end=1998-09-25T16:40:00+10:00; start=1998 scheme=W3C-DTF;", - null, "..", "1998-09-25"), - - //Invalid date - of("end=1998-09-25T16:40+10:00; start=1998-1986; scheme=W3C-DTF;", null, null, null), - // - //Spaces at the end of the name are cleaned up - of("name=The Great Depression ; start=1929; end=1939;", - "The Great Depression", "1929", "1939"), - - //Spaces at the beginning of the name are cleaned up - of("name= The Great Depression; start=1929; end=1939;", - "The Great Depression", "1929", "1939"), - - //Name at the beginning without field name - of("The Great Depression; start=1929; end=1939;", - null, "1929", "1939"), - - //Name at the beginning without field name and spaces at wrapped - of(" The Great Depression ; start=1929; end=1939;", - null, "1929", "1939"), - - //Normal case - of("name=The Great Depression; start=1929; end=1939;", - "The Great Depression", "1929", "1939") - ); - } -} diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractorTest.java deleted file mode 100644 index fbdd350963..0000000000 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractorTest.java +++ /dev/null @@ -1,74 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.DECADE; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.params.provider.Arguments.of; - -import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.DateNormalizationResultStatus; -import eu.europeana.normalization.dates.edtf.DateQualification; -import java.util.stream.Stream; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; - -class DecadeDateExtractorTest { - - private static final DecadeDateExtractor DECADE_DATE_EXTRACTOR = new DecadeDateExtractor(); - - @ParameterizedTest - @MethodSource - void extract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { - final DateNormalizationResult dateNormalizationResult = DECADE_DATE_EXTRACTOR.extractDateProperty(input, NO_QUALIFICATION); - if (expected == null) { - assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus()); - } else { - final String actual = dateNormalizationResult.getEdtfDate().toString(); - assertEquals(expected, actual); - assertEquals(actual.contains("?"), - dateNormalizationResult.getEdtfDate().getDateQualification() == DateQualification.UNCERTAIN); - assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId()); - } - } - - private static Stream extract() { - return Stream.of( - of("180x", "180X", DECADE), - of("180u", "180X", DECADE), - of("180X", "180X", DECADE), - of("180U", "180X", DECADE), - of(" 180u ", "180X", DECADE), - of("180x?", "180X?", DECADE), - of("180u?", "180X?", DECADE), - of("180??", "180X?", DECADE), - of("?180x", "180X?", DECADE), - of("?180u", "180X?", DECADE), - of("?180x?", "180X?", DECADE), - of("?180u?", "180X?", DECADE), - of("?180??", "180X?", DECADE), - - //Future dates not allowed - of("222u", null, null), - //This is an ambiguous case because hyphen can be used as a separator - of("180-?", null, null), - //Ambiguous, possible open end - of("180-", null, null), - of("180s", null, null),//Non u, x or ? - of("180?", null, null), //Only one question mark not supported - //Too many digits - of("1800", null, null), - of("?1280x", null, null), - of("?1280u?", null, null), - of("?1280??", null, null), - of("1280??", null, null), - - of("18??", null, null), //Too few digits - of("18--", null, null), //Too few digits - of("18..", null, null), //Too few digits - of("1...", null, null) //Too few digits - ); - } - -} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractorTest.java deleted file mode 100644 index b0fedbf806..0000000000 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractorTest.java +++ /dev/null @@ -1,319 +0,0 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; - -import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN; -import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN; -import static eu.europeana.normalization.dates.edtf.DateQualification.APPROXIMATE; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN; -import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN_APPROXIMATE; -import static eu.europeana.normalization.dates.edtf.IntervalEdtfDate.DATE_INTERVAL_SEPARATOR; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.params.provider.Arguments.of; - -import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.DateNormalizationResultStatus; -import eu.europeana.normalization.dates.edtf.AbstractEdtfDate; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDate; -import java.util.stream.Stream; -import org.junit.jupiter.api.DisplayName; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; - -class EdtfDateExtractorTest { - - private final EdtfDateExtractor edtfDateExtractor = new EdtfDateExtractor(); - - // TODO: 01/03/2023 Possible reuse of the test code here for all extractors - private void assertExtract(String input, String expected) { - final DateNormalizationResult dateNormalizationResult = edtfDateExtractor.extractDateProperty(input, NO_QUALIFICATION); - if (expected == null) { - assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus()); - } else { - AbstractEdtfDate edtfDate = dateNormalizationResult.getEdtfDate(); - if (edtfDate instanceof IntervalEdtfDate) { - String startPart = expected.substring(0, expected.indexOf(DATE_INTERVAL_SEPARATOR)); - String endPart = expected.substring(expected.indexOf(DATE_INTERVAL_SEPARATOR) + 1); - InstantEdtfDate start = ((IntervalEdtfDate) edtfDate).getStart(); - InstantEdtfDate end = ((IntervalEdtfDate) edtfDate).getEnd(); - assertEdtfDate(startPart, start); - assertEdtfDate(endPart, end); - } else { - assertEdtfDate(expected, (InstantEdtfDate) dateNormalizationResult.getEdtfDate()); - } - assertEquals(expected, edtfDate.toString()); - } - } - - private static void assertEdtfDate(String expected, InstantEdtfDate instantEdtfDate) { - assertEquals(expected.contains("?"), instantEdtfDate.getDateQualification() == UNCERTAIN); - assertEquals(expected.contains("~"), instantEdtfDate.getDateQualification() == APPROXIMATE); - assertEquals(expected.contains("%"), instantEdtfDate.getDateQualification() == UNCERTAIN_APPROXIMATE); - assertEquals(expected.equals(OPEN.getSerializedRepresentation()), - instantEdtfDate.getDateBoundaryType() == OPEN || instantEdtfDate.getDateBoundaryType() == UNKNOWN); - } - - @ParameterizedTest - @MethodSource - @DisplayName("[year][“-”][month][“-”][day] Complete representation") - void completeDateRepresentationLevel0(String input, String expected) { - assertExtract(input, expected); - } - - @ParameterizedTest - @MethodSource - @DisplayName("[year][“-”][month] Reduced precision for year and month") - void reducedPrecisionForYearAndMonthLevel0(String input, String expected) { - assertExtract(input, expected); - } - - @ParameterizedTest - @MethodSource - @DisplayName("[year] Reduced precision for year") - void reducedPrecisionForYearLevel0(String input, String expected) { - assertExtract(input, expected); - } - - @ParameterizedTest - @MethodSource - void dateIntervalRepresentationLevel0(String input, String expected) { - assertExtract(input, expected); - } - - @ParameterizedTest - @MethodSource - @DisplayName("Letter-prefixed calendar year") - void letterPrefixedCalendarYearLevel1(String input, String expected) { - assertExtract(input, expected); - } - - @ParameterizedTest - @MethodSource - @DisplayName("The characters '?', '~' and '%' are used to mean \"uncertain\", \"approximate\", and \"uncertain\" as well as \"approximate\", respectively") - void dateQualificationLevel1(String input, String expected) { - assertExtract(input, expected); - } - - @ParameterizedTest - @MethodSource - @DisplayName("Negative Calendar Year") - void negativeCalendarYearLevel1(String input, String expected) { - assertExtract(input, expected); - } - - - @ParameterizedTest - @MethodSource - @DisplayName("Open time interval") - void openTimeIntervalLevel1(String input, String expected) { - assertExtract(input, expected); - } - - - @ParameterizedTest - @MethodSource - @DisplayName("Unknown time interval") - void unknownTimeIntervalLevel1(String input, String expected) { - assertExtract(input, expected); - } - - private static Stream completeDateRepresentationLevel0() { - return Stream.of( - of("1989-11-01", "1989-11-01"), - of("0989-11-01", "0989-11-01"), - of("0989-11-01", "0989-11-01"), - //Digits missing on year - of("198-11-01", null), - //Digits missing on month or day - of("1989-11-1", null), - of("1989-1-01", null), - //Anything other than hyphen "-" is not valid - of("1989/11/01", null), - - //Complete representations for calendar date and (local) time of day - of("1989-11-01T23:59:59", "1989-11-01"), - of("1989-11-01T23:59", "1989-11-01"), - of("1989-11-01T23", "1989-11-01"), - of("1989-11-01T", "1989-11-01"), - of("1989-11-01T23:59:5", "1989-11-01"), - of("1989-11-01T23:5:59", "1989-11-01"), - of("1989-11-01t23:59:59", null), - of("1989-11-01 23:59:59", null), - - //Complete representations for calendar date and UTC time of day - of("1989-11-01T23:59:59Z", "1989-11-01"), - of("1989-11-01t23:59:59Z", null), - of("1989-11-01 23:59:59Z", null), - - //Date and time with time shift in hours (only) - of("1989-11-01T23:59:59-04", "1989-11-01"), - of("1989-11-01T23:59:59+04", "1989-11-01"), - of("1989-11-01t23:59:59-04", null), - of("1989-11-01 23:59:59-04", null), - - //Date and time with time shift in hours and minutes - of("1989-11-01T23:59:59-04:44", "1989-11-01"), - of("1989-11-01T23:59:59+04:44", "1989-11-01"), - of("1989-11-01t23:59:59-04:44", null), - of("1989-11-01 23:59:59-04:44", null) - ); - } - - private static Stream reducedPrecisionForYearAndMonthLevel0() { - return Stream.of( - of("1989-11", "1989-11"), - of("0989-11", "0989-11"), - //Digits missing on year - of("198-11", null), - //Digits missing on month - of("1989-1", null), - //Anything other than hyphen "-" is not valid - of("1989/11", null) - ); - } - - private static Stream reducedPrecisionForYearLevel0() { - return Stream.of( - of("1989", "1989"), - of("0989", "0989"), - //Digits missing on year - of("198", null) - ); - } - - private static Stream dateIntervalRepresentationLevel0() { - return Stream.of( - of("1989/1990", "1989/1990"), - of("1989-11/1990-11", "1989-11/1990-11"), - of("1989-11-01/1990-11-01", "1989-11-01/1990-11-01"), - of("1989-11-01/1990-11", "1989-11-01/1990-11"), - of("1989-11-01/1990", "1989-11-01/1990"), - of("1989/1990-11", "1989/1990-11"), - of("1989/1990-11-01", "1989/1990-11-01"), - of("1989-00/1990-00", null), - of("1989-00-00/1990-00-00", null), - //Spaces not valid - of("1989 / 1990", null), - //Dash not valid - of("1989-1990", null), - //Missing digits - of("989-1990", null), - of("1989-990", null) - ); - } - - private static Stream letterPrefixedCalendarYearLevel1() { - return Stream.of( - //Future dates are not valid - of("Y170000002", null), - of("Y-170000002", "Y-170000002"), - //Overflow, max is +-999999999 - of("Y1700000002", null), - of("Y-1700000002", null), - //Too low values - of("Y0", null), - of("Y1", null), - of("Y-1", null), - of("Y", null) - ); - } - - private static Stream dateQualificationLevel1() { - return Stream.of( - of("1989?", "1989?"), - of("1989~", "1989~"), - of("1989-11?", "1989-11?"), - of("1989-11~", "1989-11~"), - of("1989-11-01%", "1989-11-01%") - ); - } - - private static Stream negativeCalendarYearLevel1() { - return Stream.of( - of("-1989", "-1989"), - of("-9999", "-9999"), - of("-0989", "-0989"), - of("-11989", null) - ); - } - - private static Stream openTimeIntervalLevel1() { - return Stream.of( - //Open start - of("../1989-11-01", "../1989-11-01"), - of("../1989-11", "../1989-11"), - of("../1989", "../1989"), - of("../1989-11-01~", "../1989-11-01~"), - of("../1989-11~", "../1989-11~"), - of("../1989~", "../1989~"), - of("../1989-11-01?", "../1989-11-01?"), - of("../1989-11?", "../1989-11?"), - of("../1989?", "../1989?"), - of("../1989-11-01%", "../1989-11-01%"), - of("../1989-11%", "../1989-11%"), - of("../1989%", "../1989%"), - of(".. / 1989-11-01", null), - of("../ 1989-11-01", null), - of(".. /1989-11-01", null), - - //Open end - of("1989-11-01/..", "1989-11-01/.."), - of("1989-11/..", "1989-11/.."), - of("1989/..", "1989/.."), - of("1989-11-01~/..", "1989-11-01~/.."), - of("1989-11~/..", "1989-11~/.."), - of("1989~/..", "1989~/.."), - of("1989-11-01?/..", "1989-11-01?/.."), - of("1989-11?/..", "1989-11?/.."), - of("1989?/..", "1989?/.."), - of("1989-11-01%/..", "1989-11-01%/.."), - of("1989-11%/..", "1989-11%/.."), - of("1989%/..", "1989%/.."), - of("1989-11-01 / ..", null), - of("1989-11-01 /..", null), - of("1989-11-01/ ..", null), - of("../..", null) - ); - } - - - private static Stream unknownTimeIntervalLevel1() { - return Stream.of( - //Unknown start - of("/1989-11-01", "../1989-11-01"), - of("/1989-11", "../1989-11"), - of("/1989", "../1989"), - of("/1989-11-01~", "../1989-11-01~"), - of("/1989-11~", "../1989-11~"), - of("/1989~", "../1989~"), - of("/1989-11-01?", "../1989-11-01?"), - of("/1989-11?", "../1989-11?"), - of("/1989?", "../1989?"), - of("/1989-11-01%", "../1989-11-01%"), - of("/1989-11%", "../1989-11%"), - of("/1989%", "../1989%"), - of(" / 1989-11-01", null), - of("/ 1989-11-01", null), - of(" /1989-11-01", null), - - //Unknown end - of("1989-11-01/", "1989-11-01/.."), - of("1989-11/", "1989-11/.."), - of("1989/", "1989/.."), - of("1989-11-01~/", "1989-11-01~/.."), - of("1989-11~/", "1989-11~/.."), - of("1989~/", "1989~/.."), - of("1989-11-01?/", "1989-11-01?/.."), - of("1989-11?/", "1989-11?/.."), - of("1989?/", "1989?/.."), - of("1989-11-01%/", "1989-11-01%/.."), - of("1989-11%/", "1989-11%/.."), - of("1989%/", "1989%/.."), - of("1989-11-01 / ", null), - of("1989-11-01 /", null), - of("1989-11-01/ ", null), - of("/", null) - ); - } -} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractorTest.java new file mode 100644 index 0000000000..bb1c60d1ac --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractorTest.java @@ -0,0 +1,117 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.BC_AD; +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class BcAdDateExtractorTest implements DateExtractorTest { + + private static final BcAdDateExtractor PATTERN_BC_AD_DATE_EXTRACTOR = new BcAdDateExtractor(); + + @ParameterizedTest + @MethodSource + void extract(String input, String expected) { + assertExtract(input, expected); + } + + void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = PATTERN_BC_AD_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, BC_AD); + } + + private static Stream extract() { + return Stream.of( + //Bulgarian + of("1989 пр.Хр.", "-1988"), + of("1989 сл.Хр.", "1989"), + //Croatian + of("1989 pr. Kr.", "-1988"), + of("1989 po. Kr.", "1989"), + //Czech + of("1989 př. n. l.", "-1988"), + of("1989 n. l.", "1989"), + //Danish + of("1989 f.Kr.", "-1988"), + of("1989 e.Kr.", "1989"), + //Dutch + of("1989 v.Chr.", "-1988"), + of("1989 n.Chr.", "1989"), + //English + of("1989 BC", "-1988"), + of("1989 AD", "1989"), + //Estonian + of("1989 eKr", "-1988"), + of("1989 pKr", "1989"), + //Finnish + of("1989 eKr.", "-1988"), + of("1989 jKr.", "1989"), + //French + of("1989 av. J.-C.", "-1988"), + of("1989 ap. J.-C.", "1989"), + //German + of("1989 v. Chr.", "-1988"), + of("1989 n. Chr.", "1989"), + //Greek + of("1989 π.Χ.", "-1988"), + of("1989 μ.Χ.", "1989"), + //Hungarian + of("1989 i. e.", "-1988"), + of("1989 i. sz.", "1989"), + //Irish + of("1989 RC", "-1988"), + of("1989 AD", "1989"), + //Italian + of("1989 a.C.", "-1988"), + of("1989 d.C.", "1989"), + //Latvian + of("1989 p.m.ē.", "-1988"), + of("1989 m.ē.", "1989"), + //Lithuanian + of("1989 pr. Kr.", "-1988"), + of("1989 po Kr.", "1989"), + //Maltese + of("1989 QK", "-1988"), + of("1989 WK", "1989"), + //Polish + of("1989 p.n.e.", "-1988"), + of("1989 n.e.", "1989"), + //Portuguese + of("1989 a.C.", "-1988"), + of("1989 d.C.", "1989"), + //Romanian + of("1989 î.Hr.", "-1988"), + of("1989 d.Hr.", "1989"), + //Slovak + of("1989 pred Kr.", "-1988"), + of("1989 po Kr.", "1989"), + //Slovenian + of("1989 pr. Kr.", "-1988"), + of("1989 po Kr.", "1989"), + //Spanish + of("1989 a. C.", "-1988"), + of("1989 d. C.", "1989"), + //Swedish + of("1989 f.Kr.", "-1988"), + of("1989 e.Kr.", "1989"), + + //Less digits + of("198 AD", "0198"), + of("19 AD", "0019"), + + //First years + of("1 AD", "0001"), + of("1 BC", "0000"), + of("2 BC", "-0001"), + + //Invalids + of("0 BC", null), + of("-1989 BC", null), + of("-1989 AD", null) + ); + } +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractorTest.java new file mode 100644 index 0000000000..22c50ef142 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractorTest.java @@ -0,0 +1,68 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.BC_AD; +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class BcAdRangeDateExtractorTest implements DateExtractorTest { + + private static final BcAdRangeDateExtractor BC_AD_RANGE_DATE_EXTRACTOR = new BcAdRangeDateExtractor(); + + @ParameterizedTest + @MethodSource + void extract(String input, String expected) { + assertExtract(input, expected); + } + + void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = BC_AD_RANGE_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, BC_AD); + } + + private static Stream extract() { + return Stream.of( + //BC-BC + of("1990 BC-1989 BC", "-1989/-1988"), + of("1990 BC/1989 BC", "-1989/-1988"), + of("1990 BC - 1989 BC", "-1989/-1988"), + of("1990 BC / 1989 BC", "-1989/-1988"), + of("1990 BC-1 BC", "-1989/0000"), + + //BC-BC(Greek) + of("1990 π.Χ.-1989 π.Χ.", "-1989/-1988"), + of("1990 π.Χ./1989 π.Χ.", "-1989/-1988"), + of("1990 π.Χ. - 1989 π.Χ.", "-1989/-1988"), + of("1990 π.Χ. / 1989 π.Χ.", "-1989/-1988"), + of("1990 π.Χ.-1 π.Χ.", "-1989/0000"), + + //AD-AD + of("1989 AD-1990 AD", "1989/1990"), + of("1989 AD/1990 AD", "1989/1990"), + of("1989 AD - 1990 AD", "1989/1990"), + of("1989 AD / 1990 AD", "1989/1990"), + + //AD-AD(Greek) + of("1989 μ.Χ.-1990 μ.Χ.", "1989/1990"), + of("1989 μ.Χ./1990 μ.Χ.", "1989/1990"), + of("1989 μ.Χ. - 1990 μ.Χ.", "1989/1990"), + of("1989 μ.Χ. / 1990 μ.Χ.", "1989/1990"), + + //BC-AD + of("1989 π.Χ.-1989 μ.Χ.", "-1988/1989"), + of("1989 π.Χ.-1 μ.Χ.", "-1988/0001"), + + //Invalids + of("1990 BC//1989 BC", null), + of("-1990 BC-1989 BC", null), + of("-1990 BC--1989 BC", null), + of("1990 BC , 1989 BC", null), + of("1989 BC-0 BC", null), + of("1989 BC-0 AD", null) + ); + } +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractorTest.java new file mode 100644 index 0000000000..1995294e75 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractorTest.java @@ -0,0 +1,61 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.BRIEF_DATE_RANGE; +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class BriefRangeDateExtractorTest implements DateExtractorTest { + + private static final BriefRangeDateExtractor BRIEF_RANGE_DATE_EXTRACTOR = new BriefRangeDateExtractor(); + + private void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = BRIEF_RANGE_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, BRIEF_DATE_RANGE); + } + + @ParameterizedTest + @MethodSource + void extractBrief(String input, String expected) { + assertExtract(input, expected); + } + + private static Stream extractBrief() { + return Stream.of( + //Slash + of("1989/90", "1989/1990"), + of("1989/90?", "1989/1990?"), + of("?1989/90", "1989?/1990"), + of("?1989/90?", "1989?/1990?"), + of("-1989/-88", null), + + //Dash + of("1989-90", "1989/1990"), + of("1989-90?", "1989/1990?"), + of("?1989-90", "1989?/1990"), + of("?1989-90?", "1989?/1990?"), + of("989-90", "0989/0990"), + + //End date lower rightmost two digits than start year + of("1989/89", null), + of("1989/88", null), + of("1989-89", null), + of("1989-88", null), + + //More than two digits on end year not allowed + of("1989/990", null), + of("1989-990", null), + + //End year cannot be lower or equal than 12 + of("1900/01", null), + of("1900/12", null), + + //Less than three digits on start year + of("89-90", null) + ); + } +} diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractorTest.java new file mode 100644 index 0000000000..fa6a3b98a4 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractorTest.java @@ -0,0 +1,60 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.CENTURY_NUMERIC; +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class CenturyNumericDateExtractorTest implements DateExtractorTest { + + private static final CenturyNumericDateExtractor CENTURY_DATE_EXTRACTOR = new CenturyNumericDateExtractor(); + + void assertExtract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { + final DateNormalizationResult dateNormalizationResult = CENTURY_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId); + } + + @ParameterizedTest + @MethodSource + void extractNumeric(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { + assertExtract(input, expected, dateNormalizationExtractorMatchId); + } + + private static Stream extractNumeric() { + return Stream.of( + //PATTERN_YYYY + of("18..", "18XX", CENTURY_NUMERIC), + of(" 18.. ", "18XX", CENTURY_NUMERIC), + of("?18..", "18XX?", CENTURY_NUMERIC), + of("18..?", "18XX?", CENTURY_NUMERIC), + of("?18..?", "18XX?", CENTURY_NUMERIC), + of("192?", null, null, null), //Too many digits + of("1..", null, null, null), //Too few digits + + //PATTERN_ENGLISH + of("1st century", "00XX", CENTURY_NUMERIC), + of("2nd century", "01XX", CENTURY_NUMERIC), + of("3rd century", "02XX", CENTURY_NUMERIC), + of("11th century", "10XX", CENTURY_NUMERIC), + of(" 11th century ", "10XX", CENTURY_NUMERIC), + of("?11th century", "10XX?", CENTURY_NUMERIC), + of("11th century?", "10XX?", CENTURY_NUMERIC), + of("?11th century?", "10XX?", CENTURY_NUMERIC), + of("12th century BC", null, null, null), // not supported + of("[10th century]", null, null, null), // not supported + of("11thcentury", null, null, null), //Incorrect spacing numeric + of("11st century", null, null, null), //Incorrect suffix + of("12rd century", null, null, null), //Incorrect suffix + of("13st century", null, null, null), //Incorrect suffix + of("21th century", null, null, null), //Incorrect suffix + of("0st century", null, null, null), //Out of range + of("22nd century", null, null, null) //Out of range + ); + } + +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomaDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomaDateExtractorTest.java new file mode 100644 index 0000000000..84fba4cbfb --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomaDateExtractorTest.java @@ -0,0 +1,93 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.CENTURY_ROMAN; +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class CenturyRomaDateExtractorTest implements DateExtractorTest { + + private static final CenturyRomanDateExtractor ROMAN_CENTURY_DATE_EXTRACTOR = new CenturyRomanDateExtractor(); + + void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = ROMAN_CENTURY_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, CENTURY_ROMAN); + } + + @ParameterizedTest + @MethodSource + void extractRoman(String input, String expected) { + assertExtract(input, expected); + } + + private static Stream extractRoman() { + return Stream.of( + //Uppercase + of("I", "00XX"), + of("IV", "03XX"), + of("V", "04XX"), + of("VI", "05XX"), + of("IX", "08XX"), + of("X", "09XX"), + of("XI", "10XX"), + of("XIV", "13XX"), + of("XV", "14XX"), + of("XVI", "15XX"), + of("XIX", "18XX"), + of("XX", "19XX"), + of("XXI", "20XX"), + + //Lower case + of("i", "00XX"), + of("iv", "03XX"), + of("v", "04XX"), + of("vi", "05XX"), + of("ix", "08XX"), + of("x", "09XX"), + of("xi", "10XX"), + of("xiv", "13XX"), + of("xv", "14XX"), + of("xvi", "15XX"), + of("xix", "18XX"), + of("xx", "19XX"), + of("xxi", "20XX"), + + //Prefixes + of("s I", "00XX"), + of("s. I", "00XX"), + of("S I", "00XX"), + of("S.I", "00XX"), + of("sec.I", "00XX"), + of("SEC.I", "00XX"), + of("sec. I", "00XX"), + of("SEC. I", "00XX"), + of("saec.I", "00XX"), + of("SAEC.I", "00XX"), + of("saec. I", "00XX"), + of("SAEC. I", "00XX"), + //Other possibilities and uncertain + of("Ii", "01XX"), + of(" s I ", "00XX"), + of("?s. I", "00XX?"), + of("sec. I?", "00XX?"), + of("?saec. I?", "00XX?"), + of(" I ", "00XX"), + of("?I", "00XX?"), + of("I?", "00XX?"), + of("?I?", "00XX?"), + //Non matches + //Without a dot a space is required + of("saecI", null), + of("secI", null), + // Not supported range + of("MDCLXX", null), + // Invalid roman + of("IXX", null) + ); + } + +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractorTest.java new file mode 100644 index 0000000000..1a8474a613 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractorTest.java @@ -0,0 +1,103 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class CenturyRomanRangeDateExtractorTest implements DateExtractorTest { + + private static final CenturyRomanRangeDateExtractor ROMAN_CENTURY_RANGE_DATE_EXTRACTOR = new CenturyRomanRangeDateExtractor(); + + void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = ROMAN_CENTURY_RANGE_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.CENTURY_RANGE_ROMAN); + } + + @ParameterizedTest + @MethodSource + void extractRoman(String input, String expected) { + assertExtract(input, expected); + } + + private static Stream extractRoman() { + return Stream.of( + //Uppercase + of("I-II", "00XX/01XX"), + of("II-III", "01XX/02XX"), + of("III-IV", "02XX/03XX"), + of("IV-V", "03XX/04XX"), + of("V-VI", "04XX/05XX"), + of("VI-VII", "05XX/06XX"), + of("VII-VIII", "06XX/07XX"), + of("VIII-IX", "07XX/08XX"), + of("IX-X", "08XX/09XX"), + of("X-XI", "09XX/10XX"), + of("XI-XII", "10XX/11XX"), + of("XII-XIII", "11XX/12XX"), + of("XIII-XIV", "12XX/13XX"), + of("XIV-XV", "13XX/14XX"), + of("XV-XVI", "14XX/15XX"), + of("XVI-XVII", "15XX/16XX"), + of("XVII-XVIII", "16XX/17XX"), + of("XVIII-XIX", "17XX/18XX"), + of("XIX-XX", "18XX/19XX"), + of("XX-XXI", "19XX/20XX"), + + //Lowercase + of("i-ii", "00XX/01XX"), + of("ii-iii", "01XX/02XX"), + of("iii-iv", "02XX/03XX"), + of("iv-v", "03XX/04XX"), + of("v-vi", "04XX/05XX"), + of("vi-vii", "05XX/06XX"), + of("vii-viii", "06XX/07XX"), + of("viii-ix", "07XX/08XX"), + of("ix-x", "08XX/09XX"), + of("x-xi", "09XX/10XX"), + of("xi-xii", "10XX/11XX"), + of("xii-xiii", "11XX/12XX"), + of("xiii-xiv", "12XX/13XX"), + of("xiv-xv", "13XX/14XX"), + of("xv-xvi", "14XX/15XX"), + of("xvi-xvii", "15XX/16XX"), + of("xvii-xviii", "16XX/17XX"), + of("xviii-xix", "17XX/18XX"), + of("xix-xx", "18XX/19XX"), + of("xx-xxi", "19XX/20XX"), + + //Prefixes + of("s I-II", "00XX/01XX"), + of("S I-II", "00XX/01XX"), + of("s. I-II", "00XX/01XX"), + of("S. I-II", "00XX/01XX"), + of("sec.IV-VII", "03XX/06XX"), + of("SEC.IV-VII", "03XX/06XX"), + of("sec. IV-VII", "03XX/06XX"), + of("SEC. IV-VII", "03XX/06XX"), + of("saec.VII-XVIII", "06XX/17XX"), + of("SAEC.VII-XVIII", "06XX/17XX"), + of("saec. XVI-XVIII", "15XX/17XX"), + of("SAEC. XVI-XVIII", "15XX/17XX"), + + //Other possibilities and uncertain + of("s I-iI", "00XX/01XX"), + of(" s I-II ", "00XX/01XX"), + of("?saec.X-XVIII", "09XX?/17XX"), + of("X-XVIII?", "09XX/17XX?"), + of("?saec.X-XVIII?", "09XX?/17XX?"), + + //Non matches + of("S. XIIII-XIIIV", null, null), //Invalid roman + of("S. XVIII-", null, null, null), //Open-ended incorrect + of("sII-V", null, null), //Without a dot a space is required + of("secVI-XVII", null, null), //Without a dot a space is required + of("saecX-XVIII?", null, null) //Without a dot a space is required + ); + } + +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractorTest.java new file mode 100644 index 0000000000..e6c0ee4930 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractorTest.java @@ -0,0 +1,66 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN; +import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN; +import static eu.europeana.normalization.dates.edtf.DateQualification.APPROXIMATE; +import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN; +import static eu.europeana.normalization.dates.extraction.DefaultDatesSeparator.SLASH_DELIMITER; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import eu.europeana.normalization.dates.DateNormalizationResultStatus; +import eu.europeana.normalization.dates.edtf.AbstractEdtfDate; +import eu.europeana.normalization.dates.edtf.InstantEdtfDate; +import eu.europeana.normalization.dates.edtf.IntervalEdtfDate; + +public interface DateExtractorTest { + + default void assertQualification(String expected, InstantEdtfDate instantEdtfDate) { + assertEquals(expected.contains("?"), + instantEdtfDate.getDateQualifications().contains(UNCERTAIN) && + !instantEdtfDate.getDateQualifications().contains(APPROXIMATE)); + assertEquals(expected.contains("~"), + instantEdtfDate.getDateQualifications().contains(APPROXIMATE) && + !instantEdtfDate.getDateQualifications().contains(UNCERTAIN)); + assertEquals(expected.contains("%"), + instantEdtfDate.getDateQualifications().contains(UNCERTAIN) && + instantEdtfDate.getDateQualifications().contains(APPROXIMATE)); + } + + default void assertBoundaryType(String expected, InstantEdtfDate instantEdtfDate) { + assertEquals(expected.equals(OPEN.getSerializedRepresentation()), + instantEdtfDate.getDateBoundaryType() == OPEN || instantEdtfDate.getDateBoundaryType() == UNKNOWN); + } + + default void assertDateNormalizationResult(DateNormalizationResult dateNormalizationResult, String expected, + DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId, String expectedLabel) { + assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId); + if (expected != null) { + assertEquals(expectedLabel, dateNormalizationResult.getEdtfDate().getLabel()); + } + } + + default void assertDateNormalizationResult(DateNormalizationResult dateNormalizationResult, String expected, + DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { + if (expected == null) { + assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus()); + } else { + assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId()); + AbstractEdtfDate edtfDate = dateNormalizationResult.getEdtfDate(); + if (edtfDate instanceof IntervalEdtfDate) { + String expectedStart = expected.substring(0, expected.indexOf(SLASH_DELIMITER.getStringRepresentation())); + String expectedEnd = expected.substring(expected.indexOf(SLASH_DELIMITER.getStringRepresentation()) + 1); + InstantEdtfDate startInstantEdtfDate = ((IntervalEdtfDate) edtfDate).getStart(); + InstantEdtfDate endInstantEdtfDate = ((IntervalEdtfDate) edtfDate).getEnd(); + assertQualification(expectedStart, startInstantEdtfDate); + assertQualification(expectedEnd, endInstantEdtfDate); + assertBoundaryType(expectedStart, startInstantEdtfDate); + assertBoundaryType(expectedEnd, endInstantEdtfDate); + } else { + assertQualification(expected, (InstantEdtfDate) edtfDate); + } + assertEquals(expected, edtfDate.toString()); + } + } +} diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractorTest.java new file mode 100644 index 0000000000..0254045f14 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractorTest.java @@ -0,0 +1,90 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +/** + * Unit tests for {@link DcmiPeriodDateExtractor} class + */ +class DcmiPeriodDateExtractorTest implements DateExtractorTest { + + private static final DcmiPeriodDateExtractor DCMI_PERIOD_DATE_EXTRACTOR = new DcmiPeriodDateExtractor(); + + @ParameterizedTest + @MethodSource("extractData") + @DisplayName("Extract DCMI Period") + void extract(String input, String expected, String expectedLabel) { + DateNormalizationResult dateNormalizationResult = DCMI_PERIOD_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.DCMI_PERIOD, + expectedLabel); + } + + private static Stream extractData() { + return Stream.of( + of("name=The Great Depression; start=1929; end=1939;", "1929/1939", "The Great Depression"), + of("name=Haagse International Arts Festival, 2000; start=2000-01-26; end=2000-02-20;", + "2000-01-26/2000-02-20", "Haagse International Arts Festival, 2000"), + of("start=1998-09-25; end=1998-09-25; scheme=W3C-DTF;", "1998-09-25/1998-09-25", null), + of("start=1998-09-25T14:20:00+10:00; scheme=W3C-DTF;", "1998-09-25/..", null), + of("end=1998-09-25T16:40:00+10:00; scheme=W3C-DTF;", "../1998-09-25", null), + of("end=1998-09-25T16:40+10:00; start=1998/01/01 scheme=W3C-DTF;", "../1998-09-25", null), + + //Scheme checks + of("name=The Great Depression; start=1929; end=1939; scheme=W3CDTF;", "1929/1939", "The Great Depression"), + of("name=The Great Depression; start=1929; end=1939; scheme=W3C-DTF;", "1929/1939", "The Great Depression"), + of("scheme=W3C-DTF; name=The Great Depression; start=1929; end=1939;", "1929/1939", "The Great Depression"), + of("name=The Great Depression; start=1929; end=1939; scheme=W3C-DTF", "1929/1939", "The Great Depression"), + of("name=The Great Depression; start=1929; end=1939; scheme=W3C-", null, null, null), + + //double fields should be false + of("name=The Great Depression; start=1929; end=1939; name=The Great Depression;", null, null, null), + of("name=The Great Depression; start=1929; end=1939; start=1929;", null, null, null), + of("name=The Great Depression; end=1939; start=1929; end=1939;", null, null, null), + + //Both start and end null then false + of("name=The Great Depression; start=; end=;", null, null, null), + of("name=The Great Depression;", null, null, null), + + //One end bounded + of("name=The Great Depression; start=; end=1939;", "../1939", "The Great Depression"), + of("name=The Great Depression; start=1929; end=;", "1929/..", "The Great Depression"), + + //Full date + of("name=Haagse International Arts Festival, 2000; start=2000-01-26; end=2000-02-20;", + "2000-01-26/2000-02-20", "Haagse International Arts Festival, 2000"), + + //Full date and time + of("start=1999-09-25T14:20:00+10:00; end=1999-09-25T16:40:00+10:00; scheme=W3C-DTF;", "1999-09-25/1999-09-25", null), + of("start=1999-09-25T14:20:00+10:00; scheme=W3C-DTF;", "1999-09-25/..", null), + of("end=1999-09-25T16:40:00+10:00; scheme=W3C-DTF;", "../1999-09-25", null), + + //Missing semicolon + of("end=1998-09-25T16:40:00+10:00; start=1998 scheme=W3C-DTF;", "../1998-09-25", null), + + //Invalid date + of("end=1998-09-25T16:40+10:00; start=1998-1986; scheme=W3C-DTF;", null, null, null), + // + //Spaces at the end of the name are cleaned up + of("name=The Great Depression ; start=1929; end=1939;", "1929/1939", "The Great Depression"), + + //Spaces at the beginning of the name are cleaned up + of("name= The Great Depression; start=1929; end=1939;", "1929/1939", "The Great Depression"), + + //Name at the beginning without field name + of("The Great Depression; start=1929; end=1939;", "1929/1939", null), + + //Name at the beginning without field name and spaces at wrapped + of(" The Great Depression ; start=1929; end=1939;", "1929/1939", null), + + //Normal case + of("name=The Great Depression; start=1929; end=1939;", "1929/1939", "The Great Depression") + ); + } +} diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractorTest.java new file mode 100644 index 0000000000..3201cad090 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractorTest.java @@ -0,0 +1,63 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class DecadeDateExtractorTest implements DateExtractorTest { + + private static final DecadeDateExtractor DECADE_DATE_EXTRACTOR = new DecadeDateExtractor(); + + @ParameterizedTest + @MethodSource + void extract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = DECADE_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.DECADE); + } + + private static Stream extract() { + return Stream.of( + of("180x", "180X"), + of("180u", "180X"), + of("180X", "180X"), + of("180U", "180X"), + of(" 180u ", "180X"), + of("180x?", "180X?"), + of("180u?", "180X?"), + of("180??", "180X?"), + of("?180x", "180X?"), + of("?180u", "180X?"), + of("?180x?", "180X?"), + of("?180u?", "180X?"), + of("?180??", "180X?"), + + //Future dates not allowed + of("222u", null), + //This is an ambiguous case because hyphen can be used as a separator + of("180-?", null), + //Ambiguous, possible open end + of("180-", null), + //Non u, x or ? + of("180s", null), + //Only one question mark not supported + of("180?", null), + //Too many digits + of("1800", null), + of("?1280x", null), + of("?1280u?", null), + of("?1280??", null), + of("1280??", null), + + of("18??", null), //Too few digits + of("18--", null), //Too few digits + of("18..", null), //Too few digits + of("1...", null) //Too few digits + ); + } + +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractorTest.java new file mode 100644 index 0000000000..e9f60f3453 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractorTest.java @@ -0,0 +1,167 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class EdtfDateExtractorTest implements DateExtractorTest { + + private static final EdtfDateExtractor EDTF_DATE_EXTRACTOR = new EdtfDateExtractor(); + + private void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = EDTF_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.EDTF); + } + + @ParameterizedTest + @MethodSource + @DisplayName("[year][“-”][month][“-”][day] Complete representation") + void completeDateRepresentationLevel0(String input, String expected) { + assertExtract(input, expected); + } + + @ParameterizedTest + @MethodSource + @DisplayName("[year][“-”][month] Reduced precision for year and month") + void reducedPrecisionForYearAndMonthLevel0(String input, String expected) { + assertExtract(input, expected); + } + + @ParameterizedTest + @MethodSource + @DisplayName("[year] Reduced precision for year") + void reducedPrecisionForYearLevel0(String input, String expected) { + assertExtract(input, expected); + } + + @ParameterizedTest + @MethodSource + @DisplayName("Letter-prefixed calendar year") + void letterPrefixedCalendarYearLevel1(String input, String expected) { + assertExtract(input, expected); + } + + @ParameterizedTest + @MethodSource + @DisplayName("The characters '?', '~' and '%' are used to mean \"uncertain\", \"approximate\", and \"uncertain\" as well as \"approximate\", respectively") + void dateQualificationLevel1(String input, String expected) { + assertExtract(input, expected); + } + + @ParameterizedTest + @MethodSource + @DisplayName("Negative Calendar Year") + void negativeCalendarYearLevel1(String input, String expected) { + assertExtract(input, expected); + } + + private static Stream completeDateRepresentationLevel0() { + return Stream.of( + of("1989-11-01", "1989-11-01"), + of("0989-11-01", "0989-11-01"), + of("0989-11-01", "0989-11-01"), + //Digits missing on year + of("198-11-01", null), + //Digits missing on month or day + of("1989-11-1", null), + of("1989-1-01", null), + //Anything other than hyphen "-" is not valid + of("1989/11/01", null), + + //Complete representations for calendar date and (local) time of day + of("1989-11-01T23:59:59", "1989-11-01"), + of("1989-11-01T23:59", "1989-11-01"), + of("1989-11-01T23", "1989-11-01"), + of("1989-11-01T", "1989-11-01"), + of("1989-11-01T23:59:5", "1989-11-01"), + of("1989-11-01T23:5:59", "1989-11-01"), + of("1989-11-01t23:59:59", null), + of("1989-11-01 23:59:59", null), + + //Complete representations for calendar date and UTC time of day + of("1989-11-01T23:59:59Z", "1989-11-01"), + of("1989-11-01t23:59:59Z", null), + of("1989-11-01 23:59:59Z", null), + + //Date and time with time shift in hours (only) + of("1989-11-01T23:59:59-04", "1989-11-01"), + of("1989-11-01T23:59:59+04", "1989-11-01"), + of("1989-11-01t23:59:59-04", null), + of("1989-11-01 23:59:59-04", null), + + //Date and time with time shift in hours and minutes + of("1989-11-01T23:59:59-04:44", "1989-11-01"), + of("1989-11-01T23:59:59+04:44", "1989-11-01"), + of("1989-11-01t23:59:59-04:44", null), + of("1989-11-01 23:59:59-04:44", null) + ); + } + + private static Stream reducedPrecisionForYearAndMonthLevel0() { + return Stream.of( + of("1989-11", "1989-11"), + of("0989-11", "0989-11"), + //Digits missing on year + of("198-11", null), + //Digits missing on month + of("1989-1", null), + //Anything other than hyphen "-" is not valid + of("1989/11", null) + ); + } + + private static Stream reducedPrecisionForYearLevel0() { + return Stream.of( + of("1989", "1989"), + of("0989", "0989"), + //Digits missing on year + of("198", null) + ); + } + + private static Stream letterPrefixedCalendarYearLevel1() { + return Stream.of( + of("Y-123456789", "Y-123456789"), + //Non prefixed + of("-123456789", null), + //Future dates are not valid + of("Y123456789", null), + //Month and day not valid + of("Y123456789/11/01", null), + //Overflow, max is +-999999999 + of("Y1234567890", null), + of("Y-1234567890", null), + //Too low values + of("Y0", null), + of("Y1", null), + of("Y-1", null), + of("Y", null), + of("YnonValidNumber", null) + ); + } + + private static Stream dateQualificationLevel1() { + return Stream.of( + of("1989?", "1989?"), + of("1989~", "1989~"), + of("1989-11?", "1989-11?"), + of("1989-11~", "1989-11~"), + of("1989-11-01%", "1989-11-01%") + ); + } + + private static Stream negativeCalendarYearLevel1() { + return Stream.of( + of("-1989", "-1989"), + of("-9999", "-9999"), + of("-0989", "-0989"), + of("-11989", null) + ); + } +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractorTest.java new file mode 100644 index 0000000000..f7b7cb45fb --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractorTest.java @@ -0,0 +1,155 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class EdtfRangeDateExtractorTest implements DateExtractorTest { + + private static final EdtfRangeDateExtractor EDTF_RANGE_DATE_EXTRACTOR = new EdtfRangeDateExtractor(); + + private void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = EDTF_RANGE_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.EDTF); + } + + @ParameterizedTest + @MethodSource + void dateIntervalRepresentationLevel0(String input, String expected) { + assertExtract(input, expected); + } + + @ParameterizedTest + @MethodSource + @DisplayName("Letter-prefixed calendar year interval") + void letterPrefixedCalendarYearIntervalLevel1(String input, String expected) { + assertExtract(input, expected); + } + + @ParameterizedTest + @MethodSource + @DisplayName("Open time interval") + void openTimeIntervalLevel1(String input, String expected) { + assertExtract(input, expected); + } + + @ParameterizedTest + @MethodSource + @DisplayName("Unknown time interval") + void unknownTimeIntervalLevel1(String input, String expected) { + assertExtract(input, expected); + } + + private static Stream dateIntervalRepresentationLevel0() { + return Stream.of( + of("1989/1990", "1989/1990"), + of("1989-11/1990-11", "1989-11/1990-11"), + of("1989-11-01/1990-11-01", "1989-11-01/1990-11-01"), + of("1989-11-01/1990-11", "1989-11-01/1990-11"), + of("1989-11-01/1990", "1989-11-01/1990"), + of("1989/1990-11", "1989/1990-11"), + of("1989/1990-11-01", "1989/1990-11-01"), + of("1989-00/1990-00", null), + of("1989-00-00/1990-00-00", null), + of("1989 / 1990", "1989/1990"), + //Dash not valid + of("1989-1990", null), + //Missing digits + of("989-1990", null), + of("1989-990", null) + ); + } + + private static Stream letterPrefixedCalendarYearIntervalLevel1() { + return Stream.of( + of("Y-123456789/Y-123456788", "Y-123456789/Y-123456788"), + //Non prefixed + of("-123456789/-123456788", null) + ); + } + + private static Stream openTimeIntervalLevel1() { + return Stream.of( + //Open start + of("../1989-11-01", "../1989-11-01"), + of("../1989-11", "../1989-11"), + of("../1989", "../1989"), + of("../1989-11-01~", "../1989-11-01~"), + of("../1989-11~", "../1989-11~"), + of("../1989~", "../1989~"), + of("../1989-11-01?", "../1989-11-01?"), + of("../1989-11?", "../1989-11?"), + of("../1989?", "../1989?"), + of("../1989-11-01%", "../1989-11-01%"), + of("../1989-11%", "../1989-11%"), + of("../1989%", "../1989%"), + of(".. / 1989-11-01", "../1989-11-01"), + of("../ 1989-11-01", "../1989-11-01"), + of(".. /1989-11-01", "../1989-11-01"), + + //Open end + of("1989-11-01/..", "1989-11-01/.."), + of("1989-11/..", "1989-11/.."), + of("1989/..", "1989/.."), + of("1989-11-01~/..", "1989-11-01~/.."), + of("1989-11~/..", "1989-11~/.."), + of("1989~/..", "1989~/.."), + of("1989-11-01?/..", "1989-11-01?/.."), + of("1989-11?/..", "1989-11?/.."), + of("1989?/..", "1989?/.."), + of("1989-11-01%/..", "1989-11-01%/.."), + of("1989-11%/..", "1989-11%/.."), + of("1989%/..", "1989%/.."), + of("1989-11-01 / ..", "1989-11-01/.."), + of("1989-11-01 /..", "1989-11-01/.."), + of("1989-11-01/ ..", "1989-11-01/.."), + of("../..", null) + ); + } + + + private static Stream unknownTimeIntervalLevel1() { + return Stream.of( + //Unknown start + of("/1989-11-01", "../1989-11-01"), + of("/1989-11", "../1989-11"), + of("/1989", "../1989"), + of("/1989-11-01~", "../1989-11-01~"), + of("/1989-11~", "../1989-11~"), + of("/1989~", "../1989~"), + of("/1989-11-01?", "../1989-11-01?"), + of("/1989-11?", "../1989-11?"), + of("/1989?", "../1989?"), + of("/1989-11-01%", "../1989-11-01%"), + of("/1989-11%", "../1989-11%"), + of("/1989%", "../1989%"), + of(" / 1989-11-01", "../1989-11-01"), + of("/ 1989-11-01", "../1989-11-01"), + of(" /1989-11-01", "../1989-11-01"), + + //Unknown end + of("1989-11-01/", "1989-11-01/.."), + of("1989-11/", "1989-11/.."), + of("1989/", "1989/.."), + of("1989-11-01~/", "1989-11-01~/.."), + of("1989-11~/", "1989-11~/.."), + of("1989~/", "1989~/.."), + of("1989-11-01?/", "1989-11-01?/.."), + of("1989-11?/", "1989-11?/.."), + of("1989?/", "1989?/.."), + of("1989-11-01%/", "1989-11-01%/.."), + of("1989-11%/", "1989-11%/.."), + of("1989%/", "1989%/.."), + of("1989-11-01 / ", "1989-11-01/.."), + of("1989-11-01 /", "1989-11-01/.."), + of("1989-11-01/ ", "1989-11-01/.."), + of("/", null) + ); + } +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractorTest.java new file mode 100644 index 0000000000..3c65e22d47 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractorTest.java @@ -0,0 +1,51 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class FullDateDateExtractorTest implements DateExtractorTest { + + private static final FullDateDateExtractor PATTERN_FORMATTED_FULL_DATE_DATE_EXTRACTOR = new FullDateDateExtractor(); + + @ParameterizedTest + @MethodSource + void extract(String input, String expected) { + assertExtract(input, expected); + } + + void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = PATTERN_FORMATTED_FULL_DATE_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.FORMATTED_FULL_DATE); + } + + private static Stream extract() { + return Stream.of( + of("Wed Nov 01 01:00:00 CEST 1989", "1989-11-01"), + of("Τετ Νοε 01 01:00:00 CEST 1989", "1989-11-01"), + of("1989-11-01 04:05:06 UTC", "1989-11-01"), + of("1989-11-01 04:05:06 UTC+01", "1989-11-01"), + of("1989-11-01 04:05:06 UTC-01", "1989-11-01"), + of("1989-11-01 01:02:03 UTC", "1989-11-01"), + of("1989-11-01 01:02:03", "1989-11-01"), + of("1989-11-01 01:02:03.1", "1989-11-01"), + of("1989-11-01 01:02:03.12", "1989-11-01"), + of("1989-11-01 01:02:03.123", "1989-11-01"), + + //Invalids + of("Wed Nov 01 01:00:00 CEST", null), + of("Wed Nov 01 01:00:00", null), + of("1989-11-01 01:02:03.1234", null), + of("1989-11-01 01:02:03+01", null), + of("1989-11-01 01:02", null), + of("1989-11-01 01", null), + of("1989-11-01", null) + ); + } + +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractorTest.java new file mode 100644 index 0000000000..54461b6e83 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractorTest.java @@ -0,0 +1,43 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class LongNegativeYearDateExtractorTest implements DateExtractorTest { + + private static final LongNegativeYearDateExtractor LONG_NEGATIVE_YEAR_DATE_EXTRACTOR = new LongNegativeYearDateExtractor(); + + @ParameterizedTest + @MethodSource + void extract(String input, String expected) { + assertExtract(input, expected); + } + + void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = LONG_NEGATIVE_YEAR_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR); + } + + private static Stream extract() { + return Stream.of( + of("-12345", "Y-12345"), + of("-123456", "Y-123456"), + of("-1234567", "Y-1234567"), + of("-12345678", "Y-12345678"), + of("-123456789", "Y-123456789"), + + //Future dates are not valid + of("123456789", null), + //Less digits + of("-1234", null), + //Greater digits + of("-1234567890", null) + ); + } +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractorTest.java new file mode 100644 index 0000000000..df28e26cc7 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractorTest.java @@ -0,0 +1,50 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class LongNegativeYearRangeDateExtractorTest implements DateExtractorTest { + + private static final LongNegativeYearRangeDateExtractor LONG_NEGATIVE_YEAR_RANGE_DATE_EXTRACTOR = new LongNegativeYearRangeDateExtractor(); + + @ParameterizedTest + @MethodSource + void extract(String input, String expected) { + assertExtract(input, expected); + } + + void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = LONG_NEGATIVE_YEAR_RANGE_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR); + } + + private static Stream extract() { + return Stream.of( + of("-12345/-12344", "Y-12345/Y-12344"), + of("-123456/-123455", "Y-123456/Y-123455"), + of("-1234567/-1234566", "Y-1234567/Y-1234566"), + of("-12345678/-12345677", "Y-12345678/Y-12345677"), + of("-123456789/-123456788", "Y-123456789/Y-123456788"), + + //Dash + of("-12345--12344", null), + of("-123456--123455", null), + of("-1234567--1234566", null), + of("-12345678--12345677", null), + of("-123456789--123456788", null), + + //Future dates are not valid + of("123456788/123456789", null), + //Less digits + of("-1234/-1233", null), + //Greater digits + of("-1234567890/-1234567889", null) + ); + } +} \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractorTest.java new file mode 100644 index 0000000000..441cab3bb6 --- /dev/null +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractorTest.java @@ -0,0 +1,173 @@ +package eu.europeana.normalization.dates.extraction.extractors; + +import static org.junit.jupiter.params.provider.Arguments.of; + +import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; +import eu.europeana.normalization.dates.DateNormalizationResult; +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class MonthNameDateExtractorTest implements DateExtractorTest { + + private static final MonthNameDateExtractor PATTERN_MONTH_NAME_DATE_EXTRACTOR = new MonthNameDateExtractor(); + + @ParameterizedTest + @MethodSource + void extractDayMonthYear(String input, String expected) { + assertExtract(input, expected); + } + + @ParameterizedTest + @MethodSource + void extractMonthDayYear(String input, String expected) { + assertExtract(input, expected); + } + + @ParameterizedTest + @MethodSource + void extractMonthYear(String input, String expected) { + assertExtract(input, expected); + } + + void assertExtract(String input, String expected) { + final DateNormalizationResult dateNormalizationResult = PATTERN_MONTH_NAME_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.MONTH_NAME); + } + + private static Stream extractDayMonthYear() { + return Stream.of( + of("01 November 1989", "1989-11-01"), + of("32 November 1989", null), + of("01.November.1989", "1989-11-01"), + of("01,November,1989", "1989-11-01"), + //Combination of separators + of("01 November.1989", "1989-11-01"), + of("01 November,1989", "1989-11-01"), + of("01.November 1989", "1989-11-01"), + of("01.November,1989", "1989-11-01"), + of("01,November 1989", "1989-11-01"), + of("01,November.1989", "1989-11-01"), + + //Some other languages or name formats + of("01 nov. 1989", "1989-11-01"), + of("01 ное 1989", "1989-11-01"), + of("01 Νοεμβρίου 1989", "1989-11-01"), + of("01 January 1989", "1989-01-01"), + of("01 Νοεμβρίου 1989", "1989-11-01"), + of("01 νοεμβρίου 1989", "1989-11-01"), + of("01 ΝΟΕΜΒΡΊΟΥ 1989", "1989-11-01"), + //Italian + of("01 Novembre 1989", "1989-11-01"), + + //Incorrect month + of("99 November 9989", null), + of("99 November 9989", null), + + //Too few digits on year + of("1 January 989", null), + of("1.January.989", null), + of("1,January,989", null), + //Too many digits on year + of("01 January 12345", null), + //Too many digits on day + of("123 January 1234", null), + + //Other invalids + //Double spaces should not match + of("1989 November 01", null), + //Double dots should not match + of("1989..November..01", null), + //Double commas should not match + of("1989,,November,,01", null) + ); + } + + private static Stream extractMonthDayYear() { + + return Stream.of( + of("November 01 1989", "1989-11-01"), + of("November 32 1989", null), + of("November.01.1989", "1989-11-01"), + of("November,01,1989", "1989-11-01"), + //Combination of separators + of("November 01.1989", "1989-11-01"), + of("November 01,1989", "1989-11-01"), + of("November.01 1989", "1989-11-01"), + of("November.01,1989", "1989-11-01"), + of("November,01 1989", "1989-11-01"), + of("November,01.1989", "1989-11-01"), + + //Some other languages or name formats + of("nov. 01 1989", "1989-11-01"), + of("ное 01 1989", "1989-11-01"), + of("January 01 1989", "1989-01-01"), + of("Νοεμβρίου 01 1989", "1989-11-01"), + of("νοεμβρίου 01 1989", "1989-11-01"), + of("ΝΟΕΜΒΡΊΟΥ 01 1989", "1989-11-01"), + //Italian + of("Novembre 01 1989", "1989-11-01"), + + //Incorrect month + of("November 99 9989", null), + of("November 99 9989", null), + + //Too few digits on year + of("January 1 989", null), + of("January.1.989", null), + of("January,1,989", null), + //Too many digits on year + of("January 01 12345", null), + //Too many digits on day + of("January 123 1234", null), + + //Other invalids + //Double spaces should not match + of("November 01 1989", null), + //Double dots should not match + of("November..01..1989", null), + //Double commas should not match + of("November,,01,,1989", null) + ); + } + + private static Stream extractMonthYear() { + + return Stream.of( + //MONTH-YEAR + of("November 1989", "1989-11"), + of("November.1989", "1989-11"), + of("November,1989", "1989-11"), + + //Some other languages or name formats + of("nov. 1989", "1989-11"), + of("ное 1989", "1989-11"), + of("January 1989", "1989-01"), + of("Νοεμβρίου 1989", "1989-11"), + of("νοεμβρίου 1989", "1989-11"), + of("ΝΟΕΜΒΡΊΟΥ 1989", "1989-11"), + //Italian + of("Novembre 1989", "1989-11"), + + //Incorrect month year + of("November 9989", null), + of("November 9989", null), + //Too few digits on year + of("January 989", null), + of("January.989", null), + of("January,989", null), + //Too many digits on year + of("January 12345", null), + + //Other invalids + //Double spaces should not match + of("November 1989", null), + //Double dots should not match + of("November..1989", null), + //Double commas should not match + of("November,,1989", null) + ); + } + +} diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractorTest.java similarity index 92% rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractorTest.java rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractorTest.java index 0aaa161ea8..09eb56d96f 100644 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractorTest.java +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractorTest.java @@ -1,22 +1,18 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; +package eu.europeana.normalization.dates.extraction.extractors; import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS; import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS_XX; -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.YYYY_MM_DD_SPACES; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_SPACES_VARIANT; import static org.junit.jupiter.params.provider.Arguments.of; import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.DateNormalizationResultStatus; -import eu.europeana.normalization.dates.edtf.DateQualification; import java.util.stream.Stream; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; -class NumericPartsDateExtractorTest { +class NumericPartsDateExtractorTest implements DateExtractorTest { private static final NumericPartsDateExtractor NUMERIC_PARTS_DATE_EXTRACTOR = new NumericPartsDateExtractor(); @@ -47,25 +43,15 @@ void extractDMY_XX(String input, String expected) { @ParameterizedTest @MethodSource void extractDateSpaces(String input, String expected) { - assertExtract(input, expected, YYYY_MM_DD_SPACES); + assertExtract(input, expected, NUMERIC_SPACES_VARIANT); } void assertExtract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { - final DateNormalizationResult dateNormalizationResult = NUMERIC_PARTS_DATE_EXTRACTOR.extractDateProperty(input, - NO_QUALIFICATION); - if (expected == null) { - assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus()); - } else { - final String actual = dateNormalizationResult.getEdtfDate().toString(); - assertEquals(expected, actual); - assertEquals(actual.contains("?"), - dateNormalizationResult.getEdtfDate().getDateQualification() == DateQualification.UNCERTAIN); - assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId()); - } + final DateNormalizationResult dateNormalizationResult = NUMERIC_PARTS_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId); } private static Stream extractDateSpaces() { - return Stream.of( of("1989 11 01", "1989-11-01"), of("1989 11 01?", "1989-11-01?"), @@ -85,7 +71,6 @@ private static Stream extractDateSpaces() { } private static Stream extractYMD() { - return Stream.of( //YEAR //A month and day can be missing @@ -151,7 +136,6 @@ private static Stream extractYMD() { } private static Stream extractDMY() { - return Stream.of( //MONTH-YEAR of("11-1989", "1989-11"), @@ -220,7 +204,6 @@ private static Stream extractDMY() { } private static Stream extractYMD_XX() { - return Stream.of( //YEAR of("198X", "198X"), @@ -336,7 +319,6 @@ private static Stream extractYMD_XX() { } private static Stream extractDMY_XX() { - return Stream.of( //YEAR-MONTH of("XX.1989", "1989"), diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractorTest.java similarity index 58% rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractorTest.java rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractorTest.java index 65fb376f27..fe61ef9fba 100644 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractorTest.java +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractorTest.java @@ -1,19 +1,15 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; +package eu.europeana.normalization.dates.extraction.extractors; import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS; import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS_XX; -import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION; -import static org.junit.jupiter.api.Assertions.assertEquals; import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.DateNormalizationResultStatus; -import eu.europeana.normalization.dates.edtf.DateQualification; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ArgumentsSource; -class NumericPartsRangeDateExtractorTest { +class NumericPartsRangeDateExtractorTest implements DateExtractorTest { private static final NumericPartsRangeDateExtractor NUMERIC_PARTS_RANGE_DATE_EXTRACTOR = new NumericPartsRangeDateExtractor(); @@ -42,17 +38,7 @@ void extractDMY_XX(String input, String expected) { } void extract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { - final DateNormalizationResult dateNormalizationResult = NUMERIC_PARTS_RANGE_DATE_EXTRACTOR.extractDateProperty(input, - NO_QUALIFICATION); - if (expected == null) { - assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus()); - } else { - final String actual = dateNormalizationResult.getEdtfDate().toString(); - assertEquals(expected, actual); - assertEquals(actual.contains("?"), - dateNormalizationResult.getEdtfDate().getDateQualification() == DateQualification.UNCERTAIN); - assertEquals(actual.contains(".."), dateNormalizationResult.getEdtfDate().isOpen()); - assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId()); - } + final DateNormalizationResult dateNormalizationResult = NUMERIC_PARTS_RANGE_DATE_EXTRACTOR.extractDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId); } } \ No newline at end of file diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYArgumentsProvider.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYArgumentsProvider.java similarity index 99% rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYArgumentsProvider.java rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYArgumentsProvider.java index cca3239501..3b494422d9 100644 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYArgumentsProvider.java +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYArgumentsProvider.java @@ -1,4 +1,4 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; +package eu.europeana.normalization.dates.extraction.extractors; import static org.junit.jupiter.params.provider.Arguments.of; diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYXXArgumentsProvider.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYXXArgumentsProvider.java similarity index 99% rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYXXArgumentsProvider.java rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYXXArgumentsProvider.java index b534776338..86cfc609fc 100644 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYXXArgumentsProvider.java +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYXXArgumentsProvider.java @@ -1,4 +1,4 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; +package eu.europeana.normalization.dates.extraction.extractors; import static org.junit.jupiter.params.provider.Arguments.of; diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDArgumentsProvider.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDArgumentsProvider.java similarity index 99% rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDArgumentsProvider.java rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDArgumentsProvider.java index 8e759ed439..8e8e660adc 100644 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDArgumentsProvider.java +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDArgumentsProvider.java @@ -1,4 +1,4 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; +package eu.europeana.normalization.dates.extraction.extractors; import static org.junit.jupiter.params.provider.Arguments.of; diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDXXArgumentsProvider.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDXXArgumentsProvider.java similarity index 99% rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDXXArgumentsProvider.java rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDXXArgumentsProvider.java index 71a0f51f8a..f582df913e 100644 --- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDXXArgumentsProvider.java +++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDXXArgumentsProvider.java @@ -1,4 +1,4 @@ -package eu.europeana.normalization.dates.extraction.dateextractors; +package eu.europeana.normalization.dates.extraction.extractors; import static org.junit.jupiter.params.provider.Arguments.of; diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/normalizers/DatesNormalizerTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/normalizers/DatesNormalizerTest.java index 70e2f679d4..e3a2d2bd8f 100644 --- a/metis-normalization/src/test/java/eu/europeana/normalization/normalizers/DatesNormalizerTest.java +++ b/metis-normalization/src/test/java/eu/europeana/normalization/normalizers/DatesNormalizerTest.java @@ -12,80 +12,43 @@ import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS; import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS_XX; import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS; -import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.YYYY_MM_DD_SPACES; -import static eu.europeana.normalization.dates.edtf.IntervalEdtfDate.DATE_INTERVAL_SEPARATOR; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_SPACES_VARIANT; import static org.junit.jupiter.params.provider.Arguments.of; import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId; import eu.europeana.normalization.dates.DateNormalizationResult; -import eu.europeana.normalization.dates.DateNormalizationResultStatus; -import eu.europeana.normalization.dates.edtf.AbstractEdtfDate; -import eu.europeana.normalization.dates.edtf.DateBoundaryType; -import eu.europeana.normalization.dates.edtf.DateQualification; -import eu.europeana.normalization.dates.edtf.InstantEdtfDate; -import eu.europeana.normalization.dates.edtf.IntervalEdtfDate; -import java.util.Arrays; -import java.util.function.Function; +import eu.europeana.normalization.dates.extraction.extractors.DateExtractorTest; import java.util.stream.Stream; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; -class DatesNormalizerTest { +class DatesNormalizerTest implements DateExtractorTest { private final static DatesNormalizer NORMALIZER = new DatesNormalizer(); - void assertExtract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId, - String label) { + @ParameterizedTest + @MethodSource + void extractDatePropertiesWithLabel(String input, String expected, + DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId, String expectedLabel) { final DateNormalizationResult dateNormalizationResult = NORMALIZER.normalizeDateProperty(input); - if (expected != null) { - assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId()); - assertEquals(label, dateNormalizationResult.getEdtfDate().getLabel()); - AbstractEdtfDate edtfDate = dateNormalizationResult.getEdtfDate(); - if (edtfDate instanceof IntervalEdtfDate) { - String startPart = expected.substring(0, expected.indexOf(DATE_INTERVAL_SEPARATOR)); - String endPart = expected.substring(expected.indexOf(DATE_INTERVAL_SEPARATOR) + 1); - InstantEdtfDate start = ((IntervalEdtfDate) edtfDate).getStart(); - InstantEdtfDate end = ((IntervalEdtfDate) edtfDate).getEnd(); - assertEdtfDate(startPart, start); - assertEdtfDate(endPart, end); - } else { - assertEdtfDate(expected, (InstantEdtfDate) dateNormalizationResult.getEdtfDate()); - } - assertEquals(expected, edtfDate.toString()); - } else { - assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus()); - } - - } - - private static void assertEdtfDate(String expected, InstantEdtfDate instantEdtfDate) { - assertEquals(expected.contains("?"), instantEdtfDate.getDateQualification() == DateQualification.UNCERTAIN); - assertEquals(expected.contains("~"), instantEdtfDate.getDateQualification() == DateQualification.APPROXIMATE); - assertEquals(expected.contains("%"), instantEdtfDate.getDateQualification() == DateQualification.UNCERTAIN_APPROXIMATE); - assertEquals(expected.equals(DateBoundaryType.OPEN.getSerializedRepresentation()), - instantEdtfDate.getDateBoundaryType() == DateBoundaryType.OPEN - || instantEdtfDate.getDateBoundaryType() == DateBoundaryType.UNKNOWN); + assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId, expectedLabel); } @ParameterizedTest @MethodSource - void extractDateProperties(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId, - String label) { - assertExtract(input, expected, dateNormalizationExtractorMatchId, label); + void extractDatePropertiesWithoutLabel(String input, String expected, + DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { + final DateNormalizationResult dateNormalizationResult = NORMALIZER.normalizeDateProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId); } - private static Stream extractDateProperties() { - Stream argumentsWithoutLabel = Stream.of( - extractDatePropertiesWithoutLabel() - ).flatMap(Function.identity()).map(arguments -> - { - Object[] argumentsWithLabel = Arrays.copyOf(arguments.get(), arguments.get().length + 1); - argumentsWithLabel[argumentsWithLabel.length - 1] = null; - return of(argumentsWithLabel); - }); - return Stream.concat(extractDatePropertiesWithLabel(), argumentsWithoutLabel); + @ParameterizedTest + @MethodSource + void extractGenericPropertiesWithoutLabel(String input, String expected, + DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) { + final DateNormalizationResult dateNormalizationResult = NORMALIZER.normalizeGenericProperty(input); + assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId); } private static Stream extractDatePropertiesWithLabel() { @@ -99,19 +62,17 @@ private static Stream extractDatePropertiesWithLabel() { private static Stream extractDatePropertiesWithoutLabel() { return Stream.of( - //Brief dates. Those are similar to EDFT but should match first. + //Brief dates. Those are similar to EDTF but should match first. of("2014/15", "2014/2015", BRIEF_DATE_RANGE), - of("1889/98? (Herstellung)", "1889?/1898?", BRIEF_DATE_RANGE), - of("1918-20", "1918/1920", BRIEF_DATE_RANGE), + of("1889/98? (text in parentheses)", "1889/1898?", BRIEF_DATE_RANGE), //Centuries numeric of("18..", "18XX", CENTURY_NUMERIC), - of("19??", "19XX", NUMERIC_ALL_VARIANTS_XX), of("192?", null, null),// ambiguous of("[171-]", null, null), // ambiguous of("19th century", "18XX", CENTURY_NUMERIC), of("2nd century", "01XX", CENTURY_NUMERIC), - of("[10th century]", "09XX", CENTURY_NUMERIC), // not supported + of("[10th century]", "09XX", CENTURY_NUMERIC), of("12th century BC", null, null), // not supported //Centuries roman @@ -121,7 +82,7 @@ private static Stream extractDatePropertiesWithoutLabel() { of("S. XVI-XX", "15XX/19XX", CENTURY_RANGE_ROMAN), of("S.VIII-XV", "07XX/14XX", CENTURY_RANGE_ROMAN), of("S. XVI-XVIII", "15XX/17XX", CENTURY_RANGE_ROMAN), - of("S. XVIII-", null, null), // open-ended period + of("S. XVIII-", null, null), of("[XVI-XIX]", "15XX/18XX", CENTURY_RANGE_ROMAN), of("SVV", null, null), @@ -133,25 +94,24 @@ private static Stream extractDatePropertiesWithoutLabel() { //Numeric range '/' of("1872-06-01/1872-06-30", "1872-06-01/1872-06-30", EDTF), - of(" 1820/1820", "1820/1820", NUMERIC_RANGE_ALL_VARIANTS), - of("1918 / 1919", "1918/1919", NUMERIC_RANGE_ALL_VARIANTS), - of("1205/1215 [Herstellung]", "1205/1215", EDTF), - of(" 1757/1757", "1757/1757", NUMERIC_RANGE_ALL_VARIANTS), + of(" 1820/1820", "1820/1820", EDTF), + of("1918 / 1919", "1918/1919", EDTF), + of("1205/1215 [text in brackets]", "1205/1215", EDTF), + of(" 1757/1757", "1757/1757", EDTF), of("ca 1757/1757", "1757~/1757~", EDTF), - of("2000 vC - 2002 nC", "-2000/2002", BC_AD), - of("0114 aC - 0113 aC", "-0114/-0113", BC_AD), - of("0390 AD - 0425 AD", "0390/0425", BC_AD), - of("337 BC - 283 BC", "-0337/-0283", BC_AD), - of("100 vC - 150 nC", "-0100/0150", BC_AD), - of("400 BC - 400 AD", "-0400/0400", BC_AD), - of("235 AD – 236 AD", "0235/0236", BC_AD), - of("168 B.C.-135 A.D.", "-0168/0135", BC_AD), + of("1990 BC-1989 BC", "-1989/-1988", BC_AD), + of("1990 π.Χ.-1989 π.Χ.", "-1989/-1988", BC_AD), + of("1989 AD/1990 AD", "1989/1990", BC_AD), + of("1989 μ.Χ./1990 μ.Χ.", "1989/1990", BC_AD), + of("1989 π.Χ.-1 μ.Χ.", "-1988/0001", BC_AD), of("20/09/18XX", "18XX-09-20", NUMERIC_ALL_VARIANTS_XX), of("?/1807", "../1807", NUMERIC_RANGE_ALL_VARIANTS), //Incorrect day values of("1947-19-50/1950-19-53", null, null), of("15/21-8-1918", null, null), of("1.1848/49[?]", null, null), + of("1990 BC//1989 BC", null, null), + of("-1990 BC-1989 BC", null, null), //Numeric range ' - '(spaces around hyphen) of("1851-01-01 - 1851-12-31", "1851-01-01/1851-12-31", NUMERIC_RANGE_ALL_VARIANTS), @@ -164,8 +124,10 @@ private static Stream extractDatePropertiesWithoutLabel() { of("192?-1958", null, null), of("[ca. 1920-1930]", "1920~/1930~", NUMERIC_RANGE_ALL_VARIANTS), of("1937--1938", null, null), - of("[ca. 193-]", null, null),// ambiguous - of("1990-", null, null), // open-ended period not supported + // ambiguous + of("[ca. 193-]", null, null), + // open-ended period not supported + of("1990-", null, null), //Numeric range '|' of("1910/05/31 | 1910/05/01", "1910-05-01/1910-05-31", NUMERIC_RANGE_ALL_VARIANTS), @@ -178,9 +140,15 @@ private static Stream extractDatePropertiesWithoutLabel() { // this may not be a 100% correct normalisation, maybe it is not a range but two dates of("1651 [ca. 1656]", "1651~/1656~", NUMERIC_RANGE_ALL_VARIANTS), - //Numeric year + //Numeric year all variants of("(17--?)", "17XX?", NUMERIC_ALL_VARIANTS_XX), of("[19--?]", "19XX?", NUMERIC_ALL_VARIANTS_XX), + of("19--?]", "19XX?", NUMERIC_ALL_VARIANTS_XX), + of("19--]", "19XX", NUMERIC_ALL_VARIANTS_XX), + of("19xx", "19XX", NUMERIC_ALL_VARIANTS_XX), + of("19??", "19XX", NUMERIC_ALL_VARIANTS_XX), + of("[ca. 16??]", "16XX~", NUMERIC_ALL_VARIANTS_XX), + of("[ca. 16??]", "16XX~", NUMERIC_ALL_VARIANTS_XX), //Numeric date with dot "." of("21.1.1921", "1921-01-21", NUMERIC_ALL_VARIANTS), @@ -189,81 +157,83 @@ private static Stream extractDatePropertiesWithoutLabel() { of("28.05.1969", "1969-05-28", NUMERIC_ALL_VARIANTS), of("11.11.1947", "1947-11-11", NUMERIC_ALL_VARIANTS), of("23.02.[18--]", "18XX-02-23", NUMERIC_ALL_VARIANTS_XX), + of("15.02.1985 (text in parentheses)", "1985-02-15", NUMERIC_ALL_VARIANTS), + of("09.1972 (text in parentheses)", "1972-09", NUMERIC_ALL_VARIANTS), of("28. 1. 1240", null, null), //Numeric date with dash "-" of("1941-22-06", "1941-06-22", NUMERIC_ALL_VARIANTS), of("1937-10-??", "1937-10", NUMERIC_ALL_VARIANTS_XX), + of("1985-10-xx", "1985-10", NUMERIC_ALL_VARIANTS_XX), of("199--09-28", null, null), of("01?-1905", null, null), of("02?-1915", null, null), //Numeric date with space " " - of("1905 09 01", "1905-09-01", YYYY_MM_DD_SPACES), - of("0 2 1980", "1980-02", YYYY_MM_DD_SPACES), + of("1905 09 01", "1905-09-01", NUMERIC_SPACES_VARIANT), + of("0 2 1980", "1980-02", NUMERIC_SPACES_VARIANT), //More than 4 digits year of("18720601/18720630", null, null), of("19471950/19501953", null, null), - of("-2100/-1550", "-2100/-1550", EDTF), - // TODO: 21/12/2022 Check the below, expected null but returns 1952-02-25 instead - // of("1952-02-25T00:00:00Z-1952-02-25T23:59:59Z", null), + //Month alphabetical name + of("18 September 1914", "1914-09-18", MONTH_NAME), + of("c.6 Nov 1902", "1902-11-06~", MONTH_NAME), + + //Non-standard date format + of("Sat Jan 01 01:00:00 CET 1701", "1701-01-01", FORMATTED_FULL_DATE), + of("2013-03-21 18:45:36 UTC", "2013-03-21", FORMATTED_FULL_DATE), of("2013-09-07 09:31:51 UTC", "2013-09-07", FORMATTED_FULL_DATE), - of("1997-07-18T00:00:00 [Create]", "1997-07-18", EDTF), + + of("-2100/-1550", "-2100/-1550", EDTF), + of("1997-07-18T00:00:00 [text in brackets]", "1997-07-18", EDTF), of("1924 ca.", null, null), of("[1712?]", "1712?", EDTF), of("circa 1712", "1712~", EDTF), of("[ca. 1946]", "1946~", EDTF), of("1651?]", "1651?", EDTF), - of("19--?]", "19XX?", NUMERIC_ALL_VARIANTS_XX), of(". 1885", null, null), of("- 1885", null, null), - of("1749 (Herstellung (Werk))", "1749", EDTF), - of("1939; 1954; 1955; 1978; 1939-1945", null, null), // multiple dates no suported - of("[17__]", null, null),// this pattern is not supported (this pattern was never tested - of("19--]", "19XX", NUMERIC_ALL_VARIANTS_XX), - of("19xx", "19XX", NUMERIC_ALL_VARIANTS_XX), - of("Sat Jan 01 01:00:00 CET 1701", "1701-01-01", FORMATTED_FULL_DATE), - of("2013-03-21 18:45:36 UTC", "2013-03-21", FORMATTED_FULL_DATE), - of("15.02.1985 (identification)", "1985-02-15", NUMERIC_ALL_VARIANTS), + of("1749 (text in parentheses (text in parentheses))", "1749", EDTF), + // multiple dates no supported + of("1939; 1954; 1955; 1978; 1939-1945", null, null), + of("[17__]", null, null), of("091090", null, null), of("-0043-12-07", "-0043-12-07", EDTF), of("imp. 1901", null, null), - of("u.1707-1739", null, null),// what does 'u.' mean? - of("22.07.1971 (identification)", "1971-07-22", NUMERIC_ALL_VARIANTS), + of("u.1707-1739", null, null), //Ambiguous pattern of("187-?]", null, null), - of("18. September 1914", "1914-09-18", MONTH_NAME), of("19960216-19960619", null, null), of("-0549-01-01T00:00:00Z", "-0549-01-01", EDTF), of("1942-1943 c.", null, null), of("(1942)", "1942", EDTF), of("-3.6982", null, null), - of("[ca. 16??]", "16XX~", NUMERIC_ALL_VARIANTS_XX), of("ISO9126", null, null), - of("1985-10-xx", "1985-10", NUMERIC_ALL_VARIANTS_XX), of("14:27", null, null), - of("c.6 Nov 1902", "1902-11-06~", MONTH_NAME), - of("-1234", "-1234", EDTF), - of("09.1972 (gathering)", "1972-09", NUMERIC_ALL_VARIANTS) + of("-1234", "-1234", EDTF) ); } - // TODO: 10/03/2023 Don't forget to add specific to generic properties normalization - // //GENERIC PROPERTY - // genericPropertyTestCases.put("XIV", null); - // genericPropertyTestCases.put("1905 09 01", "1905-09-01"); - // genericPropertyTestCases.put("1851-01-01 - 1851-12-31", "1851-01-01/1851-12-31"); - // genericPropertyTestCases.put("18..", null); - // genericPropertyTestCases.put("2013-09-07 09:31:51 UTC", "2013-09-07"); - // genericPropertyTestCases.put("1918 / 1919", "1918/1919"); - // genericPropertyTestCases.put("1205/1215 [Herstellung]", null); - // genericPropertyTestCases.put("1997-07", null); - // genericPropertyTestCases.put("19??", null); - // genericPropertyTestCases.put("1871 - 191-", null); - + private static Stream extractGenericPropertiesWithoutLabel() { + return Stream.of( + of("XIV", "13XX", CENTURY_ROMAN), + of("1989 11 01", "1989-11-01", NUMERIC_SPACES_VARIANT), + of("1851-01-01 - 1851-12-31", "1851-01-01/1851-12-31", NUMERIC_RANGE_ALL_VARIANTS), + of("[1989-11-01 - 1989-12-31]", "1989-11-01/1989-12-31", NUMERIC_RANGE_ALL_VARIANTS), + of("1989-11-01 - 1989-12-31 (text in parentheses)", "1989-11-01/1989-12-31", NUMERIC_RANGE_ALL_VARIANTS), + of("2013-09-07 09:31:51 UTC", "2013-09-07", FORMATTED_FULL_DATE), + //Non precise/full dates + of("18..", null, null), + of("1918/1919", null, null), + of("1205/1215 [text in brackets]", null, null), + of("1997-07", null, null), + of("19??", null, null), + of("1871 - 191-", null, null) + ); + } } \ No newline at end of file