From 5fd53ff2f41df9f829e838d4c5d8135c8968251f Mon Sep 17 00:00:00 2001
From: Simon Tzanakis
Date: Thu, 19 Oct 2023 14:18:57 +0200
Subject: [PATCH] Debt/met 5132 dates normalization cleanup part 4 (#622)
* MET-5132: Update naming of numeric spaces variant
* MET-5132: Prepare first sample of tests
* MET-5132: More tests
* MET-5132: Restructure with enum MontNameDateExtractor
* MET-5132: Reusable DatePartsIndices
* MET-5132: Cleanup
* MET-5132: Add spaces clean and trim
* MET-5132: Cleanup
* MET-5132: First split the range part
* MET-5132: Refactor PatternBcAdDateExtractor with tests
* MET-5132: PatternBcAdRangeDateExtractor cleanup
* MET-5132: Reuse range code for dates
* MET-5132: Reuse range code for BriefRangeDateExtractor
* MET-5132: Simplify hierarchy for ranges
* MET-5132: Add sample tests for PatternLongNegativeYearDateExtractor and adapt edtf builder
* MET-5132: Refactor LongNegativeYearDateExtractor with a separate range class reusing already existent code
* MET-5132: Split century extraction to numeric, roman, roman range reusing code
* MET-5132: Centralize sanitization operation for all extractors.
* MET-5132: Add tests for generic properties
* MET-5132: Reuse test code
* MET-5132: Repackage
* MET-5132: Split EdtfDateExtractor to handle ranges separately with code reuse
* MET-5132: Centralize date qualification overwriting
* MET-5132: Simplify date qualification overwriting
* MET-5132: Update code after answers from rnd
* MET-5132: Cleanup
* MET-5132: PatternFormattedFullDateDateExtractor cleanup
* MET-5132: Add millisecond support
* MET-5132: Process review
* MET-5132: Process review 2
---
.../DateNormalizationExtractorMatchId.java | 2 +-
.../dates/edtf/AbstractEdtfDate.java | 11 +-
.../dates/edtf/DateQualification.java | 56 ++-
.../dates/edtf/InstantEdtfDate.java | 26 +-
.../dates/edtf/InstantEdtfDateBuilder.java | 97 +++---
.../dates/edtf/IntervalEdtfDate.java | 33 +-
.../dates/edtf/IntervalEdtfDateBuilder.java | 19 +-
.../dates/extraction/DatePartsIndices.java | 33 ++
.../dates/extraction/DatesSeparator.java | 9 +
.../extraction/DefaultDatesSeparator.java | 20 ++
.../dates/extraction/MonthMultilingual.java | 42 +--
.../dates/extraction/NumericPartsPattern.java | 61 ++--
.../dateextractors/AbstractDateExtractor.java | 82 -----
.../BriefRangeDateExtractor.java | 70 ----
.../dateextractors/CenturyDateExtractor.java | 150 --------
.../dateextractors/EdtfDateExtractor.java | 105 ------
.../NumericPartsRangeDateExtractor.java | 118 -------
.../PatternBcAdDateExtractor.java | 124 -------
.../PatternFormatedFullDateDateExtractor.java | 66 ----
.../PatternLongNegativeYearDateExtractor.java | 60 ----
.../PatternMonthNameDateExtractor.java | 92 -----
.../extractors/AbstractDateExtractor.java | 75 ++++
.../AbstractRangeDateExtractor.java | 69 ++++
.../extractors/BcAdDateExtractor.java | 70 ++++
.../extractors/BcAdRangeDateExtractor.java | 44 +++
.../extractors/BriefRangeDateExtractor.java | 125 +++++++
.../CenturyNumericDateExtractor.java | 87 +++++
.../extractors/CenturyRomanDateExtractor.java | 48 +++
.../CenturyRomanRangeDateExtractor.java | 46 +++
.../DateExtractor.java | 14 +-
.../DcmiPeriodDateExtractor.java | 31 +-
.../DecadeDateExtractor.java | 22 +-
.../extractors/EdtfDateExtractor.java | 95 ++++++
.../extractors/EdtfRangeDateExtractor.java | 75 ++++
.../extractors/FullDateDateExtractor.java | 92 +++++
.../LongNegativeYearDateExtractor.java | 35 ++
.../LongNegativeYearRangeDateExtractor.java | 46 +++
.../extractors/MonthNameDateExtractor.java | 117 +++++++
.../NumericPartsDateExtractor.java | 44 +--
.../NumericPartsRangeDateExtractor.java | 93 +++++
.../extractors/RangeDateExtractor.java | 76 +++++
.../normalizers/DatesNormalizer.java | 150 ++++----
.../BriefRangeDateExtractorTest.java | 88 -----
.../CenturyDateExtractorTest.java | 216 ------------
.../DcmiPeriodDateExtractorTest.java | 119 -------
.../DecadeDateExtractorTest.java | 74 ----
.../dateextractors/EdtfDateExtractorTest.java | 319 ------------------
.../extractors/BcAdDateExtractorTest.java | 117 +++++++
.../BcAdRangeDateExtractorTest.java | 68 ++++
.../BriefRangeDateExtractorTest.java | 61 ++++
.../CenturyNumericDateExtractorTest.java | 60 ++++
.../CenturyRomaDateExtractorTest.java | 93 +++++
.../CenturyRomanRangeDateExtractorTest.java | 103 ++++++
.../extractors/DateExtractorTest.java | 66 ++++
.../DcmiPeriodDateExtractorTest.java | 90 +++++
.../extractors/DecadeDateExtractorTest.java | 63 ++++
.../extractors/EdtfDateExtractorTest.java | 167 +++++++++
.../EdtfRangeDateExtractorTest.java | 155 +++++++++
.../extractors/FullDateDateExtractorTest.java | 51 +++
.../LongNegativeYearDateExtractorTest.java | 43 +++
...ongNegativeYearRangeDateExtractorTest.java | 50 +++
.../MonthNameDateExtractorTest.java | 173 ++++++++++
.../NumericPartsDateExtractorTest.java | 30 +-
.../NumericPartsRangeDateExtractorTest.java | 22 +-
.../NumericRangeDMYArgumentsProvider.java | 2 +-
.../NumericRangeDMYXXArgumentsProvider.java | 2 +-
.../NumericRangeYMDArgumentsProvider.java | 2 +-
.../NumericRangeYMDXXArgumentsProvider.java | 2 +-
.../normalizers/DatesNormalizerTest.java | 194 +++++------
69 files changed, 3017 insertions(+), 2143 deletions(-)
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatePartsIndices.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatesSeparator.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DefaultDatesSeparator.java
delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/AbstractDateExtractor.java
delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractor.java
delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractor.java
delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractor.java
delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractor.java
delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternBcAdDateExtractor.java
delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternFormatedFullDateDateExtractor.java
delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternLongNegativeYearDateExtractor.java
delete mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternMonthNameDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractRangeDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractor.java
rename metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/DateExtractor.java (58%)
rename metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/DcmiPeriodDateExtractor.java (75%)
rename metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/DecadeDateExtractor.java (59%)
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractor.java
rename metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericPartsDateExtractor.java (76%)
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractor.java
create mode 100644 metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/RangeDateExtractor.java
delete mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractorTest.java
delete mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractorTest.java
delete mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractorTest.java
delete mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractorTest.java
delete mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomaDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractorTest.java
create mode 100644 metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractorTest.java
rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericPartsDateExtractorTest.java (92%)
rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericPartsRangeDateExtractorTest.java (58%)
rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericRangeDMYArgumentsProvider.java (99%)
rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericRangeDMYXXArgumentsProvider.java (99%)
rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericRangeYMDArgumentsProvider.java (99%)
rename metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/{dateextractors => extractors}/NumericRangeYMDXXArgumentsProvider.java (99%)
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/DateNormalizationExtractorMatchId.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/DateNormalizationExtractorMatchId.java
index c9c714799b..2a6378e9b1 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/DateNormalizationExtractorMatchId.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/DateNormalizationExtractorMatchId.java
@@ -19,7 +19,7 @@ public enum DateNormalizationExtractorMatchId {
NUMERIC_ALL_VARIANTS_XX("numeric date (various separators and unknown parts)"),
NUMERIC_RANGE_ALL_VARIANTS("numeric date interval (various separators)"),
NUMERIC_RANGE_ALL_VARIANTS_XX("numeric date interval (various separators and unknown parts)"),
- YYYY_MM_DD_SPACES("numeric date (whitespace separators)");
+ NUMERIC_SPACES_VARIANT("numeric date (whitespace separators)");
final String label;
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/AbstractEdtfDate.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/AbstractEdtfDate.java
index d840ce71bf..9c79e906e7 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/AbstractEdtfDate.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/AbstractEdtfDate.java
@@ -1,5 +1,7 @@
package eu.europeana.normalization.dates.edtf;
+import java.util.Set;
+
/**
* An abstract class that contains the template that an EDTF date with compliance level 1 should implement.
* See more in the specification of EDTF
@@ -17,11 +19,18 @@ protected AbstractEdtfDate(String label) {
this.label = label;
}
+ /**
+ * Add the date qualification, mainly used for pre-sanitized values.
+ *
+ * @param dateQualification the date qualification
+ */
+ public abstract void addQualification(DateQualification dateQualification);
+
public String getLabel() {
return label;
}
- public abstract DateQualification getDateQualification();
+ public abstract Set getDateQualifications();
public abstract boolean isOpen();
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/DateQualification.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/DateQualification.java
index 1c9c06c6ac..aef6749e42 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/DateQualification.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/DateQualification.java
@@ -1,6 +1,7 @@
package eu.europeana.normalization.dates.edtf;
-import java.util.Arrays;
+import java.util.EnumSet;
+import java.util.Set;
import java.util.regex.Pattern;
/**
@@ -8,32 +9,51 @@
* Specification
*/
public enum DateQualification {
+ UNCERTAIN, APPROXIMATE;
- NO_QUALIFICATION(""),
- UNCERTAIN("?"),
- APPROXIMATE("~"),
- UNCERTAIN_APPROXIMATE("%");
-
- public static final Pattern CHECK_QUALIFICATION_PATTERN = Pattern.compile("^[^\\?~%]*([\\?~%]?)$");
- private final String character;
-
- DateQualification(String character) {
- this.character = character;
- }
+ private static final String UNCERTAIN_CHARACTER = "?";
+ private static final String APPROXIMATE_CHARACTER = "~";
+ private static final String UNCERTAIN_APPROXIMATE_CHARACTER = "%";
+ private static final String CHARACTERS_REGEX = UNCERTAIN_CHARACTER + APPROXIMATE_CHARACTER + UNCERTAIN_APPROXIMATE_CHARACTER;
+ public static final Pattern PATTERN = Pattern.compile("^[^" + CHARACTERS_REGEX + "]*([" + CHARACTERS_REGEX + "])$");
/**
- * Get the enum value based on the character provided.
- * It will return a matched enum value or {@link #NO_QUALIFICATION}.
+ * Get the enum values based on the character provided.
+ * It will return an empty set or the set with the applicable qualifications.
*
* @param character the provided character
* @return the enum value
*/
- public static DateQualification fromCharacter(String character) {
- return Arrays.stream(DateQualification.values()).filter(value -> value.character.equals(character)).findFirst().orElse(
- NO_QUALIFICATION);
+ public static Set fromCharacter(String character) {
+ final Set dateQualifications = EnumSet.noneOf(DateQualification.class);
+ if (UNCERTAIN_APPROXIMATE_CHARACTER.equals(character)) {
+ dateQualifications.add(DateQualification.UNCERTAIN);
+ dateQualifications.add(DateQualification.APPROXIMATE);
+ } else if (UNCERTAIN_CHARACTER.equals(character)) {
+ dateQualifications.add(DateQualification.UNCERTAIN);
+ } else if (APPROXIMATE_CHARACTER.equals(character)) {
+ dateQualifications.add(DateQualification.APPROXIMATE);
+ }
+ return dateQualifications;
}
- public String getCharacter() {
+ /**
+ * Get the string representation based on the provided date qualifications.
+ *
+ * @param dateQualifications the date qualifications
+ * @return the string representation
+ */
+ public static String getCharacterFromQualifications(Set dateQualifications) {
+ final String character;
+ if (dateQualifications.contains(UNCERTAIN) && dateQualifications.contains(APPROXIMATE)) {
+ character = UNCERTAIN_APPROXIMATE_CHARACTER;
+ } else if (dateQualifications.contains(UNCERTAIN)) {
+ character = UNCERTAIN_CHARACTER;
+ } else if (dateQualifications.contains(APPROXIMATE)) {
+ character = APPROXIMATE_CHARACTER;
+ } else {
+ character = "";
+ }
return character;
}
}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDate.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDate.java
index bb8fb4bc95..883515d19c 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDate.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDate.java
@@ -3,7 +3,6 @@
import static eu.europeana.normalization.dates.edtf.DateBoundaryType.DECLARED;
import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN;
import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
import static eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder.THRESHOLD_4_DIGITS_YEAR;
import static eu.europeana.normalization.dates.edtf.Iso8601Parser.ISO_8601_MINIMUM_YEAR_DIGITS;
import static java.lang.Math.abs;
@@ -19,7 +18,9 @@
import java.time.Year;
import java.time.YearMonth;
import java.time.temporal.TemporalAccessor;
+import java.util.EnumSet;
import java.util.Objects;
+import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -38,11 +39,13 @@ public final class InstantEdtfDate extends AbstractEdtfDate implements Comparabl
private Month month;
private LocalDate yearMonthDay;
private YearPrecision yearPrecision;
- private DateQualification dateQualification = NO_QUALIFICATION;
+ private Set dateQualifications = EnumSet.noneOf(DateQualification.class);
private DateBoundaryType dateBoundaryType = DECLARED;
/**
* Restricted constructor by provided {@link InstantEdtfDateBuilder}.
+ * All fields apart from {@link #dateQualifications} are strictly contained in the constructor. The date qualifications can
+ * be further extended to, for example, add an approximate qualification for a date that was sanitized.
*
* @param instantEdtfDateBuilder the builder with all content verified
*/
@@ -51,13 +54,18 @@ public final class InstantEdtfDate extends AbstractEdtfDate implements Comparabl
year = instantEdtfDateBuilder.getYearObj();
month = instantEdtfDateBuilder.getMonthObj();
yearMonthDay = instantEdtfDateBuilder.getYearMonthDayObj();
- dateQualification = instantEdtfDateBuilder.getDateQualification();
+ dateQualifications = instantEdtfDateBuilder.getDateQualifications();
}
private InstantEdtfDate(DateBoundaryType dateBoundaryType) {
this.dateBoundaryType = dateBoundaryType;
}
+ @Override
+ public void addQualification(DateQualification dateQualification) {
+ this.dateQualifications.add(dateQualification);
+ }
+
/**
* Create an {@link DateBoundaryType#UNKNOWN} instant.
*
@@ -188,7 +196,7 @@ public Integer getCentury() {
int centuryDivision = year.getValue() / YearPrecision.CENTURY.getDuration();
int centuryModulo = year.getValue() % YearPrecision.CENTURY.getDuration();
//For case 1900 it is 19th. For case 1901 it is 20th century
- return centuryModulo == 0 ? centuryDivision : centuryDivision + 1;
+ return (centuryModulo == 0) ? centuryDivision : (centuryDivision + 1);
}
/**
@@ -230,7 +238,7 @@ public String toString() {
stringBuilder.append(
ofNullable(yearMonthDay).map(LocalDate::getDayOfMonth).map(decimalFormat::format).map(d -> "-" + d).orElse(""));
}
- stringBuilder.append(dateQualification.getCharacter());
+ stringBuilder.append(DateQualification.getCharacterFromQualifications(dateQualifications));
return stringBuilder.toString();
}
@@ -256,13 +264,13 @@ public boolean equals(Object o) {
}
InstantEdtfDate that = (InstantEdtfDate) o;
return yearPrecision == that.yearPrecision && Objects.equals(year, that.year) && Objects.equals(month,
- that.month) && Objects.equals(yearMonthDay, that.yearMonthDay) && dateQualification == that.dateQualification
+ that.month) && Objects.equals(yearMonthDay, that.yearMonthDay) && dateQualifications == that.dateQualifications
&& dateBoundaryType == that.dateBoundaryType;
}
@Override
public int hashCode() {
- return Objects.hash(yearPrecision, year, month, yearMonthDay, dateQualification, dateBoundaryType);
+ return Objects.hash(yearPrecision, year, month, yearMonthDay, dateQualifications, dateBoundaryType);
}
public Year getYear() {
@@ -281,8 +289,8 @@ public YearPrecision getYearPrecision() {
return yearPrecision;
}
- public DateQualification getDateQualification() {
- return dateQualification;
+ public Set getDateQualifications() {
+ return EnumSet.copyOf(dateQualifications);
}
public DateBoundaryType getDateBoundaryType() {
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDateBuilder.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDateBuilder.java
index ba3a45bfca..e47ec5e69c 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDateBuilder.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/InstantEdtfDateBuilder.java
@@ -1,8 +1,6 @@
package eu.europeana.normalization.dates.edtf;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
import static java.lang.String.format;
-import static java.util.Optional.ofNullable;
import eu.europeana.normalization.dates.YearPrecision;
import eu.europeana.normalization.dates.extraction.DateExtractionException;
@@ -14,14 +12,16 @@
import java.time.YearMonth;
import java.time.temporal.ChronoField;
import java.time.temporal.TemporalAccessor;
+import java.util.EnumSet;
import java.util.Objects;
+import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Builder class for {@link InstantEdtfDate}.
* During {@link #build()} it will verify all the parameters that have been requested.
- * The {@link #build()}, if {@link #withFlexibleDateBuild(boolean)} was called with {@code true}, will also attempt a second time
+ * The {@link #build()}, if {@link #withAllowDayMonthSwap(boolean)} was called with {@code true}, will also attempt a second time
* by switching month and day values if the original values were invalid. Furthermore, there are a set of constructors that can
* start the builder and will perform a build with specific characteristics:
*
@@ -38,14 +38,13 @@ public class InstantEdtfDateBuilder {
private Year yearObj;
private Month monthObj;
private LocalDate yearMonthDayObj;
- private Integer year;
+ private final Integer year;
private Integer month;
private Integer day;
- private TemporalAccessor temporalAccessor;
- private YearPrecision yearPrecision;
- private DateQualification dateQualification;
- private boolean flexibleDateBuild = true;
- private boolean longDatePrefixedWithY = false;
+ private YearPrecision yearPrecision = YearPrecision.YEAR;
+ private final Set dateQualifications = EnumSet.noneOf(DateQualification.class);
+ private boolean allowDayMonthSwap = true;
+ private boolean isMoreThanFourDigitsYear = false;
/**
* Constructor which initializes the builder with the minimum requirement of year value.
@@ -65,7 +64,12 @@ public InstantEdtfDateBuilder(final Integer year) {
* @param temporalAccessor the temporal accessor
*/
public InstantEdtfDateBuilder(TemporalAccessor temporalAccessor) {
- this.temporalAccessor = temporalAccessor;
+ day = temporalAccessor.isSupported(ChronoField.DAY_OF_MONTH) ?
+ temporalAccessor.get(ChronoField.DAY_OF_MONTH) : null;
+ month = temporalAccessor.isSupported(ChronoField.MONTH_OF_YEAR) ?
+ temporalAccessor.get(ChronoField.MONTH_OF_YEAR) : null;
+ year = temporalAccessor.isSupported(ChronoField.YEAR) ?
+ temporalAccessor.get(ChronoField.YEAR) : null;
}
/**
@@ -75,10 +79,9 @@ public InstantEdtfDateBuilder(TemporalAccessor temporalAccessor) {
* @throws DateExtractionException if something went wrong during date validation
*/
public InstantEdtfDate build() throws DateExtractionException {
- InstantEdtfDate instantEdtfDate;
- instantEdtfDate = buildInternal();
+ InstantEdtfDate instantEdtfDate = buildInternal();
//Try once more if flexible date
- if (instantEdtfDate == null && flexibleDateBuild) {
+ if (instantEdtfDate == null && isPositive(month) && isPositive(day) && allowDayMonthSwap) {
swapMonthDay();
instantEdtfDate = buildInternal();
}
@@ -92,14 +95,7 @@ public InstantEdtfDate build() throws DateExtractionException {
private InstantEdtfDate buildInternal() {
InstantEdtfDate instantEdtfDate = null;
- //Setup defaults
- yearPrecision = ofNullable(yearPrecision).orElse(YearPrecision.YEAR);
- dateQualification = ofNullable(dateQualification).orElse(NO_QUALIFICATION);
-
try {
- if (temporalAccessor != null) {
- parseTemporalAccessor();
- }
parseYear();
parseMonthDay();
validateDateNotInFuture();
@@ -111,34 +107,24 @@ private InstantEdtfDate buildInternal() {
return instantEdtfDate;
}
- private void parseTemporalAccessor() {
- LOGGER.debug("TemporalAccessor present. Overwriting values.");
- day = temporalAccessor.isSupported(ChronoField.DAY_OF_MONTH) ?
- temporalAccessor.get(ChronoField.DAY_OF_MONTH) : null;
- month = temporalAccessor.isSupported(ChronoField.MONTH_OF_YEAR) ?
- temporalAccessor.get(ChronoField.MONTH_OF_YEAR) : null;
- year = temporalAccessor.isSupported(ChronoField.YEAR) ?
- temporalAccessor.get(ChronoField.YEAR) : null;
- }
-
private void parseYear() throws DateExtractionException {
Objects.requireNonNull(year, "Year value can never be null");
- if (longDatePrefixedWithY && Math.abs(year) <= THRESHOLD_4_DIGITS_YEAR) {
+ if (isMoreThanFourDigitsYear && Math.abs(year) <= THRESHOLD_4_DIGITS_YEAR) {
throw new DateExtractionException(
- format("Prefixed year with 'Y' is enabled indicating that year should have absolute value greater than %s",
- THRESHOLD_4_DIGITS_YEAR));
- } else if (!longDatePrefixedWithY && Math.abs(year) > THRESHOLD_4_DIGITS_YEAR) {
+ format("isLongerThanFourDigitsYear is %s indicating that year should have absolute value greater than %s",
+ true, THRESHOLD_4_DIGITS_YEAR));
+ } else if (!isMoreThanFourDigitsYear && Math.abs(year) > THRESHOLD_4_DIGITS_YEAR) {
throw new DateExtractionException(
- format("Year absolute value greater than %s, should be prefixed with 'Y'", THRESHOLD_4_DIGITS_YEAR));
+ format("Year absolute value is greater than %s, and isLongerThanFourDigitsYear is %s", THRESHOLD_4_DIGITS_YEAR, false));
}
yearObj = Year.of(year * yearPrecision.getDuration());
}
private void parseMonthDay() throws DateExtractionException {
try {
- if (month != null && month >= 1) {
+ if (isPositive(month)) {
monthObj = Month.of(month);
- if (day != null && day >= 1) {
+ if (isPositive(day)) {
yearMonthDayObj = LocalDate.of(yearObj.getValue(), monthObj.getValue(), day);
}
}
@@ -147,6 +133,10 @@ private void parseMonthDay() throws DateExtractionException {
}
}
+ private boolean isPositive(Integer value) {
+ return value != null && value > 0;
+ }
+
private void validateDateNotInFuture() throws DateExtractionException {
try {
final boolean isYearMonthDayInTheFuture = yearMonthDayObj != null && yearMonthDayObj.isAfter(LocalDate.now());
@@ -164,13 +154,12 @@ private void validateDateNotInFuture() throws DateExtractionException {
private void validateStrict() throws DateExtractionException {
//If it is not a long year, and we want to be strict we further validate
- boolean notLongYearAndStrictBuild = !longDatePrefixedWithY && !flexibleDateBuild;
- // TODO: 15/02/2023 Check this instruction. It used to be like that
- // return edtfDatePart.isUnknown() || edtfDatePart.isUncertain() || edtfDatePart.getYearPrecision() != null;
- // but do we actually need the check on unknown?
- boolean isDateNonPrecise = dateQualification == DateQualification.UNCERTAIN || yearPrecision != null;
+ boolean isNotMoreThanFourDigitsYearAndStrictBuild = !isMoreThanFourDigitsYear && !allowDayMonthSwap;
+ boolean isDateNonPrecise =
+ dateQualifications.contains(DateQualification.UNCERTAIN) || (yearPrecision != null
+ && yearPrecision != YearPrecision.YEAR);
boolean notCompleteDate = monthObj == null || yearMonthDayObj == null;
- if (notLongYearAndStrictBuild && (isDateNonPrecise || notCompleteDate)) {
+ if (isNotMoreThanFourDigitsYearAndStrictBuild && (isDateNonPrecise || notCompleteDate)) {
throw new DateExtractionException("Date is invalid according to our strict profile!");
}
}
@@ -217,22 +206,22 @@ public InstantEdtfDateBuilder withYearPrecision(YearPrecision yearPrecision) {
/**
* Add date qualification.
*
- * @param dateQualification the date qualification
+ * @param dateQualifications the date qualifications
* @return the extended builder
*/
- public InstantEdtfDateBuilder withDateQualification(DateQualification dateQualification) {
- this.dateQualification = dateQualification;
+ public InstantEdtfDateBuilder withDateQualification(Set dateQualifications) {
+ this.dateQualifications.addAll(dateQualifications);
return this;
}
/**
- * Opt in/out for flexible date building.
+ * Opt in/out for day month swap if original values failed validation.
*
- * @param flexibleDateBuild the boolean (dis|en)abling the flexibility
+ * @param allowDayMonthSwap the boolean (dis|en)abling the day and month swap
* @return the extended builder
*/
- public InstantEdtfDateBuilder withFlexibleDateBuild(boolean flexibleDateBuild) {
- this.flexibleDateBuild = flexibleDateBuild;
+ public InstantEdtfDateBuilder withAllowDayMonthSwap(boolean allowDayMonthSwap) {
+ this.allowDayMonthSwap = allowDayMonthSwap;
return this;
}
@@ -241,8 +230,8 @@ public InstantEdtfDateBuilder withFlexibleDateBuild(boolean flexibleDateBuild) {
*
* @return the extended builder
*/
- public InstantEdtfDateBuilder withLongYearPrefixedWithY() {
- this.longDatePrefixedWithY = true;
+ public InstantEdtfDateBuilder withMoreThanFourDigitsYear() {
+ this.isMoreThanFourDigitsYear = true;
return this;
}
@@ -262,7 +251,7 @@ public YearPrecision getYearPrecision() {
return yearPrecision;
}
- public DateQualification getDateQualification() {
- return dateQualification;
+ public Set getDateQualifications() {
+ return EnumSet.copyOf(dateQualifications);
}
}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDate.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDate.java
index 28100b047d..f5c6659526 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDate.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDate.java
@@ -1,16 +1,28 @@
package eu.europeana.normalization.dates.edtf;
+import static eu.europeana.normalization.dates.extraction.DefaultDatesSeparator.SLASH_DELIMITER;
import static java.lang.String.format;
+import java.util.EnumSet;
+import java.util.Set;
+
/**
* An EDTF date that represents a period of time specified by a start and end date with various degrees of precision
*/
public class IntervalEdtfDate extends AbstractEdtfDate {
- public static final String DATE_INTERVAL_SEPARATOR = "/";
private InstantEdtfDate start;
private InstantEdtfDate end;
+
+ /**
+ * Restricted constructor by provided {@link InstantEdtfDateBuilder}.
+ * All fields apart from the internal {@link IntervalEdtfDate#addQualification(DateQualification)}(for each boundary) are
+ * strictly contained in the constructor. The date qualifications can be further extended to, for example, add an approximate
+ * qualification for a date that was sanitized.
+ *
+ * @param intervalEdtfDateBuilder the builder with all content verified
+ */
IntervalEdtfDate(IntervalEdtfDateBuilder intervalEdtfDateBuilder) {
super(intervalEdtfDateBuilder.getLabel());
this.start = intervalEdtfDateBuilder.getStart();
@@ -18,13 +30,16 @@ public class IntervalEdtfDate extends AbstractEdtfDate {
}
@Override
- public DateQualification getDateQualification() {
- // TODO: 24/02/2023 To verify what this should return.
- if (start.getDateQualification() == DateQualification.NO_QUALIFICATION) {
- return end.getDateQualification();
- } else {
- return start.getDateQualification();
- }
+ public void addQualification(DateQualification dateQualification) {
+ start.addQualification(dateQualification);
+ end.addQualification(dateQualification);
+ }
+
+ @Override
+ public Set getDateQualifications() {
+ Set dateQualifications = EnumSet.copyOf(start.getDateQualifications());
+ dateQualifications.addAll(end.getDateQualifications());
+ return dateQualifications;
}
@Override
@@ -60,6 +75,6 @@ public void setEnd(InstantEdtfDate end) {
@Override
public String toString() {
- return format("%s%s%s", start.toString(), DATE_INTERVAL_SEPARATOR, end.toString());
+ return format("%s%s%s", start.toString(), SLASH_DELIMITER.getStringRepresentation(), end.toString());
}
}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDateBuilder.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDateBuilder.java
index 428a673eda..053a64fdc2 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDateBuilder.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/edtf/IntervalEdtfDateBuilder.java
@@ -10,8 +10,8 @@
* Builder class for {@link IntervalEdtfDate}.
*
* During {@link #build()} it will verify all the parameters that have been requested. The {@link #build()}, if
- * {@link #withFlexibleDateBuild(boolean)} was called with {@code true}, will also attempt a second time by switching start and
- * end values if the original values were invalid.
+ * {@link #withAllowStartEndSwap(boolean)} was called with {@code true}, will also attempt a second time by switching
+ * start and end values if the original values were invalid.
*
*/
public class IntervalEdtfDateBuilder {
@@ -20,8 +20,7 @@ public class IntervalEdtfDateBuilder {
private InstantEdtfDate start;
private InstantEdtfDate end;
private String label;
-
- private boolean flexibleDateBuild = false;
+ private boolean allowStartEndSwap = true;
/**
* Constructor which initializes the builder with the start and end date boundaries.
@@ -47,13 +46,13 @@ public IntervalEdtfDateBuilder withLabel(String label) {
}
/**
- * Opt in/out for flexible date building.
+ * Opt in/out for start end swap if original values failed validation.
*
- * @param flexibleDateBuild the boolean (dis|en)abling the flexibility
+ * @param allowStartEndSwap the boolean (dis|en)abling the start and end swap
* @return the extended builder
*/
- public IntervalEdtfDateBuilder withFlexibleDateBuild(boolean flexibleDateBuild) {
- this.flexibleDateBuild = flexibleDateBuild;
+ public IntervalEdtfDateBuilder withAllowStartEndSwap(boolean allowStartEndSwap) {
+ this.allowStartEndSwap = allowStartEndSwap;
return this;
}
@@ -67,13 +66,11 @@ public IntervalEdtfDate build() throws DateExtractionException {
IntervalEdtfDate intervalEdtfDate;
intervalEdtfDate = buildInternal();
//Try once more if switching allowed
- if (intervalEdtfDate == null && flexibleDateBuild) {
- //Retry with swapping month and day
+ if (intervalEdtfDate == null && allowStartEndSwap) {
switchStartWithEnd();
intervalEdtfDate = buildInternal();
}
- //Still nothing, we are done.
if (intervalEdtfDate == null) {
throw new DateExtractionException("Could not instantiate date");
}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatePartsIndices.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatePartsIndices.java
new file mode 100644
index 0000000000..f2e639e383
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatePartsIndices.java
@@ -0,0 +1,33 @@
+package eu.europeana.normalization.dates.extraction;
+
+import org.apache.commons.lang3.tuple.ImmutableTriple;
+import org.apache.commons.lang3.tuple.Triple;
+
+/**
+ * Enum containing triples of the group indices.
+ * The positions are Left = Year, Middle = Month, Right = Day
+ */
+public enum DatePartsIndices {
+ DMY_INDICES(ImmutableTriple.of(3, 2, 1)),
+ YMD_INDICES(ImmutableTriple.of(1, 2, 3)),
+ MDY_INDICES(ImmutableTriple.of(3, 1, 2)),
+ MY_INDICES(ImmutableTriple.of(2, 1, null));
+
+ private final Triple indicesTriple;
+
+ DatePartsIndices(Triple indicesTriple) {
+ this.indicesTriple = indicesTriple;
+ }
+
+ public Integer getYearIndex() {
+ return indicesTriple.getLeft();
+ }
+
+ public Integer getMonthIndex() {
+ return indicesTriple.getMiddle();
+ }
+
+ public Integer getDayIndex() {
+ return indicesTriple.getRight();
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatesSeparator.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatesSeparator.java
new file mode 100644
index 0000000000..584db54b6c
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DatesSeparator.java
@@ -0,0 +1,9 @@
+package eu.europeana.normalization.dates.extraction;
+
+/**
+ * Interface to get the separator between two dates
+ */
+public interface DatesSeparator {
+
+ String getStringRepresentation();
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DefaultDatesSeparator.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DefaultDatesSeparator.java
new file mode 100644
index 0000000000..1d693adb28
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/DefaultDatesSeparator.java
@@ -0,0 +1,20 @@
+package eu.europeana.normalization.dates.extraction;
+
+/**
+ * Basic default enum for date separators
+ */
+public enum DefaultDatesSeparator implements DatesSeparator {
+ DASH_DELIMITER("-"),
+ SLASH_DELIMITER("/");
+
+ private final String stringRepresentation;
+
+ DefaultDatesSeparator(String stringRepresentation) {
+ this.stringRepresentation = stringRepresentation;
+ }
+
+ @Override
+ public String getStringRepresentation() {
+ return stringRepresentation;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/MonthMultilingual.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/MonthMultilingual.java
index ce0b26cde0..bbd7ec53f0 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/MonthMultilingual.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/MonthMultilingual.java
@@ -1,10 +1,15 @@
package eu.europeana.normalization.dates.extraction;
+import static java.util.Collections.unmodifiableSet;
+
import java.time.Month;
import java.time.format.TextStyle;
+import java.util.Collections;
import java.util.EnumMap;
import java.util.HashSet;
+import java.util.Locale;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Set;
/**
@@ -13,7 +18,7 @@
*/
public class MonthMultilingual {
- private final Map> monthToAllLanguagesStringsMap;
+ private final EnumMap> monthToAllLanguagesStringsMap;
/**
* Default constructor.
@@ -27,34 +32,33 @@ public MonthMultilingual() {
for (Month month : Month.values()) {
final HashSet languageValues = new HashSet<>();
for (EuropeanLanguage europeanLanguage : EuropeanLanguage.values()) {
- languageValues.add(month.getDisplayName(TextStyle.SHORT, europeanLanguage.getLocale()));
- languageValues.add(month.getDisplayName(TextStyle.SHORT_STANDALONE, europeanLanguage.getLocale()));
- languageValues.add(month.getDisplayName(TextStyle.FULL, europeanLanguage.getLocale()));
- languageValues.add(month.getDisplayName(TextStyle.FULL_STANDALONE, europeanLanguage.getLocale()));
+ languageValues.add(month.getDisplayName(TextStyle.SHORT, europeanLanguage.getLocale())
+ .toLowerCase(europeanLanguage.getLocale()));
+ languageValues.add(month.getDisplayName(TextStyle.SHORT_STANDALONE, europeanLanguage.getLocale())
+ .toLowerCase(europeanLanguage.getLocale()));
+ languageValues.add(month.getDisplayName(TextStyle.FULL, europeanLanguage.getLocale())
+ .toLowerCase(europeanLanguage.getLocale()));
+ languageValues.add(month.getDisplayName(TextStyle.FULL_STANDALONE, europeanLanguage.getLocale())
+ .toLowerCase(europeanLanguage.getLocale()));
}
- monthToAllLanguagesStringsMap.put(month, languageValues);
+ monthToAllLanguagesStringsMap.put(month, unmodifiableSet(languageValues));
}
}
- /**
- * Get all languages string values for a month.
- *
- * @param month the month
- * @return the set of all string representations
- */
- public Set getMonthStrings(Month month) {
- return monthToAllLanguagesStringsMap.get(month);
+ public Map> getMonthToAllLanguagesStringsMap() {
+ return Collections.unmodifiableMap(monthToAllLanguagesStringsMap);
}
/**
- * Get the month index based on a month name in any supported language, full or short, standard or stand-alone.
+ * Get {@link Month} by name.
*
* @param monthName the month name
- * @return the month index
+ * @return the month
*/
- public Integer getMonthIndexValue(String monthName) {
- return monthToAllLanguagesStringsMap.entrySet().stream().filter(entry -> entry.getValue().contains(monthName))
- .findFirst().map(entry -> entry.getKey().getValue()).orElse(null);
+ public Month getMonth(String monthName) {
+ return monthToAllLanguagesStringsMap.entrySet().stream()
+ .filter(entry -> entry.getValue().contains(monthName.toLowerCase(Locale.ROOT)))
+ .findFirst().map(Entry::getKey).orElse(null);
}
}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/NumericPartsPattern.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/NumericPartsPattern.java
index c5defa979e..27beb5eebb 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/NumericPartsPattern.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/NumericPartsPattern.java
@@ -2,17 +2,17 @@
import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS;
import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS_XX;
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.YYYY_MM_DD_SPACES;
-import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.DatePartsIndices.DMY_INDICES;
-import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.DatePartsIndices.YMD_INDICES;
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_SPACES_VARIANT;
+import static eu.europeana.normalization.dates.extraction.DatePartsIndices.DMY_INDICES;
+import static eu.europeana.normalization.dates.extraction.DatePartsIndices.YMD_INDICES;
import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericDateDelimiters.DASH_DOT_DELIMITERS;
import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericDateDelimiters.DASH_DOT_SLASH_DELIMITERS;
import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericDateDelimiters.DOT_SLASH_DELIMITERS;
import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericDateDelimiters.SPACE_DELIMITER;
-import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeDateDelimiters.DASH_RANGE;
-import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeDateDelimiters.SLASH_RANGE;
-import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeDateDelimiters.SPACED_DASH_RANGE;
-import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeDateDelimiters.SPACE_RANGE;
+import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeQualifier.DASH_RANGE;
+import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeQualifier.SLASH_RANGE;
+import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeQualifier.SPACED_DASH_RANGE;
+import static eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeQualifier.SPACE_RANGE;
import static java.util.Collections.unmodifiableSet;
import static java.util.regex.Pattern.CASE_INSENSITIVE;
import static java.util.regex.Pattern.compile;
@@ -21,8 +21,6 @@
import java.util.EnumSet;
import java.util.Set;
import java.util.regex.Pattern;
-import org.apache.commons.lang3.tuple.ImmutableTriple;
-import org.apache.commons.lang3.tuple.Triple;
/**
* Enum with all the acceptable date patterns used for numeric dates.
@@ -37,8 +35,8 @@ public enum NumericPartsPattern {
YMD_XX(DASH_DOT_SLASH_DELIMITERS, YMD_INDICES, NUMERIC_ALL_VARIANTS_XX),
DMY_XX(DASH_DOT_SLASH_DELIMITERS, DMY_INDICES, NUMERIC_ALL_VARIANTS_XX),
- YMD_SPACES(SPACE_DELIMITER, YMD_INDICES, YYYY_MM_DD_SPACES),
- DMY_SPACES(SPACE_DELIMITER, DMY_INDICES, YYYY_MM_DD_SPACES),
+ YMD_SPACES(SPACE_DELIMITER, YMD_INDICES, NUMERIC_SPACES_VARIANT),
+ DMY_SPACES(SPACE_DELIMITER, DMY_INDICES, NUMERIC_SPACES_VARIANT),
YMD_SPACED_DASH_RANGE(SPACED_DASH_RANGE, YMD_INDICES, NUMERIC_ALL_VARIANTS),
DMY_SPACED_DASH_RANGE(SPACED_DASH_RANGE, DMY_INDICES, NUMERIC_ALL_VARIANTS),
@@ -88,9 +86,9 @@ public enum NumericPartsPattern {
NumericPartsPattern(DateDelimiters dateDelimiters, DatePartsIndices dateFormatIndices,
DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
this.dateNormalizationExtractorMatchId = dateNormalizationExtractorMatchId;
- this.yearIndex = dateFormatIndices.tripleIndices.getLeft();
- this.monthIndex = dateFormatIndices.tripleIndices.getMiddle();
- this.dayIndex = dateFormatIndices.tripleIndices.getRight();
+ this.yearIndex = dateFormatIndices.getYearIndex();
+ this.monthIndex = dateFormatIndices.getMonthIndex();
+ this.dayIndex = dateFormatIndices.getDayIndex();
this.pattern = NumericPartsPattern.generatePattern(dateDelimiters.getDatesDelimiters(), dateNormalizationExtractorMatchId,
dateFormatIndices);
@@ -125,7 +123,7 @@ private static Pattern generatePattern(String dateDelimiters,
year = "(\\d{2}(?:XX|UU|--|\\?\\?)|\\d{3}(?!\\?)[XU]|\\d{4})";
delimiterDigits = "(?:" + dateDelimiters + "(\\d{2}|XX|UU|(?
*/
- public enum NumericRangeDateDelimiters implements DateDelimiters {
+ public enum NumericRangeQualifier implements DateDelimiters, DatesSeparator {
//"[XU]" with "-" delimiter, "[\\-XU]" with "./" delimiters
- SPACED_DASH_RANGE(" - ", DASH_DOT_SLASH_DELIMITERS, NumericRangeDateDelimiters.DEFAULT_UNSPECIFIED_CHARACTERS),
+ SPACED_DASH_RANGE(" - ", DASH_DOT_SLASH_DELIMITERS, NumericRangeQualifier.DEFAULT_UNSPECIFIED_CHARACTERS),
//"[XU]" with "-" delimiter, "[\\-XU]" with "./" delimiters
- PIPE_RANGE("\\|", DASH_DOT_SLASH_DELIMITERS, NumericRangeDateDelimiters.DEFAULT_UNSPECIFIED_CHARACTERS),
+ PIPE_RANGE("\\|", DASH_DOT_SLASH_DELIMITERS, NumericRangeQualifier.DEFAULT_UNSPECIFIED_CHARACTERS),
//For space separator we don't accept unspecified boundaries
//Does not exist in XX
SPACE_RANGE(" ", DASH_DOT_SLASH_DELIMITERS, null),
//"[XU]"
DASH_RANGE("-", DOT_SLASH_DELIMITERS, "\\?|\\.\\."),
//"[XU]" with "-" delimiter, "[\\-XU]" with "." delimiter
- SLASH_RANGE("/", DASH_DOT_DELIMITERS, NumericRangeDateDelimiters.DEFAULT_UNSPECIFIED_CHARACTERS);
+ SLASH_RANGE("/", DASH_DOT_DELIMITERS, NumericRangeQualifier.DEFAULT_UNSPECIFIED_CHARACTERS);
public static final String DEFAULT_UNSPECIFIED_CHARACTERS = "\\?|-|\\.\\.";
@@ -215,13 +213,14 @@ public enum NumericRangeDateDelimiters implements DateDelimiters {
private final String datesDelimiters;
private final String unspecifiedCharacters;
- NumericRangeDateDelimiters(String datesSeparator, NumericDateDelimiters datesDelimiters, String unspecifiedCharacters) {
+ NumericRangeQualifier(String datesSeparator, NumericDateDelimiters datesDelimiters, String unspecifiedCharacters) {
this.datesSeparator = datesSeparator;
this.datesDelimiters = datesDelimiters.getDatesDelimiters();
this.unspecifiedCharacters = unspecifiedCharacters;
}
- public String getDatesSeparator() {
+ @Override
+ public String getStringRepresentation() {
return datesSeparator;
}
@@ -234,18 +233,4 @@ public String getUnspecifiedCharacters() {
return unspecifiedCharacters;
}
}
-
- /**
- * Simple internal enum that contains the indices order of a DMY and YMD date formatting.
- */
- enum DatePartsIndices {
- DMY_INDICES(ImmutableTriple.of(3, 2, 1)),
- YMD_INDICES(ImmutableTriple.of(1, 2, 3));
-
- private final Triple tripleIndices;
-
- DatePartsIndices(Triple tripleIndices) {
- this.tripleIndices = tripleIndices;
- }
- }
}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/AbstractDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/AbstractDateExtractor.java
deleted file mode 100644
index 0538114121..0000000000
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/AbstractDateExtractor.java
+++ /dev/null
@@ -1,82 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult;
-import static java.lang.String.format;
-
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import eu.europeana.normalization.dates.extraction.DateExtractionException;
-import java.lang.invoke.MethodHandles;
-import java.util.function.Supplier;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Abstract class implementing interface {@link DateExtractor} with default functionality for all extractors
- */
-public abstract class AbstractDateExtractor implements DateExtractor {
-
- private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
- /**
- * Utility method for calling {@link #extract(String, DateQualification, boolean)} with allowSwitchesDuringValidation as true.
- * It also captures relevant exceptions so that return is performed
- *
- * @param inputValue the input value
- * @param dateQualification the date qualification requested
- * @return the date normalization result
- */
- @Override
- public DateNormalizationResult extractDateProperty(String inputValue, DateQualification dateQualification) {
- return getDateNormalizationResult(inputValue, dateQualification, true);
- }
-
- /**
- * Utility method for calling {@link #extract(String, DateQualification, boolean)} with allowSwitchesDuringValidation as false.
- * It also captures relevant exceptions so that return is performed
- *
- * @param inputValue the input value
- * @param dateQualification the date qualification requested
- * @return the date normalization result
- */
- @Override
- public DateNormalizationResult extractGenericProperty(String inputValue, DateQualification dateQualification) {
- return getDateNormalizationResult(inputValue, dateQualification, false);
- }
-
- private DateNormalizationResult getDateNormalizationResult(String inputValue, DateQualification dateQualification,
- boolean flexibleDateBuild) {
- DateNormalizationResult dateNormalizationResult = getNoMatchResult(inputValue);
- try {
- dateNormalizationResult = extract(inputValue, dateQualification, flexibleDateBuild);
- } catch (DateExtractionException e) {
- LOGGER.debug(format("Date extraction failed %s: ", inputValue), e);
- }
- //Sanity check to avoid null return.
- if (dateNormalizationResult == null) {
- dateNormalizationResult = getNoMatchResult(inputValue);
- }
- return dateNormalizationResult;
- }
-
- /**
- * Default method to get the correct date qualification.
- * If a requested date qualification is requested we then set that, overwriting any other that would otherwise be computed.
- * The date qualification will be overwritten if the requested date qualification in non-null and
- * non-{@link DateQualification#NO_QUALIFICATION}. Otherwise we compute it with the supplier provided.
- *
- * @param requestedDateQualification the requested date qualification
- * @param dateQualificationSupplier the date qualification supplier
- * @return the computed date qualification
- */
- DateQualification computeDateQualification(DateQualification requestedDateQualification,
- Supplier dateQualificationSupplier) {
- final DateQualification dateQualification;
- if (requestedDateQualification != null && requestedDateQualification != DateQualification.NO_QUALIFICATION) {
- dateQualification = requestedDateQualification;
- } else {
- dateQualification = dateQualificationSupplier.get();
- }
- return dateQualification;
- }
-}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractor.java
deleted file mode 100644
index 5c353611b8..0000000000
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractor.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.YearPrecision.CENTURY;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN;
-
-import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder;
-import eu.europeana.normalization.dates.extraction.DateExtractionException;
-import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer;
-import java.time.Month;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Extractor that matches a date range where the end year includes only the rightmost two digits.
- *
- * The end year in this extractor has to:
- *
- * - Be higher than 12 to avoid matching a month value from other extractors.
- * - Be higher than the two rightmost digits of the start year.
- *
- *
- *
- * This pattern needs to be executed before the Edtf extractor because EDTF could potentially match yyyy/MM and yyyy-MM.
- * Therefore in this extractor we check only the values that are higher than 12 to avoid a mismatch.
- *
- */
-public class BriefRangeDateExtractor extends AbstractDateExtractor {
-
- private final Pattern briefRangePattern = Pattern.compile("\\??(\\d{3,4})[\\-/](\\d{2})\\??");
-
- public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) throws DateExtractionException {
- final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue);
- final DateQualification dateQualification = computeDateQualification(requestedDateQualification, () ->
- (sanitizedValue.startsWith("?") || sanitizedValue.endsWith("?")) ? UNCERTAIN : NO_QUALIFICATION);
-
- DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue);
- final Matcher matcher = briefRangePattern.matcher(sanitizedValue);
- if (matcher.matches()) {
- final int startYearFourDigits = Integer.parseInt(matcher.group(1));
- final int startYearLastTwoDigits = startYearFourDigits % CENTURY.getDuration();
- final int endYearTwoDigits = Integer.parseInt(matcher.group(2));
- final int endYearFourDigits = (startYearFourDigits / CENTURY.getDuration()) * CENTURY.getDuration() + endYearTwoDigits;
-
- if (endYearTwoDigits > Month.DECEMBER.getValue() && startYearLastTwoDigits < endYearTwoDigits) {
- final InstantEdtfDate startDate = new InstantEdtfDateBuilder(startYearFourDigits)
- .withDateQualification(dateQualification)
- .withFlexibleDateBuild(flexibleDateBuild)
- .build();
-
- final InstantEdtfDate endDate = new InstantEdtfDateBuilder(endYearFourDigits)
- .withDateQualification(dateQualification)
- .withFlexibleDateBuild(flexibleDateBuild)
- .build();
-
- dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.BRIEF_DATE_RANGE, inputValue,
- new IntervalEdtfDateBuilder(startDate, endDate).withFlexibleDateBuild(flexibleDateBuild).build());
- }
- }
-
- return dateNormalizationResult;
- }
-}
-
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractor.java
deleted file mode 100644
index f8c8c46bda..0000000000
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractor.java
+++ /dev/null
@@ -1,150 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.YearPrecision.CENTURY;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN;
-import static java.util.regex.Pattern.CASE_INSENSITIVE;
-import static java.util.regex.Pattern.compile;
-
-import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.DateNormalizationResultStatus;
-import eu.europeana.normalization.dates.edtf.AbstractEdtfDate;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder;
-import eu.europeana.normalization.dates.extraction.DateExtractionException;
-import eu.europeana.normalization.dates.extraction.RomanToNumber;
-import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer;
-import java.lang.invoke.MethodHandles;
-import java.util.Arrays;
-import java.util.function.ToIntFunction;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Extractor that matches a century with a decimal or Roman numerals
- * The range of values this accepts are from 1-21 including.
- * Examples of some cases:
- *
- * -
- * Value = 18.. | Outcome = 18XX
- * Value = 1st century | Outcome = 00XX
- * Value = s. XXI | Outcome = 20XX
- * Value = s.II-III | Outcome = 01XX/02XX
- *
- *
- *
- * The Roman numerals may also be preceded by an abbreviation of century, for example ‘s. XIX’.
- * Also supports ranges.
- */
-public class CenturyDateExtractor extends AbstractDateExtractor {
-
- private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
- private static final String NUMERIC_10_TO_21_ENDING_DOTS_REGEX = "(1\\d|2[0-1])\\.{2}";
- private static final String NUMERIC_1_TO_21_SUFFIXED_REGEX = "(2?1st|2nd|3rd|(?:1\\d|[4-9]|20)th)\\scentury";
- private static final String ROMAN_1_TO_21_REGEX = "(X?(?:IX|IV|VI{0,3}|I{1,3})|X|XXI?)";
- private static final String CENTURY_PREFIX = "(?:(?:s|sec|saec)\\s|(?:s|sec|saec)\\.\\s?)?";
- private static final String QUESTION_MARK = "\\??";
-
- enum PatternCenturyDateOperation {
- PATTERN_YYYY(
- compile(QUESTION_MARK + NUMERIC_10_TO_21_ENDING_DOTS_REGEX + QUESTION_MARK, CASE_INSENSITIVE),
- Integer::parseInt, DateNormalizationExtractorMatchId.CENTURY_NUMERIC),
- PATTERN_ENGLISH(
- compile(QUESTION_MARK + NUMERIC_1_TO_21_SUFFIXED_REGEX + QUESTION_MARK, CASE_INSENSITIVE),
- century -> (Integer.parseInt(century.substring(0, century.length() - 2)) - 1),
- DateNormalizationExtractorMatchId.CENTURY_NUMERIC),
- PATTERN_ROMAN(
- compile(QUESTION_MARK + CENTURY_PREFIX + ROMAN_1_TO_21_REGEX + QUESTION_MARK, CASE_INSENSITIVE),
- century -> (RomanToNumber.romanToDecimal(century) - 1),
- DateNormalizationExtractorMatchId.CENTURY_ROMAN),
- PATTERN_ROMAN_RANGE(
- compile(QUESTION_MARK + CENTURY_PREFIX + ROMAN_1_TO_21_REGEX + "\\s?-\\s?" + ROMAN_1_TO_21_REGEX + QUESTION_MARK,
- CASE_INSENSITIVE), century -> (RomanToNumber.romanToDecimal(century) - 1),
- DateNormalizationExtractorMatchId.CENTURY_RANGE_ROMAN);
-
- private final Pattern pattern;
- private final ToIntFunction centuryExtractorFunction;
- private final DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId;
-
- PatternCenturyDateOperation(Pattern pattern, ToIntFunction centuryExtractorFunction,
- DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
- this.pattern = pattern;
- this.centuryExtractorFunction = centuryExtractorFunction;
- this.dateNormalizationExtractorMatchId = dateNormalizationExtractorMatchId;
- }
-
- public Pattern getPattern() {
- return pattern;
- }
-
- public ToIntFunction getCenturyExtractorFunction() {
- return centuryExtractorFunction;
- }
-
- public DateNormalizationExtractorMatchId getDateNormalizationExtractorMatchId() {
- return dateNormalizationExtractorMatchId;
- }
- }
-
- @Override
- public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) {
- return Arrays.stream(PatternCenturyDateOperation.values())
- .map(operation -> {
- try {
- return extractInstance(inputValue, requestedDateQualification, operation,
- flexibleDateBuild);
- } catch (DateExtractionException e) {
- LOGGER.warn("Failed instance extraction!", e);
- }
- return DateNormalizationResult.getNoMatchResult(inputValue);
- })
- .filter(dateNormalizationResult -> dateNormalizationResult.getDateNormalizationResultStatus()
- == DateNormalizationResultStatus.MATCHED).findFirst()
- .orElse(DateNormalizationResult.getNoMatchResult(inputValue));
- }
-
- private DateNormalizationResult extractInstance(String inputValue, DateQualification requestedDateQualification,
- PatternCenturyDateOperation patternCenturyDateOperation,
- boolean allowSwitchMonthDay) throws DateExtractionException {
- final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue);
- final DateQualification dateQualification = computeDateQualification(requestedDateQualification, () ->
- (sanitizedValue.startsWith("?") || sanitizedValue.endsWith("?")) ? UNCERTAIN : NO_QUALIFICATION);
-
- final Matcher matcher = patternCenturyDateOperation.getPattern().matcher(sanitizedValue);
- DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue);
- if (matcher.matches()) {
- AbstractEdtfDate abstractEdtfDate;
- InstantEdtfDateBuilder startDatePartBuilder = extractEdtfDatePart(patternCenturyDateOperation, matcher, 1);
- InstantEdtfDate startEdtfDate = startDatePartBuilder.withDateQualification(dateQualification)
- .withFlexibleDateBuild(allowSwitchMonthDay).build();
-
- boolean isInterval = matcher.groupCount() == 2;
- if (isInterval) {
- InstantEdtfDateBuilder endDatePartBuilder = extractEdtfDatePart(patternCenturyDateOperation, matcher, 2);
- InstantEdtfDate endEdtfDate = endDatePartBuilder.withDateQualification(dateQualification)
- .withFlexibleDateBuild(allowSwitchMonthDay).build();
- abstractEdtfDate = new IntervalEdtfDateBuilder(startEdtfDate, endEdtfDate).withFlexibleDateBuild(allowSwitchMonthDay)
- .build();
- } else {
- abstractEdtfDate = startEdtfDate;
- }
-
- dateNormalizationResult = new DateNormalizationResult(patternCenturyDateOperation.getDateNormalizationExtractorMatchId(),
- inputValue, abstractEdtfDate);
- }
- return dateNormalizationResult;
- }
-
- private InstantEdtfDateBuilder extractEdtfDatePart(PatternCenturyDateOperation patternCenturyDateOperation,
- Matcher matcher, int group) {
- final String century = matcher.group(group);
- return new InstantEdtfDateBuilder(patternCenturyDateOperation.getCenturyExtractorFunction().applyAsInt(century))
- .withYearPrecision(CENTURY);
- }
-}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractor.java
deleted file mode 100644
index 84c617d165..0000000000
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractor.java
+++ /dev/null
@@ -1,105 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN;
-import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN;
-import static eu.europeana.normalization.dates.edtf.DateQualification.CHECK_QUALIFICATION_PATTERN;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder.OVER_4_DIGITS_YEAR_PREFIX;
-import static eu.europeana.normalization.dates.edtf.IntervalEdtfDate.DATE_INTERVAL_SEPARATOR;
-
-import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.edtf.AbstractEdtfDate;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDate;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder;
-import eu.europeana.normalization.dates.edtf.Iso8601Parser;
-import eu.europeana.normalization.dates.extraction.DateExtractionException;
-import java.time.temporal.TemporalAccessor;
-import java.util.regex.Matcher;
-import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.lang3.math.NumberUtils;
-
-/**
- * The pattern for EDTF dates and compatible with ISO 8601 dates.
- * This parser supports partial Level0 and Level1 from the Extended
- * Date/Time Format (EDTF) Specification. It only validates the date part of a date and the time if existent it is discarded.
- * Specifically from Level1, seasons and Unspecified digit(s) from the right are not supported
- *
- */
-public class EdtfDateExtractor extends AbstractDateExtractor {
-
- private static final Iso8601Parser ISO_8601_PARSER = new Iso8601Parser();
-
- @Override
- public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) throws DateExtractionException {
- if (StringUtils.isEmpty(inputValue)) {
- throw new DateExtractionException("Empty argument");
- }
- final AbstractEdtfDate edtfDate;
- if (inputValue.contains(DATE_INTERVAL_SEPARATOR)) {
- edtfDate = extractInterval(inputValue, requestedDateQualification, flexibleDateBuild);
- } else {
- edtfDate = extractInstant(inputValue, requestedDateQualification, flexibleDateBuild);
- }
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.EDTF, inputValue, edtfDate);
- }
-
- protected IntervalEdtfDate extractInterval(String dateInput, DateQualification requestedDateQualification,
- boolean allowSwitchMonthDay) throws DateExtractionException {
- String startPart = dateInput.substring(0, dateInput.indexOf(DATE_INTERVAL_SEPARATOR));
- String endPart = dateInput.substring(dateInput.indexOf(DATE_INTERVAL_SEPARATOR) + 1);
- final InstantEdtfDate start = extractInstant(startPart, requestedDateQualification, allowSwitchMonthDay);
- final InstantEdtfDate end = extractInstant(endPart, requestedDateQualification, allowSwitchMonthDay);
-
- //Are both ends unknown or open, then it is not a date
- if ((end.getDateBoundaryType() == UNKNOWN || end.getDateBoundaryType() == OPEN) &&
- (start.getDateBoundaryType() == UNKNOWN || start.getDateBoundaryType() == OPEN)) {
- throw new DateExtractionException(dateInput);
- }
- return new IntervalEdtfDateBuilder(start, end).withFlexibleDateBuild(allowSwitchMonthDay).build();
- }
-
- protected InstantEdtfDate extractInstant(String dateInput, DateQualification requestedDateQualification,
- boolean allowSwitchMonthDay) throws DateExtractionException {
- final InstantEdtfDate instantEdtfDate;
- if (UNKNOWN.getDeserializedRepresentation().equals(dateInput)) {
- instantEdtfDate = InstantEdtfDate.getUnknownInstance();
- } else if (OPEN.getDeserializedRepresentation().equals(dateInput)) {
- instantEdtfDate = InstantEdtfDate.getOpenInstance();
- } else if (dateInput.startsWith(String.valueOf(OVER_4_DIGITS_YEAR_PREFIX))) {
- int year = NumberUtils.toInt(dateInput.substring(1));
- instantEdtfDate = new InstantEdtfDateBuilder(year).withLongYearPrefixedWithY()
- .withDateQualification(requestedDateQualification).build();
- } else {
- instantEdtfDate = extractInstantEdtfDate(dateInput, requestedDateQualification, allowSwitchMonthDay);
- }
- return instantEdtfDate;
- }
-
- private static InstantEdtfDate extractInstantEdtfDate(String dateInput, DateQualification requestedDateQualification,
- boolean allowSwitchMonthDay) throws DateExtractionException {
- Matcher matcher = CHECK_QUALIFICATION_PATTERN.matcher(dateInput);
- String dateInputStrippedModifier = dateInput;
- DateQualification dateQualification = requestedDateQualification;
-
- boolean containsQualification = matcher.matches();
- if (containsQualification && (requestedDateQualification == null || requestedDateQualification == NO_QUALIFICATION)) {
- final String modifier = matcher.group(1);
- if (StringUtils.isNotEmpty(modifier)) {
- dateQualification = DateQualification.fromCharacter(String.valueOf(modifier.charAt(0)));
- dateInputStrippedModifier = dateInput.substring(0, dateInput.length() - 1);
- }
- }
-
- final TemporalAccessor temporalAccessor = ISO_8601_PARSER.parseDatePart(dateInputStrippedModifier);
- return new InstantEdtfDateBuilder(temporalAccessor)
- .withDateQualification(dateQualification)
- .withFlexibleDateBuild(allowSwitchMonthDay)
- .build();
- }
-
-}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractor.java
deleted file mode 100644
index ca9dd573c9..0000000000
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractor.java
+++ /dev/null
@@ -1,118 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS;
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS_XX;
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS;
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS_XX;
-
-import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.DateNormalizationResultStatus;
-import eu.europeana.normalization.dates.edtf.DateBoundaryType;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDate;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder;
-import eu.europeana.normalization.dates.extraction.DateExtractionException;
-import eu.europeana.normalization.dates.extraction.NumericPartsPattern;
-import eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeDateDelimiters;
-import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer;
-
-/**
- * Patterns for numeric date ranges with variations in the separators of date components.
- * We reuse the already existent {@link NumericPartsDateExtractor} code for the boundaries.
- */
-public class NumericPartsRangeDateExtractor extends AbstractDateExtractor {
-
- private static final NumericPartsDateExtractor NUMERIC_WITH_MISSING_PARTS_DATE_EXTRACTOR = new NumericPartsDateExtractor();
-
- /**
- * Extract the date normalization result for a range.
- *
- * The date is split in two boundaries using the {@link NumericRangeDateDelimiters#values()} as a separator. The result will
- * contain the first split that is exactly splitting the original value in two parts(boundaries) and those two boundaries are
- * valid parsable boundaries or null if none found.
- *
- *
- * @param inputValue the range value to attempt parsing
- * @return the date normalization result
- */
- @Override
- public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) throws DateExtractionException {
- final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue);
- DateNormalizationResult startDateResult;
- DateNormalizationResult endDateResult;
- DateNormalizationResult rangeDate = DateNormalizationResult.getNoMatchResult(inputValue);
- for (NumericRangeDateDelimiters numericRangeSpecialCharacters : NumericRangeDateDelimiters.values()) {
- // Split with -1 limit does not discard empty splits
- final String[] sanitizedDateSplitArray = sanitizedValue.split(numericRangeSpecialCharacters.getDatesSeparator(), -1);
- // The sanitizedDateSplitArray has to be exactly in two, and then we can verify.
- // This also guarantees that the separator used is not used for unknown characters.
- if (sanitizedDateSplitArray.length == 2) {
- // Try extraction and verify
- startDateResult = extractDateNormalizationResult(sanitizedDateSplitArray[0], numericRangeSpecialCharacters,
- requestedDateQualification,
- flexibleDateBuild);
- endDateResult = extractDateNormalizationResult(sanitizedDateSplitArray[1], numericRangeSpecialCharacters,
- requestedDateQualification, flexibleDateBuild);
- if (startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED
- && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED
- && !areYearsAmbiguous((InstantEdtfDate) startDateResult.getEdtfDate(), (InstantEdtfDate) endDateResult.getEdtfDate(),
- numericRangeSpecialCharacters)) {
-
- final DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId =
- getDateNormalizationExtractorId(startDateResult, endDateResult);
- final IntervalEdtfDate intervalEdtfDate = new IntervalEdtfDateBuilder((InstantEdtfDate) startDateResult.getEdtfDate(),
- (InstantEdtfDate) endDateResult.getEdtfDate()).withFlexibleDateBuild(flexibleDateBuild).build();
- rangeDate = new DateNormalizationResult(dateNormalizationExtractorMatchId, inputValue, intervalEdtfDate);
- break;
- }
- }
- }
- return rangeDate;
- }
-
- /**
- * Captures the ambiguous case of "198-?".
- *
- * @param startDate the start date of a range
- * @param endDate the end date of the range
- * @param numericRangeSpecialCharacters the date separator of the range
- * @return true if the range is ambiguous
- */
- private boolean areYearsAmbiguous(InstantEdtfDate startDate, InstantEdtfDate endDate,
- NumericRangeDateDelimiters numericRangeSpecialCharacters) {
- boolean isAmbiguous = false;
- if (numericRangeSpecialCharacters == NumericRangeDateDelimiters.DASH_RANGE) {
- final boolean isStartDeclared = startDate.getDateBoundaryType() == DateBoundaryType.DECLARED;
- final boolean isStartThreeDigit =
- isStartDeclared && Integer.toString(startDate.getYear().getValue()).matches("\\d{3}");
- if (isStartThreeDigit && endDate.getDateBoundaryType() == DateBoundaryType.OPEN) {
- isAmbiguous = true;
- }
- }
- return isAmbiguous;
- }
-
- private DateNormalizationResult extractDateNormalizationResult(String dateString,
- NumericRangeDateDelimiters numericRangeSpecialCharacters, DateQualification requestedDateQualification,
- boolean allowSwitchMonthDay) throws DateExtractionException {
- final DateNormalizationResult dateNormalizationResult;
- if (numericRangeSpecialCharacters.getUnspecifiedCharacters() != null && dateString.matches(
- numericRangeSpecialCharacters.getUnspecifiedCharacters())) {
- dateNormalizationResult = new DateNormalizationResult(NUMERIC_ALL_VARIANTS, dateString, InstantEdtfDate.getOpenInstance());
- } else {
- dateNormalizationResult = NUMERIC_WITH_MISSING_PARTS_DATE_EXTRACTOR.extract(dateString, requestedDateQualification,
- NumericPartsPattern.NUMERIC_RANGE_SET, allowSwitchMonthDay);
- }
- return dateNormalizationResult;
- }
-
- private static DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDate,
- DateNormalizationResult endDate) {
- final boolean isStartXX = startDate.getDateNormalizationExtractorMatchId() == NUMERIC_ALL_VARIANTS_XX;
- final boolean isEndXX = endDate.getDateNormalizationExtractorMatchId() == NUMERIC_ALL_VARIANTS_XX;
- return isStartXX || isEndXX ? NUMERIC_RANGE_ALL_VARIANTS_XX : NUMERIC_RANGE_ALL_VARIANTS;
- }
-}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternBcAdDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternBcAdDateExtractor.java
deleted file mode 100644
index 803d25e00a..0000000000
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternBcAdDateExtractor.java
+++ /dev/null
@@ -1,124 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder;
-import eu.europeana.normalization.dates.extraction.DateExtractionException;
-import java.util.HashSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-
-/**
- * A year with an indication of the era, for example ‘3000 BC’. Currently, the normalisation process recognizes ‘BC/AD’ and
- * ‘AC/DC’, but the abbreviations used in other languages will be supported in the future. Or a date range where the start/end
- * years contain an indication of the era.
- */
-public class PatternBcAdDateExtractor extends AbstractDateExtractor {
-
- static final HashSet bcAbbreviations = new HashSet<>();
-
- static {
- bcAbbreviations.add("B\\.?C".toLowerCase());
- bcAbbreviations.add("A\\.?C".toLowerCase());
- bcAbbreviations.add("v\\.?Chr".toLowerCase());
- bcAbbreviations.add("vC".toLowerCase());
- bcAbbreviations.add("avant J\\.?-C".toLowerCase());
- bcAbbreviations.add("av[\\. ]J\\.?-C".toLowerCase());
- //bcAbbreviations.add("eKr"); removed due to ambiguity
- bcAbbreviations.add("f\\.?Kr".toLowerCase());
- bcAbbreviations.add("π\\.*Χ".toLowerCase());
- }
-
- static final HashSet adAbbreviations = new HashSet<>();
-
- static {
- adAbbreviations.add("A\\.?D".toLowerCase());
- adAbbreviations.add("D\\.?C".toLowerCase());
- adAbbreviations.add("n\\.?Chr".toLowerCase());
- adAbbreviations.add("nC".toLowerCase());
- adAbbreviations.add("après J-C".toLowerCase());
- adAbbreviations.add("apres J-C".toLowerCase());
- adAbbreviations.add("ap[\\. ]J-C".toLowerCase());
- //adAbbreviations.add("eKr"); removed due to ambiguity
- adAbbreviations.add("j\\.?Kr".toLowerCase());
- adAbbreviations.add("μ\\.?Χ".toLowerCase());
- }
-
- static final HashSet bcAbbreviationsPatterns = new HashSet<>();
-
- static {
- for (String abbrev : bcAbbreviations) {
- bcAbbreviationsPatterns.add(Pattern.compile(abbrev, Pattern.CASE_INSENSITIVE));
- }
- }
-
- Pattern patYyyy;
- Pattern patRange;
-
- public PatternBcAdDateExtractor() {
- String patYearBcAd = "(?\\d{2,4})\\s*(?";
- patYearBcAd += bcAbbreviations.stream().collect(Collectors.joining("|"));
- patYearBcAd += adAbbreviations.stream().collect(Collectors.joining("|"));
- patYearBcAd = patYearBcAd.substring(0, patYearBcAd.length() - 1) + ")\\.?";
-
- patYyyy = Pattern.compile(patYearBcAd, Pattern.CASE_INSENSITIVE);
- patRange = Pattern.compile(
- patYearBcAd + "\\s*[\\-\\/]\\s*" + patYearBcAd.replace("", "").replace("", ""),
- Pattern.CASE_INSENSITIVE);
- }
-
- @Override
- public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) throws DateExtractionException {
- Matcher m = patYyyy.matcher(inputValue);
- if (m.matches()) {
- final InstantEdtfDateBuilder instantEdtfDateBuilder;
- if (bcAbbreviations.contains(m.group("era").toLowerCase())) {
- instantEdtfDateBuilder = new InstantEdtfDateBuilder(-Integer.parseInt(m.group("year")));
- } else {
- instantEdtfDateBuilder = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year")));
- }
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.BC_AD, inputValue,
- instantEdtfDateBuilder.withDateQualification(requestedDateQualification).withFlexibleDateBuild(
- flexibleDateBuild)
- .build());
- }
- m = patRange.matcher(inputValue);
- if (m.matches()) {
- final InstantEdtfDateBuilder startDatePartBuilder;
- if (isBc(m.group("era"))) {
- startDatePartBuilder = new InstantEdtfDateBuilder(-Integer.parseInt(m.group("year")));
- } else {
- startDatePartBuilder = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year")));
- }
- InstantEdtfDate start = startDatePartBuilder.withDateQualification(requestedDateQualification)
- .withFlexibleDateBuild(flexibleDateBuild).build();
-
- final InstantEdtfDateBuilder endDatePartBuilder;
- if (isBc(m.group("era2"))) {
- endDatePartBuilder = new InstantEdtfDateBuilder(-Integer.parseInt(m.group("year2")));
- } else {
- endDatePartBuilder = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year2")));
- }
- InstantEdtfDate end = endDatePartBuilder.withDateQualification(requestedDateQualification)
- .withFlexibleDateBuild(flexibleDateBuild).build();
-
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.BC_AD, inputValue,
- new IntervalEdtfDateBuilder(start, end).withFlexibleDateBuild(flexibleDateBuild).build());
- }
- return DateNormalizationResult.getNoMatchResult(inputValue);
- }
-
- private boolean isBc(String abbreviation) {
- for (Pattern pat : bcAbbreviationsPatterns) {
- if (pat.matcher(abbreviation).matches()) {
- return true;
- }
- }
- return false;
- }
-}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternFormatedFullDateDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternFormatedFullDateDateExtractor.java
deleted file mode 100644
index 3754efe377..0000000000
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternFormatedFullDateDateExtractor.java
+++ /dev/null
@@ -1,66 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
-import eu.europeana.normalization.dates.extraction.DateExtractionException;
-import eu.europeana.normalization.dates.extraction.MonthMultilingual;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Patterns for date formats that are well-structured but do not follow a particular standard
- */
-public class PatternFormatedFullDateDateExtractor extends AbstractDateExtractor {
-
- MonthMultilingual monthNames = new MonthMultilingual();
-
- // "Thu Dec 31 01:00:00 CET 1863","31 Dec 1863"
- // month day hour minute second year
- Pattern patFormatedDate = Pattern
- .compile("\\w{3} (\\w{3}) (\\d{2}) (\\d{2}):(\\d{2}):(\\d{2}) \\w{3,4} (\\d{1,4})");
-
- // 2020-06-21 13:43:26 UTC
- // year month day hour minute second
- Pattern patFormatedDate2 = Pattern
- .compile("(\\d{4})-(\\d{2})-(\\d{2}) (\\d{2}):(\\d{2}):(\\d{2}) \\w{3,4}\\s?(\\d{0,4})");
-
- // 2018-03-27 09:08:34
- // year month day hour minute second
- Pattern patFormatedDate3 = Pattern.compile("(\\d{4})-(\\d{2})-(\\d{2}) (\\d{2}):(\\d{2}):(\\d{2})(\\.\\d{1,3})?");
-
- @Override
- public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) throws DateExtractionException {
- final DateQualification dateQualification = computeDateQualification(requestedDateQualification,
- () -> DateQualification.NO_QUALIFICATION);
-
- Matcher m = patFormatedDate2.matcher(inputValue);
- if (m.matches()) {
- final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group(1)))
- .withMonth(Integer.parseInt(m.group(2)))
- .withDay(Integer.parseInt(m.group(3)))
- .withDateQualification(dateQualification).withFlexibleDateBuild(flexibleDateBuild).build();
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.FORMATTED_FULL_DATE, inputValue, datePart);
- }
- m = patFormatedDate.matcher(inputValue);
- if (m.matches()) {
- final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group(6)))
- .withMonth(monthNames.getMonthIndexValue(m.group(1)))
- .withDay(Integer.parseInt(m.group(2)))
- .withDateQualification(dateQualification).withFlexibleDateBuild(flexibleDateBuild).build();
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.FORMATTED_FULL_DATE, inputValue, datePart);
- }
- m = patFormatedDate3.matcher(inputValue);
- if (m.matches()) {
- final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group(1)))
- .withMonth(Integer.parseInt(m.group(2)))
- .withDay(Integer.parseInt(m.group(3)))
- .withDateQualification(dateQualification).withFlexibleDateBuild(flexibleDateBuild).build();
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.FORMATTED_FULL_DATE, inputValue, datePart);
- }
- return DateNormalizationResult.getNoMatchResult(inputValue);
- }
-}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternLongNegativeYearDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternLongNegativeYearDateExtractor.java
deleted file mode 100644
index 7e8b9dc942..0000000000
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternLongNegativeYearDateExtractor.java
+++ /dev/null
@@ -1,60 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN;
-
-import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDate;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder;
-import eu.europeana.normalization.dates.extraction.DateExtractionException;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * A year before 1 AD with more than 4 digits. This pattern is typically used in archaeological contexts. The year may contain
- * between 5 and 9 digits. Aso includes the pattern for ranges of this kind of years.
- */
-public class PatternLongNegativeYearDateExtractor extends AbstractDateExtractor {
-
- Pattern patYyyyyy = Pattern.compile("\\s*(?\\?)?(?-\\d{5,9})(?\\?)?\\s*",
- Pattern.CASE_INSENSITIVE);
- Pattern patYyyyyyRange = Pattern.compile(
- "\\s*(?\\?)?(?-\\d{5,9})\\s*/\\s*(?-\\d{5,9})(?\\?)?\\s*",
- Pattern.CASE_INSENSITIVE);
-
- @Override
- public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) throws DateExtractionException {
- final DateQualification dateQualification;
-
- final Matcher m = patYyyyyy.matcher(inputValue);
- if (m.matches()) {
- dateQualification =
- computeDateQualification(requestedDateQualification,
- () -> (m.group("uncertain") != null || m.group("uncertain2") != null) ? UNCERTAIN : NO_QUALIFICATION);
-
- final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year"))).withDateQualification(
- dateQualification).withFlexibleDateBuild(flexibleDateBuild).build();
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR, inputValue, datePart);
- }
- final Matcher m2 = patYyyyyyRange.matcher(inputValue);
- if (m2.matches()) {
- dateQualification =
- computeDateQualification(requestedDateQualification,
- () -> (m2.group("uncertain") != null || m2.group("uncertain2") != null) ? UNCERTAIN : NO_QUALIFICATION);
-
- final InstantEdtfDate startDatePart = new InstantEdtfDateBuilder(Integer.parseInt(m2.group("year"))).withDateQualification(
- dateQualification).withFlexibleDateBuild(flexibleDateBuild).build();
- final InstantEdtfDate endDatePart = new InstantEdtfDateBuilder(Integer.parseInt(m2.group("year2"))).withDateQualification(
- dateQualification).withFlexibleDateBuild(flexibleDateBuild).build();
- IntervalEdtfDate intervalEdtfDate = new IntervalEdtfDateBuilder(startDatePart, endDatePart).withFlexibleDateBuild(
- flexibleDateBuild).build();
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR, inputValue, intervalEdtfDate);
- }
- return DateNormalizationResult.getNoMatchResult(inputValue);
- }
-}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternMonthNameDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternMonthNameDateExtractor.java
deleted file mode 100644
index d5e594cc6e..0000000000
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/PatternMonthNameDateExtractor.java
+++ /dev/null
@@ -1,92 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
-import eu.europeana.normalization.dates.extraction.DateExtractionException;
-import eu.europeana.normalization.dates.extraction.MonthMultilingual;
-import java.time.Month;
-import java.util.HashMap;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * A date where the month is specified by its name or an abbreviation. Supports all the official languages of the European Union
- */
-public class PatternMonthNameDateExtractor extends AbstractDateExtractor {
-
- HashMap patternDayMonthYear = new HashMap<>(12);
- HashMap patternMonthDayYear = new HashMap<>(12);
- HashMap patternMonthYear = new HashMap<>(12);
-
- public PatternMonthNameDateExtractor() {
- MonthMultilingual months = new MonthMultilingual();
- for (Month month : Month.values()) {
- String monthNamesPattern = null;
- for (String m : months.getMonthStrings(month)) {
- if (monthNamesPattern == null) {
- monthNamesPattern = "(?";
- } else {
- monthNamesPattern += "|";
- }
- monthNamesPattern += m.replaceAll("\\.", "\\.");
- }
- monthNamesPattern += ")";
-
- patternDayMonthYear
- .put(month,
- Pattern.compile(
- "\\s*(?\\d\\d?)[ .,]([a-zA-Z]{0,2}[ .,])?" + monthNamesPattern
- + "[ .,]([a-zA-Z]{0,2}[ .,])?(?\\d{4})\\s*",
- Pattern.CASE_INSENSITIVE));
- patternMonthDayYear.put(month, Pattern.compile("\\s*" + monthNamesPattern
- + "[ .,]([a-zA-Z]{0,2}[ .,])?(?\\d\\d?)[ .,][a-zA-Z]{0,2}[ .,](?\\d{4})\\s*",
- Pattern.CASE_INSENSITIVE));
- patternMonthYear.put(month,
- Pattern.compile("\\s*" + monthNamesPattern + "[ .,]([a-zA-Z]{0,2}[ .,])?(?\\d{4})\\s*",
- Pattern.CASE_INSENSITIVE));
- }
- }
-
- @Override
- public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) throws DateExtractionException {
- final DateQualification dateQualification = computeDateQualification(requestedDateQualification,
- () -> DateQualification.NO_QUALIFICATION);
-
- for (Month month : Month.values()) {
- Matcher m = patternDayMonthYear.get(month).matcher(inputValue);
- if (m.matches()) {
- final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year")))
- .withMonth(month.getValue())
- .withDay(Integer.parseInt(m.group("day")))
- .withDateQualification(dateQualification)
- .withFlexibleDateBuild(flexibleDateBuild)
- .build();
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.MONTH_NAME, inputValue, datePart);
- }
- m = patternMonthDayYear.get(month).matcher(inputValue);
- if (m.matches()) {
- final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year")))
- .withMonth(month.getValue())
- .withDay(Integer.parseInt(m.group("day")))
- .withDateQualification(dateQualification)
- .withFlexibleDateBuild(flexibleDateBuild)
- .build();
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.MONTH_NAME, inputValue, datePart);
- }
- m = patternMonthYear.get(month).matcher(inputValue);
- if (m.matches()) {
- final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(m.group("year")))
- .withMonth(month.getValue())
- .withDateQualification(dateQualification)
- .withFlexibleDateBuild(flexibleDateBuild)
- .build();
- return new DateNormalizationResult(DateNormalizationExtractorMatchId.MONTH_NAME, inputValue, datePart);
- }
- }
- return DateNormalizationResult.getNoMatchResult(inputValue);
- }
-}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractDateExtractor.java
new file mode 100644
index 0000000000..ab6a047b70
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractDateExtractor.java
@@ -0,0 +1,75 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult;
+import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN;
+import static java.lang.String.format;
+
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.edtf.DateQualification;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer;
+import java.lang.invoke.MethodHandles;
+import java.util.EnumSet;
+import java.util.Set;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Abstract class implementing interface {@link DateExtractor} with default functionality for all extractors
+ */
+public abstract class AbstractDateExtractor implements DateExtractor {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ static final String OPTIONAL_QUESTION_MARK_REGEX = "\\??";
+
+ /**
+ * Reusable default checking of Date qualification on an input.
+ *
+ * @param inputValue the input value
+ * @return the date qualification
+ */
+ public Set getQualification(String inputValue) {
+ final Set dateQualifications = EnumSet.noneOf(DateQualification.class);
+ if (inputValue.startsWith("?") || inputValue.endsWith("?")) {
+ dateQualifications.add(UNCERTAIN);
+ }
+ return dateQualifications;
+ }
+
+ /**
+ * Utility method for calling {@link DateExtractor#extract(String, boolean)} with flexibleDateBuild as true.
+ * It also captures relevant exceptions so that return is performed
+ *
+ * @param inputValue the input value
+ * @return the date normalization result
+ */
+ @Override
+ public DateNormalizationResult extractDateProperty(String inputValue) {
+ return getDateNormalizationResult(inputValue, true);
+ }
+
+ /**
+ * Utility method for calling {@link DateExtractor#extract(String, boolean)} with flexibleDateBuild as false.
+ * It also captures relevant exceptions so that return is performed
+ *
+ * @param inputValue the input value
+ * @return the date normalization result
+ */
+ @Override
+ public DateNormalizationResult extractGenericProperty(String inputValue) {
+ return getDateNormalizationResult(inputValue, false);
+ }
+
+ private DateNormalizationResult getDateNormalizationResult(String inputValue, boolean flexibleDateBuild) {
+ final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue);
+ DateNormalizationResult dateNormalizationResult;
+ try {
+ dateNormalizationResult = extract(sanitizedValue, flexibleDateBuild);
+ } catch (DateExtractionException e) {
+ LOGGER.debug(format("Date extraction failed %s: ", sanitizedValue), e);
+ dateNormalizationResult = getNoMatchResult(inputValue);
+ }
+
+ return dateNormalizationResult;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractRangeDateExtractor.java
new file mode 100644
index 0000000000..92a354ef5b
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/AbstractRangeDateExtractor.java
@@ -0,0 +1,69 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
+import eu.europeana.normalization.dates.edtf.IntervalEdtfDate;
+import eu.europeana.normalization.dates.edtf.IntervalEdtfDateBuilder;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.DatesSeparator;
+import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * The abstract class adding the option to a reusable range extractor functionality.
+ * It is a generic way to capture ranges for all implementations. It is based on required methods being implemented from
+ * {@link RangeDateExtractor} interface.
+ *
+ * @param the object containing delimiters/separators for dates
+ */
+public abstract class AbstractRangeDateExtractor extends AbstractDateExtractor implements
+ RangeDateExtractor {
+
+ public static final int KEEP_EMPTY_SPLITS_LIMIT_VALUE = -1;
+ /**
+ * The date split has to be exactly two. This also guarantees that the separator used is not used for unknown characters.
+ */
+ public static final int VALID_SPLIT_SIZE = 2;
+
+ /**
+ * Extract the date normalization result for a range.
+ *
+ * The date is split in two boundaries using the {@link T} to provide the separators. The result will contain the first split
+ * that is exactly splitting the original value in two parts(boundaries) and those two boundaries are valid parsable boundaries
+ * or null if none found.
+ *
+ *
+ * @param inputValue the range value to attempt parsing
+ * @param flexibleDateBuild the flag indicating if during creating of the dates we are flexible with validation
+ * @return the date normalization result
+ * @throws DateExtractionException if anything happened during the extraction of the date
+ */
+ @Override
+ public DateNormalizationResult extract(String inputValue, boolean flexibleDateBuild) throws DateExtractionException {
+ DateNormalizationResult rangeDate = DateNormalizationResult.getNoMatchResult(inputValue);
+ for (T rangeDateQualifier : getRangeDateQualifiers()) {
+ final List sanitizedDateList =
+ Arrays.stream(inputValue.split(rangeDateQualifier.getStringRepresentation(), KEEP_EMPTY_SPLITS_LIMIT_VALUE))
+ .map(DateFieldSanitizer::cleanSpacesAndTrim).collect(
+ Collectors.toList());
+ if (sanitizedDateList.size() == VALID_SPLIT_SIZE) {
+ final DateNormalizationResultRangePair dateNormalizationResultRangePair = extractDateNormalizationResult(
+ sanitizedDateList.get(0), sanitizedDateList.get(1), rangeDateQualifier, flexibleDateBuild);
+ final DateNormalizationResult startResult = dateNormalizationResultRangePair.getStartDateNormalizationResult();
+ final DateNormalizationResult endResult = dateNormalizationResultRangePair.getEndDateNormalizationResult();
+ if (isRangeMatchSuccess(rangeDateQualifier, startResult, endResult)) {
+ final DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId =
+ getDateNormalizationExtractorId(startResult, endResult);
+ final IntervalEdtfDate intervalEdtfDate = new IntervalEdtfDateBuilder((InstantEdtfDate) startResult.getEdtfDate(),
+ (InstantEdtfDate) endResult.getEdtfDate()).withAllowStartEndSwap(flexibleDateBuild).build();
+ rangeDate = new DateNormalizationResult(dateNormalizationExtractorMatchId, inputValue, intervalEdtfDate);
+ break;
+ }
+ }
+ }
+ return rangeDate;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractor.java
new file mode 100644
index 0000000000..81e51a137e
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractor.java
@@ -0,0 +1,70 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult;
+import static java.util.regex.Pattern.compile;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.EuropeanLanguage;
+import java.text.DateFormatSymbols;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+/**
+ * A year with an indication of the era in european languages
+ *
+ * Some examples:
+ *
+ * - 1989 BC
+ * - 1989 AD
+ * - 1989 π.Χ.
+ * - 1989 μ.Χ.
+ *
+ *
+ */
+public class BcAdDateExtractor extends AbstractDateExtractor {
+
+ private static final String YEAR_REGEX = "(\\d{1,4})";
+ private static final String DELIMITERS_REGEX = " ";
+ private static final Set adAbbreviations = new HashSet<>();
+ private static final Pattern pattern;
+
+ static {
+ final Set bcAbbreviations = new HashSet<>();
+ for (EuropeanLanguage europeanLanguage : EuropeanLanguage.values()) {
+ final DateFormatSymbols symbols = DateFormatSymbols.getInstance(europeanLanguage.getLocale());
+ bcAbbreviations.add(symbols.getEras()[0]);
+ adAbbreviations.add(symbols.getEras()[1]);
+ }
+ final String abbreviationsJoinedValues = Stream.concat(bcAbbreviations.stream(), adAbbreviations.stream())
+ .map(Pattern::quote)
+ .collect(Collectors.joining("|", "(", ")"));
+ pattern = compile(String.join(DELIMITERS_REGEX, YEAR_REGEX, abbreviationsJoinedValues), Pattern.CASE_INSENSITIVE);
+ }
+
+ @Override
+ public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException {
+ DateNormalizationResult dateNormalizationResult = getNoMatchResult(inputValue);
+
+ Matcher matcher = pattern.matcher(inputValue);
+ if (matcher.matches()) {
+ final int year = Integer.parseInt(matcher.group(1));
+ //Year should not be 0 on an era
+ if (year != 0) {
+ final boolean isAd = adAbbreviations.contains(matcher.group(2));
+ int yearSign = isAd ? 1 : -1;
+ int yearAdjusted = (isAd ? year : (year - 1)) * yearSign;
+ final InstantEdtfDateBuilder instantEdtfDateBuilder = new InstantEdtfDateBuilder(yearAdjusted);
+ dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.BC_AD, inputValue,
+ instantEdtfDateBuilder.build());
+ }
+ }
+ return dateNormalizationResult;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractor.java
new file mode 100644
index 0000000000..171e8b92fe
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractor.java
@@ -0,0 +1,44 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.DateNormalizationResultStatus;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.DefaultDatesSeparator;
+import java.util.List;
+
+/**
+ * Extractor for BC and AD date ranges with variations in the separators of date components.
+ * We reuse the already existent {@link BcAdDateExtractor} code for the boundaries.
+ */
+public class BcAdRangeDateExtractor extends AbstractRangeDateExtractor {
+
+ private static final BcAdDateExtractor BC_AD_DATE_EXTRACTOR = new BcAdDateExtractor();
+
+ @Override
+ public DateNormalizationResultRangePair extractDateNormalizationResult(String startString, String endString,
+ DefaultDatesSeparator rangeDateDelimiters,
+ boolean allowDayMonthSwap) throws DateExtractionException {
+ return new DateNormalizationResultRangePair(
+ BC_AD_DATE_EXTRACTOR.extract(startString, allowDayMonthSwap),
+ BC_AD_DATE_EXTRACTOR.extract(endString, allowDayMonthSwap));
+ }
+
+ @Override
+ public List getRangeDateQualifiers() {
+ return List.of(DefaultDatesSeparator.values());
+ }
+
+ @Override
+ public boolean isRangeMatchSuccess(DefaultDatesSeparator rangeDateDelimiters, DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED
+ && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED;
+ }
+
+ @Override
+ public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return DateNormalizationExtractorMatchId.BC_AD;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractor.java
new file mode 100644
index 0000000000..69353a8b7f
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractor.java
@@ -0,0 +1,125 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult;
+import static eu.europeana.normalization.dates.YearPrecision.CENTURY;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.DateNormalizationResultStatus;
+import eu.europeana.normalization.dates.edtf.DateQualification;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.DefaultDatesSeparator;
+import java.time.Month;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Extractor that matches a date range where the end year includes only the rightmost two digits.
+ *
+ * The end year in this extractor has to:
+ *
+ * - Be higher than 12(or lower than -12) to avoid matching a month value from other extractors.
+ * - Be higher than the two rightmost digits of the start year.
+ *
+ *
+ *
+ * This pattern needs to be executed before the Edtf extractor because EDTF could potentially match yyyy/MM and yyyy-MM.
+ * Therefore in this extractor we check only the values that are higher than 12 to avoid a mismatch.
+ *
+ */
+public class BriefRangeDateExtractor extends AbstractRangeDateExtractor {
+
+ private static final Pattern YEAR_PATTERN = Pattern.compile(
+ OPTIONAL_QUESTION_MARK_REGEX + "(\\d{2,4})" + OPTIONAL_QUESTION_MARK_REGEX);
+
+ @Override
+ public DateNormalizationResultRangePair extractDateNormalizationResult(String startString,
+ String endString, DefaultDatesSeparator rangeDateDelimiters,
+ boolean allowDayMonthSwap) throws DateExtractionException {
+ final DateNormalizationResult startDateNormalizationResult =
+ extractStartDateNormalizationResult(startString, allowDayMonthSwap);
+ final DateNormalizationResult endDateNormalizationResult =
+ extractEndDateNormalizationResult(startDateNormalizationResult, endString, allowDayMonthSwap);
+ return new DateNormalizationResultRangePair(startDateNormalizationResult, endDateNormalizationResult);
+ }
+
+ private DateNormalizationResult extractStartDateNormalizationResult(String dateString, boolean allowDayMonthSwap)
+ throws DateExtractionException {
+ DateNormalizationResult dateNormalizationResult = getNoMatchResult(dateString);
+ final DateNormalizationResult startYearDateDateNormalizationResult = extractYear(dateString, allowDayMonthSwap);
+
+ if (startYearDateDateNormalizationResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED) {
+ int absoluteYear = Math.abs(((InstantEdtfDate) startYearDateDateNormalizationResult.getEdtfDate()).getYear().getValue());
+ int startYearDigitsLength = (int) (Math.log10(absoluteYear) + 1);
+ if (startYearDigitsLength > 2) {
+ dateNormalizationResult = startYearDateDateNormalizationResult;
+ }
+ }
+
+ return dateNormalizationResult;
+ }
+
+ private DateNormalizationResult extractEndDateNormalizationResult(DateNormalizationResult startDateNormalizationResult,
+ String dateString, boolean allowDayMonthSwap) throws DateExtractionException {
+ DateNormalizationResult dateNormalizationResult = getNoMatchResult(dateString);
+ if (startDateNormalizationResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED) {
+ final DateNormalizationResult endDateNormalizationResult = extractYear(dateString, allowDayMonthSwap);
+
+ if (endDateNormalizationResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED) {
+ final Set endDateQualifications = endDateNormalizationResult.getEdtfDate().getDateQualifications();
+
+ final int startYearFourDigits = ((InstantEdtfDate) startDateNormalizationResult.getEdtfDate()).getYear().getValue();
+ final int startYearLastTwoDigits = startYearFourDigits % CENTURY.getDuration();
+ final int endYear = ((InstantEdtfDate) endDateNormalizationResult.getEdtfDate()).getYear().getValue();
+
+ int absoluteEndYear = Math.abs(endYear);
+ int endYearDigitsLength = (int) (Math.log10(absoluteEndYear) + 1);
+ if (endYearDigitsLength == 2 && Math.abs(endYear) > Month.DECEMBER.getValue() && startYearLastTwoDigits < endYear) {
+ final int endYearFourDigits = (startYearFourDigits / CENTURY.getDuration()) * CENTURY.getDuration() + endYear;
+ final InstantEdtfDate endInstantEdtfDate = new InstantEdtfDateBuilder(endYearFourDigits).withDateQualification(
+ endDateQualifications).withAllowDayMonthSwap(allowDayMonthSwap).build();
+ dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.BRIEF_DATE_RANGE, dateString,
+ endInstantEdtfDate);
+ }
+ }
+ }
+
+ return dateNormalizationResult;
+ }
+
+ private DateNormalizationResult extractYear(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException {
+ DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue);
+ final Matcher matcher = YEAR_PATTERN.matcher(inputValue);
+ if (matcher.matches()) {
+ final int year = Integer.parseInt(matcher.group(1));
+ final InstantEdtfDate instantEdtfDate = new InstantEdtfDateBuilder(year).withDateQualification(getQualification(inputValue))
+ .withAllowDayMonthSwap(allowDayMonthSwap).build();
+ dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.BRIEF_DATE_RANGE, inputValue,
+ instantEdtfDate);
+ }
+ return dateNormalizationResult;
+ }
+
+ @Override
+ public List getRangeDateQualifiers() {
+ return List.of(DefaultDatesSeparator.values());
+ }
+
+ @Override
+ public boolean isRangeMatchSuccess(DefaultDatesSeparator rangeDateDelimiters, DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED
+ && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED;
+ }
+
+ @Override
+ public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return DateNormalizationExtractorMatchId.BRIEF_DATE_RANGE;
+ }
+}
+
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractor.java
new file mode 100644
index 0000000000..9668dc7ffd
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractor.java
@@ -0,0 +1,87 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.YearPrecision.CENTURY;
+import static java.util.regex.Pattern.CASE_INSENSITIVE;
+import static java.util.regex.Pattern.compile;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import java.util.function.ToIntFunction;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Extractor that matches a century with a decimal numerals.
+ * The range of values this accepts are from 1-21 including.
+ * Examples of some cases:
+ *
+ * -
+ * Value = 18.. | Outcome = 18XX
+ * Value = 1st century | Outcome = 00XX
+ *
+ *
+ *
+ */
+public class CenturyNumericDateExtractor extends AbstractDateExtractor {
+
+ private static final String NUMERIC_10_TO_21_ENDING_DOTS_REGEX = "(1\\d|2[0-1])\\.{2}";
+ private static final String NUMERIC_1_TO_21_SUFFIXED_REGEX = "(2?1st|2nd|3rd|(?:1\\d|[4-9]|20)th)\\scentury";
+
+ private enum CenturyNumericDatePattern {
+ PATTERN_YYYY(
+ compile(OPTIONAL_QUESTION_MARK_REGEX + NUMERIC_10_TO_21_ENDING_DOTS_REGEX + OPTIONAL_QUESTION_MARK_REGEX,
+ CASE_INSENSITIVE),
+ Integer::parseInt, DateNormalizationExtractorMatchId.CENTURY_NUMERIC),
+ PATTERN_ENGLISH(
+ compile(OPTIONAL_QUESTION_MARK_REGEX + NUMERIC_1_TO_21_SUFFIXED_REGEX + OPTIONAL_QUESTION_MARK_REGEX, CASE_INSENSITIVE),
+ century -> (Integer.parseInt(century.substring(0, century.length() - 2)) - 1),
+ DateNormalizationExtractorMatchId.CENTURY_NUMERIC);
+
+ private final Pattern pattern;
+ private final ToIntFunction centuryExtractorFunction;
+ private final DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId;
+
+ CenturyNumericDatePattern(Pattern pattern, ToIntFunction centuryExtractorFunction,
+ DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
+ this.pattern = pattern;
+ this.centuryExtractorFunction = centuryExtractorFunction;
+ this.dateNormalizationExtractorMatchId = dateNormalizationExtractorMatchId;
+ }
+
+ public Pattern getPattern() {
+ return pattern;
+ }
+
+ public ToIntFunction getCenturyExtractorFunction() {
+ return centuryExtractorFunction;
+ }
+
+ public DateNormalizationExtractorMatchId getDateNormalizationExtractorMatchId() {
+ return dateNormalizationExtractorMatchId;
+ }
+ }
+
+ @Override
+ public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException {
+ DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue);
+ for (CenturyNumericDatePattern centerNumericDatePattern : CenturyNumericDatePattern.values()) {
+ final Matcher matcher = centerNumericDatePattern.getPattern().matcher(inputValue);
+ if (matcher.matches()) {
+ final String century = matcher.group(1);
+ InstantEdtfDateBuilder instantEdtfDateBuilder = new InstantEdtfDateBuilder(
+ centerNumericDatePattern.getCenturyExtractorFunction().applyAsInt(century))
+ .withYearPrecision(CENTURY);
+ InstantEdtfDate instantEdtfDate = instantEdtfDateBuilder.withDateQualification(getQualification(inputValue))
+ .withAllowDayMonthSwap(allowDayMonthSwap).build();
+ dateNormalizationResult =
+ new DateNormalizationResult(centerNumericDatePattern.getDateNormalizationExtractorMatchId(), inputValue,
+ instantEdtfDate);
+ break;
+ }
+ }
+ return dateNormalizationResult;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanDateExtractor.java
new file mode 100644
index 0000000000..6cc2ad3ae1
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanDateExtractor.java
@@ -0,0 +1,48 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.YearPrecision.CENTURY;
+import static java.util.regex.Pattern.CASE_INSENSITIVE;
+import static java.util.regex.Pattern.compile;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.RomanToNumber;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Extractor that matches a century with Roman numerals
+ * The range of values this accepts are from 1-21 including.
+ * The Roman numerals may also be preceded by an abbreviation of century, for example ‘s. XIX’.
+ * Examples of some cases:
+ *
+ * -
+ * Value = s. XX | Outcome = 19XX
+ * Value = s. XXI | Outcome = 20XX
+ *
+ *
+ *
+ */
+public class CenturyRomanDateExtractor extends AbstractDateExtractor {
+
+ private static final String CENTURY_PREFIX = "(?:(?:s|sec|saec)\\s|(?:s|sec|saec)\\.\\s?)?";
+ private static final String ROMAN_1_TO_21_REGEX = "(X?(?:IX|IV|VI{0,3}|I{1,3})|X|XXI?)";
+ private static final Pattern ROMAN_2_TO_21_PATTERN = compile(
+ OPTIONAL_QUESTION_MARK_REGEX + CENTURY_PREFIX + ROMAN_1_TO_21_REGEX + OPTIONAL_QUESTION_MARK_REGEX, CASE_INSENSITIVE);
+
+ @Override
+ public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException {
+ DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue);
+ final Matcher matcher = ROMAN_2_TO_21_PATTERN.matcher(inputValue);
+ if (matcher.matches()) {
+ final int century = RomanToNumber.romanToDecimal(matcher.group(1)) - 1;
+ final InstantEdtfDateBuilder instantEdtfDateBuilder =
+ new InstantEdtfDateBuilder(century).withYearPrecision(CENTURY).withDateQualification(getQualification(inputValue));
+ dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.CENTURY_ROMAN,
+ inputValue, instantEdtfDateBuilder.build());
+ }
+ return dateNormalizationResult;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractor.java
new file mode 100644
index 0000000000..168d1a2a52
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractor.java
@@ -0,0 +1,46 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.DateNormalizationResultStatus;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.DefaultDatesSeparator;
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.List;
+
+/**
+ * Extractor for Roman century ranges.
+ * We reuse the already existent {@link CenturyRomanDateExtractor} code for the boundaries.
+ */
+public class CenturyRomanRangeDateExtractor extends AbstractRangeDateExtractor {
+
+ private static final CenturyRomanDateExtractor ROMAN_CENTURY_DATE_EXTRACTOR = new CenturyRomanDateExtractor();
+
+ @Override
+ public DateNormalizationResultRangePair extractDateNormalizationResult(String startString, String endString,
+ DefaultDatesSeparator rangeDateDelimiters,
+ boolean allowDayMonthSwap) throws DateExtractionException {
+ return new DateNormalizationResultRangePair(
+ ROMAN_CENTURY_DATE_EXTRACTOR.extract(startString, allowDayMonthSwap),
+ ROMAN_CENTURY_DATE_EXTRACTOR.extract(endString, allowDayMonthSwap));
+ }
+
+ @Override
+ public List getRangeDateQualifiers() {
+ return new ArrayList<>(EnumSet.of(DefaultDatesSeparator.DASH_DELIMITER));
+ }
+
+ @Override
+ public boolean isRangeMatchSuccess(DefaultDatesSeparator rangeDateDelimiters, DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED
+ && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED;
+ }
+
+ @Override
+ public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return DateNormalizationExtractorMatchId.CENTURY_RANGE_ROMAN;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractor.java
similarity index 58%
rename from metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DateExtractor.java
rename to metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractor.java
index 08ab31305e..0cbedfcfc5 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DateExtractor.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractor.java
@@ -1,11 +1,10 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
+package eu.europeana.normalization.dates.extraction.extractors;
import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.edtf.DateQualification;
import eu.europeana.normalization.dates.extraction.DateExtractionException;
/**
- * The interface for all the implementation of date patterns
+ * The interface for date extractors.
*/
public interface DateExtractor {
@@ -13,30 +12,27 @@ public interface DateExtractor {
* Extractor of a date normalization operation.
*
* @param inputValue the value containing the date
- * @param requestedDateQualification the overwriting value of date qualification, if any
* @param flexibleDateBuild the flag indicating if during creating of the dates we are flexible with validation
* @return the date normalization result
* @throws DateExtractionException if anything happened during the extraction of the date
*/
- DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification, boolean flexibleDateBuild)
+ DateNormalizationResult extract(String inputValue, boolean flexibleDateBuild)
throws DateExtractionException;
/**
* Extractor of a date normalization operation for date properties
*
* @param inputValue the value containing the date
- * @param requestedDateQualification the overwriting value of date qualification, if any
* @return the date normalization result
*/
- DateNormalizationResult extractDateProperty(String inputValue, DateQualification requestedDateQualification);
+ DateNormalizationResult extractDateProperty(String inputValue);
/**
* Extractor of a date normalization operation for generic properties
*
* @param inputValue the value containing the date
- * @param requestedDateQualification the overwriting value of date qualification, if any
* @return the date normalization result
*/
- DateNormalizationResult extractGenericProperty(String inputValue, DateQualification requestedDateQualification);
+ DateNormalizationResult extractGenericProperty(String inputValue);
}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractor.java
similarity index 75%
rename from metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractor.java
rename to metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractor.java
index c2d23bae6e..a3ddde3604 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractor.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractor.java
@@ -1,9 +1,8 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
+package eu.europeana.normalization.dates.extraction.extractors;
import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
import eu.europeana.normalization.dates.DateNormalizationResult;
import eu.europeana.normalization.dates.edtf.DateBoundaryType;
-import eu.europeana.normalization.dates.edtf.DateQualification;
import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
import eu.europeana.normalization.dates.edtf.IntervalEdtfDate;
@@ -27,8 +26,8 @@ public class DcmiPeriodDateExtractor extends AbstractDateExtractor {
private static final String NON_SPACE_NON_SEMICOLON = "[^\\s;]*";
private static final String NON_SPACE_NON_LINE_END = "[^\\s$]*";
private static final String VALUE_ENDING = "(?:;|$)";
- private static final String SPACE_VALUE_ENDING = "\\s*" + VALUE_ENDING;
- private static final String EQUALS_SPACES_WRAPPED = "\\s*=\\s*";
+ private static final String SPACE_VALUE_ENDING = "\\s?" + VALUE_ENDING;
+ private static final String EQUALS_SPACES_WRAPPED = "\\s?=\\s?";
private static final String DCMI_FIELD_REGEX =
EQUALS_SPACES_WRAPPED + "(" + NON_SPACE_NON_SEMICOLON + "|" + NON_SPACE_NON_LINE_END + ")" + SPACE_VALUE_ENDING;
private static final Pattern DCMI_PERIOD_SCHEME_PATTERN = Pattern.compile("scheme" + DCMI_FIELD_REGEX);
@@ -48,22 +47,22 @@ public class DcmiPeriodDateExtractor extends AbstractDateExtractor {
private static final Set W3C_DTF_SCHEME_VALUES = Set.of("W3C-DTF", "W3CDTF");
@Override
- public DateNormalizationResult extract(String value, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) throws DateExtractionException {
+ public DateNormalizationResult extract(String value, boolean flexibleDateBuild) throws DateExtractionException {
DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(value);
if (isValidScheme(value)) {
Matcher matcher = DCMI_PERIOD_START_PATTERN.matcher(value);
- InstantEdtfDate start = extractDate(matcher, requestedDateQualification, flexibleDateBuild);
+ final InstantEdtfDate start = extractDate(matcher, flexibleDateBuild);
matcher = DCMI_PERIOD_END_PATTERN.matcher(value);
- InstantEdtfDate end = extractDate(matcher, requestedDateQualification, flexibleDateBuild);
+ final InstantEdtfDate end = extractDate(matcher, flexibleDateBuild);
String name = extractName(value);
//At least one end has to be specified
if (start.getDateBoundaryType() == DateBoundaryType.DECLARED || end.getDateBoundaryType() == DateBoundaryType.DECLARED) {
- IntervalEdtfDate intervalEdtfDate = new IntervalEdtfDateBuilder(start, end).withLabel(name)
- .withFlexibleDateBuild(
- flexibleDateBuild)
- .build();
+ final IntervalEdtfDate intervalEdtfDate =
+ new IntervalEdtfDateBuilder(start, end)
+ .withLabel(name)
+ .withAllowStartEndSwap(flexibleDateBuild)
+ .build();
dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.DCMI_PERIOD, value,
intervalEdtfDate);
}
@@ -97,17 +96,13 @@ private static boolean isValidScheme(String dcmiPeriod) {
return isValidScheme;
}
- private InstantEdtfDate extractDate(Matcher matcher, DateQualification requestedDateQualification,
- boolean allowSwitchMonthDay) throws DateExtractionException {
+ private InstantEdtfDate extractDate(Matcher matcher, boolean allowDayMonthSwap) throws DateExtractionException {
InstantEdtfDate instantEdtfDate = null;
if (matcher.find()) {
final String fieldValue = matcher.group(1);
if (StringUtils.isNotBlank(fieldValue)) {
TemporalAccessor temporalAccessor = ISO_8601_PARSER.parseDatePart(fieldValue);
- DateQualification dateQualification = computeDateQualification(requestedDateQualification,
- () -> DateQualification.NO_QUALIFICATION);
- instantEdtfDate = new InstantEdtfDateBuilder(temporalAccessor).withDateQualification(dateQualification)
- .withFlexibleDateBuild(allowSwitchMonthDay).build();
+ instantEdtfDate = new InstantEdtfDateBuilder(temporalAccessor).withAllowDayMonthSwap(allowDayMonthSwap).build();
}
//if we find it again we declare invalid
if (matcher.find()) {
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractor.java
similarity index 59%
rename from metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractor.java
rename to metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractor.java
index 94be631ddc..abd14233e1 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractor.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractor.java
@@ -1,16 +1,12 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
+package eu.europeana.normalization.dates.extraction.extractors;
import static eu.europeana.normalization.dates.YearPrecision.DECADE;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN;
import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.edtf.DateQualification;
import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
import eu.europeana.normalization.dates.extraction.DateExtractionException;
-import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -35,22 +31,18 @@
*/
public class DecadeDateExtractor extends AbstractDateExtractor {
- private static final Pattern decadePattern = Pattern.compile("\\??(\\d{3})(?:[XU]\\??|\\?\\?)", Pattern.CASE_INSENSITIVE);
+ private static final Pattern decadePattern = Pattern.compile(
+ OPTIONAL_QUESTION_MARK_REGEX + "(\\d{3})(?:[XU]" + OPTIONAL_QUESTION_MARK_REGEX + "|\\?\\?)", Pattern.CASE_INSENSITIVE);
@Override
- public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) throws DateExtractionException {
- final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue);
- final DateQualification dateQualification = computeDateQualification(requestedDateQualification, () ->
- (sanitizedValue.startsWith("?") || sanitizedValue.endsWith("?")) ? UNCERTAIN : NO_QUALIFICATION);
-
+ public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException {
DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue);
- final Matcher matcher = decadePattern.matcher(sanitizedValue);
+ final Matcher matcher = decadePattern.matcher(inputValue);
if (matcher.matches()) {
final InstantEdtfDate datePart = new InstantEdtfDateBuilder(Integer.parseInt(matcher.group(1)))
.withYearPrecision(DECADE)
- .withDateQualification(dateQualification)
- .withFlexibleDateBuild(flexibleDateBuild)
+ .withDateQualification(getQualification(inputValue))
+ .withAllowDayMonthSwap(allowDayMonthSwap)
.build();
dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.DECADE, inputValue, datePart);
}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractor.java
new file mode 100644
index 0000000000..cf0f67477d
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractor.java
@@ -0,0 +1,95 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder.OVER_4_DIGITS_YEAR_PREFIX;
+import static eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder.THRESHOLD_4_DIGITS_YEAR;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.edtf.DateQualification;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
+import eu.europeana.normalization.dates.edtf.Iso8601Parser;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import java.lang.invoke.MethodHandles;
+import java.time.temporal.TemporalAccessor;
+import java.util.EnumSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The pattern for EDTF dates and compatible with ISO 8601 dates.
+ * This parser supports partial Level0 and Level1 from the Extended
+ * Date/Time Format (EDTF) Specification. It only validates the date part of a date and the time if existent is discarded.
+ * Specifically from Level1, seasons and unspecified digit(s) from the right are not supported
+ *
+ */
+public class EdtfDateExtractor extends AbstractDateExtractor {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ private static final Iso8601Parser ISO_8601_PARSER = new Iso8601Parser();
+
+ @Override
+ public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException {
+ final InstantEdtfDate instantEdtfDate = extractInstant(inputValue, allowDayMonthSwap);
+ return new DateNormalizationResult(DateNormalizationExtractorMatchId.EDTF, inputValue, instantEdtfDate);
+ }
+
+ private InstantEdtfDate extractInstant(String dateInput, boolean allowDayMonthSwap) throws DateExtractionException {
+ final InstantEdtfDate instantEdtfDate;
+ final Integer moreThanFourDigitsYear = getMoreThanFourDigitsYear(dateInput);
+ if (moreThanFourDigitsYear != null) {
+ instantEdtfDate = new InstantEdtfDateBuilder(moreThanFourDigitsYear).withMoreThanFourDigitsYear().build();
+ } else {
+ instantEdtfDate = extractInstantEdtfDate(dateInput, allowDayMonthSwap);
+ }
+ return instantEdtfDate;
+ }
+
+ private static Integer getMoreThanFourDigitsYear(String dateInput) {
+ final boolean startsWithY = dateInput.startsWith(String.valueOf(OVER_4_DIGITS_YEAR_PREFIX));
+ Integer longYear = null;
+ if (startsWithY) {
+ final String yearSubstring = dateInput.substring(1);
+ try {
+ //Try parsing year
+ longYear = Integer.parseInt(yearSubstring);
+ } catch (NumberFormatException er) {
+ LOGGER.debug("Not a valid integer at this stage");
+ }
+ //If prefixed we have to be strict on the length
+ if (longYear != null && Math.abs(longYear) <= THRESHOLD_4_DIGITS_YEAR) {
+ longYear = null;
+ }
+ }
+ return longYear;
+ }
+
+ @Override
+ public Set getQualification(String inputValue) {
+ final Matcher qualificationMatcher = DateQualification.PATTERN.matcher(inputValue);
+ Set dateQualifications = EnumSet.noneOf(DateQualification.class);
+ if (qualificationMatcher.matches()) {
+ final String modifier = qualificationMatcher.group(1);
+ dateQualifications = DateQualification.fromCharacter(String.valueOf(modifier.charAt(0)));
+ }
+ return dateQualifications;
+ }
+
+ private InstantEdtfDate extractInstantEdtfDate(String inputValue, boolean allowDayMonthSwap)
+ throws DateExtractionException {
+ final Set dateQualifications = getQualification(inputValue);
+ String dateInputStrippedModifier = inputValue;
+ if (!dateQualifications.isEmpty()) {
+ dateInputStrippedModifier = inputValue.substring(0, inputValue.length() - 1);
+ }
+
+ final TemporalAccessor temporalAccessor = ISO_8601_PARSER.parseDatePart(dateInputStrippedModifier);
+ return new InstantEdtfDateBuilder(temporalAccessor)
+ .withDateQualification(dateQualifications)
+ .withAllowDayMonthSwap(allowDayMonthSwap)
+ .build();
+ }
+
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractor.java
new file mode 100644
index 0000000000..959ee3d04f
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractor.java
@@ -0,0 +1,75 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult;
+import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN;
+import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.DateNormalizationResultStatus;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.DefaultDatesSeparator;
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.List;
+
+/**
+ * Extractor for Edtf date ranges.
+ * We reuse the already existent {@link EdtfDateExtractor} code for the boundaries.
+ */
+public class EdtfRangeDateExtractor extends AbstractRangeDateExtractor {
+
+ private static final EdtfDateExtractor EDTF_DATE_EXTRACTOR = new EdtfDateExtractor();
+
+ @Override
+ public List getRangeDateQualifiers() {
+ return new ArrayList<>(EnumSet.of(DefaultDatesSeparator.SLASH_DELIMITER));
+ }
+
+ @Override
+ public DateNormalizationResultRangePair extractDateNormalizationResult(String startString, String endString,
+ DefaultDatesSeparator rangeDateDelimiters, boolean allowDayMonthSwap)
+ throws DateExtractionException {
+ DateNormalizationResult startDateNormalizationResult = extractInstant(startString, allowDayMonthSwap);
+ DateNormalizationResult endDateNormalizationResult = extractInstant(endString, allowDayMonthSwap);
+ final InstantEdtfDate startInstantEdtfDate = (InstantEdtfDate) startDateNormalizationResult.getEdtfDate();
+ final InstantEdtfDate endInstantEdtfDate = (InstantEdtfDate) endDateNormalizationResult.getEdtfDate();
+
+ //Are both ends unknown or open, then it is not a date
+ if ((startInstantEdtfDate.getDateBoundaryType() == UNKNOWN || startInstantEdtfDate.getDateBoundaryType() == OPEN) &&
+ (endInstantEdtfDate.getDateBoundaryType() == UNKNOWN || endInstantEdtfDate.getDateBoundaryType() == OPEN)) {
+ startDateNormalizationResult = getNoMatchResult(startString);
+ endDateNormalizationResult = getNoMatchResult(endString);
+ }
+
+ return new DateNormalizationResultRangePair(startDateNormalizationResult, endDateNormalizationResult);
+ }
+
+ private DateNormalizationResult extractInstant(String dateInput, boolean allowDayMonthSwap) throws DateExtractionException {
+ final DateNormalizationResult dateNormalizationResult;
+ if (UNKNOWN.getDeserializedRepresentation().equals(dateInput)) {
+ dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.EDTF, dateInput,
+ InstantEdtfDate.getUnknownInstance());
+ } else if (OPEN.getDeserializedRepresentation().equals(dateInput)) {
+ dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.EDTF, dateInput,
+ InstantEdtfDate.getOpenInstance());
+ } else {
+ dateNormalizationResult = EDTF_DATE_EXTRACTOR.extract(dateInput, allowDayMonthSwap);
+ }
+ return dateNormalizationResult;
+ }
+
+ @Override
+ public boolean isRangeMatchSuccess(DefaultDatesSeparator rangeDateDelimiters, DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED
+ && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED;
+ }
+
+ @Override
+ public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return DateNormalizationExtractorMatchId.EDTF;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractor.java
new file mode 100644
index 0000000000..40df096398
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractor.java
@@ -0,0 +1,92 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static java.lang.String.format;
+import static java.time.format.DateTimeFormatter.ofPattern;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.EuropeanLanguage;
+import java.lang.invoke.MethodHandles;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.time.format.DateTimeFormatterBuilder;
+import java.time.format.DateTimeParseException;
+import java.time.temporal.ChronoField;
+import java.util.LinkedList;
+import java.util.List;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A full date pattern that does not follow a particular standard.
+ * If a timezone with or without offset is present, those are discarded and the date part is taken as such without any
+ * adjustment. For example a date "Wed Nov 01 01:00:00 CEST 1989" will be parsed as "1989-11-01" and not as "1989-10-31"
+ *
+ * Examples:
+ *
+ * - Wed Nov 01 01:00:00 CEST 1989
+ * - 1989-11-01 04:05:06 UTC+01
+ * - 1989-11-01 01:02:03
+ *
+ *
+ */
+public class FullDateDateExtractor extends AbstractDateExtractor {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ private static final List DATE_TIME_FORMATTERS = new LinkedList<>();
+
+ public static final int MIN_MILLISECONDS_WIDTH = 0;
+ public static final int MAX_MILLISECONDS_WIDTH = 3;
+
+ static {
+ DATE_TIME_FORMATTERS.add(
+ new DateTimeFormatterBuilder()
+ .append(ofPattern("EEE MMM dd HH:mm:ss zzz"))
+ .appendOptional(ofPattern("x"))
+ .append(ofPattern(" yyyy"))
+ .toFormatter()
+ );
+ DATE_TIME_FORMATTERS.add(
+ new DateTimeFormatterBuilder()
+ .append(ofPattern("yyyy-MM-dd HH:mm:ss"))
+ .appendFraction(ChronoField.MILLI_OF_SECOND, MIN_MILLISECONDS_WIDTH, MAX_MILLISECONDS_WIDTH, true)
+ .optionalStart()
+ .append(ofPattern(" zzz"))
+ .appendOptional(ofPattern("x"))
+ .optionalEnd()
+ .toFormatter()
+ );
+ }
+
+ @Override
+ public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException {
+ for (DateTimeFormatter dateTimeFormatter : DATE_TIME_FORMATTERS) {
+ final LocalDateTime localDateTime = parseDateWithLocales(inputValue, dateTimeFormatter);
+ if (localDateTime != null) {
+ final InstantEdtfDate instantEdtfDate = new InstantEdtfDateBuilder(localDateTime)
+ .withAllowDayMonthSwap(allowDayMonthSwap)
+ .build();
+ return new DateNormalizationResult(DateNormalizationExtractorMatchId.FORMATTED_FULL_DATE, inputValue, instantEdtfDate);
+ }
+ }
+ return DateNormalizationResult.getNoMatchResult(inputValue);
+ }
+
+ private static LocalDateTime parseDateWithLocales(String inputValue, DateTimeFormatter dateTimeFormatter) {
+ LocalDateTime localDateTime = null;
+ for (EuropeanLanguage europeanLanguage : EuropeanLanguage.values()) {
+ final DateTimeFormatter dateTimeFormatterWithLocale = dateTimeFormatter.withLocale(europeanLanguage.getLocale());
+ try {
+ localDateTime = LocalDateTime.parse(inputValue, dateTimeFormatterWithLocale);
+ break;
+ } catch (DateTimeParseException e) {
+ LOGGER.debug(format("Parsing date failed with date time formatter: %s, and locale: %s", dateTimeFormatterWithLocale,
+ dateTimeFormatterWithLocale.getLocale()), e);
+ }
+ }
+ return localDateTime;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractor.java
new file mode 100644
index 0000000000..6cabab5821
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractor.java
@@ -0,0 +1,35 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A year before 1 AD with more than 4 digits. This pattern is typically used in archaeological contexts. The year may contain
+ * between 5 and 9 digits. Aso includes the pattern for ranges of this kind of years.
+ */
+public class LongNegativeYearDateExtractor extends AbstractDateExtractor {
+
+ private static final Pattern YEAR_PATTERN = Pattern.compile("(-?\\d{5,9})");
+
+ @Override
+ public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException {
+ DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue);
+ final Matcher matcher = YEAR_PATTERN.matcher(inputValue);
+ if (matcher.matches()) {
+ final int year = Integer.parseInt(matcher.group(1));
+ final InstantEdtfDate instantEdtfDate =
+ new InstantEdtfDateBuilder(year).withDateQualification(getQualification(inputValue))
+ .withMoreThanFourDigitsYear()
+ .withAllowDayMonthSwap(allowDayMonthSwap).build();
+ dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR, inputValue,
+ instantEdtfDate);
+ }
+ return dateNormalizationResult;
+ }
+
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractor.java
new file mode 100644
index 0000000000..b965a8cbdf
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractor.java
@@ -0,0 +1,46 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.DateNormalizationResultStatus;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.DefaultDatesSeparator;
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.List;
+
+/**
+ * A year before 1 AD with more than 4 digits. This pattern is typically used in archaeological contexts. The year may contain
+ * between 5 and 9 digits. Aso includes the pattern for ranges of this kind of years.
+ */
+public class LongNegativeYearRangeDateExtractor extends AbstractRangeDateExtractor {
+
+ private static final LongNegativeYearDateExtractor LONG_NEGATIVE_YEAR_DATE_EXTRACTOR = new LongNegativeYearDateExtractor();
+
+ @Override
+ public List getRangeDateQualifiers() {
+ return new ArrayList<>(EnumSet.of(DefaultDatesSeparator.SLASH_DELIMITER));
+ }
+
+ @Override
+ public DateNormalizationResultRangePair extractDateNormalizationResult(String startString, String endString,
+ DefaultDatesSeparator rangeDateDelimiters, boolean allowDayMonthSwap)
+ throws DateExtractionException {
+ return new DateNormalizationResultRangePair(
+ LONG_NEGATIVE_YEAR_DATE_EXTRACTOR.extract(startString, allowDayMonthSwap),
+ LONG_NEGATIVE_YEAR_DATE_EXTRACTOR.extract(endString, allowDayMonthSwap));
+ }
+
+ @Override
+ public boolean isRangeMatchSuccess(DefaultDatesSeparator rangeDateDelimiters, DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED
+ && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED;
+ }
+
+ @Override
+ public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractor.java
new file mode 100644
index 0000000000..3df77433af
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractor.java
@@ -0,0 +1,117 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationResult.getNoMatchResult;
+import static eu.europeana.normalization.dates.extraction.DatePartsIndices.DMY_INDICES;
+import static eu.europeana.normalization.dates.extraction.DatePartsIndices.MDY_INDICES;
+import static eu.europeana.normalization.dates.extraction.DatePartsIndices.MY_INDICES;
+import static java.util.regex.Pattern.compile;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.DateNormalizationResultStatus;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.DatePartsIndices;
+import eu.europeana.normalization.dates.extraction.MonthMultilingual;
+import java.lang.invoke.MethodHandles;
+import java.time.Month;
+import java.util.Arrays;
+import java.util.Optional;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Extractor that matches dates which contain months represented by their name, in all the 24 european languages.
+ *
+ * Examples of some cases:
+ *
+ * - 01 November 1989
+ * - 01.November.1989
+ * - 01,November,1989
+ * - November 01 1989
+ * - November 1989
+ *
+ *
+ */
+public class MonthNameDateExtractor extends AbstractDateExtractor {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ private static final String DELIMITERS_REGEX = "[ .,]";
+ private static final String YEAR_REGEX = "(\\d{4})";
+ private static final String DAY_REGEX = "(\\d{1,2})";
+
+ private static final MonthMultilingual monthMultilingual = new MonthMultilingual();
+ private static final String MONTH_JOINED_VALUES =
+ monthMultilingual.getMonthToAllLanguagesStringsMap().values().stream().flatMap(Set::stream)
+ .map(Pattern::quote)
+ .collect(Collectors.joining("|", "(", ")"));
+
+ private enum MonthNameDatePattern {
+ DAY_MONTH_YEAR_PATTERN(compilePattern(new String[]{DAY_REGEX, MONTH_JOINED_VALUES, YEAR_REGEX}), DMY_INDICES),
+ MONTH_DAY_YEAR_PATTERN(compilePattern(new String[]{MONTH_JOINED_VALUES, DAY_REGEX, YEAR_REGEX}), MDY_INDICES),
+ MONTH_YEAR_PATTERN(compilePattern(new String[]{MONTH_JOINED_VALUES, YEAR_REGEX}), MY_INDICES);
+
+ private final Pattern pattern;
+ private final DatePartsIndices datePartsIndices;
+
+ MonthNameDatePattern(Pattern pattern, DatePartsIndices datePartsIndices) {
+ this.pattern = pattern;
+ this.datePartsIndices = datePartsIndices;
+ }
+
+ public Pattern getPattern() {
+ return pattern;
+ }
+
+ public DatePartsIndices getDatePartsIndices() {
+ return datePartsIndices;
+ }
+ }
+
+ private static Pattern compilePattern(String[] parts) {
+ return compile(String.join(DELIMITERS_REGEX, parts), Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
+ }
+
+ @Override
+ public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException {
+ return Arrays.stream(MonthNameDatePattern.values())
+ .map(operation -> extract(operation, inputValue))
+ .filter(dateNormalizationResult -> dateNormalizationResult.getDateNormalizationResultStatus()
+ == DateNormalizationResultStatus.MATCHED).findFirst()
+ .orElse(getNoMatchResult(inputValue));
+ }
+
+ private DateNormalizationResult extract(MonthNameDatePattern monthNameDatePattern, String inputValue) {
+ DateNormalizationResult dateNormalizationResult = getNoMatchResult(inputValue);
+ try {
+ final Matcher matcher = monthNameDatePattern.getPattern().matcher(inputValue);
+ if (matcher.matches()) {
+ final Month month = monthMultilingual.getMonth(
+ matcher.group(monthNameDatePattern.getDatePartsIndices().getMonthIndex()));
+ final InstantEdtfDateBuilder instantEdtfDateBuilder = new InstantEdtfDateBuilder(
+ Integer.parseInt(matcher.group(monthNameDatePattern.getDatePartsIndices().getYearIndex())))
+ .withMonth(month.getValue());
+ getDayIfPresent(monthNameDatePattern, matcher).ifPresent(instantEdtfDateBuilder::withDay);
+ final InstantEdtfDate instantEdtfDate = instantEdtfDateBuilder.build();
+ dateNormalizationResult = new DateNormalizationResult(DateNormalizationExtractorMatchId.MONTH_NAME, inputValue,
+ instantEdtfDate);
+ }
+ } catch (DateExtractionException e) {
+ LOGGER.warn("Failed instance extraction!", e);
+ }
+ return dateNormalizationResult;
+ }
+
+ private Optional getDayIfPresent(MonthNameDatePattern monthNameDatePattern, Matcher matcher) {
+ if (monthNameDatePattern.getDatePartsIndices().getDayIndex() != null) {
+ return Optional.of(Integer.parseInt(matcher.group(monthNameDatePattern.getDatePartsIndices().getDayIndex())));
+ }
+ return Optional.empty();
+ }
+}
+
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractor.java
similarity index 76%
rename from metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractor.java
rename to metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractor.java
index 92b972f8ed..f04d123039 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractor.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractor.java
@@ -1,6 +1,5 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
+package eu.europeana.normalization.dates.extraction.extractors;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN;
import static java.util.Optional.ofNullable;
import static java.util.regex.Pattern.compile;
@@ -8,10 +7,11 @@
import eu.europeana.normalization.dates.DateNormalizationResult;
import eu.europeana.normalization.dates.YearPrecision;
import eu.europeana.normalization.dates.edtf.DateQualification;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
import eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder;
import eu.europeana.normalization.dates.extraction.DateExtractionException;
import eu.europeana.normalization.dates.extraction.NumericPartsPattern;
-import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer;
+import java.util.EnumSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
@@ -39,9 +39,17 @@ public class NumericPartsDateExtractor extends AbstractDateExtractor {
private static final String UNKNOWN_CHARACTERS_REGEX = "[XU?-]";
@Override
- public DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- boolean flexibleDateBuild) throws DateExtractionException {
- return extract(inputValue, requestedDateQualification, NumericPartsPattern.NUMERIC_SET, flexibleDateBuild);
+ public DateNormalizationResult extract(String inputValue, boolean allowDayMonthSwap) throws DateExtractionException {
+ return extract(inputValue, NumericPartsPattern.NUMERIC_SET, allowDayMonthSwap);
+ }
+
+ @Override
+ public Set getQualification(String inputValue) {
+ final Set dateQualifications = EnumSet.noneOf(DateQualification.class);
+ if (STARTING_UNCERTAIN_PATTERN.matcher(inputValue).find() || ENDING_UNCERTAIN_PATTERN.matcher(inputValue).find()) {
+ dateQualifications.add(UNCERTAIN);
+ }
+ return dateQualifications;
}
/**
@@ -49,27 +57,21 @@ public DateNormalizationResult extract(String inputValue, DateQualification requ
*
* @param inputValue the input value
* @param numericPatternValues the patterns to check against
- * @param allowSwitchMonthDay allow switching month and day values if month and day original values are not valid
+ * @param flexibleDateBuild allow switching month and day values if month and day original values are not valid
* @return the date normalization result
*/
- protected DateNormalizationResult extract(String inputValue, DateQualification requestedDateQualification,
- Set numericPatternValues,
- boolean allowSwitchMonthDay) throws DateExtractionException {
- final String sanitizedValue = DateFieldSanitizer.cleanSpacesAndTrim(inputValue);
- final DateQualification dateQualification = computeDateQualification(requestedDateQualification, () ->
- (STARTING_UNCERTAIN_PATTERN.matcher(sanitizedValue).find() || ENDING_UNCERTAIN_PATTERN.matcher(sanitizedValue).find())
- ? UNCERTAIN : NO_QUALIFICATION);
-
+ protected DateNormalizationResult extract(String inputValue, Set numericPatternValues,
+ boolean flexibleDateBuild) throws DateExtractionException {
DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(inputValue);
for (NumericPartsPattern numericWithMissingPartsPattern : numericPatternValues) {
- final Matcher matcher = numericWithMissingPartsPattern.getPattern().matcher(sanitizedValue);
+ final Matcher matcher = numericWithMissingPartsPattern.getPattern().matcher(inputValue);
if (matcher.matches()) {
InstantEdtfDateBuilder instantEdtfDateBuilder = extractDateProperty(numericWithMissingPartsPattern, matcher);
- dateNormalizationResult = new DateNormalizationResult(
- numericWithMissingPartsPattern.getDateNormalizationExtractorMatchId(), inputValue,
- instantEdtfDateBuilder.withDateQualification(dateQualification).withFlexibleDateBuild(allowSwitchMonthDay)
- .build());
- break;
+ final InstantEdtfDate instantEdtfDate = instantEdtfDateBuilder.withDateQualification(getQualification(inputValue))
+ .withAllowDayMonthSwap(flexibleDateBuild).build();
+ dateNormalizationResult = new DateNormalizationResult(
+ numericWithMissingPartsPattern.getDateNormalizationExtractorMatchId(), inputValue, instantEdtfDate);
+ break;
}
}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractor.java
new file mode 100644
index 0000000000..bf3077a08c
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractor.java
@@ -0,0 +1,93 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS;
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS_XX;
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS;
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS_XX;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.DateNormalizationResultStatus;
+import eu.europeana.normalization.dates.edtf.DateBoundaryType;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.NumericPartsPattern;
+import eu.europeana.normalization.dates.extraction.NumericPartsPattern.NumericRangeQualifier;
+import java.util.List;
+
+/**
+ * Patterns for numeric date ranges with variations in the separators of date components.
+ * We reuse the already existent {@link NumericPartsDateExtractor} code for the boundaries.
+ */
+public class NumericPartsRangeDateExtractor extends AbstractRangeDateExtractor {
+
+ private static final NumericPartsDateExtractor NUMERIC_WITH_MISSING_PARTS_DATE_EXTRACTOR = new NumericPartsDateExtractor();
+
+ @Override
+ public boolean isRangeMatchSuccess(NumericRangeQualifier numericRangeQualifier, DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ return startDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED
+ && endDateResult.getDateNormalizationResultStatus() == DateNormalizationResultStatus.MATCHED
+ && !areYearsAmbiguous((InstantEdtfDate) startDateResult.getEdtfDate(), (InstantEdtfDate) endDateResult.getEdtfDate(),
+ numericRangeQualifier);
+ }
+
+ /**
+ * Captures the ambiguous case of "198-?".
+ *
+ * @param startDate the start date of a range
+ * @param endDate the end date of the range
+ * @param numericRangeQualifier the numeric range qualifier
+ * @return true if the range is ambiguous
+ */
+ private boolean areYearsAmbiguous(InstantEdtfDate startDate, InstantEdtfDate endDate,
+ NumericRangeQualifier numericRangeQualifier) {
+ boolean isAmbiguous = false;
+ if (numericRangeQualifier == NumericRangeQualifier.DASH_RANGE) {
+ final boolean isStartDeclared = startDate.getDateBoundaryType() == DateBoundaryType.DECLARED;
+ final boolean isStartThreeDigit =
+ isStartDeclared && Integer.toString(startDate.getYear().getValue()).matches("\\d{3}");
+ if (isStartThreeDigit && endDate.getDateBoundaryType() == DateBoundaryType.OPEN) {
+ isAmbiguous = true;
+ }
+ }
+ return isAmbiguous;
+ }
+
+ @Override
+ public List getRangeDateQualifiers() {
+ return List.of(NumericRangeQualifier.values());
+ }
+
+ @Override
+ public DateNormalizationResultRangePair extractDateNormalizationResult(String startString,
+ String endString, NumericRangeQualifier numericRangeQualifier,
+ boolean allowDayMonthSwap)
+ throws DateExtractionException {
+ return new DateNormalizationResultRangePair(
+ extractDate(startString, numericRangeQualifier, allowDayMonthSwap),
+ extractDate(endString, numericRangeQualifier, allowDayMonthSwap));
+ }
+
+ @Override
+ public DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult) {
+ final boolean isStartXX = startDateResult.getDateNormalizationExtractorMatchId() == NUMERIC_ALL_VARIANTS_XX;
+ final boolean isEndXX = endDateResult.getDateNormalizationExtractorMatchId() == NUMERIC_ALL_VARIANTS_XX;
+ return isStartXX || isEndXX ? NUMERIC_RANGE_ALL_VARIANTS_XX : NUMERIC_RANGE_ALL_VARIANTS;
+ }
+
+ private static DateNormalizationResult extractDate(String dateString,
+ NumericRangeQualifier numericRangeQualifier, boolean allowDayMonthSwap)
+ throws DateExtractionException {
+ final DateNormalizationResult dateNormalizationResult;
+ if (numericRangeQualifier.getUnspecifiedCharacters() != null && dateString.matches(
+ numericRangeQualifier.getUnspecifiedCharacters())) {
+ dateNormalizationResult = new DateNormalizationResult(NUMERIC_ALL_VARIANTS, dateString, InstantEdtfDate.getOpenInstance());
+ } else {
+ dateNormalizationResult = NUMERIC_WITH_MISSING_PARTS_DATE_EXTRACTOR.extract(dateString,
+ NumericPartsPattern.NUMERIC_RANGE_SET, allowDayMonthSwap);
+ }
+ return dateNormalizationResult;
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/RangeDateExtractor.java b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/RangeDateExtractor.java
new file mode 100644
index 0000000000..4f1a3345c3
--- /dev/null
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/dates/extraction/extractors/RangeDateExtractor.java
@@ -0,0 +1,76 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.extraction.DateExtractionException;
+import eu.europeana.normalization.dates.extraction.DatesSeparator;
+import java.util.List;
+
+/**
+ * The interface for range date extractors.
+ *
+ * @param the object containing delimiters/separators for dates
+ */
+public interface RangeDateExtractor {
+
+ List getRangeDateQualifiers();
+
+ /**
+ * Extract the start and end date normalization result pair.
+ * At this stage we just perform an extraction, the range is not verified yet.
+ *
+ * @param startString the start date string
+ * @param endString the end date string
+ * @param rangeDateDelimiters the range date delimiters
+ * @param allowDayMonthSwap the boolean opting flexible date build
+ * @return the start and end date result pair
+ * @throws DateExtractionException if the date extraction failed
+ */
+ DateNormalizationResultRangePair extractDateNormalizationResult(
+ String startString, String endString, T rangeDateDelimiters,
+ boolean allowDayMonthSwap) throws DateExtractionException;
+
+ /**
+ * Checks if a provided date range was successfully extracted
+ *
+ * @param rangeDateDelimiters the range date delimiters
+ * @param startDateResult the extracted start date boundary
+ * @param endDateResult the extracted end date boundary
+ * @return the boolean representing a successful date range extraction
+ */
+ boolean isRangeMatchSuccess(T rangeDateDelimiters, DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult);
+
+ /**
+ * Get the date normalization extractor match identifier from the two date boundaries.
+ *
+ * @param startDateResult the start date boundary
+ * @param endDateResult the end date boundary
+ * @return the date normalization extractor match identifier
+ */
+ DateNormalizationExtractorMatchId getDateNormalizationExtractorId(DateNormalizationResult startDateResult,
+ DateNormalizationResult endDateResult);
+
+ /**
+ * Class wrapping a pair of start and end dates.
+ */
+ class DateNormalizationResultRangePair {
+
+ final DateNormalizationResult startDateNormalizationResult;
+ final DateNormalizationResult endDateNormalizationResult;
+
+ public DateNormalizationResultRangePair(DateNormalizationResult startDateNormalizationResult,
+ DateNormalizationResult endDateNormalizationResult) {
+ this.startDateNormalizationResult = startDateNormalizationResult;
+ this.endDateNormalizationResult = endDateNormalizationResult;
+ }
+
+ public DateNormalizationResult getStartDateNormalizationResult() {
+ return startDateNormalizationResult;
+ }
+
+ public DateNormalizationResult getEndDateNormalizationResult() {
+ return endDateNormalizationResult;
+ }
+ }
+}
diff --git a/metis-normalization/src/main/java/eu/europeana/normalization/normalizers/DatesNormalizer.java b/metis-normalization/src/main/java/eu/europeana/normalization/normalizers/DatesNormalizer.java
index 0848263726..3c83973772 100644
--- a/metis-normalization/src/main/java/eu/europeana/normalization/normalizers/DatesNormalizer.java
+++ b/metis-normalization/src/main/java/eu/europeana/normalization/normalizers/DatesNormalizer.java
@@ -8,18 +8,23 @@
import eu.europeana.normalization.dates.edtf.AbstractEdtfDate;
import eu.europeana.normalization.dates.edtf.DateQualification;
import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.extraction.dateextractors.BriefRangeDateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.CenturyDateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.DateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.DcmiPeriodDateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.DecadeDateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.EdtfDateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.NumericPartsDateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.NumericPartsRangeDateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.PatternBcAdDateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.PatternFormatedFullDateDateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.PatternLongNegativeYearDateExtractor;
-import eu.europeana.normalization.dates.extraction.dateextractors.PatternMonthNameDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.BcAdDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.BcAdRangeDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.BriefRangeDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.CenturyNumericDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.CenturyRomanDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.CenturyRomanRangeDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.DateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.DcmiPeriodDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.DecadeDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.EdtfDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.EdtfRangeDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.FullDateDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.LongNegativeYearDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.LongNegativeYearRangeDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.MonthNameDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.NumericPartsDateExtractor;
+import eu.europeana.normalization.dates.extraction.extractors.NumericPartsRangeDateExtractor;
import eu.europeana.normalization.dates.sanitize.DateFieldSanitizer;
import eu.europeana.normalization.dates.sanitize.SanitizeOperation;
import eu.europeana.normalization.dates.sanitize.SanitizedDate;
@@ -29,10 +34,12 @@
import eu.europeana.normalization.util.NormalizationException;
import eu.europeana.normalization.util.XmlUtil;
import eu.europeana.normalization.util.XpathQuery;
+import java.lang.invoke.MethodHandles;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Optional;
+import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
@@ -40,6 +47,8 @@
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
@@ -56,13 +65,15 @@
*/
public class DatesNormalizer implements RecordNormalizeAction {
+ private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
private static final Namespace.Element EDM_PROVIDED_CHO = Namespace.EDM.getElement("ProvidedCHO");
private static final Namespace.Element EDM_WEB_RESOURCE = Namespace.EDM.getElement("WebResource");
private static final Namespace.Element EDM_AGENT = Namespace.EDM.getElement("Agent");
private static final Namespace.Element EDM_PLACE = Namespace.EDM.getElement("Place");
private static final Namespace.Element EDM_TIMESPAN = Namespace.EDM.getElement("TimeSpan");
private static final Namespace.Element RDF_ABOUT = Namespace.RDF.getElement("about");
- private static final Namespace.Element SKOS_PREFLABEL = Namespace.SKOS.getElement("prefLabel");
+ private static final Namespace.Element SKOS_PREF_LABEL = Namespace.SKOS.getElement("prefLabel");
private static final Namespace.Element XML_LANG = Namespace.XML.getElement("lang");
private static final Namespace.Element SKOS_NOTATION = Namespace.SKOS.getElement("notation");
private static final Namespace.Element SKOS_NOTE = Namespace.SKOS.getElement("note");
@@ -70,7 +81,7 @@ public class DatesNormalizer implements RecordNormalizeAction {
private static final Namespace.Element RDF_RESOURCE = Namespace.RDF.getElement("resource");
private static final Namespace.Element EDM_BEGIN = Namespace.EDM.getElement("begin");
private static final Namespace.Element EDM_END = Namespace.EDM.getElement("end");
- private static final Namespace.Element DCTERMS_ISPARTOF = Namespace.DCTERMS.getElement("isPartOf");
+ private static final Namespace.Element DC_TERMS_IS_PART_OF = Namespace.DCTERMS.getElement("isPartOf");
private static final Namespace.Element ORE_PROXY = Namespace.ORE.getElement("Proxy");
private static final Namespace.Element EDM_EUROPEANA_PROXY = Namespace.EDM.getElement("europeanaProxy");
@@ -118,33 +129,42 @@ public DatesNormalizer() {
extractorsInOrderForDateProperties = List.of(
new BriefRangeDateExtractor(),
new EdtfDateExtractor(),
- new CenturyDateExtractor(),
+ new EdtfRangeDateExtractor(),
+ new CenturyNumericDateExtractor(),
+ new CenturyRomanDateExtractor(),
+ new CenturyRomanRangeDateExtractor(),
new DecadeDateExtractor(),
new NumericPartsRangeDateExtractor(),
new NumericPartsDateExtractor(),
new DcmiPeriodDateExtractor(),
- new PatternMonthNameDateExtractor(),
- new PatternFormatedFullDateDateExtractor(),
- new PatternBcAdDateExtractor(),
- new PatternLongNegativeYearDateExtractor());
+ new MonthNameDateExtractor(),
+ new FullDateDateExtractor(),
+ new BcAdDateExtractor(),
+ new BcAdRangeDateExtractor(),
+ new LongNegativeYearDateExtractor(),
+ new LongNegativeYearRangeDateExtractor());
extractorsInOrderForGenericProperties =
extractorsInOrderForDateProperties.stream()
- .filter(
- not(BriefRangeDateExtractor.class::isInstance))
- .collect(Collectors.toList());
+ .filter(not(BriefRangeDateExtractor.class::isInstance)).collect(Collectors.toList());
normalizationOperationsInOrderDateProperty = List.of(
- input -> normalizeInput(extractorsInOrderForDateProperties, input, DateQualification.NO_QUALIFICATION),
- input -> normalizeInput(extractorsInOrderForDateProperties, input, dateFieldSanitizer::sanitize1stTimeDateProperty,
- SanitizeOperation::isApproximateSanitizeOperationForDateProperty),
- input -> normalizeInput(extractorsInOrderForDateProperties, input, dateFieldSanitizer::sanitize2ndTimeDateProperty,
- SanitizeOperation::isApproximateSanitizeOperationForDateProperty));
+ input -> normalizeInput(extractorsInOrderForDateProperties, input),
+ input -> normalizeInputSanitized(extractorsInOrderForDateProperties, input,
+ dateFieldSanitizer::sanitize1stTimeDateProperty,
+ SanitizeOperation::isApproximateSanitizeOperationForDateProperty,
+ (dateExtractors, sanitizedDate) -> normalizeInput(dateExtractors, sanitizedDate.getSanitizedDateString())),
+ input -> normalizeInputSanitized(extractorsInOrderForDateProperties, input,
+ dateFieldSanitizer::sanitize2ndTimeDateProperty,
+ SanitizeOperation::isApproximateSanitizeOperationForDateProperty,
+ (dateExtractors, sanitizedDate) -> normalizeInput(dateExtractors, sanitizedDate.getSanitizedDateString())));
normalizationOperationsInOrderGenericProperty = List.of(
- input -> normalizeInputGeneric(extractorsInOrderForGenericProperties, input, DateQualification.NO_QUALIFICATION),
- input -> normalizeInputGeneric(extractorsInOrderForGenericProperties, input,
- dateFieldSanitizer::sanitizeGenericProperty, SanitizeOperation::isApproximateSanitizeOperationForGenericProperty));
+ input -> normalizeInputGeneric(extractorsInOrderForGenericProperties, input),
+ input -> normalizeInputSanitized(extractorsInOrderForGenericProperties, input,
+ dateFieldSanitizer::sanitizeGenericProperty,
+ SanitizeOperation::isApproximateSanitizeOperationForGenericProperty,
+ (dateExtractors, sanitizedDate) -> normalizeInputGeneric(dateExtractors, sanitizedDate.getSanitizedDateString())));
}
private static Pair getProxySubtagQuery(Namespace.Element subtag) {
@@ -160,10 +180,8 @@ public NormalizationReport normalize(Document document) throws NormalizationExce
// Perform the two different kinds of normalizations
final InternalNormalizationReport report = new InternalNormalizationReport();
- report.mergeWith(normalizeElements(document, europeanaProxy, DATE_PROPERTY_FIELDS,
- this::normalizeDateProperty));
- report.mergeWith(normalizeElements(document, europeanaProxy, GENERIC_PROPERTY_FIELDS,
- this::normalizeGenericProperty));
+ report.mergeWith(normalizeElements(document, europeanaProxy, DATE_PROPERTY_FIELDS, this::normalizeDateProperty));
+ report.mergeWith(normalizeElements(document, europeanaProxy, GENERIC_PROPERTY_FIELDS, this::normalizeGenericProperty));
return report;
}
@@ -194,6 +212,7 @@ private void normalizeElement(Document document, Element element, Namespace.Elem
final String elementText = XmlUtil.getElementText(element);
final DateNormalizationResult dateNormalizationResult = normalizationFunction.apply(elementText);
if (dateNormalizationResult.getDateNormalizationResultStatus() == NO_MATCH) {
+ LOGGER.debug("Normalization did not find a match");
return;
}
@@ -209,10 +228,10 @@ private void normalizeElement(Document document, Element element, Namespace.Elem
final Element reference = XmlUtil.createElement(elementType, europeanaProxy, List.of());
final String fullResourceName = XmlUtil.getPrefixedElementName(RDF_RESOURCE,
reference.lookupPrefix(RDF_RESOURCE.getNamespace().getUri()));
- final Attr dctermsIsPartOfResource = document.createAttributeNS(
+ final Attr dcTermsIsPartOfResource = document.createAttributeNS(
RDF_RESOURCE.getNamespace().getUri(), fullResourceName);
- dctermsIsPartOfResource.setValue(timespanId);
- reference.setAttributeNode(dctermsIsPartOfResource);
+ dcTermsIsPartOfResource.setValue(timespanId);
+ reference.setAttributeNode(dcTermsIsPartOfResource);
// Update the report.
report.increment(this.getClass().getSimpleName(), ConfidenceLevel.CERTAIN);
@@ -256,58 +275,31 @@ private DateNormalizationResult normalizeProperty(
return dateNormalizationResult;
}
- private DateNormalizationResult normalizeInput(List dateExtractors, String inputDate,
- DateQualification dateQualification) {
- return dateExtractors.stream().map(
- dateExtractor -> dateExtractor.extractDateProperty(inputDate, dateQualification))
+ private DateNormalizationResult normalizeInput(List dateExtractors, String inputDate) {
+ return dateExtractors.stream().map(dateExtractor -> dateExtractor.extractDateProperty(inputDate))
.filter(dateNormalizationResult -> dateNormalizationResult.getDateNormalizationResultStatus()
== MATCHED).findFirst()
.orElse(DateNormalizationResult.getNoMatchResult(inputDate));
}
- private DateNormalizationResult normalizeInputGeneric(List dateExtractors, String input,
- DateQualification dateQualification) {
- return dateExtractors.stream().map(dateExtractor -> dateExtractor.extractGenericProperty(input, dateQualification))
+ private DateNormalizationResult normalizeInputGeneric(List dateExtractors, String input) {
+ return dateExtractors.stream().map(dateExtractor -> dateExtractor.extractGenericProperty(input))
.filter(dateNormalizationResult -> dateNormalizationResult.getDateNormalizationResultStatus()
== MATCHED).findFirst()
.orElse(DateNormalizationResult.getNoMatchResult(input));
}
- private DateNormalizationResult normalizeInput(List dateExtractors, String input,
- Function sanitizeFunction, Predicate checkIfApproximateCleanOperationId) {
+ private DateNormalizationResult normalizeInputSanitized(List dateExtractors, String input,
+ Function sanitizeFunction, Predicate checkIfApproximateCleanOperationId,
+ BiFunction, SanitizedDate, DateNormalizationResult> normalizeFunction) {
final SanitizedDate sanitizedDate = sanitizeFunction.apply(input);
DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(input);
if (sanitizedDate != null && StringUtils.isNotEmpty(sanitizedDate.getSanitizedDateString())) {
- final DateQualification dateQualification;
- if (checkIfApproximateCleanOperationId.test(sanitizedDate.getSanitizeOperation())) {
- dateQualification = DateQualification.APPROXIMATE;
- } else {
- dateQualification = DateQualification.NO_QUALIFICATION;
- }
- dateNormalizationResult = normalizeInput(dateExtractors, sanitizedDate.getSanitizedDateString(), dateQualification);
-
- if (dateNormalizationResult.getDateNormalizationResultStatus() == MATCHED) {
- //Re-create result containing sanitization operation.
- dateNormalizationResult = new DateNormalizationResult(dateNormalizationResult, sanitizedDate.getSanitizeOperation());
- }
- }
- return dateNormalizationResult;
- }
-
- private DateNormalizationResult normalizeInputGeneric(List dateExtractors, String input,
- Function sanitizeFunction, Predicate checkIfApproximateCleanOperationId) {
- final SanitizedDate sanitizedDate = sanitizeFunction.apply(input);
- DateNormalizationResult dateNormalizationResult = DateNormalizationResult.getNoMatchResult(input);
- if (sanitizedDate != null && StringUtils.isNotEmpty(sanitizedDate.getSanitizedDateString())) {
- if (checkIfApproximateCleanOperationId.test(sanitizedDate.getSanitizeOperation())) {
- dateNormalizationResult = normalizeInputGeneric(dateExtractors, sanitizedDate.getSanitizedDateString(),
- DateQualification.APPROXIMATE);
- } else {
- dateNormalizationResult = normalizeInputGeneric(dateExtractors, sanitizedDate.getSanitizedDateString(),
- DateQualification.NO_QUALIFICATION);
- }
-
+ dateNormalizationResult = normalizeFunction.apply(dateExtractors, sanitizedDate);
if (dateNormalizationResult.getDateNormalizationResultStatus() == MATCHED) {
+ if (checkIfApproximateCleanOperationId.test(sanitizedDate.getSanitizeOperation())) {
+ dateNormalizationResult.getEdtfDate().addQualification(DateQualification.APPROXIMATE);
+ }
//Re-create result containing sanitization operation.
dateNormalizationResult = new DateNormalizationResult(dateNormalizationResult, sanitizedDate.getSanitizeOperation());
}
@@ -362,7 +354,7 @@ private void appendTimespanEntity(Document document, AbstractEdtfDate edtfDate,
timeSpan.setAttributeNode(rdfAbout);
// Create and add skosPrefLabel to timespan
- final Element skosPrefLabel = XmlUtil.createElement(SKOS_PREFLABEL, timeSpan, null);
+ final Element skosPrefLabel = XmlUtil.createElement(SKOS_PREF_LABEL, timeSpan, null);
if (StringUtils.isNotBlank(edtfDate.getLabel())) {
skosPrefLabel.setNodeValue(edtfDate.getLabel());
skosPrefLabel.appendChild(document.createTextNode(edtfDate.getLabel()));
@@ -376,11 +368,11 @@ private void appendTimespanEntity(Document document, AbstractEdtfDate edtfDate,
}
// Create and add skosNote elements to timespan in case of approximate or uncertain dates.
- if (edtfDate.getDateQualification() == DateQualification.APPROXIMATE) {
+ if (edtfDate.getDateQualifications().contains(DateQualification.APPROXIMATE)) {
final Element skosNote = XmlUtil.createElement(SKOS_NOTE, timeSpan, null);
skosNote.appendChild(document.createTextNode("approximate"));
}
- if (edtfDate.getDateQualification() == DateQualification.UNCERTAIN) {
+ if (edtfDate.getDateQualifications().contains(DateQualification.UNCERTAIN)) {
final Element skosNote = XmlUtil.createElement(SKOS_NOTE, timeSpan, null);
skosNote.appendChild(document.createTextNode("uncertain"));
}
@@ -407,7 +399,7 @@ private void appendTimespanEntity(Document document, AbstractEdtfDate edtfDate,
final String fullResourceName = XmlUtil.getPrefixedElementName(RDF_RESOURCE,
timeSpan.lookupPrefix(RDF_RESOURCE.getNamespace().getUri()));
for (int century = Math.max(1, startCentury); century <= Math.max(0, endCentury); century++) {
- final Element dctermsIsPartOf = XmlUtil.createElement(DCTERMS_ISPARTOF, timeSpan, null);
+ final Element dctermsIsPartOf = XmlUtil.createElement(DC_TERMS_IS_PART_OF, timeSpan, null);
final Attr dctermsIsPartOfResource = document.createAttributeNS(RDF_RESOURCE.getNamespace().getUri(), fullResourceName);
dctermsIsPartOfResource.setValue("http://data.europeana.eu/timespan/" + century);
dctermsIsPartOf.setAttributeNode(dctermsIsPartOfResource);
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractorTest.java
deleted file mode 100644
index 4d0334b4be..0000000000
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/BriefRangeDateExtractorTest.java
+++ /dev/null
@@ -1,88 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN;
-import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN;
-import static eu.europeana.normalization.dates.edtf.DateQualification.APPROXIMATE;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN;
-import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN_APPROXIMATE;
-import static eu.europeana.normalization.dates.edtf.IntervalEdtfDate.DATE_INTERVAL_SEPARATOR;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.params.provider.Arguments.of;
-
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.DateNormalizationResultStatus;
-import eu.europeana.normalization.dates.edtf.AbstractEdtfDate;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDate;
-import java.util.stream.Stream;
-import org.junit.jupiter.params.ParameterizedTest;
-import org.junit.jupiter.params.provider.Arguments;
-import org.junit.jupiter.params.provider.MethodSource;
-
-class BriefRangeDateExtractorTest {
-
- private final BriefRangeDateExtractor briefRangeDateExtractor = new BriefRangeDateExtractor();
-
- private void assertExtract(String input, String expected) {
- final DateNormalizationResult dateNormalizationResult = briefRangeDateExtractor.extractDateProperty(input, NO_QUALIFICATION);
- if (expected == null) {
- assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus());
- } else {
- AbstractEdtfDate edtfDate = dateNormalizationResult.getEdtfDate();
- if (edtfDate instanceof IntervalEdtfDate) {
- String startPart = expected.substring(0, expected.indexOf(DATE_INTERVAL_SEPARATOR));
- String endPart = expected.substring(expected.indexOf(DATE_INTERVAL_SEPARATOR) + 1);
- InstantEdtfDate start = ((IntervalEdtfDate) edtfDate).getStart();
- InstantEdtfDate end = ((IntervalEdtfDate) edtfDate).getEnd();
- assertEdtfDate(startPart, start);
- assertEdtfDate(endPart, end);
- } else {
- assertEdtfDate(expected, (InstantEdtfDate) dateNormalizationResult.getEdtfDate());
- }
- assertEquals(expected, edtfDate.toString());
- }
- }
-
- private static void assertEdtfDate(String expected, InstantEdtfDate instantEdtfDate) {
- assertEquals(expected.contains("?"), instantEdtfDate.getDateQualification() == UNCERTAIN);
- assertEquals(expected.contains("~"), instantEdtfDate.getDateQualification() == APPROXIMATE);
- assertEquals(expected.contains("%"), instantEdtfDate.getDateQualification() == UNCERTAIN_APPROXIMATE);
- assertEquals(expected.equals(OPEN.getSerializedRepresentation()),
- instantEdtfDate.getDateBoundaryType() == OPEN || instantEdtfDate.getDateBoundaryType() == UNKNOWN);
- }
-
- @ParameterizedTest
- @MethodSource
- void extractBrief(String input, String expected) {
- assertExtract(input, expected);
- }
-
- private static Stream extractBrief() {
- return Stream.of(
- of("1989/90", "1989/1990"),
- of("1989/90?", "1989?/1990?"),
- of("1989-90", "1989/1990"),
- of("1989-90?", "1989?/1990?"),
- of("1900-13", "1900/1913"),
-
- //End date lower rightmost two digits than start year
- of("1989/89", null),
- of("1989/88", null),
- of("1989-89", null),
- of("1989-88", null),
-
- //More than two digits on end year not allowed
- of("1989/990", null),
- of("1989-990", null),
-
- //End year cannot be lower or equal than 12
- of("1900/01", null),
- of("1900-12", null),
-
- //Less than three digits on start year
- of("89-90", null)
- );
- }
-
-}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractorTest.java
deleted file mode 100644
index 1a8df29669..0000000000
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/CenturyDateExtractorTest.java
+++ /dev/null
@@ -1,216 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.CENTURY_NUMERIC;
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.CENTURY_RANGE_ROMAN;
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.CENTURY_ROMAN;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.params.provider.Arguments.of;
-
-import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.DateNormalizationResultStatus;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import java.util.stream.Stream;
-import org.junit.jupiter.params.ParameterizedTest;
-import org.junit.jupiter.params.provider.Arguments;
-import org.junit.jupiter.params.provider.MethodSource;
-
-class CenturyDateExtractorTest {
- private static final CenturyDateExtractor CENTURY_DATE_EXTRACTOR = new CenturyDateExtractor();
-
- void assertExtract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
- final DateNormalizationResult dateNormalizationResult = CENTURY_DATE_EXTRACTOR.extractDateProperty(input, NO_QUALIFICATION);
- if (expected == null) {
- assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus());
- } else {
- final String actual = dateNormalizationResult.getEdtfDate().toString();
- assertEquals(expected, actual);
- assertEquals(actual.contains("?"),
- dateNormalizationResult.getEdtfDate().getDateQualification() == DateQualification.UNCERTAIN);
- assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId());
- }
- }
-
- @ParameterizedTest
- @MethodSource("extractNumericData")
- void extractNumeric(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
- assertExtract(input, expected, dateNormalizationExtractorMatchId);
- }
-
- @ParameterizedTest
- @MethodSource("extractRomanData")
- void extractRoman(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
- assertExtract(input, expected, dateNormalizationExtractorMatchId);
- }
-
- private static Stream extractNumericData() {
- return Stream.of(
- //PATTERN_YYYY
- of("18..", "18XX", CENTURY_NUMERIC),
- of(" 18.. ", "18XX", CENTURY_NUMERIC),
- of("?18..", "18XX?", CENTURY_NUMERIC),
- of("18..?", "18XX?", CENTURY_NUMERIC),
- of("?18..?", "18XX?", CENTURY_NUMERIC),
- of("192?", null, null, null), //Too many digits
- of("1..", null, null, null), //Too few digits
-
- //PATTERN_ENGLISH
- of("1st century", "00XX", CENTURY_NUMERIC),
- of("2nd century", "01XX", CENTURY_NUMERIC),
- of("3rd century", "02XX", CENTURY_NUMERIC),
- of("11th century", "10XX", CENTURY_NUMERIC),
- of(" 11th century ", "10XX", CENTURY_NUMERIC),
- of("?11th century", "10XX?", CENTURY_NUMERIC),
- of("11th century?", "10XX?", CENTURY_NUMERIC),
- of("?11th century?", "10XX?", CENTURY_NUMERIC),
- of("12th century BC", null, null, null), // not supported
- of("[10th century]", null, null, null), // not supported
- of("11thcentury", null, null, null), //Incorrect spacing numeric
- of("11st century", null, null, null), //Incorrect suffix
- of("12rd century", null, null, null), //Incorrect suffix
- of("13st century", null, null, null), //Incorrect suffix
- of("21th century", null, null, null), //Incorrect suffix
- of("0st century", null, null, null), //Out of range
- of("22nd century", null, null, null) //Out of range
- );
- }
-
- private static Stream extractRomanData() {
- return Stream.of(
- //PATTERN_ROMAN
- //Uppercase
- of("I", "00XX", CENTURY_ROMAN),
- of("IV", "03XX", CENTURY_ROMAN),
- of("V", "04XX", CENTURY_ROMAN),
- of("VI", "05XX", CENTURY_ROMAN),
- of("IX", "08XX", CENTURY_ROMAN),
- of("X", "09XX", CENTURY_ROMAN),
- of("XI", "10XX", CENTURY_ROMAN),
- of("XIV", "13XX", CENTURY_ROMAN),
- of("XV", "14XX", CENTURY_ROMAN),
- of("XVI", "15XX", CENTURY_ROMAN),
- of("XIX", "18XX", CENTURY_ROMAN),
- of("XX", "19XX", CENTURY_ROMAN),
- of("XXI", "20XX", CENTURY_ROMAN),
-
- //Lower case
- of("i", "00XX", CENTURY_ROMAN),
- of("iv", "03XX", CENTURY_ROMAN),
- of("v", "04XX", CENTURY_ROMAN),
- of("vi", "05XX", CENTURY_ROMAN),
- of("ix", "08XX", CENTURY_ROMAN),
- of("x", "09XX", CENTURY_ROMAN),
- of("xi", "10XX", CENTURY_ROMAN),
- of("xiv", "13XX", CENTURY_ROMAN),
- of("xv", "14XX", CENTURY_ROMAN),
- of("xvi", "15XX", CENTURY_ROMAN),
- of("xix", "18XX", CENTURY_ROMAN),
- of("xx", "19XX", CENTURY_ROMAN),
- of("xxi", "20XX", CENTURY_ROMAN),
-
- //Prefixes
- of("s I", "00XX", CENTURY_ROMAN),
- of("s. I", "00XX", CENTURY_ROMAN),
- of("S I", "00XX", CENTURY_ROMAN),
- of("S.I", "00XX", CENTURY_ROMAN),
- of("sec.I", "00XX", CENTURY_ROMAN),
- of("SEC.I", "00XX", CENTURY_ROMAN),
- of("sec. I", "00XX", CENTURY_ROMAN),
- of("SEC. I", "00XX", CENTURY_ROMAN),
- of("saec.I", "00XX", CENTURY_ROMAN),
- of("SAEC.I", "00XX", CENTURY_ROMAN),
- of("saec. I", "00XX", CENTURY_ROMAN),
- of("SAEC. I", "00XX", CENTURY_ROMAN),
- //Other possibilities and uncertain
- of("Ii", "01XX", CENTURY_ROMAN),
- of(" s I ", "00XX", CENTURY_ROMAN),
- of("?s. I", "00XX?", CENTURY_ROMAN),
- of("sec. I?", "00XX?", CENTURY_ROMAN),
- of("?saec. I?", "00XX?", CENTURY_ROMAN),
- of(" I ", "00XX", CENTURY_ROMAN),
- of("?I", "00XX?", CENTURY_ROMAN),
- of("I?", "00XX?", CENTURY_ROMAN),
- of("?I?", "00XX?", CENTURY_ROMAN),
- //Non matches
- of("saecI", null, null), //Without a dot a space is required
- of("secI", null, null), //Without a dot a space is required
- of("MDCLXX", null, null, null), // Not supported range
- of("IXX", null, null, null), // Invalid roman
-
- //PATTERN_ROMAN_RANGE
- //Uppercase
- of("I-II", "00XX/01XX", CENTURY_RANGE_ROMAN),
- of("II-III", "01XX/02XX", CENTURY_RANGE_ROMAN),
- of("III-IV", "02XX/03XX", CENTURY_RANGE_ROMAN),
- of("IV-V", "03XX/04XX", CENTURY_RANGE_ROMAN),
- of("V-VI", "04XX/05XX", CENTURY_RANGE_ROMAN),
- of("VI-VII", "05XX/06XX", CENTURY_RANGE_ROMAN),
- of("VII-VIII", "06XX/07XX", CENTURY_RANGE_ROMAN),
- of("VIII-IX", "07XX/08XX", CENTURY_RANGE_ROMAN),
- of("IX-X", "08XX/09XX", CENTURY_RANGE_ROMAN),
- of("X-XI", "09XX/10XX", CENTURY_RANGE_ROMAN),
- of("XI-XII", "10XX/11XX", CENTURY_RANGE_ROMAN),
- of("XII-XIII", "11XX/12XX", CENTURY_RANGE_ROMAN),
- of("XIII-XIV", "12XX/13XX", CENTURY_RANGE_ROMAN),
- of("XIV-XV", "13XX/14XX", CENTURY_RANGE_ROMAN),
- of("XV-XVI", "14XX/15XX", CENTURY_RANGE_ROMAN),
- of("XVI-XVII", "15XX/16XX", CENTURY_RANGE_ROMAN),
- of("XVII-XVIII", "16XX/17XX", CENTURY_RANGE_ROMAN),
- of("XVIII-XIX", "17XX/18XX", CENTURY_RANGE_ROMAN),
- of("XIX-XX", "18XX/19XX", CENTURY_RANGE_ROMAN),
- of("XX-XXI", "19XX/20XX", CENTURY_RANGE_ROMAN),
-
- //Lowercase
- of("i-ii", "00XX/01XX", CENTURY_RANGE_ROMAN),
- of("ii-iii", "01XX/02XX", CENTURY_RANGE_ROMAN),
- of("iii-iv", "02XX/03XX", CENTURY_RANGE_ROMAN),
- of("iv-v", "03XX/04XX", CENTURY_RANGE_ROMAN),
- of("v-vi", "04XX/05XX", CENTURY_RANGE_ROMAN),
- of("vi-vii", "05XX/06XX", CENTURY_RANGE_ROMAN),
- of("vii-viii", "06XX/07XX", CENTURY_RANGE_ROMAN),
- of("viii-ix", "07XX/08XX", CENTURY_RANGE_ROMAN),
- of("ix-x", "08XX/09XX", CENTURY_RANGE_ROMAN),
- of("x-xi", "09XX/10XX", CENTURY_RANGE_ROMAN),
- of("xi-xii", "10XX/11XX", CENTURY_RANGE_ROMAN),
- of("xii-xiii", "11XX/12XX", CENTURY_RANGE_ROMAN),
- of("xiii-xiv", "12XX/13XX", CENTURY_RANGE_ROMAN),
- of("xiv-xv", "13XX/14XX", CENTURY_RANGE_ROMAN),
- of("xv-xvi", "14XX/15XX", CENTURY_RANGE_ROMAN),
- of("xvi-xvii", "15XX/16XX", CENTURY_RANGE_ROMAN),
- of("xvii-xviii", "16XX/17XX", CENTURY_RANGE_ROMAN),
- of("xviii-xix", "17XX/18XX", CENTURY_RANGE_ROMAN),
- of("xix-xx", "18XX/19XX", CENTURY_RANGE_ROMAN),
- of("xx-xxi", "19XX/20XX", CENTURY_RANGE_ROMAN),
-
- //Prefixes
- of("s I-II", "00XX/01XX", CENTURY_RANGE_ROMAN),
- of("S I-II", "00XX/01XX", CENTURY_RANGE_ROMAN),
- of("s. I-II", "00XX/01XX", CENTURY_RANGE_ROMAN),
- of("S. I-II", "00XX/01XX", CENTURY_RANGE_ROMAN),
- of("sec.IV-VII", "03XX/06XX", CENTURY_RANGE_ROMAN),
- of("SEC.IV-VII", "03XX/06XX", CENTURY_RANGE_ROMAN),
- of("sec. IV-VII", "03XX/06XX", CENTURY_RANGE_ROMAN),
- of("SEC. IV-VII", "03XX/06XX", CENTURY_RANGE_ROMAN),
- of("saec.VII-XVIII", "06XX/17XX", CENTURY_RANGE_ROMAN),
- of("SAEC.VII-XVIII", "06XX/17XX", CENTURY_RANGE_ROMAN),
- of("saec. XVI-XVIII", "15XX/17XX", CENTURY_RANGE_ROMAN),
- of("SAEC. XVI-XVIII", "15XX/17XX", CENTURY_RANGE_ROMAN),
-
- //Other possibilities and uncertain
- of("s I-iI", "00XX/01XX", CENTURY_RANGE_ROMAN),
- of(" s I-II ", "00XX/01XX", CENTURY_RANGE_ROMAN),
- of("?saec.X-XVIII", "09XX?/17XX?", CENTURY_RANGE_ROMAN),
- of("X-XVIII?", "09XX?/17XX?", CENTURY_RANGE_ROMAN),
- of("?saec.X-XVIII?", "09XX?/17XX?", CENTURY_RANGE_ROMAN),
-
- //Non matches
- of("S. XIIII-XIIIV", null, null), //Invalid roman
- of("S. XVIII-", null, null, null), //Open-ended incorrect
- of("sII-V", null, null), //Without a dot a space is required
- of("secVI-XVII", null, null), //Without a dot a space is required
- of("saecX-XVIII?", null, null) //Without a dot a space is required
- );
- }
-
-}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractorTest.java
deleted file mode 100644
index 74670ea253..0000000000
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DcmiPeriodDateExtractorTest.java
+++ /dev/null
@@ -1,119 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.DCMI_PERIOD;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.params.provider.Arguments.of;
-
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.DateNormalizationResultStatus;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDate;
-import java.util.stream.Stream;
-import org.junit.jupiter.api.DisplayName;
-import org.junit.jupiter.params.ParameterizedTest;
-import org.junit.jupiter.params.provider.Arguments;
-import org.junit.jupiter.params.provider.MethodSource;
-
-/**
- * Unit tests for {@link DcmiPeriodDateExtractor} class
- */
-class DcmiPeriodDateExtractorTest {
-
- @ParameterizedTest
- @MethodSource("extractData")
- @DisplayName("Extract DCMI Period")
- void extract(String actualDcmiPeriod, String expectedLabel, String expectedStartDate, String expectedEndDate) {
- DcmiPeriodDateExtractor periodDateExtractor = new DcmiPeriodDateExtractor();
- DateNormalizationResult dateNormalizationResult = periodDateExtractor.extractDateProperty(actualDcmiPeriod, NO_QUALIFICATION);
- if (expectedStartDate == null || expectedEndDate == null) {
- assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus());
- } else {
- IntervalEdtfDate interval = (IntervalEdtfDate) dateNormalizationResult.getEdtfDate();
- assertEquals(expectedLabel, interval.getLabel());
- assertEquals(expectedStartDate, interval.getStart() != null ? interval.getStart().toString() : null);
- assertEquals(expectedEndDate, interval.getEnd() != null ? interval.getEnd().toString() : null);
- assertEquals(DCMI_PERIOD, dateNormalizationResult.getDateNormalizationExtractorMatchId());
- }
- }
-
- private static Stream extractData() {
- return Stream.of(
- of("name=The Great Depression; start=1929; end=1939;",
- "The Great Depression", "1929", "1939"),
- of("name=Haagse International Arts Festival, 2000; start=2000-01-26; end=2000-02-20;",
- "Haagse International Arts Festival, 2000", "2000-01-26", "2000-02-20"),
- of("start=1998-09-25; end=1998-09-25; scheme=W3C-DTF;",
- null, "1998-09-25", "1998-09-25"),
- of("start=1998-09-25T14:20:00+10:00; scheme=W3C-DTF;",
- null, "1998-09-25", ".."),
- of("end=1998-09-25T16:40:00+10:00; scheme=W3C-DTF;",
- null, "..", "1998-09-25"),
- of("end=1998-09-25T16:40+10:00; start=1998/01/01 scheme=W3C-DTF;", null, "..", "1998-09-25"),
-
- //Scheme checks
- of("name=The Great Depression; start=1929; end=1939; scheme=W3CDTF;",
- "The Great Depression", "1929", "1939"),
- of("name=The Great Depression; start=1929; end=1939; scheme=W3C-DTF;",
- "The Great Depression", "1929", "1939"),
- of("scheme=W3C-DTF; name=The Great Depression; start=1929; end=1939;",
- "The Great Depression", "1929", "1939"),
- of("name=The Great Depression; start=1929; end=1939; scheme=W3C-DTF",
- "The Great Depression", "1929", "1939"),
- of("name=The Great Depression; start=1929; end=1939; scheme=W3C-", null, null, null),
-
- //double fields should be false
- of("name=The Great Depression; start=1929; end=1939; name=The Great Depression;", null, null, null),
- of("name=The Great Depression; start=1929; end=1939; start=1929;", null, null, null),
- of("name=The Great Depression; end=1939; start=1929; end=1939;", null, null, null),
-
- //Both start and end null then false
- of("name=The Great Depression; start=; end=;", null, null, null),
- of("name=The Great Depression;", null, null, null),
-
- //One end bounded
- of("name=The Great Depression; start=; end=1939;",
- "The Great Depression", "..", "1939"),
- of("name=The Great Depression; start=1929; end=;",
- "The Great Depression", "1929", ".."),
-
- //Full date
- of("name=Haagse International Arts Festival, 2000; start=2000-01-26; end=2000-02-20;",
- "Haagse International Arts Festival, 2000", "2000-01-26", "2000-02-20"),
-
- //Full date and time
- of("start=1999-09-25T14:20:00+10:00; end=1999-09-25T16:40:00+10:00; scheme=W3C-DTF;",
- null, "1999-09-25", "1999-09-25"),
- of("start=1999-09-25T14:20:00+10:00; scheme=W3C-DTF;",
- null, "1999-09-25", ".."),
- of("end=1999-09-25T16:40:00+10:00; scheme=W3C-DTF;",
- null, "..", "1999-09-25"),
-
- //Missing semicolon
- of("end=1998-09-25T16:40:00+10:00; start=1998 scheme=W3C-DTF;",
- null, "..", "1998-09-25"),
-
- //Invalid date
- of("end=1998-09-25T16:40+10:00; start=1998-1986; scheme=W3C-DTF;", null, null, null),
- //
- //Spaces at the end of the name are cleaned up
- of("name=The Great Depression ; start=1929; end=1939;",
- "The Great Depression", "1929", "1939"),
-
- //Spaces at the beginning of the name are cleaned up
- of("name= The Great Depression; start=1929; end=1939;",
- "The Great Depression", "1929", "1939"),
-
- //Name at the beginning without field name
- of("The Great Depression; start=1929; end=1939;",
- null, "1929", "1939"),
-
- //Name at the beginning without field name and spaces at wrapped
- of(" The Great Depression ; start=1929; end=1939;",
- null, "1929", "1939"),
-
- //Normal case
- of("name=The Great Depression; start=1929; end=1939;",
- "The Great Depression", "1929", "1939")
- );
- }
-}
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractorTest.java
deleted file mode 100644
index fbdd350963..0000000000
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/DecadeDateExtractorTest.java
+++ /dev/null
@@ -1,74 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.DECADE;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.params.provider.Arguments.of;
-
-import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.DateNormalizationResultStatus;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import java.util.stream.Stream;
-import org.junit.jupiter.params.ParameterizedTest;
-import org.junit.jupiter.params.provider.Arguments;
-import org.junit.jupiter.params.provider.MethodSource;
-
-class DecadeDateExtractorTest {
-
- private static final DecadeDateExtractor DECADE_DATE_EXTRACTOR = new DecadeDateExtractor();
-
- @ParameterizedTest
- @MethodSource
- void extract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
- final DateNormalizationResult dateNormalizationResult = DECADE_DATE_EXTRACTOR.extractDateProperty(input, NO_QUALIFICATION);
- if (expected == null) {
- assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus());
- } else {
- final String actual = dateNormalizationResult.getEdtfDate().toString();
- assertEquals(expected, actual);
- assertEquals(actual.contains("?"),
- dateNormalizationResult.getEdtfDate().getDateQualification() == DateQualification.UNCERTAIN);
- assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId());
- }
- }
-
- private static Stream extract() {
- return Stream.of(
- of("180x", "180X", DECADE),
- of("180u", "180X", DECADE),
- of("180X", "180X", DECADE),
- of("180U", "180X", DECADE),
- of(" 180u ", "180X", DECADE),
- of("180x?", "180X?", DECADE),
- of("180u?", "180X?", DECADE),
- of("180??", "180X?", DECADE),
- of("?180x", "180X?", DECADE),
- of("?180u", "180X?", DECADE),
- of("?180x?", "180X?", DECADE),
- of("?180u?", "180X?", DECADE),
- of("?180??", "180X?", DECADE),
-
- //Future dates not allowed
- of("222u", null, null),
- //This is an ambiguous case because hyphen can be used as a separator
- of("180-?", null, null),
- //Ambiguous, possible open end
- of("180-", null, null),
- of("180s", null, null),//Non u, x or ?
- of("180?", null, null), //Only one question mark not supported
- //Too many digits
- of("1800", null, null),
- of("?1280x", null, null),
- of("?1280u?", null, null),
- of("?1280??", null, null),
- of("1280??", null, null),
-
- of("18??", null, null), //Too few digits
- of("18--", null, null), //Too few digits
- of("18..", null, null), //Too few digits
- of("1...", null, null) //Too few digits
- );
- }
-
-}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractorTest.java
deleted file mode 100644
index b0fedbf806..0000000000
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/EdtfDateExtractorTest.java
+++ /dev/null
@@ -1,319 +0,0 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
-
-import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN;
-import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN;
-import static eu.europeana.normalization.dates.edtf.DateQualification.APPROXIMATE;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN;
-import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN_APPROXIMATE;
-import static eu.europeana.normalization.dates.edtf.IntervalEdtfDate.DATE_INTERVAL_SEPARATOR;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.params.provider.Arguments.of;
-
-import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.DateNormalizationResultStatus;
-import eu.europeana.normalization.dates.edtf.AbstractEdtfDate;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDate;
-import java.util.stream.Stream;
-import org.junit.jupiter.api.DisplayName;
-import org.junit.jupiter.params.ParameterizedTest;
-import org.junit.jupiter.params.provider.Arguments;
-import org.junit.jupiter.params.provider.MethodSource;
-
-class EdtfDateExtractorTest {
-
- private final EdtfDateExtractor edtfDateExtractor = new EdtfDateExtractor();
-
- // TODO: 01/03/2023 Possible reuse of the test code here for all extractors
- private void assertExtract(String input, String expected) {
- final DateNormalizationResult dateNormalizationResult = edtfDateExtractor.extractDateProperty(input, NO_QUALIFICATION);
- if (expected == null) {
- assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus());
- } else {
- AbstractEdtfDate edtfDate = dateNormalizationResult.getEdtfDate();
- if (edtfDate instanceof IntervalEdtfDate) {
- String startPart = expected.substring(0, expected.indexOf(DATE_INTERVAL_SEPARATOR));
- String endPart = expected.substring(expected.indexOf(DATE_INTERVAL_SEPARATOR) + 1);
- InstantEdtfDate start = ((IntervalEdtfDate) edtfDate).getStart();
- InstantEdtfDate end = ((IntervalEdtfDate) edtfDate).getEnd();
- assertEdtfDate(startPart, start);
- assertEdtfDate(endPart, end);
- } else {
- assertEdtfDate(expected, (InstantEdtfDate) dateNormalizationResult.getEdtfDate());
- }
- assertEquals(expected, edtfDate.toString());
- }
- }
-
- private static void assertEdtfDate(String expected, InstantEdtfDate instantEdtfDate) {
- assertEquals(expected.contains("?"), instantEdtfDate.getDateQualification() == UNCERTAIN);
- assertEquals(expected.contains("~"), instantEdtfDate.getDateQualification() == APPROXIMATE);
- assertEquals(expected.contains("%"), instantEdtfDate.getDateQualification() == UNCERTAIN_APPROXIMATE);
- assertEquals(expected.equals(OPEN.getSerializedRepresentation()),
- instantEdtfDate.getDateBoundaryType() == OPEN || instantEdtfDate.getDateBoundaryType() == UNKNOWN);
- }
-
- @ParameterizedTest
- @MethodSource
- @DisplayName("[year][“-”][month][“-”][day] Complete representation")
- void completeDateRepresentationLevel0(String input, String expected) {
- assertExtract(input, expected);
- }
-
- @ParameterizedTest
- @MethodSource
- @DisplayName("[year][“-”][month] Reduced precision for year and month")
- void reducedPrecisionForYearAndMonthLevel0(String input, String expected) {
- assertExtract(input, expected);
- }
-
- @ParameterizedTest
- @MethodSource
- @DisplayName("[year] Reduced precision for year")
- void reducedPrecisionForYearLevel0(String input, String expected) {
- assertExtract(input, expected);
- }
-
- @ParameterizedTest
- @MethodSource
- void dateIntervalRepresentationLevel0(String input, String expected) {
- assertExtract(input, expected);
- }
-
- @ParameterizedTest
- @MethodSource
- @DisplayName("Letter-prefixed calendar year")
- void letterPrefixedCalendarYearLevel1(String input, String expected) {
- assertExtract(input, expected);
- }
-
- @ParameterizedTest
- @MethodSource
- @DisplayName("The characters '?', '~' and '%' are used to mean \"uncertain\", \"approximate\", and \"uncertain\" as well as \"approximate\", respectively")
- void dateQualificationLevel1(String input, String expected) {
- assertExtract(input, expected);
- }
-
- @ParameterizedTest
- @MethodSource
- @DisplayName("Negative Calendar Year")
- void negativeCalendarYearLevel1(String input, String expected) {
- assertExtract(input, expected);
- }
-
-
- @ParameterizedTest
- @MethodSource
- @DisplayName("Open time interval")
- void openTimeIntervalLevel1(String input, String expected) {
- assertExtract(input, expected);
- }
-
-
- @ParameterizedTest
- @MethodSource
- @DisplayName("Unknown time interval")
- void unknownTimeIntervalLevel1(String input, String expected) {
- assertExtract(input, expected);
- }
-
- private static Stream completeDateRepresentationLevel0() {
- return Stream.of(
- of("1989-11-01", "1989-11-01"),
- of("0989-11-01", "0989-11-01"),
- of("0989-11-01", "0989-11-01"),
- //Digits missing on year
- of("198-11-01", null),
- //Digits missing on month or day
- of("1989-11-1", null),
- of("1989-1-01", null),
- //Anything other than hyphen "-" is not valid
- of("1989/11/01", null),
-
- //Complete representations for calendar date and (local) time of day
- of("1989-11-01T23:59:59", "1989-11-01"),
- of("1989-11-01T23:59", "1989-11-01"),
- of("1989-11-01T23", "1989-11-01"),
- of("1989-11-01T", "1989-11-01"),
- of("1989-11-01T23:59:5", "1989-11-01"),
- of("1989-11-01T23:5:59", "1989-11-01"),
- of("1989-11-01t23:59:59", null),
- of("1989-11-01 23:59:59", null),
-
- //Complete representations for calendar date and UTC time of day
- of("1989-11-01T23:59:59Z", "1989-11-01"),
- of("1989-11-01t23:59:59Z", null),
- of("1989-11-01 23:59:59Z", null),
-
- //Date and time with time shift in hours (only)
- of("1989-11-01T23:59:59-04", "1989-11-01"),
- of("1989-11-01T23:59:59+04", "1989-11-01"),
- of("1989-11-01t23:59:59-04", null),
- of("1989-11-01 23:59:59-04", null),
-
- //Date and time with time shift in hours and minutes
- of("1989-11-01T23:59:59-04:44", "1989-11-01"),
- of("1989-11-01T23:59:59+04:44", "1989-11-01"),
- of("1989-11-01t23:59:59-04:44", null),
- of("1989-11-01 23:59:59-04:44", null)
- );
- }
-
- private static Stream reducedPrecisionForYearAndMonthLevel0() {
- return Stream.of(
- of("1989-11", "1989-11"),
- of("0989-11", "0989-11"),
- //Digits missing on year
- of("198-11", null),
- //Digits missing on month
- of("1989-1", null),
- //Anything other than hyphen "-" is not valid
- of("1989/11", null)
- );
- }
-
- private static Stream reducedPrecisionForYearLevel0() {
- return Stream.of(
- of("1989", "1989"),
- of("0989", "0989"),
- //Digits missing on year
- of("198", null)
- );
- }
-
- private static Stream dateIntervalRepresentationLevel0() {
- return Stream.of(
- of("1989/1990", "1989/1990"),
- of("1989-11/1990-11", "1989-11/1990-11"),
- of("1989-11-01/1990-11-01", "1989-11-01/1990-11-01"),
- of("1989-11-01/1990-11", "1989-11-01/1990-11"),
- of("1989-11-01/1990", "1989-11-01/1990"),
- of("1989/1990-11", "1989/1990-11"),
- of("1989/1990-11-01", "1989/1990-11-01"),
- of("1989-00/1990-00", null),
- of("1989-00-00/1990-00-00", null),
- //Spaces not valid
- of("1989 / 1990", null),
- //Dash not valid
- of("1989-1990", null),
- //Missing digits
- of("989-1990", null),
- of("1989-990", null)
- );
- }
-
- private static Stream letterPrefixedCalendarYearLevel1() {
- return Stream.of(
- //Future dates are not valid
- of("Y170000002", null),
- of("Y-170000002", "Y-170000002"),
- //Overflow, max is +-999999999
- of("Y1700000002", null),
- of("Y-1700000002", null),
- //Too low values
- of("Y0", null),
- of("Y1", null),
- of("Y-1", null),
- of("Y", null)
- );
- }
-
- private static Stream dateQualificationLevel1() {
- return Stream.of(
- of("1989?", "1989?"),
- of("1989~", "1989~"),
- of("1989-11?", "1989-11?"),
- of("1989-11~", "1989-11~"),
- of("1989-11-01%", "1989-11-01%")
- );
- }
-
- private static Stream negativeCalendarYearLevel1() {
- return Stream.of(
- of("-1989", "-1989"),
- of("-9999", "-9999"),
- of("-0989", "-0989"),
- of("-11989", null)
- );
- }
-
- private static Stream openTimeIntervalLevel1() {
- return Stream.of(
- //Open start
- of("../1989-11-01", "../1989-11-01"),
- of("../1989-11", "../1989-11"),
- of("../1989", "../1989"),
- of("../1989-11-01~", "../1989-11-01~"),
- of("../1989-11~", "../1989-11~"),
- of("../1989~", "../1989~"),
- of("../1989-11-01?", "../1989-11-01?"),
- of("../1989-11?", "../1989-11?"),
- of("../1989?", "../1989?"),
- of("../1989-11-01%", "../1989-11-01%"),
- of("../1989-11%", "../1989-11%"),
- of("../1989%", "../1989%"),
- of(".. / 1989-11-01", null),
- of("../ 1989-11-01", null),
- of(".. /1989-11-01", null),
-
- //Open end
- of("1989-11-01/..", "1989-11-01/.."),
- of("1989-11/..", "1989-11/.."),
- of("1989/..", "1989/.."),
- of("1989-11-01~/..", "1989-11-01~/.."),
- of("1989-11~/..", "1989-11~/.."),
- of("1989~/..", "1989~/.."),
- of("1989-11-01?/..", "1989-11-01?/.."),
- of("1989-11?/..", "1989-11?/.."),
- of("1989?/..", "1989?/.."),
- of("1989-11-01%/..", "1989-11-01%/.."),
- of("1989-11%/..", "1989-11%/.."),
- of("1989%/..", "1989%/.."),
- of("1989-11-01 / ..", null),
- of("1989-11-01 /..", null),
- of("1989-11-01/ ..", null),
- of("../..", null)
- );
- }
-
-
- private static Stream unknownTimeIntervalLevel1() {
- return Stream.of(
- //Unknown start
- of("/1989-11-01", "../1989-11-01"),
- of("/1989-11", "../1989-11"),
- of("/1989", "../1989"),
- of("/1989-11-01~", "../1989-11-01~"),
- of("/1989-11~", "../1989-11~"),
- of("/1989~", "../1989~"),
- of("/1989-11-01?", "../1989-11-01?"),
- of("/1989-11?", "../1989-11?"),
- of("/1989?", "../1989?"),
- of("/1989-11-01%", "../1989-11-01%"),
- of("/1989-11%", "../1989-11%"),
- of("/1989%", "../1989%"),
- of(" / 1989-11-01", null),
- of("/ 1989-11-01", null),
- of(" /1989-11-01", null),
-
- //Unknown end
- of("1989-11-01/", "1989-11-01/.."),
- of("1989-11/", "1989-11/.."),
- of("1989/", "1989/.."),
- of("1989-11-01~/", "1989-11-01~/.."),
- of("1989-11~/", "1989-11~/.."),
- of("1989~/", "1989~/.."),
- of("1989-11-01?/", "1989-11-01?/.."),
- of("1989-11?/", "1989-11?/.."),
- of("1989?/", "1989?/.."),
- of("1989-11-01%/", "1989-11-01%/.."),
- of("1989-11%/", "1989-11%/.."),
- of("1989%/", "1989%/.."),
- of("1989-11-01 / ", null),
- of("1989-11-01 /", null),
- of("1989-11-01/ ", null),
- of("/", null)
- );
- }
-}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractorTest.java
new file mode 100644
index 0000000000..bb1c60d1ac
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdDateExtractorTest.java
@@ -0,0 +1,117 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.BC_AD;
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class BcAdDateExtractorTest implements DateExtractorTest {
+
+ private static final BcAdDateExtractor PATTERN_BC_AD_DATE_EXTRACTOR = new BcAdDateExtractor();
+
+ @ParameterizedTest
+ @MethodSource
+ void extract(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = PATTERN_BC_AD_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, BC_AD);
+ }
+
+ private static Stream extract() {
+ return Stream.of(
+ //Bulgarian
+ of("1989 пр.Хр.", "-1988"),
+ of("1989 сл.Хр.", "1989"),
+ //Croatian
+ of("1989 pr. Kr.", "-1988"),
+ of("1989 po. Kr.", "1989"),
+ //Czech
+ of("1989 př. n. l.", "-1988"),
+ of("1989 n. l.", "1989"),
+ //Danish
+ of("1989 f.Kr.", "-1988"),
+ of("1989 e.Kr.", "1989"),
+ //Dutch
+ of("1989 v.Chr.", "-1988"),
+ of("1989 n.Chr.", "1989"),
+ //English
+ of("1989 BC", "-1988"),
+ of("1989 AD", "1989"),
+ //Estonian
+ of("1989 eKr", "-1988"),
+ of("1989 pKr", "1989"),
+ //Finnish
+ of("1989 eKr.", "-1988"),
+ of("1989 jKr.", "1989"),
+ //French
+ of("1989 av. J.-C.", "-1988"),
+ of("1989 ap. J.-C.", "1989"),
+ //German
+ of("1989 v. Chr.", "-1988"),
+ of("1989 n. Chr.", "1989"),
+ //Greek
+ of("1989 π.Χ.", "-1988"),
+ of("1989 μ.Χ.", "1989"),
+ //Hungarian
+ of("1989 i. e.", "-1988"),
+ of("1989 i. sz.", "1989"),
+ //Irish
+ of("1989 RC", "-1988"),
+ of("1989 AD", "1989"),
+ //Italian
+ of("1989 a.C.", "-1988"),
+ of("1989 d.C.", "1989"),
+ //Latvian
+ of("1989 p.m.ē.", "-1988"),
+ of("1989 m.ē.", "1989"),
+ //Lithuanian
+ of("1989 pr. Kr.", "-1988"),
+ of("1989 po Kr.", "1989"),
+ //Maltese
+ of("1989 QK", "-1988"),
+ of("1989 WK", "1989"),
+ //Polish
+ of("1989 p.n.e.", "-1988"),
+ of("1989 n.e.", "1989"),
+ //Portuguese
+ of("1989 a.C.", "-1988"),
+ of("1989 d.C.", "1989"),
+ //Romanian
+ of("1989 î.Hr.", "-1988"),
+ of("1989 d.Hr.", "1989"),
+ //Slovak
+ of("1989 pred Kr.", "-1988"),
+ of("1989 po Kr.", "1989"),
+ //Slovenian
+ of("1989 pr. Kr.", "-1988"),
+ of("1989 po Kr.", "1989"),
+ //Spanish
+ of("1989 a. C.", "-1988"),
+ of("1989 d. C.", "1989"),
+ //Swedish
+ of("1989 f.Kr.", "-1988"),
+ of("1989 e.Kr.", "1989"),
+
+ //Less digits
+ of("198 AD", "0198"),
+ of("19 AD", "0019"),
+
+ //First years
+ of("1 AD", "0001"),
+ of("1 BC", "0000"),
+ of("2 BC", "-0001"),
+
+ //Invalids
+ of("0 BC", null),
+ of("-1989 BC", null),
+ of("-1989 AD", null)
+ );
+ }
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractorTest.java
new file mode 100644
index 0000000000..22c50ef142
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BcAdRangeDateExtractorTest.java
@@ -0,0 +1,68 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.BC_AD;
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class BcAdRangeDateExtractorTest implements DateExtractorTest {
+
+ private static final BcAdRangeDateExtractor BC_AD_RANGE_DATE_EXTRACTOR = new BcAdRangeDateExtractor();
+
+ @ParameterizedTest
+ @MethodSource
+ void extract(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = BC_AD_RANGE_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, BC_AD);
+ }
+
+ private static Stream extract() {
+ return Stream.of(
+ //BC-BC
+ of("1990 BC-1989 BC", "-1989/-1988"),
+ of("1990 BC/1989 BC", "-1989/-1988"),
+ of("1990 BC - 1989 BC", "-1989/-1988"),
+ of("1990 BC / 1989 BC", "-1989/-1988"),
+ of("1990 BC-1 BC", "-1989/0000"),
+
+ //BC-BC(Greek)
+ of("1990 π.Χ.-1989 π.Χ.", "-1989/-1988"),
+ of("1990 π.Χ./1989 π.Χ.", "-1989/-1988"),
+ of("1990 π.Χ. - 1989 π.Χ.", "-1989/-1988"),
+ of("1990 π.Χ. / 1989 π.Χ.", "-1989/-1988"),
+ of("1990 π.Χ.-1 π.Χ.", "-1989/0000"),
+
+ //AD-AD
+ of("1989 AD-1990 AD", "1989/1990"),
+ of("1989 AD/1990 AD", "1989/1990"),
+ of("1989 AD - 1990 AD", "1989/1990"),
+ of("1989 AD / 1990 AD", "1989/1990"),
+
+ //AD-AD(Greek)
+ of("1989 μ.Χ.-1990 μ.Χ.", "1989/1990"),
+ of("1989 μ.Χ./1990 μ.Χ.", "1989/1990"),
+ of("1989 μ.Χ. - 1990 μ.Χ.", "1989/1990"),
+ of("1989 μ.Χ. / 1990 μ.Χ.", "1989/1990"),
+
+ //BC-AD
+ of("1989 π.Χ.-1989 μ.Χ.", "-1988/1989"),
+ of("1989 π.Χ.-1 μ.Χ.", "-1988/0001"),
+
+ //Invalids
+ of("1990 BC//1989 BC", null),
+ of("-1990 BC-1989 BC", null),
+ of("-1990 BC--1989 BC", null),
+ of("1990 BC , 1989 BC", null),
+ of("1989 BC-0 BC", null),
+ of("1989 BC-0 AD", null)
+ );
+ }
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractorTest.java
new file mode 100644
index 0000000000..1995294e75
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/BriefRangeDateExtractorTest.java
@@ -0,0 +1,61 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.BRIEF_DATE_RANGE;
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class BriefRangeDateExtractorTest implements DateExtractorTest {
+
+ private static final BriefRangeDateExtractor BRIEF_RANGE_DATE_EXTRACTOR = new BriefRangeDateExtractor();
+
+ private void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = BRIEF_RANGE_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, BRIEF_DATE_RANGE);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ void extractBrief(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ private static Stream extractBrief() {
+ return Stream.of(
+ //Slash
+ of("1989/90", "1989/1990"),
+ of("1989/90?", "1989/1990?"),
+ of("?1989/90", "1989?/1990"),
+ of("?1989/90?", "1989?/1990?"),
+ of("-1989/-88", null),
+
+ //Dash
+ of("1989-90", "1989/1990"),
+ of("1989-90?", "1989/1990?"),
+ of("?1989-90", "1989?/1990"),
+ of("?1989-90?", "1989?/1990?"),
+ of("989-90", "0989/0990"),
+
+ //End date lower rightmost two digits than start year
+ of("1989/89", null),
+ of("1989/88", null),
+ of("1989-89", null),
+ of("1989-88", null),
+
+ //More than two digits on end year not allowed
+ of("1989/990", null),
+ of("1989-990", null),
+
+ //End year cannot be lower or equal than 12
+ of("1900/01", null),
+ of("1900/12", null),
+
+ //Less than three digits on start year
+ of("89-90", null)
+ );
+ }
+}
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractorTest.java
new file mode 100644
index 0000000000..fa6a3b98a4
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyNumericDateExtractorTest.java
@@ -0,0 +1,60 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.CENTURY_NUMERIC;
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class CenturyNumericDateExtractorTest implements DateExtractorTest {
+
+ private static final CenturyNumericDateExtractor CENTURY_DATE_EXTRACTOR = new CenturyNumericDateExtractor();
+
+ void assertExtract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
+ final DateNormalizationResult dateNormalizationResult = CENTURY_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ void extractNumeric(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
+ assertExtract(input, expected, dateNormalizationExtractorMatchId);
+ }
+
+ private static Stream extractNumeric() {
+ return Stream.of(
+ //PATTERN_YYYY
+ of("18..", "18XX", CENTURY_NUMERIC),
+ of(" 18.. ", "18XX", CENTURY_NUMERIC),
+ of("?18..", "18XX?", CENTURY_NUMERIC),
+ of("18..?", "18XX?", CENTURY_NUMERIC),
+ of("?18..?", "18XX?", CENTURY_NUMERIC),
+ of("192?", null, null, null), //Too many digits
+ of("1..", null, null, null), //Too few digits
+
+ //PATTERN_ENGLISH
+ of("1st century", "00XX", CENTURY_NUMERIC),
+ of("2nd century", "01XX", CENTURY_NUMERIC),
+ of("3rd century", "02XX", CENTURY_NUMERIC),
+ of("11th century", "10XX", CENTURY_NUMERIC),
+ of(" 11th century ", "10XX", CENTURY_NUMERIC),
+ of("?11th century", "10XX?", CENTURY_NUMERIC),
+ of("11th century?", "10XX?", CENTURY_NUMERIC),
+ of("?11th century?", "10XX?", CENTURY_NUMERIC),
+ of("12th century BC", null, null, null), // not supported
+ of("[10th century]", null, null, null), // not supported
+ of("11thcentury", null, null, null), //Incorrect spacing numeric
+ of("11st century", null, null, null), //Incorrect suffix
+ of("12rd century", null, null, null), //Incorrect suffix
+ of("13st century", null, null, null), //Incorrect suffix
+ of("21th century", null, null, null), //Incorrect suffix
+ of("0st century", null, null, null), //Out of range
+ of("22nd century", null, null, null) //Out of range
+ );
+ }
+
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomaDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomaDateExtractorTest.java
new file mode 100644
index 0000000000..84fba4cbfb
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomaDateExtractorTest.java
@@ -0,0 +1,93 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.CENTURY_ROMAN;
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class CenturyRomaDateExtractorTest implements DateExtractorTest {
+
+ private static final CenturyRomanDateExtractor ROMAN_CENTURY_DATE_EXTRACTOR = new CenturyRomanDateExtractor();
+
+ void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = ROMAN_CENTURY_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, CENTURY_ROMAN);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ void extractRoman(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ private static Stream extractRoman() {
+ return Stream.of(
+ //Uppercase
+ of("I", "00XX"),
+ of("IV", "03XX"),
+ of("V", "04XX"),
+ of("VI", "05XX"),
+ of("IX", "08XX"),
+ of("X", "09XX"),
+ of("XI", "10XX"),
+ of("XIV", "13XX"),
+ of("XV", "14XX"),
+ of("XVI", "15XX"),
+ of("XIX", "18XX"),
+ of("XX", "19XX"),
+ of("XXI", "20XX"),
+
+ //Lower case
+ of("i", "00XX"),
+ of("iv", "03XX"),
+ of("v", "04XX"),
+ of("vi", "05XX"),
+ of("ix", "08XX"),
+ of("x", "09XX"),
+ of("xi", "10XX"),
+ of("xiv", "13XX"),
+ of("xv", "14XX"),
+ of("xvi", "15XX"),
+ of("xix", "18XX"),
+ of("xx", "19XX"),
+ of("xxi", "20XX"),
+
+ //Prefixes
+ of("s I", "00XX"),
+ of("s. I", "00XX"),
+ of("S I", "00XX"),
+ of("S.I", "00XX"),
+ of("sec.I", "00XX"),
+ of("SEC.I", "00XX"),
+ of("sec. I", "00XX"),
+ of("SEC. I", "00XX"),
+ of("saec.I", "00XX"),
+ of("SAEC.I", "00XX"),
+ of("saec. I", "00XX"),
+ of("SAEC. I", "00XX"),
+ //Other possibilities and uncertain
+ of("Ii", "01XX"),
+ of(" s I ", "00XX"),
+ of("?s. I", "00XX?"),
+ of("sec. I?", "00XX?"),
+ of("?saec. I?", "00XX?"),
+ of(" I ", "00XX"),
+ of("?I", "00XX?"),
+ of("I?", "00XX?"),
+ of("?I?", "00XX?"),
+ //Non matches
+ //Without a dot a space is required
+ of("saecI", null),
+ of("secI", null),
+ // Not supported range
+ of("MDCLXX", null),
+ // Invalid roman
+ of("IXX", null)
+ );
+ }
+
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractorTest.java
new file mode 100644
index 0000000000..1a8474a613
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/CenturyRomanRangeDateExtractorTest.java
@@ -0,0 +1,103 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class CenturyRomanRangeDateExtractorTest implements DateExtractorTest {
+
+ private static final CenturyRomanRangeDateExtractor ROMAN_CENTURY_RANGE_DATE_EXTRACTOR = new CenturyRomanRangeDateExtractor();
+
+ void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = ROMAN_CENTURY_RANGE_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.CENTURY_RANGE_ROMAN);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ void extractRoman(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ private static Stream extractRoman() {
+ return Stream.of(
+ //Uppercase
+ of("I-II", "00XX/01XX"),
+ of("II-III", "01XX/02XX"),
+ of("III-IV", "02XX/03XX"),
+ of("IV-V", "03XX/04XX"),
+ of("V-VI", "04XX/05XX"),
+ of("VI-VII", "05XX/06XX"),
+ of("VII-VIII", "06XX/07XX"),
+ of("VIII-IX", "07XX/08XX"),
+ of("IX-X", "08XX/09XX"),
+ of("X-XI", "09XX/10XX"),
+ of("XI-XII", "10XX/11XX"),
+ of("XII-XIII", "11XX/12XX"),
+ of("XIII-XIV", "12XX/13XX"),
+ of("XIV-XV", "13XX/14XX"),
+ of("XV-XVI", "14XX/15XX"),
+ of("XVI-XVII", "15XX/16XX"),
+ of("XVII-XVIII", "16XX/17XX"),
+ of("XVIII-XIX", "17XX/18XX"),
+ of("XIX-XX", "18XX/19XX"),
+ of("XX-XXI", "19XX/20XX"),
+
+ //Lowercase
+ of("i-ii", "00XX/01XX"),
+ of("ii-iii", "01XX/02XX"),
+ of("iii-iv", "02XX/03XX"),
+ of("iv-v", "03XX/04XX"),
+ of("v-vi", "04XX/05XX"),
+ of("vi-vii", "05XX/06XX"),
+ of("vii-viii", "06XX/07XX"),
+ of("viii-ix", "07XX/08XX"),
+ of("ix-x", "08XX/09XX"),
+ of("x-xi", "09XX/10XX"),
+ of("xi-xii", "10XX/11XX"),
+ of("xii-xiii", "11XX/12XX"),
+ of("xiii-xiv", "12XX/13XX"),
+ of("xiv-xv", "13XX/14XX"),
+ of("xv-xvi", "14XX/15XX"),
+ of("xvi-xvii", "15XX/16XX"),
+ of("xvii-xviii", "16XX/17XX"),
+ of("xviii-xix", "17XX/18XX"),
+ of("xix-xx", "18XX/19XX"),
+ of("xx-xxi", "19XX/20XX"),
+
+ //Prefixes
+ of("s I-II", "00XX/01XX"),
+ of("S I-II", "00XX/01XX"),
+ of("s. I-II", "00XX/01XX"),
+ of("S. I-II", "00XX/01XX"),
+ of("sec.IV-VII", "03XX/06XX"),
+ of("SEC.IV-VII", "03XX/06XX"),
+ of("sec. IV-VII", "03XX/06XX"),
+ of("SEC. IV-VII", "03XX/06XX"),
+ of("saec.VII-XVIII", "06XX/17XX"),
+ of("SAEC.VII-XVIII", "06XX/17XX"),
+ of("saec. XVI-XVIII", "15XX/17XX"),
+ of("SAEC. XVI-XVIII", "15XX/17XX"),
+
+ //Other possibilities and uncertain
+ of("s I-iI", "00XX/01XX"),
+ of(" s I-II ", "00XX/01XX"),
+ of("?saec.X-XVIII", "09XX?/17XX"),
+ of("X-XVIII?", "09XX/17XX?"),
+ of("?saec.X-XVIII?", "09XX?/17XX?"),
+
+ //Non matches
+ of("S. XIIII-XIIIV", null, null), //Invalid roman
+ of("S. XVIII-", null, null, null), //Open-ended incorrect
+ of("sII-V", null, null), //Without a dot a space is required
+ of("secVI-XVII", null, null), //Without a dot a space is required
+ of("saecX-XVIII?", null, null) //Without a dot a space is required
+ );
+ }
+
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractorTest.java
new file mode 100644
index 0000000000..e6c0ee4930
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DateExtractorTest.java
@@ -0,0 +1,66 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN;
+import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN;
+import static eu.europeana.normalization.dates.edtf.DateQualification.APPROXIMATE;
+import static eu.europeana.normalization.dates.edtf.DateQualification.UNCERTAIN;
+import static eu.europeana.normalization.dates.extraction.DefaultDatesSeparator.SLASH_DELIMITER;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import eu.europeana.normalization.dates.DateNormalizationResultStatus;
+import eu.europeana.normalization.dates.edtf.AbstractEdtfDate;
+import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
+import eu.europeana.normalization.dates.edtf.IntervalEdtfDate;
+
+public interface DateExtractorTest {
+
+ default void assertQualification(String expected, InstantEdtfDate instantEdtfDate) {
+ assertEquals(expected.contains("?"),
+ instantEdtfDate.getDateQualifications().contains(UNCERTAIN) &&
+ !instantEdtfDate.getDateQualifications().contains(APPROXIMATE));
+ assertEquals(expected.contains("~"),
+ instantEdtfDate.getDateQualifications().contains(APPROXIMATE) &&
+ !instantEdtfDate.getDateQualifications().contains(UNCERTAIN));
+ assertEquals(expected.contains("%"),
+ instantEdtfDate.getDateQualifications().contains(UNCERTAIN) &&
+ instantEdtfDate.getDateQualifications().contains(APPROXIMATE));
+ }
+
+ default void assertBoundaryType(String expected, InstantEdtfDate instantEdtfDate) {
+ assertEquals(expected.equals(OPEN.getSerializedRepresentation()),
+ instantEdtfDate.getDateBoundaryType() == OPEN || instantEdtfDate.getDateBoundaryType() == UNKNOWN);
+ }
+
+ default void assertDateNormalizationResult(DateNormalizationResult dateNormalizationResult, String expected,
+ DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId, String expectedLabel) {
+ assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId);
+ if (expected != null) {
+ assertEquals(expectedLabel, dateNormalizationResult.getEdtfDate().getLabel());
+ }
+ }
+
+ default void assertDateNormalizationResult(DateNormalizationResult dateNormalizationResult, String expected,
+ DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
+ if (expected == null) {
+ assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus());
+ } else {
+ assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId());
+ AbstractEdtfDate edtfDate = dateNormalizationResult.getEdtfDate();
+ if (edtfDate instanceof IntervalEdtfDate) {
+ String expectedStart = expected.substring(0, expected.indexOf(SLASH_DELIMITER.getStringRepresentation()));
+ String expectedEnd = expected.substring(expected.indexOf(SLASH_DELIMITER.getStringRepresentation()) + 1);
+ InstantEdtfDate startInstantEdtfDate = ((IntervalEdtfDate) edtfDate).getStart();
+ InstantEdtfDate endInstantEdtfDate = ((IntervalEdtfDate) edtfDate).getEnd();
+ assertQualification(expectedStart, startInstantEdtfDate);
+ assertQualification(expectedEnd, endInstantEdtfDate);
+ assertBoundaryType(expectedStart, startInstantEdtfDate);
+ assertBoundaryType(expectedEnd, endInstantEdtfDate);
+ } else {
+ assertQualification(expected, (InstantEdtfDate) edtfDate);
+ }
+ assertEquals(expected, edtfDate.toString());
+ }
+ }
+}
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractorTest.java
new file mode 100644
index 0000000000..0254045f14
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DcmiPeriodDateExtractorTest.java
@@ -0,0 +1,90 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.api.DisplayName;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+/**
+ * Unit tests for {@link DcmiPeriodDateExtractor} class
+ */
+class DcmiPeriodDateExtractorTest implements DateExtractorTest {
+
+ private static final DcmiPeriodDateExtractor DCMI_PERIOD_DATE_EXTRACTOR = new DcmiPeriodDateExtractor();
+
+ @ParameterizedTest
+ @MethodSource("extractData")
+ @DisplayName("Extract DCMI Period")
+ void extract(String input, String expected, String expectedLabel) {
+ DateNormalizationResult dateNormalizationResult = DCMI_PERIOD_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.DCMI_PERIOD,
+ expectedLabel);
+ }
+
+ private static Stream extractData() {
+ return Stream.of(
+ of("name=The Great Depression; start=1929; end=1939;", "1929/1939", "The Great Depression"),
+ of("name=Haagse International Arts Festival, 2000; start=2000-01-26; end=2000-02-20;",
+ "2000-01-26/2000-02-20", "Haagse International Arts Festival, 2000"),
+ of("start=1998-09-25; end=1998-09-25; scheme=W3C-DTF;", "1998-09-25/1998-09-25", null),
+ of("start=1998-09-25T14:20:00+10:00; scheme=W3C-DTF;", "1998-09-25/..", null),
+ of("end=1998-09-25T16:40:00+10:00; scheme=W3C-DTF;", "../1998-09-25", null),
+ of("end=1998-09-25T16:40+10:00; start=1998/01/01 scheme=W3C-DTF;", "../1998-09-25", null),
+
+ //Scheme checks
+ of("name=The Great Depression; start=1929; end=1939; scheme=W3CDTF;", "1929/1939", "The Great Depression"),
+ of("name=The Great Depression; start=1929; end=1939; scheme=W3C-DTF;", "1929/1939", "The Great Depression"),
+ of("scheme=W3C-DTF; name=The Great Depression; start=1929; end=1939;", "1929/1939", "The Great Depression"),
+ of("name=The Great Depression; start=1929; end=1939; scheme=W3C-DTF", "1929/1939", "The Great Depression"),
+ of("name=The Great Depression; start=1929; end=1939; scheme=W3C-", null, null, null),
+
+ //double fields should be false
+ of("name=The Great Depression; start=1929; end=1939; name=The Great Depression;", null, null, null),
+ of("name=The Great Depression; start=1929; end=1939; start=1929;", null, null, null),
+ of("name=The Great Depression; end=1939; start=1929; end=1939;", null, null, null),
+
+ //Both start and end null then false
+ of("name=The Great Depression; start=; end=;", null, null, null),
+ of("name=The Great Depression;", null, null, null),
+
+ //One end bounded
+ of("name=The Great Depression; start=; end=1939;", "../1939", "The Great Depression"),
+ of("name=The Great Depression; start=1929; end=;", "1929/..", "The Great Depression"),
+
+ //Full date
+ of("name=Haagse International Arts Festival, 2000; start=2000-01-26; end=2000-02-20;",
+ "2000-01-26/2000-02-20", "Haagse International Arts Festival, 2000"),
+
+ //Full date and time
+ of("start=1999-09-25T14:20:00+10:00; end=1999-09-25T16:40:00+10:00; scheme=W3C-DTF;", "1999-09-25/1999-09-25", null),
+ of("start=1999-09-25T14:20:00+10:00; scheme=W3C-DTF;", "1999-09-25/..", null),
+ of("end=1999-09-25T16:40:00+10:00; scheme=W3C-DTF;", "../1999-09-25", null),
+
+ //Missing semicolon
+ of("end=1998-09-25T16:40:00+10:00; start=1998 scheme=W3C-DTF;", "../1998-09-25", null),
+
+ //Invalid date
+ of("end=1998-09-25T16:40+10:00; start=1998-1986; scheme=W3C-DTF;", null, null, null),
+ //
+ //Spaces at the end of the name are cleaned up
+ of("name=The Great Depression ; start=1929; end=1939;", "1929/1939", "The Great Depression"),
+
+ //Spaces at the beginning of the name are cleaned up
+ of("name= The Great Depression; start=1929; end=1939;", "1929/1939", "The Great Depression"),
+
+ //Name at the beginning without field name
+ of("The Great Depression; start=1929; end=1939;", "1929/1939", null),
+
+ //Name at the beginning without field name and spaces at wrapped
+ of(" The Great Depression ; start=1929; end=1939;", "1929/1939", null),
+
+ //Normal case
+ of("name=The Great Depression; start=1929; end=1939;", "1929/1939", "The Great Depression")
+ );
+ }
+}
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractorTest.java
new file mode 100644
index 0000000000..3201cad090
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/DecadeDateExtractorTest.java
@@ -0,0 +1,63 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class DecadeDateExtractorTest implements DateExtractorTest {
+
+ private static final DecadeDateExtractor DECADE_DATE_EXTRACTOR = new DecadeDateExtractor();
+
+ @ParameterizedTest
+ @MethodSource
+ void extract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = DECADE_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.DECADE);
+ }
+
+ private static Stream extract() {
+ return Stream.of(
+ of("180x", "180X"),
+ of("180u", "180X"),
+ of("180X", "180X"),
+ of("180U", "180X"),
+ of(" 180u ", "180X"),
+ of("180x?", "180X?"),
+ of("180u?", "180X?"),
+ of("180??", "180X?"),
+ of("?180x", "180X?"),
+ of("?180u", "180X?"),
+ of("?180x?", "180X?"),
+ of("?180u?", "180X?"),
+ of("?180??", "180X?"),
+
+ //Future dates not allowed
+ of("222u", null),
+ //This is an ambiguous case because hyphen can be used as a separator
+ of("180-?", null),
+ //Ambiguous, possible open end
+ of("180-", null),
+ //Non u, x or ?
+ of("180s", null),
+ //Only one question mark not supported
+ of("180?", null),
+ //Too many digits
+ of("1800", null),
+ of("?1280x", null),
+ of("?1280u?", null),
+ of("?1280??", null),
+ of("1280??", null),
+
+ of("18??", null), //Too few digits
+ of("18--", null), //Too few digits
+ of("18..", null), //Too few digits
+ of("1...", null) //Too few digits
+ );
+ }
+
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractorTest.java
new file mode 100644
index 0000000000..e9f60f3453
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfDateExtractorTest.java
@@ -0,0 +1,167 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.api.DisplayName;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class EdtfDateExtractorTest implements DateExtractorTest {
+
+ private static final EdtfDateExtractor EDTF_DATE_EXTRACTOR = new EdtfDateExtractor();
+
+ private void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = EDTF_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.EDTF);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ @DisplayName("[year][“-”][month][“-”][day] Complete representation")
+ void completeDateRepresentationLevel0(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ @DisplayName("[year][“-”][month] Reduced precision for year and month")
+ void reducedPrecisionForYearAndMonthLevel0(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ @DisplayName("[year] Reduced precision for year")
+ void reducedPrecisionForYearLevel0(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ @DisplayName("Letter-prefixed calendar year")
+ void letterPrefixedCalendarYearLevel1(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ @DisplayName("The characters '?', '~' and '%' are used to mean \"uncertain\", \"approximate\", and \"uncertain\" as well as \"approximate\", respectively")
+ void dateQualificationLevel1(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ @DisplayName("Negative Calendar Year")
+ void negativeCalendarYearLevel1(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ private static Stream completeDateRepresentationLevel0() {
+ return Stream.of(
+ of("1989-11-01", "1989-11-01"),
+ of("0989-11-01", "0989-11-01"),
+ of("0989-11-01", "0989-11-01"),
+ //Digits missing on year
+ of("198-11-01", null),
+ //Digits missing on month or day
+ of("1989-11-1", null),
+ of("1989-1-01", null),
+ //Anything other than hyphen "-" is not valid
+ of("1989/11/01", null),
+
+ //Complete representations for calendar date and (local) time of day
+ of("1989-11-01T23:59:59", "1989-11-01"),
+ of("1989-11-01T23:59", "1989-11-01"),
+ of("1989-11-01T23", "1989-11-01"),
+ of("1989-11-01T", "1989-11-01"),
+ of("1989-11-01T23:59:5", "1989-11-01"),
+ of("1989-11-01T23:5:59", "1989-11-01"),
+ of("1989-11-01t23:59:59", null),
+ of("1989-11-01 23:59:59", null),
+
+ //Complete representations for calendar date and UTC time of day
+ of("1989-11-01T23:59:59Z", "1989-11-01"),
+ of("1989-11-01t23:59:59Z", null),
+ of("1989-11-01 23:59:59Z", null),
+
+ //Date and time with time shift in hours (only)
+ of("1989-11-01T23:59:59-04", "1989-11-01"),
+ of("1989-11-01T23:59:59+04", "1989-11-01"),
+ of("1989-11-01t23:59:59-04", null),
+ of("1989-11-01 23:59:59-04", null),
+
+ //Date and time with time shift in hours and minutes
+ of("1989-11-01T23:59:59-04:44", "1989-11-01"),
+ of("1989-11-01T23:59:59+04:44", "1989-11-01"),
+ of("1989-11-01t23:59:59-04:44", null),
+ of("1989-11-01 23:59:59-04:44", null)
+ );
+ }
+
+ private static Stream reducedPrecisionForYearAndMonthLevel0() {
+ return Stream.of(
+ of("1989-11", "1989-11"),
+ of("0989-11", "0989-11"),
+ //Digits missing on year
+ of("198-11", null),
+ //Digits missing on month
+ of("1989-1", null),
+ //Anything other than hyphen "-" is not valid
+ of("1989/11", null)
+ );
+ }
+
+ private static Stream reducedPrecisionForYearLevel0() {
+ return Stream.of(
+ of("1989", "1989"),
+ of("0989", "0989"),
+ //Digits missing on year
+ of("198", null)
+ );
+ }
+
+ private static Stream letterPrefixedCalendarYearLevel1() {
+ return Stream.of(
+ of("Y-123456789", "Y-123456789"),
+ //Non prefixed
+ of("-123456789", null),
+ //Future dates are not valid
+ of("Y123456789", null),
+ //Month and day not valid
+ of("Y123456789/11/01", null),
+ //Overflow, max is +-999999999
+ of("Y1234567890", null),
+ of("Y-1234567890", null),
+ //Too low values
+ of("Y0", null),
+ of("Y1", null),
+ of("Y-1", null),
+ of("Y", null),
+ of("YnonValidNumber", null)
+ );
+ }
+
+ private static Stream dateQualificationLevel1() {
+ return Stream.of(
+ of("1989?", "1989?"),
+ of("1989~", "1989~"),
+ of("1989-11?", "1989-11?"),
+ of("1989-11~", "1989-11~"),
+ of("1989-11-01%", "1989-11-01%")
+ );
+ }
+
+ private static Stream negativeCalendarYearLevel1() {
+ return Stream.of(
+ of("-1989", "-1989"),
+ of("-9999", "-9999"),
+ of("-0989", "-0989"),
+ of("-11989", null)
+ );
+ }
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractorTest.java
new file mode 100644
index 0000000000..f7b7cb45fb
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/EdtfRangeDateExtractorTest.java
@@ -0,0 +1,155 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.api.DisplayName;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class EdtfRangeDateExtractorTest implements DateExtractorTest {
+
+ private static final EdtfRangeDateExtractor EDTF_RANGE_DATE_EXTRACTOR = new EdtfRangeDateExtractor();
+
+ private void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = EDTF_RANGE_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.EDTF);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ void dateIntervalRepresentationLevel0(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ @DisplayName("Letter-prefixed calendar year interval")
+ void letterPrefixedCalendarYearIntervalLevel1(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ @DisplayName("Open time interval")
+ void openTimeIntervalLevel1(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ @DisplayName("Unknown time interval")
+ void unknownTimeIntervalLevel1(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ private static Stream dateIntervalRepresentationLevel0() {
+ return Stream.of(
+ of("1989/1990", "1989/1990"),
+ of("1989-11/1990-11", "1989-11/1990-11"),
+ of("1989-11-01/1990-11-01", "1989-11-01/1990-11-01"),
+ of("1989-11-01/1990-11", "1989-11-01/1990-11"),
+ of("1989-11-01/1990", "1989-11-01/1990"),
+ of("1989/1990-11", "1989/1990-11"),
+ of("1989/1990-11-01", "1989/1990-11-01"),
+ of("1989-00/1990-00", null),
+ of("1989-00-00/1990-00-00", null),
+ of("1989 / 1990", "1989/1990"),
+ //Dash not valid
+ of("1989-1990", null),
+ //Missing digits
+ of("989-1990", null),
+ of("1989-990", null)
+ );
+ }
+
+ private static Stream letterPrefixedCalendarYearIntervalLevel1() {
+ return Stream.of(
+ of("Y-123456789/Y-123456788", "Y-123456789/Y-123456788"),
+ //Non prefixed
+ of("-123456789/-123456788", null)
+ );
+ }
+
+ private static Stream openTimeIntervalLevel1() {
+ return Stream.of(
+ //Open start
+ of("../1989-11-01", "../1989-11-01"),
+ of("../1989-11", "../1989-11"),
+ of("../1989", "../1989"),
+ of("../1989-11-01~", "../1989-11-01~"),
+ of("../1989-11~", "../1989-11~"),
+ of("../1989~", "../1989~"),
+ of("../1989-11-01?", "../1989-11-01?"),
+ of("../1989-11?", "../1989-11?"),
+ of("../1989?", "../1989?"),
+ of("../1989-11-01%", "../1989-11-01%"),
+ of("../1989-11%", "../1989-11%"),
+ of("../1989%", "../1989%"),
+ of(".. / 1989-11-01", "../1989-11-01"),
+ of("../ 1989-11-01", "../1989-11-01"),
+ of(".. /1989-11-01", "../1989-11-01"),
+
+ //Open end
+ of("1989-11-01/..", "1989-11-01/.."),
+ of("1989-11/..", "1989-11/.."),
+ of("1989/..", "1989/.."),
+ of("1989-11-01~/..", "1989-11-01~/.."),
+ of("1989-11~/..", "1989-11~/.."),
+ of("1989~/..", "1989~/.."),
+ of("1989-11-01?/..", "1989-11-01?/.."),
+ of("1989-11?/..", "1989-11?/.."),
+ of("1989?/..", "1989?/.."),
+ of("1989-11-01%/..", "1989-11-01%/.."),
+ of("1989-11%/..", "1989-11%/.."),
+ of("1989%/..", "1989%/.."),
+ of("1989-11-01 / ..", "1989-11-01/.."),
+ of("1989-11-01 /..", "1989-11-01/.."),
+ of("1989-11-01/ ..", "1989-11-01/.."),
+ of("../..", null)
+ );
+ }
+
+
+ private static Stream unknownTimeIntervalLevel1() {
+ return Stream.of(
+ //Unknown start
+ of("/1989-11-01", "../1989-11-01"),
+ of("/1989-11", "../1989-11"),
+ of("/1989", "../1989"),
+ of("/1989-11-01~", "../1989-11-01~"),
+ of("/1989-11~", "../1989-11~"),
+ of("/1989~", "../1989~"),
+ of("/1989-11-01?", "../1989-11-01?"),
+ of("/1989-11?", "../1989-11?"),
+ of("/1989?", "../1989?"),
+ of("/1989-11-01%", "../1989-11-01%"),
+ of("/1989-11%", "../1989-11%"),
+ of("/1989%", "../1989%"),
+ of(" / 1989-11-01", "../1989-11-01"),
+ of("/ 1989-11-01", "../1989-11-01"),
+ of(" /1989-11-01", "../1989-11-01"),
+
+ //Unknown end
+ of("1989-11-01/", "1989-11-01/.."),
+ of("1989-11/", "1989-11/.."),
+ of("1989/", "1989/.."),
+ of("1989-11-01~/", "1989-11-01~/.."),
+ of("1989-11~/", "1989-11~/.."),
+ of("1989~/", "1989~/.."),
+ of("1989-11-01?/", "1989-11-01?/.."),
+ of("1989-11?/", "1989-11?/.."),
+ of("1989?/", "1989?/.."),
+ of("1989-11-01%/", "1989-11-01%/.."),
+ of("1989-11%/", "1989-11%/.."),
+ of("1989%/", "1989%/.."),
+ of("1989-11-01 / ", "1989-11-01/.."),
+ of("1989-11-01 /", "1989-11-01/.."),
+ of("1989-11-01/ ", "1989-11-01/.."),
+ of("/", null)
+ );
+ }
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractorTest.java
new file mode 100644
index 0000000000..3c65e22d47
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/FullDateDateExtractorTest.java
@@ -0,0 +1,51 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class FullDateDateExtractorTest implements DateExtractorTest {
+
+ private static final FullDateDateExtractor PATTERN_FORMATTED_FULL_DATE_DATE_EXTRACTOR = new FullDateDateExtractor();
+
+ @ParameterizedTest
+ @MethodSource
+ void extract(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = PATTERN_FORMATTED_FULL_DATE_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.FORMATTED_FULL_DATE);
+ }
+
+ private static Stream extract() {
+ return Stream.of(
+ of("Wed Nov 01 01:00:00 CEST 1989", "1989-11-01"),
+ of("Τετ Νοε 01 01:00:00 CEST 1989", "1989-11-01"),
+ of("1989-11-01 04:05:06 UTC", "1989-11-01"),
+ of("1989-11-01 04:05:06 UTC+01", "1989-11-01"),
+ of("1989-11-01 04:05:06 UTC-01", "1989-11-01"),
+ of("1989-11-01 01:02:03 UTC", "1989-11-01"),
+ of("1989-11-01 01:02:03", "1989-11-01"),
+ of("1989-11-01 01:02:03.1", "1989-11-01"),
+ of("1989-11-01 01:02:03.12", "1989-11-01"),
+ of("1989-11-01 01:02:03.123", "1989-11-01"),
+
+ //Invalids
+ of("Wed Nov 01 01:00:00 CEST", null),
+ of("Wed Nov 01 01:00:00", null),
+ of("1989-11-01 01:02:03.1234", null),
+ of("1989-11-01 01:02:03+01", null),
+ of("1989-11-01 01:02", null),
+ of("1989-11-01 01", null),
+ of("1989-11-01", null)
+ );
+ }
+
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractorTest.java
new file mode 100644
index 0000000000..54461b6e83
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearDateExtractorTest.java
@@ -0,0 +1,43 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class LongNegativeYearDateExtractorTest implements DateExtractorTest {
+
+ private static final LongNegativeYearDateExtractor LONG_NEGATIVE_YEAR_DATE_EXTRACTOR = new LongNegativeYearDateExtractor();
+
+ @ParameterizedTest
+ @MethodSource
+ void extract(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = LONG_NEGATIVE_YEAR_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR);
+ }
+
+ private static Stream extract() {
+ return Stream.of(
+ of("-12345", "Y-12345"),
+ of("-123456", "Y-123456"),
+ of("-1234567", "Y-1234567"),
+ of("-12345678", "Y-12345678"),
+ of("-123456789", "Y-123456789"),
+
+ //Future dates are not valid
+ of("123456789", null),
+ //Less digits
+ of("-1234", null),
+ //Greater digits
+ of("-1234567890", null)
+ );
+ }
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractorTest.java
new file mode 100644
index 0000000000..df28e26cc7
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/LongNegativeYearRangeDateExtractorTest.java
@@ -0,0 +1,50 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class LongNegativeYearRangeDateExtractorTest implements DateExtractorTest {
+
+ private static final LongNegativeYearRangeDateExtractor LONG_NEGATIVE_YEAR_RANGE_DATE_EXTRACTOR = new LongNegativeYearRangeDateExtractor();
+
+ @ParameterizedTest
+ @MethodSource
+ void extract(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = LONG_NEGATIVE_YEAR_RANGE_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.LONG_NEGATIVE_YEAR);
+ }
+
+ private static Stream extract() {
+ return Stream.of(
+ of("-12345/-12344", "Y-12345/Y-12344"),
+ of("-123456/-123455", "Y-123456/Y-123455"),
+ of("-1234567/-1234566", "Y-1234567/Y-1234566"),
+ of("-12345678/-12345677", "Y-12345678/Y-12345677"),
+ of("-123456789/-123456788", "Y-123456789/Y-123456788"),
+
+ //Dash
+ of("-12345--12344", null),
+ of("-123456--123455", null),
+ of("-1234567--1234566", null),
+ of("-12345678--12345677", null),
+ of("-123456789--123456788", null),
+
+ //Future dates are not valid
+ of("123456788/123456789", null),
+ //Less digits
+ of("-1234/-1233", null),
+ //Greater digits
+ of("-1234567890/-1234567889", null)
+ );
+ }
+}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractorTest.java
new file mode 100644
index 0000000000..441cab3bb6
--- /dev/null
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/MonthNameDateExtractorTest.java
@@ -0,0 +1,173 @@
+package eu.europeana.normalization.dates.extraction.extractors;
+
+import static org.junit.jupiter.params.provider.Arguments.of;
+
+import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
+import eu.europeana.normalization.dates.DateNormalizationResult;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class MonthNameDateExtractorTest implements DateExtractorTest {
+
+ private static final MonthNameDateExtractor PATTERN_MONTH_NAME_DATE_EXTRACTOR = new MonthNameDateExtractor();
+
+ @ParameterizedTest
+ @MethodSource
+ void extractDayMonthYear(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ void extractMonthDayYear(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ void extractMonthYear(String input, String expected) {
+ assertExtract(input, expected);
+ }
+
+ void assertExtract(String input, String expected) {
+ final DateNormalizationResult dateNormalizationResult = PATTERN_MONTH_NAME_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, DateNormalizationExtractorMatchId.MONTH_NAME);
+ }
+
+ private static Stream extractDayMonthYear() {
+ return Stream.of(
+ of("01 November 1989", "1989-11-01"),
+ of("32 November 1989", null),
+ of("01.November.1989", "1989-11-01"),
+ of("01,November,1989", "1989-11-01"),
+ //Combination of separators
+ of("01 November.1989", "1989-11-01"),
+ of("01 November,1989", "1989-11-01"),
+ of("01.November 1989", "1989-11-01"),
+ of("01.November,1989", "1989-11-01"),
+ of("01,November 1989", "1989-11-01"),
+ of("01,November.1989", "1989-11-01"),
+
+ //Some other languages or name formats
+ of("01 nov. 1989", "1989-11-01"),
+ of("01 ное 1989", "1989-11-01"),
+ of("01 Νοεμβρίου 1989", "1989-11-01"),
+ of("01 January 1989", "1989-01-01"),
+ of("01 Νοεμβρίου 1989", "1989-11-01"),
+ of("01 νοεμβρίου 1989", "1989-11-01"),
+ of("01 ΝΟΕΜΒΡΊΟΥ 1989", "1989-11-01"),
+ //Italian
+ of("01 Novembre 1989", "1989-11-01"),
+
+ //Incorrect month
+ of("99 November 9989", null),
+ of("99 November 9989", null),
+
+ //Too few digits on year
+ of("1 January 989", null),
+ of("1.January.989", null),
+ of("1,January,989", null),
+ //Too many digits on year
+ of("01 January 12345", null),
+ //Too many digits on day
+ of("123 January 1234", null),
+
+ //Other invalids
+ //Double spaces should not match
+ of("1989 November 01", null),
+ //Double dots should not match
+ of("1989..November..01", null),
+ //Double commas should not match
+ of("1989,,November,,01", null)
+ );
+ }
+
+ private static Stream extractMonthDayYear() {
+
+ return Stream.of(
+ of("November 01 1989", "1989-11-01"),
+ of("November 32 1989", null),
+ of("November.01.1989", "1989-11-01"),
+ of("November,01,1989", "1989-11-01"),
+ //Combination of separators
+ of("November 01.1989", "1989-11-01"),
+ of("November 01,1989", "1989-11-01"),
+ of("November.01 1989", "1989-11-01"),
+ of("November.01,1989", "1989-11-01"),
+ of("November,01 1989", "1989-11-01"),
+ of("November,01.1989", "1989-11-01"),
+
+ //Some other languages or name formats
+ of("nov. 01 1989", "1989-11-01"),
+ of("ное 01 1989", "1989-11-01"),
+ of("January 01 1989", "1989-01-01"),
+ of("Νοεμβρίου 01 1989", "1989-11-01"),
+ of("νοεμβρίου 01 1989", "1989-11-01"),
+ of("ΝΟΕΜΒΡΊΟΥ 01 1989", "1989-11-01"),
+ //Italian
+ of("Novembre 01 1989", "1989-11-01"),
+
+ //Incorrect month
+ of("November 99 9989", null),
+ of("November 99 9989", null),
+
+ //Too few digits on year
+ of("January 1 989", null),
+ of("January.1.989", null),
+ of("January,1,989", null),
+ //Too many digits on year
+ of("January 01 12345", null),
+ //Too many digits on day
+ of("January 123 1234", null),
+
+ //Other invalids
+ //Double spaces should not match
+ of("November 01 1989", null),
+ //Double dots should not match
+ of("November..01..1989", null),
+ //Double commas should not match
+ of("November,,01,,1989", null)
+ );
+ }
+
+ private static Stream extractMonthYear() {
+
+ return Stream.of(
+ //MONTH-YEAR
+ of("November 1989", "1989-11"),
+ of("November.1989", "1989-11"),
+ of("November,1989", "1989-11"),
+
+ //Some other languages or name formats
+ of("nov. 1989", "1989-11"),
+ of("ное 1989", "1989-11"),
+ of("January 1989", "1989-01"),
+ of("Νοεμβρίου 1989", "1989-11"),
+ of("νοεμβρίου 1989", "1989-11"),
+ of("ΝΟΕΜΒΡΊΟΥ 1989", "1989-11"),
+ //Italian
+ of("Novembre 1989", "1989-11"),
+
+ //Incorrect month year
+ of("November 9989", null),
+ of("November 9989", null),
+ //Too few digits on year
+ of("January 989", null),
+ of("January.989", null),
+ of("January,989", null),
+ //Too many digits on year
+ of("January 12345", null),
+
+ //Other invalids
+ //Double spaces should not match
+ of("November 1989", null),
+ //Double dots should not match
+ of("November..1989", null),
+ //Double commas should not match
+ of("November,,1989", null)
+ );
+ }
+
+}
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractorTest.java
similarity index 92%
rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractorTest.java
rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractorTest.java
index 0aaa161ea8..09eb56d96f 100644
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsDateExtractorTest.java
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsDateExtractorTest.java
@@ -1,22 +1,18 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
+package eu.europeana.normalization.dates.extraction.extractors;
import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS;
import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS_XX;
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.YYYY_MM_DD_SPACES;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_SPACES_VARIANT;
import static org.junit.jupiter.params.provider.Arguments.of;
import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.DateNormalizationResultStatus;
-import eu.europeana.normalization.dates.edtf.DateQualification;
import java.util.stream.Stream;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
-class NumericPartsDateExtractorTest {
+class NumericPartsDateExtractorTest implements DateExtractorTest {
private static final NumericPartsDateExtractor NUMERIC_PARTS_DATE_EXTRACTOR = new NumericPartsDateExtractor();
@@ -47,25 +43,15 @@ void extractDMY_XX(String input, String expected) {
@ParameterizedTest
@MethodSource
void extractDateSpaces(String input, String expected) {
- assertExtract(input, expected, YYYY_MM_DD_SPACES);
+ assertExtract(input, expected, NUMERIC_SPACES_VARIANT);
}
void assertExtract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
- final DateNormalizationResult dateNormalizationResult = NUMERIC_PARTS_DATE_EXTRACTOR.extractDateProperty(input,
- NO_QUALIFICATION);
- if (expected == null) {
- assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus());
- } else {
- final String actual = dateNormalizationResult.getEdtfDate().toString();
- assertEquals(expected, actual);
- assertEquals(actual.contains("?"),
- dateNormalizationResult.getEdtfDate().getDateQualification() == DateQualification.UNCERTAIN);
- assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId());
- }
+ final DateNormalizationResult dateNormalizationResult = NUMERIC_PARTS_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId);
}
private static Stream extractDateSpaces() {
-
return Stream.of(
of("1989 11 01", "1989-11-01"),
of("1989 11 01?", "1989-11-01?"),
@@ -85,7 +71,6 @@ private static Stream extractDateSpaces() {
}
private static Stream extractYMD() {
-
return Stream.of(
//YEAR
//A month and day can be missing
@@ -151,7 +136,6 @@ private static Stream extractYMD() {
}
private static Stream extractDMY() {
-
return Stream.of(
//MONTH-YEAR
of("11-1989", "1989-11"),
@@ -220,7 +204,6 @@ private static Stream extractDMY() {
}
private static Stream extractYMD_XX() {
-
return Stream.of(
//YEAR
of("198X", "198X"),
@@ -336,7 +319,6 @@ private static Stream extractYMD_XX() {
}
private static Stream extractDMY_XX() {
-
return Stream.of(
//YEAR-MONTH
of("XX.1989", "1989"),
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractorTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractorTest.java
similarity index 58%
rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractorTest.java
rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractorTest.java
index 65fb376f27..fe61ef9fba 100644
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericPartsRangeDateExtractorTest.java
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericPartsRangeDateExtractorTest.java
@@ -1,19 +1,15 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
+package eu.europeana.normalization.dates.extraction.extractors;
import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS;
import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS_XX;
-import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
-import static org.junit.jupiter.api.Assertions.assertEquals;
import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.DateNormalizationResultStatus;
-import eu.europeana.normalization.dates.edtf.DateQualification;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ArgumentsSource;
-class NumericPartsRangeDateExtractorTest {
+class NumericPartsRangeDateExtractorTest implements DateExtractorTest {
private static final NumericPartsRangeDateExtractor NUMERIC_PARTS_RANGE_DATE_EXTRACTOR = new NumericPartsRangeDateExtractor();
@@ -42,17 +38,7 @@ void extractDMY_XX(String input, String expected) {
}
void extract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
- final DateNormalizationResult dateNormalizationResult = NUMERIC_PARTS_RANGE_DATE_EXTRACTOR.extractDateProperty(input,
- NO_QUALIFICATION);
- if (expected == null) {
- assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus());
- } else {
- final String actual = dateNormalizationResult.getEdtfDate().toString();
- assertEquals(expected, actual);
- assertEquals(actual.contains("?"),
- dateNormalizationResult.getEdtfDate().getDateQualification() == DateQualification.UNCERTAIN);
- assertEquals(actual.contains(".."), dateNormalizationResult.getEdtfDate().isOpen());
- assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId());
- }
+ final DateNormalizationResult dateNormalizationResult = NUMERIC_PARTS_RANGE_DATE_EXTRACTOR.extractDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId);
}
}
\ No newline at end of file
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYArgumentsProvider.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYArgumentsProvider.java
similarity index 99%
rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYArgumentsProvider.java
rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYArgumentsProvider.java
index cca3239501..3b494422d9 100644
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYArgumentsProvider.java
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYArgumentsProvider.java
@@ -1,4 +1,4 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
+package eu.europeana.normalization.dates.extraction.extractors;
import static org.junit.jupiter.params.provider.Arguments.of;
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYXXArgumentsProvider.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYXXArgumentsProvider.java
similarity index 99%
rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYXXArgumentsProvider.java
rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYXXArgumentsProvider.java
index b534776338..86cfc609fc 100644
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeDMYXXArgumentsProvider.java
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeDMYXXArgumentsProvider.java
@@ -1,4 +1,4 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
+package eu.europeana.normalization.dates.extraction.extractors;
import static org.junit.jupiter.params.provider.Arguments.of;
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDArgumentsProvider.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDArgumentsProvider.java
similarity index 99%
rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDArgumentsProvider.java
rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDArgumentsProvider.java
index 8e759ed439..8e8e660adc 100644
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDArgumentsProvider.java
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDArgumentsProvider.java
@@ -1,4 +1,4 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
+package eu.europeana.normalization.dates.extraction.extractors;
import static org.junit.jupiter.params.provider.Arguments.of;
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDXXArgumentsProvider.java b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDXXArgumentsProvider.java
similarity index 99%
rename from metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDXXArgumentsProvider.java
rename to metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDXXArgumentsProvider.java
index 71a0f51f8a..f582df913e 100644
--- a/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/dateextractors/NumericRangeYMDXXArgumentsProvider.java
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/dates/extraction/extractors/NumericRangeYMDXXArgumentsProvider.java
@@ -1,4 +1,4 @@
-package eu.europeana.normalization.dates.extraction.dateextractors;
+package eu.europeana.normalization.dates.extraction.extractors;
import static org.junit.jupiter.params.provider.Arguments.of;
diff --git a/metis-normalization/src/test/java/eu/europeana/normalization/normalizers/DatesNormalizerTest.java b/metis-normalization/src/test/java/eu/europeana/normalization/normalizers/DatesNormalizerTest.java
index 70e2f679d4..e3a2d2bd8f 100644
--- a/metis-normalization/src/test/java/eu/europeana/normalization/normalizers/DatesNormalizerTest.java
+++ b/metis-normalization/src/test/java/eu/europeana/normalization/normalizers/DatesNormalizerTest.java
@@ -12,80 +12,43 @@
import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS;
import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_ALL_VARIANTS_XX;
import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_RANGE_ALL_VARIANTS;
-import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.YYYY_MM_DD_SPACES;
-import static eu.europeana.normalization.dates.edtf.IntervalEdtfDate.DATE_INTERVAL_SEPARATOR;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static eu.europeana.normalization.dates.DateNormalizationExtractorMatchId.NUMERIC_SPACES_VARIANT;
import static org.junit.jupiter.params.provider.Arguments.of;
import eu.europeana.normalization.dates.DateNormalizationExtractorMatchId;
import eu.europeana.normalization.dates.DateNormalizationResult;
-import eu.europeana.normalization.dates.DateNormalizationResultStatus;
-import eu.europeana.normalization.dates.edtf.AbstractEdtfDate;
-import eu.europeana.normalization.dates.edtf.DateBoundaryType;
-import eu.europeana.normalization.dates.edtf.DateQualification;
-import eu.europeana.normalization.dates.edtf.InstantEdtfDate;
-import eu.europeana.normalization.dates.edtf.IntervalEdtfDate;
-import java.util.Arrays;
-import java.util.function.Function;
+import eu.europeana.normalization.dates.extraction.extractors.DateExtractorTest;
import java.util.stream.Stream;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
-class DatesNormalizerTest {
+class DatesNormalizerTest implements DateExtractorTest {
private final static DatesNormalizer NORMALIZER = new DatesNormalizer();
- void assertExtract(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId,
- String label) {
+ @ParameterizedTest
+ @MethodSource
+ void extractDatePropertiesWithLabel(String input, String expected,
+ DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId, String expectedLabel) {
final DateNormalizationResult dateNormalizationResult = NORMALIZER.normalizeDateProperty(input);
- if (expected != null) {
- assertEquals(dateNormalizationExtractorMatchId, dateNormalizationResult.getDateNormalizationExtractorMatchId());
- assertEquals(label, dateNormalizationResult.getEdtfDate().getLabel());
- AbstractEdtfDate edtfDate = dateNormalizationResult.getEdtfDate();
- if (edtfDate instanceof IntervalEdtfDate) {
- String startPart = expected.substring(0, expected.indexOf(DATE_INTERVAL_SEPARATOR));
- String endPart = expected.substring(expected.indexOf(DATE_INTERVAL_SEPARATOR) + 1);
- InstantEdtfDate start = ((IntervalEdtfDate) edtfDate).getStart();
- InstantEdtfDate end = ((IntervalEdtfDate) edtfDate).getEnd();
- assertEdtfDate(startPart, start);
- assertEdtfDate(endPart, end);
- } else {
- assertEdtfDate(expected, (InstantEdtfDate) dateNormalizationResult.getEdtfDate());
- }
- assertEquals(expected, edtfDate.toString());
- } else {
- assertEquals(DateNormalizationResultStatus.NO_MATCH, dateNormalizationResult.getDateNormalizationResultStatus());
- }
-
- }
-
- private static void assertEdtfDate(String expected, InstantEdtfDate instantEdtfDate) {
- assertEquals(expected.contains("?"), instantEdtfDate.getDateQualification() == DateQualification.UNCERTAIN);
- assertEquals(expected.contains("~"), instantEdtfDate.getDateQualification() == DateQualification.APPROXIMATE);
- assertEquals(expected.contains("%"), instantEdtfDate.getDateQualification() == DateQualification.UNCERTAIN_APPROXIMATE);
- assertEquals(expected.equals(DateBoundaryType.OPEN.getSerializedRepresentation()),
- instantEdtfDate.getDateBoundaryType() == DateBoundaryType.OPEN
- || instantEdtfDate.getDateBoundaryType() == DateBoundaryType.UNKNOWN);
+ assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId, expectedLabel);
}
@ParameterizedTest
@MethodSource
- void extractDateProperties(String input, String expected, DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId,
- String label) {
- assertExtract(input, expected, dateNormalizationExtractorMatchId, label);
+ void extractDatePropertiesWithoutLabel(String input, String expected,
+ DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
+ final DateNormalizationResult dateNormalizationResult = NORMALIZER.normalizeDateProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId);
}
- private static Stream extractDateProperties() {
- Stream argumentsWithoutLabel = Stream.of(
- extractDatePropertiesWithoutLabel()
- ).flatMap(Function.identity()).map(arguments ->
- {
- Object[] argumentsWithLabel = Arrays.copyOf(arguments.get(), arguments.get().length + 1);
- argumentsWithLabel[argumentsWithLabel.length - 1] = null;
- return of(argumentsWithLabel);
- });
- return Stream.concat(extractDatePropertiesWithLabel(), argumentsWithoutLabel);
+ @ParameterizedTest
+ @MethodSource
+ void extractGenericPropertiesWithoutLabel(String input, String expected,
+ DateNormalizationExtractorMatchId dateNormalizationExtractorMatchId) {
+ final DateNormalizationResult dateNormalizationResult = NORMALIZER.normalizeGenericProperty(input);
+ assertDateNormalizationResult(dateNormalizationResult, expected, dateNormalizationExtractorMatchId);
}
private static Stream extractDatePropertiesWithLabel() {
@@ -99,19 +62,17 @@ private static Stream extractDatePropertiesWithLabel() {
private static Stream extractDatePropertiesWithoutLabel() {
return Stream.of(
- //Brief dates. Those are similar to EDFT but should match first.
+ //Brief dates. Those are similar to EDTF but should match first.
of("2014/15", "2014/2015", BRIEF_DATE_RANGE),
- of("1889/98? (Herstellung)", "1889?/1898?", BRIEF_DATE_RANGE),
- of("1918-20", "1918/1920", BRIEF_DATE_RANGE),
+ of("1889/98? (text in parentheses)", "1889/1898?", BRIEF_DATE_RANGE),
//Centuries numeric
of("18..", "18XX", CENTURY_NUMERIC),
- of("19??", "19XX", NUMERIC_ALL_VARIANTS_XX),
of("192?", null, null),// ambiguous
of("[171-]", null, null), // ambiguous
of("19th century", "18XX", CENTURY_NUMERIC),
of("2nd century", "01XX", CENTURY_NUMERIC),
- of("[10th century]", "09XX", CENTURY_NUMERIC), // not supported
+ of("[10th century]", "09XX", CENTURY_NUMERIC),
of("12th century BC", null, null), // not supported
//Centuries roman
@@ -121,7 +82,7 @@ private static Stream extractDatePropertiesWithoutLabel() {
of("S. XVI-XX", "15XX/19XX", CENTURY_RANGE_ROMAN),
of("S.VIII-XV", "07XX/14XX", CENTURY_RANGE_ROMAN),
of("S. XVI-XVIII", "15XX/17XX", CENTURY_RANGE_ROMAN),
- of("S. XVIII-", null, null), // open-ended period
+ of("S. XVIII-", null, null),
of("[XVI-XIX]", "15XX/18XX", CENTURY_RANGE_ROMAN),
of("SVV", null, null),
@@ -133,25 +94,24 @@ private static Stream extractDatePropertiesWithoutLabel() {
//Numeric range '/'
of("1872-06-01/1872-06-30", "1872-06-01/1872-06-30", EDTF),
- of(" 1820/1820", "1820/1820", NUMERIC_RANGE_ALL_VARIANTS),
- of("1918 / 1919", "1918/1919", NUMERIC_RANGE_ALL_VARIANTS),
- of("1205/1215 [Herstellung]", "1205/1215", EDTF),
- of(" 1757/1757", "1757/1757", NUMERIC_RANGE_ALL_VARIANTS),
+ of(" 1820/1820", "1820/1820", EDTF),
+ of("1918 / 1919", "1918/1919", EDTF),
+ of("1205/1215 [text in brackets]", "1205/1215", EDTF),
+ of(" 1757/1757", "1757/1757", EDTF),
of("ca 1757/1757", "1757~/1757~", EDTF),
- of("2000 vC - 2002 nC", "-2000/2002", BC_AD),
- of("0114 aC - 0113 aC", "-0114/-0113", BC_AD),
- of("0390 AD - 0425 AD", "0390/0425", BC_AD),
- of("337 BC - 283 BC", "-0337/-0283", BC_AD),
- of("100 vC - 150 nC", "-0100/0150", BC_AD),
- of("400 BC - 400 AD", "-0400/0400", BC_AD),
- of("235 AD – 236 AD", "0235/0236", BC_AD),
- of("168 B.C.-135 A.D.", "-0168/0135", BC_AD),
+ of("1990 BC-1989 BC", "-1989/-1988", BC_AD),
+ of("1990 π.Χ.-1989 π.Χ.", "-1989/-1988", BC_AD),
+ of("1989 AD/1990 AD", "1989/1990", BC_AD),
+ of("1989 μ.Χ./1990 μ.Χ.", "1989/1990", BC_AD),
+ of("1989 π.Χ.-1 μ.Χ.", "-1988/0001", BC_AD),
of("20/09/18XX", "18XX-09-20", NUMERIC_ALL_VARIANTS_XX),
of("?/1807", "../1807", NUMERIC_RANGE_ALL_VARIANTS),
//Incorrect day values
of("1947-19-50/1950-19-53", null, null),
of("15/21-8-1918", null, null),
of("1.1848/49[?]", null, null),
+ of("1990 BC//1989 BC", null, null),
+ of("-1990 BC-1989 BC", null, null),
//Numeric range ' - '(spaces around hyphen)
of("1851-01-01 - 1851-12-31", "1851-01-01/1851-12-31", NUMERIC_RANGE_ALL_VARIANTS),
@@ -164,8 +124,10 @@ private static Stream extractDatePropertiesWithoutLabel() {
of("192?-1958", null, null),
of("[ca. 1920-1930]", "1920~/1930~", NUMERIC_RANGE_ALL_VARIANTS),
of("1937--1938", null, null),
- of("[ca. 193-]", null, null),// ambiguous
- of("1990-", null, null), // open-ended period not supported
+ // ambiguous
+ of("[ca. 193-]", null, null),
+ // open-ended period not supported
+ of("1990-", null, null),
//Numeric range '|'
of("1910/05/31 | 1910/05/01", "1910-05-01/1910-05-31", NUMERIC_RANGE_ALL_VARIANTS),
@@ -178,9 +140,15 @@ private static Stream extractDatePropertiesWithoutLabel() {
// this may not be a 100% correct normalisation, maybe it is not a range but two dates
of("1651 [ca. 1656]", "1651~/1656~", NUMERIC_RANGE_ALL_VARIANTS),
- //Numeric year
+ //Numeric year all variants
of("(17--?)", "17XX?", NUMERIC_ALL_VARIANTS_XX),
of("[19--?]", "19XX?", NUMERIC_ALL_VARIANTS_XX),
+ of("19--?]", "19XX?", NUMERIC_ALL_VARIANTS_XX),
+ of("19--]", "19XX", NUMERIC_ALL_VARIANTS_XX),
+ of("19xx", "19XX", NUMERIC_ALL_VARIANTS_XX),
+ of("19??", "19XX", NUMERIC_ALL_VARIANTS_XX),
+ of("[ca. 16??]", "16XX~", NUMERIC_ALL_VARIANTS_XX),
+ of("[ca. 16??]", "16XX~", NUMERIC_ALL_VARIANTS_XX),
//Numeric date with dot "."
of("21.1.1921", "1921-01-21", NUMERIC_ALL_VARIANTS),
@@ -189,81 +157,83 @@ private static Stream extractDatePropertiesWithoutLabel() {
of("28.05.1969", "1969-05-28", NUMERIC_ALL_VARIANTS),
of("11.11.1947", "1947-11-11", NUMERIC_ALL_VARIANTS),
of("23.02.[18--]", "18XX-02-23", NUMERIC_ALL_VARIANTS_XX),
+ of("15.02.1985 (text in parentheses)", "1985-02-15", NUMERIC_ALL_VARIANTS),
+ of("09.1972 (text in parentheses)", "1972-09", NUMERIC_ALL_VARIANTS),
of("28. 1. 1240", null, null),
//Numeric date with dash "-"
of("1941-22-06", "1941-06-22", NUMERIC_ALL_VARIANTS),
of("1937-10-??", "1937-10", NUMERIC_ALL_VARIANTS_XX),
+ of("1985-10-xx", "1985-10", NUMERIC_ALL_VARIANTS_XX),
of("199--09-28", null, null),
of("01?-1905", null, null),
of("02?-1915", null, null),
//Numeric date with space " "
- of("1905 09 01", "1905-09-01", YYYY_MM_DD_SPACES),
- of("0 2 1980", "1980-02", YYYY_MM_DD_SPACES),
+ of("1905 09 01", "1905-09-01", NUMERIC_SPACES_VARIANT),
+ of("0 2 1980", "1980-02", NUMERIC_SPACES_VARIANT),
//More than 4 digits year
of("18720601/18720630", null, null),
of("19471950/19501953", null, null),
- of("-2100/-1550", "-2100/-1550", EDTF),
- // TODO: 21/12/2022 Check the below, expected null but returns 1952-02-25 instead
- // of("1952-02-25T00:00:00Z-1952-02-25T23:59:59Z", null),
+ //Month alphabetical name
+ of("18 September 1914", "1914-09-18", MONTH_NAME),
+ of("c.6 Nov 1902", "1902-11-06~", MONTH_NAME),
+
+ //Non-standard date format
+ of("Sat Jan 01 01:00:00 CET 1701", "1701-01-01", FORMATTED_FULL_DATE),
+ of("2013-03-21 18:45:36 UTC", "2013-03-21", FORMATTED_FULL_DATE),
of("2013-09-07 09:31:51 UTC", "2013-09-07", FORMATTED_FULL_DATE),
- of("1997-07-18T00:00:00 [Create]", "1997-07-18", EDTF),
+
+ of("-2100/-1550", "-2100/-1550", EDTF),
+ of("1997-07-18T00:00:00 [text in brackets]", "1997-07-18", EDTF),
of("1924 ca.", null, null),
of("[1712?]", "1712?", EDTF),
of("circa 1712", "1712~", EDTF),
of("[ca. 1946]", "1946~", EDTF),
of("1651?]", "1651?", EDTF),
- of("19--?]", "19XX?", NUMERIC_ALL_VARIANTS_XX),
of(". 1885", null, null),
of("- 1885", null, null),
- of("1749 (Herstellung (Werk))", "1749", EDTF),
- of("1939; 1954; 1955; 1978; 1939-1945", null, null), // multiple dates no suported
- of("[17__]", null, null),// this pattern is not supported (this pattern was never tested
- of("19--]", "19XX", NUMERIC_ALL_VARIANTS_XX),
- of("19xx", "19XX", NUMERIC_ALL_VARIANTS_XX),
- of("Sat Jan 01 01:00:00 CET 1701", "1701-01-01", FORMATTED_FULL_DATE),
- of("2013-03-21 18:45:36 UTC", "2013-03-21", FORMATTED_FULL_DATE),
- of("15.02.1985 (identification)", "1985-02-15", NUMERIC_ALL_VARIANTS),
+ of("1749 (text in parentheses (text in parentheses))", "1749", EDTF),
+ // multiple dates no supported
+ of("1939; 1954; 1955; 1978; 1939-1945", null, null),
+ of("[17__]", null, null),
of("091090", null, null),
of("-0043-12-07", "-0043-12-07", EDTF),
of("imp. 1901", null, null),
- of("u.1707-1739", null, null),// what does 'u.' mean?
- of("22.07.1971 (identification)", "1971-07-22", NUMERIC_ALL_VARIANTS),
+ of("u.1707-1739", null, null),
//Ambiguous pattern
of("187-?]", null, null),
- of("18. September 1914", "1914-09-18", MONTH_NAME),
of("19960216-19960619", null, null),
of("-0549-01-01T00:00:00Z", "-0549-01-01", EDTF),
of("1942-1943 c.", null, null),
of("(1942)", "1942", EDTF),
of("-3.6982", null, null),
- of("[ca. 16??]", "16XX~", NUMERIC_ALL_VARIANTS_XX),
of("ISO9126", null, null),
- of("1985-10-xx", "1985-10", NUMERIC_ALL_VARIANTS_XX),
of("14:27", null, null),
- of("c.6 Nov 1902", "1902-11-06~", MONTH_NAME),
- of("-1234", "-1234", EDTF),
- of("09.1972 (gathering)", "1972-09", NUMERIC_ALL_VARIANTS)
+ of("-1234", "-1234", EDTF)
);
}
- // TODO: 10/03/2023 Don't forget to add specific to generic properties normalization
- // //GENERIC PROPERTY
- // genericPropertyTestCases.put("XIV", null);
- // genericPropertyTestCases.put("1905 09 01", "1905-09-01");
- // genericPropertyTestCases.put("1851-01-01 - 1851-12-31", "1851-01-01/1851-12-31");
- // genericPropertyTestCases.put("18..", null);
- // genericPropertyTestCases.put("2013-09-07 09:31:51 UTC", "2013-09-07");
- // genericPropertyTestCases.put("1918 / 1919", "1918/1919");
- // genericPropertyTestCases.put("1205/1215 [Herstellung]", null);
- // genericPropertyTestCases.put("1997-07", null);
- // genericPropertyTestCases.put("19??", null);
- // genericPropertyTestCases.put("1871 - 191-", null);
-
+ private static Stream extractGenericPropertiesWithoutLabel() {
+ return Stream.of(
+ of("XIV", "13XX", CENTURY_ROMAN),
+ of("1989 11 01", "1989-11-01", NUMERIC_SPACES_VARIANT),
+ of("1851-01-01 - 1851-12-31", "1851-01-01/1851-12-31", NUMERIC_RANGE_ALL_VARIANTS),
+ of("[1989-11-01 - 1989-12-31]", "1989-11-01/1989-12-31", NUMERIC_RANGE_ALL_VARIANTS),
+ of("1989-11-01 - 1989-12-31 (text in parentheses)", "1989-11-01/1989-12-31", NUMERIC_RANGE_ALL_VARIANTS),
+ of("2013-09-07 09:31:51 UTC", "2013-09-07", FORMATTED_FULL_DATE),
+ //Non precise/full dates
+ of("18..", null, null),
+ of("1918/1919", null, null),
+ of("1205/1215 [text in brackets]", null, null),
+ of("1997-07", null, null),
+ of("19??", null, null),
+ of("1871 - 191-", null, null)
+ );
+ }
}
\ No newline at end of file