Skip to content

Commit

Permalink
Debt/met 5132 dates normalization cleanup part 4 (#622)
Browse files Browse the repository at this point in the history
* MET-5132: Update naming of numeric spaces variant

* MET-5132: Prepare first sample of tests

* MET-5132: More tests

* MET-5132: Restructure with enum MontNameDateExtractor

* MET-5132: Reusable DatePartsIndices

* MET-5132: Cleanup

* MET-5132: Add spaces clean and trim

* MET-5132: Cleanup

* MET-5132: First split the range part

* MET-5132: Refactor PatternBcAdDateExtractor with tests

* MET-5132: PatternBcAdRangeDateExtractor cleanup

* MET-5132: Reuse range code for dates

* MET-5132: Reuse range code for BriefRangeDateExtractor

* MET-5132: Simplify hierarchy for ranges

* MET-5132: Add sample tests for PatternLongNegativeYearDateExtractor and adapt edtf builder

* MET-5132: Refactor LongNegativeYearDateExtractor with a separate range class reusing already existent code

* MET-5132: Split century extraction to numeric, roman, roman range reusing code

* MET-5132: Centralize sanitization operation for all extractors.

* MET-5132: Add tests for generic properties

* MET-5132: Reuse test code

* MET-5132: Repackage

* MET-5132: Split EdtfDateExtractor to handle ranges separately with code reuse

* MET-5132: Centralize date qualification overwriting

* MET-5132: Simplify date qualification overwriting

* MET-5132: Update code after answers from rnd

* MET-5132: Cleanup

* MET-5132: PatternFormattedFullDateDateExtractor cleanup

* MET-5132: Add millisecond support

* MET-5132: Process review

* MET-5132: Process review 2
  • Loading branch information
stzanakis authored Oct 19, 2023
1 parent 42519f7 commit 85b07a3
Show file tree
Hide file tree
Showing 69 changed files with 3,017 additions and 2,143 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public enum DateNormalizationExtractorMatchId {
NUMERIC_ALL_VARIANTS_XX("numeric date (various separators and unknown parts)"),
NUMERIC_RANGE_ALL_VARIANTS("numeric date interval (various separators)"),
NUMERIC_RANGE_ALL_VARIANTS_XX("numeric date interval (various separators and unknown parts)"),
YYYY_MM_DD_SPACES("numeric date (whitespace separators)");
NUMERIC_SPACES_VARIANT("numeric date (whitespace separators)");

final String label;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package eu.europeana.normalization.dates.edtf;

import java.util.Set;

/**
* An abstract class that contains the template that an EDTF date with compliance level 1 should implement.
* <p>See more in the specification of <a href="https://www.loc.gov/standards/datetime/">EDTF</a></p>
Expand All @@ -17,11 +19,18 @@ protected AbstractEdtfDate(String label) {
this.label = label;
}

/**
* Add the date qualification, mainly used for pre-sanitized values.
*
* @param dateQualification the date qualification
*/
public abstract void addQualification(DateQualification dateQualification);

public String getLabel() {
return label;
}

public abstract DateQualification getDateQualification();
public abstract Set<DateQualification> getDateQualifications();

public abstract boolean isOpen();

Expand Down
Original file line number Diff line number Diff line change
@@ -1,39 +1,59 @@
package eu.europeana.normalization.dates.edtf;

import java.util.Arrays;
import java.util.EnumSet;
import java.util.Set;
import java.util.regex.Pattern;

/**
* Date qualification characters according to <a href="https://www.loc.gov/standards/datetime/">Extended Date/Time Format (EDTF)
* Specification</a>
*/
public enum DateQualification {
UNCERTAIN, APPROXIMATE;

NO_QUALIFICATION(""),
UNCERTAIN("?"),
APPROXIMATE("~"),
UNCERTAIN_APPROXIMATE("%");

public static final Pattern CHECK_QUALIFICATION_PATTERN = Pattern.compile("^[^\\?~%]*([\\?~%]?)$");
private final String character;

DateQualification(String character) {
this.character = character;
}
private static final String UNCERTAIN_CHARACTER = "?";
private static final String APPROXIMATE_CHARACTER = "~";
private static final String UNCERTAIN_APPROXIMATE_CHARACTER = "%";
private static final String CHARACTERS_REGEX = UNCERTAIN_CHARACTER + APPROXIMATE_CHARACTER + UNCERTAIN_APPROXIMATE_CHARACTER;
public static final Pattern PATTERN = Pattern.compile("^[^" + CHARACTERS_REGEX + "]*([" + CHARACTERS_REGEX + "])$");

/**
* Get the enum value based on the character provided.
* <p>It will return a matched enum value or {@link #NO_QUALIFICATION}.</p>
* Get the enum values based on the character provided.
* <p>It will return an empty set or the set with the applicable qualifications.</p>
*
* @param character the provided character
* @return the enum value
*/
public static DateQualification fromCharacter(String character) {
return Arrays.stream(DateQualification.values()).filter(value -> value.character.equals(character)).findFirst().orElse(
NO_QUALIFICATION);
public static Set<DateQualification> fromCharacter(String character) {
final Set<DateQualification> dateQualifications = EnumSet.noneOf(DateQualification.class);
if (UNCERTAIN_APPROXIMATE_CHARACTER.equals(character)) {
dateQualifications.add(DateQualification.UNCERTAIN);
dateQualifications.add(DateQualification.APPROXIMATE);
} else if (UNCERTAIN_CHARACTER.equals(character)) {
dateQualifications.add(DateQualification.UNCERTAIN);
} else if (APPROXIMATE_CHARACTER.equals(character)) {
dateQualifications.add(DateQualification.APPROXIMATE);
}
return dateQualifications;
}

public String getCharacter() {
/**
* Get the string representation based on the provided date qualifications.
*
* @param dateQualifications the date qualifications
* @return the string representation
*/
public static String getCharacterFromQualifications(Set<DateQualification> dateQualifications) {
final String character;
if (dateQualifications.contains(UNCERTAIN) && dateQualifications.contains(APPROXIMATE)) {
character = UNCERTAIN_APPROXIMATE_CHARACTER;
} else if (dateQualifications.contains(UNCERTAIN)) {
character = UNCERTAIN_CHARACTER;
} else if (dateQualifications.contains(APPROXIMATE)) {
character = APPROXIMATE_CHARACTER;
} else {
character = "";
}
return character;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import static eu.europeana.normalization.dates.edtf.DateBoundaryType.DECLARED;
import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN;
import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN;
import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
import static eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder.THRESHOLD_4_DIGITS_YEAR;
import static eu.europeana.normalization.dates.edtf.Iso8601Parser.ISO_8601_MINIMUM_YEAR_DIGITS;
import static java.lang.Math.abs;
Expand All @@ -19,7 +18,9 @@
import java.time.Year;
import java.time.YearMonth;
import java.time.temporal.TemporalAccessor;
import java.util.EnumSet;
import java.util.Objects;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -38,11 +39,13 @@ public final class InstantEdtfDate extends AbstractEdtfDate implements Comparabl
private Month month;
private LocalDate yearMonthDay;
private YearPrecision yearPrecision;
private DateQualification dateQualification = NO_QUALIFICATION;
private Set<DateQualification> dateQualifications = EnumSet.noneOf(DateQualification.class);
private DateBoundaryType dateBoundaryType = DECLARED;

/**
* Restricted constructor by provided {@link InstantEdtfDateBuilder}.
* <p>All fields apart from {@link #dateQualifications} are strictly contained in the constructor. The date qualifications can
* be further extended to, for example, add an approximate qualification for a date that was sanitized.</p>
*
* @param instantEdtfDateBuilder the builder with all content verified
*/
Expand All @@ -51,13 +54,18 @@ public final class InstantEdtfDate extends AbstractEdtfDate implements Comparabl
year = instantEdtfDateBuilder.getYearObj();
month = instantEdtfDateBuilder.getMonthObj();
yearMonthDay = instantEdtfDateBuilder.getYearMonthDayObj();
dateQualification = instantEdtfDateBuilder.getDateQualification();
dateQualifications = instantEdtfDateBuilder.getDateQualifications();
}

private InstantEdtfDate(DateBoundaryType dateBoundaryType) {
this.dateBoundaryType = dateBoundaryType;
}

@Override
public void addQualification(DateQualification dateQualification) {
this.dateQualifications.add(dateQualification);
}

/**
* Create an {@link DateBoundaryType#UNKNOWN} instant.
*
Expand Down Expand Up @@ -188,7 +196,7 @@ public Integer getCentury() {
int centuryDivision = year.getValue() / YearPrecision.CENTURY.getDuration();
int centuryModulo = year.getValue() % YearPrecision.CENTURY.getDuration();
//For case 1900 it is 19th. For case 1901 it is 20th century
return centuryModulo == 0 ? centuryDivision : centuryDivision + 1;
return (centuryModulo == 0) ? centuryDivision : (centuryDivision + 1);
}

/**
Expand Down Expand Up @@ -230,7 +238,7 @@ public String toString() {
stringBuilder.append(
ofNullable(yearMonthDay).map(LocalDate::getDayOfMonth).map(decimalFormat::format).map(d -> "-" + d).orElse(""));
}
stringBuilder.append(dateQualification.getCharacter());
stringBuilder.append(DateQualification.getCharacterFromQualifications(dateQualifications));
return stringBuilder.toString();
}

Expand All @@ -256,13 +264,13 @@ public boolean equals(Object o) {
}
InstantEdtfDate that = (InstantEdtfDate) o;
return yearPrecision == that.yearPrecision && Objects.equals(year, that.year) && Objects.equals(month,
that.month) && Objects.equals(yearMonthDay, that.yearMonthDay) && dateQualification == that.dateQualification
that.month) && Objects.equals(yearMonthDay, that.yearMonthDay) && dateQualifications == that.dateQualifications
&& dateBoundaryType == that.dateBoundaryType;
}

@Override
public int hashCode() {
return Objects.hash(yearPrecision, year, month, yearMonthDay, dateQualification, dateBoundaryType);
return Objects.hash(yearPrecision, year, month, yearMonthDay, dateQualifications, dateBoundaryType);
}

public Year getYear() {
Expand All @@ -281,8 +289,8 @@ public YearPrecision getYearPrecision() {
return yearPrecision;
}

public DateQualification getDateQualification() {
return dateQualification;
public Set<DateQualification> getDateQualifications() {
return EnumSet.copyOf(dateQualifications);
}

public DateBoundaryType getDateBoundaryType() {
Expand Down
Loading

0 comments on commit 85b07a3

Please sign in to comment.