Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Debt/met 5132 dates normalization cleanup part 4 #622

Merged
merged 30 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
5243fd8
MET-5132: Update naming of numeric spaces variant
stzanakis Aug 29, 2023
ea87db3
MET-5132: Prepare first sample of tests
stzanakis Aug 29, 2023
721e4d6
MET-5132: More tests
stzanakis Aug 30, 2023
9cfdcf1
MET-5132: Restructure with enum MontNameDateExtractor
stzanakis Aug 31, 2023
0d70956
MET-5132: Reusable DatePartsIndices
stzanakis Aug 31, 2023
1905582
MET-5132: Cleanup
stzanakis Aug 31, 2023
25f222c
MET-5132: Add spaces clean and trim
stzanakis Aug 31, 2023
c20a3cd
MET-5132: Cleanup
stzanakis Aug 31, 2023
c981371
MET-5132: First split the range part
stzanakis Sep 1, 2023
9ad9340
MET-5132: Refactor PatternBcAdDateExtractor with tests
stzanakis Sep 1, 2023
c7e5c7d
MET-5132: PatternBcAdRangeDateExtractor cleanup
stzanakis Sep 4, 2023
ba5a05d
MET-5132: Reuse range code for dates
stzanakis Sep 4, 2023
c953057
MET-5132: Reuse range code for BriefRangeDateExtractor
stzanakis Sep 6, 2023
0382940
MET-5132: Simplify hierarchy for ranges
stzanakis Sep 6, 2023
9667339
MET-5132: Add sample tests for PatternLongNegativeYearDateExtractor a…
stzanakis Sep 6, 2023
bccf45f
MET-5132: Refactor LongNegativeYearDateExtractor with a separate rang…
stzanakis Sep 7, 2023
717f0f4
MET-5132: Split century extraction to numeric, roman, roman range reu…
stzanakis Sep 7, 2023
874ad5c
MET-5132: Centralize sanitization operation for all extractors.
stzanakis Sep 7, 2023
05dd5f9
MET-5132: Add tests for generic properties
stzanakis Sep 11, 2023
0d38dcf
MET-5132: Reuse test code
stzanakis Sep 11, 2023
0899566
MET-5132: Repackage
stzanakis Sep 11, 2023
1a57415
MET-5132: Split EdtfDateExtractor to handle ranges separately with co…
stzanakis Sep 12, 2023
683c118
MET-5132: Centralize date qualification overwriting
stzanakis Sep 12, 2023
2ab80b0
MET-5132: Simplify date qualification overwriting
stzanakis Sep 13, 2023
d613725
MET-5132: Update code after answers from rnd
stzanakis Oct 3, 2023
e405a94
MET-5132: Cleanup
stzanakis Oct 3, 2023
6682482
MET-5132: PatternFormattedFullDateDateExtractor cleanup
stzanakis Oct 4, 2023
451fffb
MET-5132: Add millisecond support
stzanakis Oct 5, 2023
82ee0e4
MET-5132: Process review
stzanakis Oct 17, 2023
097707d
MET-5132: Process review 2
stzanakis Oct 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public enum DateNormalizationExtractorMatchId {
NUMERIC_ALL_VARIANTS_XX("numeric date (various separators and unknown parts)"),
NUMERIC_RANGE_ALL_VARIANTS("numeric date interval (various separators)"),
NUMERIC_RANGE_ALL_VARIANTS_XX("numeric date interval (various separators and unknown parts)"),
YYYY_MM_DD_SPACES("numeric date (whitespace separators)");
NUMERIC_SPACES_VARIANT("numeric date (whitespace separators)");

final String label;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package eu.europeana.normalization.dates.edtf;

import java.util.Set;

/**
* An abstract class that contains the template that an EDTF date with compliance level 1 should implement.
* <p>See more in the specification of <a href="https://www.loc.gov/standards/datetime/">EDTF</a></p>
Expand All @@ -17,11 +19,18 @@ protected AbstractEdtfDate(String label) {
this.label = label;
}

/**
* Add the date qualification, mainly used for pre-sanitized values.
*
* @param dateQualification the date qualification
*/
public abstract void addQualification(DateQualification dateQualification);

public String getLabel() {
return label;
}

public abstract DateQualification getDateQualification();
public abstract Set<DateQualification> getDateQualifications();

public abstract boolean isOpen();

Expand Down
Original file line number Diff line number Diff line change
@@ -1,39 +1,59 @@
package eu.europeana.normalization.dates.edtf;

import java.util.Arrays;
import java.util.EnumSet;
import java.util.Set;
import java.util.regex.Pattern;

/**
* Date qualification characters according to <a href="https://www.loc.gov/standards/datetime/">Extended Date/Time Format (EDTF)
* Specification</a>
*/
public enum DateQualification {
UNCERTAIN, APPROXIMATE;

NO_QUALIFICATION(""),
UNCERTAIN("?"),
APPROXIMATE("~"),
UNCERTAIN_APPROXIMATE("%");

public static final Pattern CHECK_QUALIFICATION_PATTERN = Pattern.compile("^[^\\?~%]*([\\?~%]?)$");
private final String character;

DateQualification(String character) {
this.character = character;
}
private static final String UNCERTAIN_CHARACTER = "?";
private static final String APPROXIMATE_CHARACTER = "~";
private static final String UNCERTAIN_APPROXIMATE_CHARACTER = "%";
private static final String CHARACTERS_REGEX = UNCERTAIN_CHARACTER + APPROXIMATE_CHARACTER + UNCERTAIN_APPROXIMATE_CHARACTER;
public static final Pattern PATTERN = Pattern.compile("^[^" + CHARACTERS_REGEX + "]*([" + CHARACTERS_REGEX + "])$");

/**
* Get the enum value based on the character provided.
* <p>It will return a matched enum value or {@link #NO_QUALIFICATION}.</p>
* Get the enum values based on the character provided.
* <p>It will return an empty set or the set with the applicable qualifications.</p>
*
* @param character the provided character
* @return the enum value
*/
public static DateQualification fromCharacter(String character) {
return Arrays.stream(DateQualification.values()).filter(value -> value.character.equals(character)).findFirst().orElse(
NO_QUALIFICATION);
public static Set<DateQualification> fromCharacter(String character) {
final Set<DateQualification> dateQualifications = EnumSet.noneOf(DateQualification.class);
if (UNCERTAIN_APPROXIMATE_CHARACTER.equals(character)) {
dateQualifications.add(DateQualification.UNCERTAIN);
dateQualifications.add(DateQualification.APPROXIMATE);
} else if (UNCERTAIN_CHARACTER.equals(character)) {
dateQualifications.add(DateQualification.UNCERTAIN);
} else if (APPROXIMATE_CHARACTER.equals(character)) {
dateQualifications.add(DateQualification.APPROXIMATE);
}
return dateQualifications;
}

public String getCharacter() {
/**
* Get the string representation based on the provided date qualifications.
*
* @param dateQualifications the date qualifications
* @return the string representation
*/
public static String getCharacterFromQualifications(Set<DateQualification> dateQualifications) {
final String character;
if (dateQualifications.contains(UNCERTAIN) && dateQualifications.contains(APPROXIMATE)) {
character = UNCERTAIN_APPROXIMATE_CHARACTER;
} else if (dateQualifications.contains(UNCERTAIN)) {
character = UNCERTAIN_CHARACTER;
} else if (dateQualifications.contains(APPROXIMATE)) {
character = APPROXIMATE_CHARACTER;
} else {
character = "";
}
return character;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import static eu.europeana.normalization.dates.edtf.DateBoundaryType.DECLARED;
import static eu.europeana.normalization.dates.edtf.DateBoundaryType.OPEN;
import static eu.europeana.normalization.dates.edtf.DateBoundaryType.UNKNOWN;
import static eu.europeana.normalization.dates.edtf.DateQualification.NO_QUALIFICATION;
import static eu.europeana.normalization.dates.edtf.InstantEdtfDateBuilder.THRESHOLD_4_DIGITS_YEAR;
import static eu.europeana.normalization.dates.edtf.Iso8601Parser.ISO_8601_MINIMUM_YEAR_DIGITS;
import static java.lang.Math.abs;
Expand All @@ -19,7 +18,9 @@
import java.time.Year;
import java.time.YearMonth;
import java.time.temporal.TemporalAccessor;
import java.util.EnumSet;
import java.util.Objects;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -38,11 +39,13 @@ public final class InstantEdtfDate extends AbstractEdtfDate implements Comparabl
private Month month;
private LocalDate yearMonthDay;
private YearPrecision yearPrecision;
private DateQualification dateQualification = NO_QUALIFICATION;
private Set<DateQualification> dateQualifications = EnumSet.noneOf(DateQualification.class);
private DateBoundaryType dateBoundaryType = DECLARED;

/**
* Restricted constructor by provided {@link InstantEdtfDateBuilder}.
* <p>All fields apart from {@link #dateQualifications} are strictly contained in the constructor. The date qualifications can
* be further extended to, for example, add an approximate qualification for a date that was sanitized.</p>
*
* @param instantEdtfDateBuilder the builder with all content verified
*/
Expand All @@ -51,13 +54,18 @@ public final class InstantEdtfDate extends AbstractEdtfDate implements Comparabl
year = instantEdtfDateBuilder.getYearObj();
month = instantEdtfDateBuilder.getMonthObj();
yearMonthDay = instantEdtfDateBuilder.getYearMonthDayObj();
dateQualification = instantEdtfDateBuilder.getDateQualification();
dateQualifications = instantEdtfDateBuilder.getDateQualifications();
}

private InstantEdtfDate(DateBoundaryType dateBoundaryType) {
this.dateBoundaryType = dateBoundaryType;
}

@Override
public void addQualification(DateQualification dateQualification) {
this.dateQualifications.add(dateQualification);
}

/**
* Create an {@link DateBoundaryType#UNKNOWN} instant.
*
Expand Down Expand Up @@ -188,7 +196,7 @@ public Integer getCentury() {
int centuryDivision = year.getValue() / YearPrecision.CENTURY.getDuration();
int centuryModulo = year.getValue() % YearPrecision.CENTURY.getDuration();
//For case 1900 it is 19th. For case 1901 it is 20th century
return centuryModulo == 0 ? centuryDivision : centuryDivision + 1;
return (centuryModulo == 0) ? centuryDivision : (centuryDivision + 1);
}

/**
Expand Down Expand Up @@ -230,7 +238,7 @@ public String toString() {
stringBuilder.append(
ofNullable(yearMonthDay).map(LocalDate::getDayOfMonth).map(decimalFormat::format).map(d -> "-" + d).orElse(""));
}
stringBuilder.append(dateQualification.getCharacter());
stringBuilder.append(DateQualification.getCharacterFromQualifications(dateQualifications));
return stringBuilder.toString();
}

Expand All @@ -256,13 +264,13 @@ public boolean equals(Object o) {
}
InstantEdtfDate that = (InstantEdtfDate) o;
return yearPrecision == that.yearPrecision && Objects.equals(year, that.year) && Objects.equals(month,
that.month) && Objects.equals(yearMonthDay, that.yearMonthDay) && dateQualification == that.dateQualification
that.month) && Objects.equals(yearMonthDay, that.yearMonthDay) && dateQualifications == that.dateQualifications
&& dateBoundaryType == that.dateBoundaryType;
}

@Override
public int hashCode() {
return Objects.hash(yearPrecision, year, month, yearMonthDay, dateQualification, dateBoundaryType);
return Objects.hash(yearPrecision, year, month, yearMonthDay, dateQualifications, dateBoundaryType);
}

public Year getYear() {
Expand All @@ -281,8 +289,8 @@ public YearPrecision getYearPrecision() {
return yearPrecision;
}

public DateQualification getDateQualification() {
return dateQualification;
public Set<DateQualification> getDateQualifications() {
return EnumSet.copyOf(dateQualifications);
}

public DateBoundaryType getDateBoundaryType() {
Expand Down
Loading