Skip to content

Commit

Permalink
feat: use looser comparison for title validation (#217) (#226)
Browse files Browse the repository at this point in the history
  • Loading branch information
hunterckx authored May 23, 2024
1 parent 5a41254 commit b25f2c1
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 1 deletion.
42 changes: 42 additions & 0 deletions __tests__/source-dataset-validation-results.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
ATLAS_WITH_SOURCE_DATASET_VALIDATIONS_B,
SOURCE_DATASET_PUBLISHED_WITH_HCA,
SOURCE_DATASET_PUBLISHED_WITH_HCA_TITLE_MISMATCH,
SOURCE_DATASET_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH,
SOURCE_DATASET_PUBLISHED_WITH_NO_HCA_OR_CELLXGENE,
SOURCE_DATASET_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA,
SOURCE_DATASET_UNPUBLISHED_WITH_CELLXGENE,
Expand Down Expand Up @@ -117,6 +118,39 @@ const VALIDATIONS_PUBLISHED_WITH_HCA_TITLE_MISMATCH: ExpectedValidationPropertie
},
];

const VALIDATIONS_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH: ExpectedValidationProperties[] =
[
{
system: SYSTEM.CELLXGENE,
taskStatus: TASK_STATUS.TODO,
validationId: VALIDATION_ID.SOURCE_DATASET_IN_CELLXGENE,
validationStatus: VALIDATION_STATUS.FAILED,
validationType: VALIDATION_TYPE.INGEST,
},
{
system: SYSTEM.HCA_DATA_REPOSITORY,
taskStatus: TASK_STATUS.DONE,
validationId: VALIDATION_ID.SOURCE_DATASET_IN_HCA_DATA_REPOSITORY,
validationStatus: VALIDATION_STATUS.PASSED,
validationType: VALIDATION_TYPE.INGEST,
},
{
system: SYSTEM.HCA_DATA_REPOSITORY,
taskStatus: TASK_STATUS.DONE,
validationId:
VALIDATION_ID.SOURCE_DATASET_TITLE_MATCHES_HCA_DATA_REPOSITORY,
validationStatus: VALIDATION_STATUS.PASSED,
validationType: VALIDATION_TYPE.METADATA,
},
{
system: SYSTEM.HCA_DATA_REPOSITORY,
taskStatus: TASK_STATUS.DONE,
validationId: VALIDATION_ID.SOURCE_DATASET_HCA_PROJECT_HAS_PRIMARY_DATA,
validationStatus: VALIDATION_STATUS.PASSED,
validationType: VALIDATION_TYPE.INGEST,
},
];

const VALIDATIONS_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA: ExpectedValidationProperties[] =
[
{
Expand Down Expand Up @@ -207,6 +241,14 @@ describe("getSourceDatasetValidationResults", () => {
);
});

it("returns validations for source dataset with HCA project with approximately-matching title", async () => {
await testValidations(
SOURCE_DATASET_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH,
[ATLAS_WITH_SOURCE_DATASET_VALIDATIONS_A],
VALIDATIONS_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH
);
});

it("returns validations for source dataset with HCA project without primary data", async () => {
await testValidations(
SOURCE_DATASET_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA,
Expand Down
23 changes: 22 additions & 1 deletion app/services/validations.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { dequal } from "dequal";
import DOMPurify from "isomorphic-dompurify";
import pg from "pg";
import {
DBEntityOfType,
Expand Down Expand Up @@ -93,7 +94,8 @@ export const SOURCE_DATASET_VALIDATIONS: ValidationDefinition<HCAAtlasTrackerDBS
(projectInfo, infoProperties, publication) => {
const expected = publication.title;
const actual = projectInfo?.title ?? null;
const valid = expected === actual;
const valid =
actual === null ? false : titlesMatch(expected, actual);
const info: ValidationStatusInfo = {
...infoProperties,
valid,
Expand Down Expand Up @@ -505,3 +507,22 @@ export async function updateTargetCompletions(
client.release();
}
}

/**
* Determine whether two titles are similar enough to be considered the same.
* @param a - First title.
* @param b - Second title.
* @returns true if the titles match.
*/
function titlesMatch(a: string, b: string): boolean {
return simplifyString(a) === simplifyString(b);

function simplifyString(s: string): string {
return DOMPurify.sanitize(s, { ALLOWED_TAGS: ["#text"] })
.normalize("NFKD")
.toLowerCase()
.replace(/[\p{P}\p{S}]/gu, "")
.replace(/\s+/g, " ")
.trim();
}
}
39 changes: 39 additions & 0 deletions testing/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ export const DOI_PUBLISHED_WITH_HCA = "10.123/sd-published-with-hca";
export const DOI_PUBLISHED_WITH_HCA_TITLE_MISMATCH =
"10.123/sd-published-with-hca-title-mismatch";

export const DOI_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH =
"10.123/sd-published-with-hca-title-near-match";

export const DOI_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA =
"10.123/sd-published-with-no-hca-primary-data";

Expand Down Expand Up @@ -286,6 +289,9 @@ export const HCA_ID_PUBLISHED_WITH_HCA = "hca-id-published-with-hca";
export const HCA_ID_PUBLISHED_WITH_HCA_TITLE_MISMATCH =
"hca-id-published-with-hca-title-mismatch";

export const HCA_ID_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH =
"hca-id-published-with-hca-title-near-match";

export const HCA_ID_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA =
"hca-id-published-with-no-hca-primary-data";

Expand Down Expand Up @@ -329,6 +335,13 @@ export const HCA_PROJECTS_RESPONSE_PUBLISHED_WITH_HCA_TITLE_MISMATCH =
"Published With HCA Title Mismatch MISMATCHED"
);

export const HCA_PROJECTS_RESPONSE_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH =
makeTestProjectsResponse(
HCA_ID_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH,
DOI_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH,
"Published – With Hca Title <i>Near</i> Match. "
);

export const HCA_PROJECTS_RESPONSE_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA =
makeTestProjectsResponse(
HCA_ID_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA,
Expand All @@ -346,6 +359,10 @@ export const TEST_HCA_PROJECTS_BY_DOI = new Map([
DOI_PUBLISHED_WITH_HCA_TITLE_MISMATCH,
HCA_PROJECTS_RESPONSE_PUBLISHED_WITH_HCA_TITLE_MISMATCH,
],
[
DOI_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH,
HCA_PROJECTS_RESPONSE_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH,
],
[
DOI_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA,
HCA_PROJECTS_RESPONSE_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA,
Expand Down Expand Up @@ -519,6 +536,26 @@ export const SOURCE_DATASET_PUBLISHED_WITH_HCA_TITLE_MISMATCH: TestPublishedSour
},
};

export const SOURCE_DATASET_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH: TestPublishedSourceDataset =
{
doi: DOI_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH,
doiStatus: DOI_STATUS.OK,
id: "351ab5d7-99e9-473d-bb07-397abd01a2f2",
publication: {
authors: [
{
name: "Bar Baz Foo",
personalName: null,
},
],
hasPreprintDoi: null,
journal: "Foo Bar Foo",
preprintOfDoi: null,
publicationDate: "2024-05-22",
title: "Published With HCA Title Near Match",
},
};

export const SOURCE_DATASET_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA: TestPublishedSourceDataset =
{
doi: DOI_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA,
Expand Down Expand Up @@ -570,6 +607,7 @@ export const INITIAL_TEST_SOURCE_DATASETS = [
SOURCE_DATASET_PUBLISHED_WITH_HCA,
SOURCE_DATASET_UNPUBLISHED_WITH_CELLXGENE,
SOURCE_DATASET_PUBLISHED_WITH_HCA_TITLE_MISMATCH,
SOURCE_DATASET_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH,
SOURCE_DATASET_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA,
SOURCE_DATASET_PUBLISHED_WITH_NO_HCA_OR_CELLXGENE,
];
Expand Down Expand Up @@ -628,6 +666,7 @@ export const ATLAS_WITH_SOURCE_DATASET_VALIDATIONS_A: TestAtlas = {
SOURCE_DATASET_PUBLISHED_WITH_HCA.id,
SOURCE_DATASET_UNPUBLISHED_WITH_CELLXGENE.id,
SOURCE_DATASET_PUBLISHED_WITH_HCA_TITLE_MISMATCH.id,
SOURCE_DATASET_PUBLISHED_WITH_HCA_TITLE_NEAR_MATCH.id,
SOURCE_DATASET_PUBLISHED_WITH_NO_HCA_PRIMARY_DATA.id,
],
status: ATLAS_STATUS.DRAFT,
Expand Down

0 comments on commit b25f2c1

Please sign in to comment.