diff --git a/docs/source/curation.rst b/docs/source/curation.rst index ca2da39e0..0cb3940a1 100644 --- a/docs/source/curation.rst +++ b/docs/source/curation.rst @@ -5,3 +5,7 @@ There are several curation workflows implemented in :mod:`bioregistry.curation`. Bulk Import ----------- .. automodapi:: bioregistry.curation.bulk_import + +Semi-automated Literature Curation +---------------------------------- +.. automodapi:: bioregistry.curation.literature diff --git a/src/bioregistry/constants.py b/src/bioregistry/constants.py index 29b75926c..6041ab787 100644 --- a/src/bioregistry/constants.py +++ b/src/bioregistry/constants.py @@ -33,6 +33,7 @@ COLLECTIONS_PATH = DATA_DIRECTORY / "collections.json" MISMATCH_PATH = DATA_DIRECTORY / "mismatch.json" CONTEXTS_PATH = DATA_DIRECTORY / "contexts.json" +CURATED_PAPERS_PATH = DATA_DIRECTORY / "curated_papers.tsv" BIOREGISTRY_MODULE = pystow.module("bioregistry") diff --git a/src/bioregistry/curation/curated_papers.csv b/src/bioregistry/curation/curated_papers.csv deleted file mode 100644 index 66b3363c9..000000000 --- a/src/bioregistry/curation/curated_papers.csv +++ /dev/null @@ -1 +0,0 @@ -pmid, relevant, relevancy_type, notes \ No newline at end of file diff --git a/src/bioregistry/curation/literature.py b/src/bioregistry/curation/literature.py new file mode 100644 index 000000000..04d6c13a2 --- /dev/null +++ b/src/bioregistry/curation/literature.py @@ -0,0 +1,39 @@ +"""Utilities for working with the data produced by the semi-automated curation workflow.""" + +import enum + +__all__ = [ + "CurationRelevance", + "COLUMNS", +] + +COLUMNS = [ + "pmid", + "relevant", + "orcid", + "date_curated", + "relevancy_type", + "pr_added", # links back to the PR where curations were done + "notes", +] + + +class CurationRelevance(str, enum.Enum): + """An enumeration for curation relevance.""" + + #: A resource for new primary identifiers + new_prefix = enum.auto() + #: A resolver for existing identifiers + new_provider = enum.auto() + #: A new publication for an existing prefix + new_publication = enum.auto() + #: A database, but not for identifier information + not_identifiers_resource = enum.auto() + #: Paper suggestive of a new database, but no link to website provided + no_website = enum.auto() + #: An existing entry in the bioregistry + existing = enum.auto() + #: Not clear how to curate in the bioregistry, follow up discussion required + unclear = enum.auto() + #: Completely unrelated information + irrelevant_other = enum.auto() diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index d71be1682..b47bf18a9 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -85844,6 +85844,24 @@ "prefix": "pdbj", "uri_format": "http://service.pdbj.org/mine/Detail?PDBID=$1&PAGEID=Summary" }, + "providers": [ + { + "code": "furna", + "description": "FURNA (Functions of RNAs) is a database of ligand-RNA interactions and Gene Ontology annotations for RNAs in the Protein Data Bank (PDB).", + "homepage": "https://seq2fun.dcmb.med.umich.edu/furna/", + "name": "furna", + "publications": [ + { + "doi": "10.1371/journal.pbio.3002476", + "pmc": "PMC11309384", + "pubmed": "39074139", + "title": "FURNA: A database for functional annotations of RNA structures", + "year": 2024 + } + ], + "uri_format": "https://seq2fun.dcmb.med.umich.edu/furna/pdb.cgi?pdbid=$1" + } + ], "publications": [ { "doi": "10.1002/pro.4211", diff --git a/src/bioregistry/data/curated_papers.tsv b/src/bioregistry/data/curated_papers.tsv new file mode 100644 index 000000000..68b964311 --- /dev/null +++ b/src/bioregistry/data/curated_papers.tsv @@ -0,0 +1,21 @@ +pmid relevant orcid date_curated relevancy_type pr_added notes +39104285 1 0009-0009-5240-7463 2024-09-24 new_provider 1193 Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species +39074139 1 0009-0009-5240-7463 2024-09-24 new_provider 1193 Resolver for PDB IDs +39014503 0 0009-0009-5240-7463 2024-09-25 no_website +39047988 0 0009-0009-5240-7463 2024-09-25 irrelevant_other +39115390 0 0009-0009-5240-7463 2024-09-26 irrelevant_other +39095357 0 0009-0009-5240-7463 2024-09-26 irrelevant_other +39084442 0 0009-0009-5240-7463 2024-09-27 not_identifiers_resource +38991851 1 0009-0009-5240-7463 2024-09-28 unclear identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry +38991828 0 0009-0009-5240-7463 2024-09-28 irrelevant_other +39049520 0 0009-0009-5240-7463 2024-09-30 not_identifiers_resource +39104826 1 0009-0009-5240-7463 2024-10-01 existing Already present in the bioregistry as a provider for mesh prefix +39050757 0 0009-0009-5240-7463 2024-10-01 irrelevant_other +39064021 0 0009-0009-5240-7463 2024-10-01 irrelevant_other +39028894 0 0009-0009-5240-7463 2024-10-04 not_identifiers_resource +39044201 0 0009-0009-5240-7463 2024-10-04 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information +39088253 0 0009-0009-5240-7463 2024-10-05 irrelevant_other +39119155 0 0009-0009-5240-7463 2024-10-05 irrelevant_other +39005357 0 0009-0009-5240-7463 2024-10-05 irrelevant_other +39044130 0 0009-0009-5240-7463 2024-10-05 irrelevant_other +39010878 0 0009-0009-5240-7463 2024-10-05 irrelevant_other diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py new file mode 100644 index 000000000..f646df47e --- /dev/null +++ b/tests/test_curated_papers.py @@ -0,0 +1,69 @@ +"""Test for checking the integrity of the curated_papers TSV file.""" + +import csv +import unittest +from datetime import datetime + +from bioregistry.constants import CURATED_PAPERS_PATH, ORCID_PATTERN +from bioregistry.curation.literature import COLUMNS, CurationRelevance + + +class TestTSV(unittest.TestCase): + """Tests for curated_papers tsv file.""" + + def setUp(self): + """Set up the test case.""" + self.relevancy_types = {r.name for r in CurationRelevance} + + def validate_row(self, row): + """Validate a single row from the TSV file.""" + for field in COLUMNS: + self.assertIn(field, row) + + self.assertTrue(row["pmid"].isdigit(), msg="PubMed identifier should be an integer") + + # Allow pr_added to be empty + if row["pr_added"]: + self.assertTrue(row["pr_added"].isdigit(), msg="Pull Request should be an integer") + + # Validate relevant is 0 or 1 + self.assertIn(row["relevant"], ["0", "1"]) + + """ + Commenting out this check for now. This can be re-implemented if a need + for it arises in the future + + if row["relevant"] == "1": + prefix = row["prefix"] + self.assertIsNotNone(prefix, msg="prefix should be set for all relevant entries") + self.assertNotEqual("", prefix, msg="prefix should not be empty for relevant entries") + self.assertEqual( + bioregistry.normalize_prefix(prefix), + prefix, + msg="prefix should be standardized for relevant entries", + ) + """ + + # Validate relevancy_type is in relevancy_vocab + self.assertIn(row["relevancy_type"], self.relevancy_types) + + self.assertRegex(row["orcid"], ORCID_PATTERN) + + # Handle None values for notes + if row["notes"] is not None: + self.assertFalse(row["notes"].startswith('"')) + self.assertFalse(row["notes"].endswith('"')) + + # Validate date_curated format + try: + datetime.strptime(row["date_curated"], "%Y-%m-%d") + except ValueError: + self.fail("date_curated should follow format YYYY-MM-DD") + + def test_tsv_file(self): + """Tests all rows in TSV file are valid.""" + with CURATED_PAPERS_PATH.open() as tsv_file: + reader = csv.DictReader(tsv_file, delimiter="\t") + for row, data in enumerate(reader, start=1): + with self.subTest(row=row, data=data): + self.validate_row(data)