biopragmatics · cthoyt · Oct 19, 2024 · Oct 1, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/docs/source/curation.rst b/docs/source/curation.rst
@@ -5,3 +5,7 @@ There are several curation workflows implemented in :mod:`bioregistry.curation`.
 Bulk Import
 -----------
 .. automodapi:: bioregistry.curation.bulk_import
+
+Semi-automated Literature Curation
+----------------------------------
+.. automodapi:: bioregistry.curation.literature
diff --git a/src/bioregistry/constants.py b/src/bioregistry/constants.py
@@ -33,6 +33,7 @@
 COLLECTIONS_PATH = DATA_DIRECTORY / "collections.json"
 MISMATCH_PATH = DATA_DIRECTORY / "mismatch.json"
 CONTEXTS_PATH = DATA_DIRECTORY / "contexts.json"
+CURATED_PAPERS_PATH = DATA_DIRECTORY / "curated_papers.tsv"
 
 BIOREGISTRY_MODULE = pystow.module("bioregistry")
 

diff --git a/src/bioregistry/curation/curated_papers.csv b/src/bioregistry/curation/curated_papers.csv
diff --git a/src/bioregistry/curation/literature.py b/src/bioregistry/curation/literature.py
@@ -0,0 +1,39 @@
+"""Utilities for working with the data produced by the semi-automated curation workflow."""
+
+import enum
+
+__all__ = [
+    "CurationRelevance",
+    "COLUMNS",
+]
+
+COLUMNS = [
+    "pmid",
+    "relevant",
+    "orcid",
+    "date_curated",
+    "relevancy_type",
+    "pr_added",  # links back to the PR where curations were done
+    "notes",
+]
+
+
+class CurationRelevance(str, enum.Enum):
+    """An enumeration for curation relevance."""
+
+    #: A resource for new primary identifiers
+    new_prefix = enum.auto()
+    #: A resolver for existing identifiers
+    new_provider = enum.auto()
+    #: A new publication for an existing prefix
+    new_publication = enum.auto()
+    #: A database, but not for identifier information
+    not_identifiers_resource = enum.auto()
+    #: Paper suggestive of a new database, but no link to website provided
+    no_website = enum.auto()
+    #: An existing entry in the bioregistry
+    existing = enum.auto()
+    #: Not clear how to curate in the bioregistry, follow up discussion required
+    unclear = enum.auto()
+    #: Completely unrelated information
+    irrelevant_other = enum.auto()
diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json
@@ -85844,6 +85844,24 @@
       "prefix": "pdbj",
       "uri_format": "http://service.pdbj.org/mine/Detail?PDBID=$1&PAGEID=Summary"
     },
+    "providers": [
+      {
+        "code": "furna",
+        "description": "FURNA (Functions of RNAs) is a database of ligand-RNA interactions and Gene Ontology annotations for RNAs in the Protein Data Bank (PDB).",
+        "homepage": "https://seq2fun.dcmb.med.umich.edu/furna/",
+        "name": "furna",
+        "publications": [
+          {
+            "doi": "10.1371/journal.pbio.3002476",
+            "pmc": "PMC11309384",
+            "pubmed": "39074139",
+            "title": "FURNA: A database for functional annotations of RNA structures",
+            "year": 2024
+          }
+        ],
+        "uri_format": "https://seq2fun.dcmb.med.umich.edu/furna/pdb.cgi?pdbid=$1"
+      }
+    ],
     "publications": [
       {
         "doi": "10.1002/pro.4211",

diff --git a/src/bioregistry/data/curated_papers.tsv b/src/bioregistry/data/curated_papers.tsv
@@ -0,0 +1,21 @@
+pmid	relevant	orcid	date_curated	relevancy_type	pr_added	notes
+39104285	1	0009-0009-5240-7463	2024-09-24	new_provider	1193	Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species
+39074139	1	0009-0009-5240-7463	2024-09-24	new_provider	1193	Resolver for PDB IDs
+39014503	0	0009-0009-5240-7463	2024-09-25	no_website		
+39047988	0	0009-0009-5240-7463	2024-09-25	irrelevant_other		
+39115390	0	0009-0009-5240-7463	2024-09-26	irrelevant_other		
+39095357	0	0009-0009-5240-7463	2024-09-26	irrelevant_other		
+39084442	0	0009-0009-5240-7463	2024-09-27	not_identifiers_resource		
+38991851	1	0009-0009-5240-7463	2024-09-28	unclear		identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry
+38991828	0	0009-0009-5240-7463	2024-09-28	irrelevant_other		
+39049520	0	0009-0009-5240-7463	2024-09-30	not_identifiers_resource		
+39104826	1	0009-0009-5240-7463	2024-10-01	existing		Already present in the bioregistry as a provider for mesh prefix
+39050757	0	0009-0009-5240-7463	2024-10-01	irrelevant_other		
+39064021	0	0009-0009-5240-7463	2024-10-01	irrelevant_other		
+39028894	0	0009-0009-5240-7463	2024-10-04	not_identifiers_resource		
+39044201	0	0009-0009-5240-7463	2024-10-04	not_identifiers_resource		Potential resource for rare diseases identifiers, but not identifier information
+39088253	0	0009-0009-5240-7463	2024-10-05	irrelevant_other		
+39119155	0	0009-0009-5240-7463	2024-10-05	irrelevant_other		
+39005357	0	0009-0009-5240-7463	2024-10-05	irrelevant_other		
+39044130	0	0009-0009-5240-7463	2024-10-05	irrelevant_other		
+39010878	0	0009-0009-5240-7463	2024-10-05	irrelevant_other		
diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py
@@ -0,0 +1,69 @@
+"""Test for checking the integrity of the curated_papers TSV file."""
+
+import csv
+import unittest
+from datetime import datetime
+
+from bioregistry.constants import CURATED_PAPERS_PATH, ORCID_PATTERN
+from bioregistry.curation.literature import COLUMNS, CurationRelevance
+
+
+class TestTSV(unittest.TestCase):
+    """Tests for curated_papers tsv file."""
+
+    def setUp(self):
+        """Set up the test case."""
+        self.relevancy_types = {r.name for r in CurationRelevance}
+
+    def validate_row(self, row):
+        """Validate a single row from the TSV file."""
+        for field in COLUMNS:
+            self.assertIn(field, row)
+
+        self.assertTrue(row["pmid"].isdigit(), msg="PubMed identifier should be an integer")
+
+        # Allow pr_added to be empty
+        if row["pr_added"]:
+            self.assertTrue(row["pr_added"].isdigit(), msg="Pull Request should be an integer")
+
+        # Validate relevant is 0 or 1
+        self.assertIn(row["relevant"], ["0", "1"])
+
+        """
+        Commenting out this check for now. This can be re-implemented if a need
+        for it arises in the future
+
+        if row["relevant"] == "1":
+            prefix = row["prefix"]
+            self.assertIsNotNone(prefix, msg="prefix should be set for all relevant entries")
+            self.assertNotEqual("", prefix, msg="prefix should not be empty for relevant entries")
+            self.assertEqual(
+                bioregistry.normalize_prefix(prefix),
+                prefix,
+                msg="prefix should be standardized for relevant entries",
+            )
+        """
+
+        # Validate relevancy_type is in relevancy_vocab
+        self.assertIn(row["relevancy_type"], self.relevancy_types)
+
+        self.assertRegex(row["orcid"], ORCID_PATTERN)
+
+        # Handle None values for notes
+        if row["notes"] is not None:
+            self.assertFalse(row["notes"].startswith('"'))
+            self.assertFalse(row["notes"].endswith('"'))
+
+        # Validate date_curated format
+        try:
+            datetime.strptime(row["date_curated"], "%Y-%m-%d")
+        except ValueError:
+            self.fail("date_curated should follow format YYYY-MM-DD")
+
+    def test_tsv_file(self):
+        """Tests all rows in TSV file are valid."""
+        with CURATED_PAPERS_PATH.open() as tsv_file:
+            reader = csv.DictReader(tsv_file, delimiter="\t")
+            for row, data in enumerate(reader, start=1):
+                with self.subTest(row=row, data=data):
+                    self.validate_row(data)