From 01b96851e4f80838f67ef5dc6c2b17c96da42087 Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Tue, 1 Oct 2024 18:48:52 -0400 Subject: [PATCH 01/22] update curated papers list --- src/bioregistry/curation/curated_papers.csv | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/bioregistry/curation/curated_papers.csv b/src/bioregistry/curation/curated_papers.csv index 66b3363c9..160d9ebbc 100644 --- a/src/bioregistry/curation/curated_papers.csv +++ b/src/bioregistry/curation/curated_papers.csv @@ -1 +1,8 @@ -pmid, relevant, relevancy_type, notes \ No newline at end of file +pmid,relevant,relevancy_type,notes +39104285,1,new_provider,“Provider for UniProt IDs, issue with multiple URI formats depending on plant species” +39074139,1,new_prefix,"Uses PDP and RNAcentral IDs to create RNA-ligand interaction page but unsure if creating “new” identifiers or rehashing existing ones" +39014503,1,no_website +39047988,0,irrelevant_other +39115390,0,irrelevant_other +39095357,0,irrelevant_other +39084442,0,not_identifiers_resource From 1824a51f441f8f1439c54c33b10d11332e5dad81 Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Mon, 7 Oct 2024 11:04:11 -0400 Subject: [PATCH 02/22] update curated papers list with papers identified on Aug 9 batch. Curate new prefix for PEPhub. Curate new provider for PDB --- src/bioregistry/curation/curated_papers.csv | 27 +++++++++++++++------ src/bioregistry/data/bioregistry.json | 22 +++++++++++++++++ 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/src/bioregistry/curation/curated_papers.csv b/src/bioregistry/curation/curated_papers.csv index 160d9ebbc..e76545c25 100644 --- a/src/bioregistry/curation/curated_papers.csv +++ b/src/bioregistry/curation/curated_papers.csv @@ -1,8 +1,21 @@ pmid,relevant,relevancy_type,notes -39104285,1,new_provider,“Provider for UniProt IDs, issue with multiple URI formats depending on plant species” -39074139,1,new_prefix,"Uses PDP and RNAcentral IDs to create RNA-ligand interaction page but unsure if creating “new” identifiers or rehashing existing ones" -39014503,1,no_website -39047988,0,irrelevant_other -39115390,0,irrelevant_other -39095357,0,irrelevant_other -39084442,0,not_identifiers_resource +39104285,1,new_provider,"Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species" +39074139,1,new_provider,"Resolver for PDB IDs" +39014503,0,no_website, +39047988,0,irrelevant_other, +39115390,0,irrelevant_other, +39095357,0,irrelevant_other, +39084442,0,not_identifiers_resource, +38991851,0,new_prefix,"identifiers for sharing, retrieving, and validating sample metadata." +38991828,0,irrelevant_other, +39049520,0,not_identifiers_resource, +39104826,1,existing,"Already present in the bioregistry as a provider for mesh prefix" +39050757,0,irrelevant_other, +39064021,0,irrelevant_other, +39028894,0,not_identifiers_resource, +39044201,0,not_identifiers,"Potential resource for rare diseases identifiers, but not identifier information" +39088253,0,irrelevant_other, +39119155,0,irrelevant_other, +39005357,0,irrelevant_other, +39044130,0,irrelevant_other, +39010878,0,irrelevant_other, \ No newline at end of file diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index 4deb6abe7..8c260542e 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -85731,6 +85731,15 @@ "prefix": "pdbj", "uri_format": "http://service.pdbj.org/mine/Detail?PDBID=$1&PAGEID=Summary" }, + "providers": [ + { + "code": "furna", + "description": "FURNA (Functions of RNAs) is a database of ligand-RNA interactions and Gene Ontology annotations for RNAs in the Protein Data Bank (PDB).", + "homepage": "https://seq2fun.dcmb.med.umich.edu/furna/", + "name": "furna", + "uri_format": "https://seq2fun.dcmb.med.umich.edu/furna/pdb.cgi?pdbid=$1" + } + ], "publications": [ { "doi": "10.1002/pro.4211", @@ -86670,6 +86679,19 @@ "orcid": "0000-0003-4423-4370" } }, + "pephub": { + "contact": { + "email": "nsheffield@virginia.edu", + "name": "Nathan Sheffield", + "orcid": "0000-0001-5643-4068" + }, + "description": "PEPhub is a database, web interface, and API for sharing, retrieving, and validating sample metadata. PEPhub uses Portable Encapsulated Projects (PEP) biological metadata standard to store, edit, and access PEPs in one place.", + "example": "gse185244", + "homepage": "https://pephub.databio.org/", + "name": "PEPhub", + "pattern": "^gse[0-9]{6}$", + "uri_format": "https://pephub.databio.org/geo/$1" + }, "peptideatlas": { "biocontext": { "prefix": "PEPTIDEATLAS" From c3967ac558ffd4fc77ac2b8a635bba3e90cc7a40 Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Mon, 7 Oct 2024 15:34:17 -0400 Subject: [PATCH 03/22] Add contributor information, update regex pattern and examples for pephub --- src/bioregistry/data/bioregistry.json | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index 8c260542e..450474eda 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -86685,12 +86685,21 @@ "name": "Nathan Sheffield", "orcid": "0000-0001-5643-4068" }, + "contributor": { + "email": "m.naguthana@hotmail.com", + "github": "nagutm", + "name": "Mufaddal Naguthanawala", + "orcid": "0009-0009-5240-7463" + }, "description": "PEPhub is a database, web interface, and API for sharing, retrieving, and validating sample metadata. PEPhub uses Portable Encapsulated Projects (PEP) biological metadata standard to store, edit, and access PEPs in one place.", - "example": "gse185244", + "example": [ + "geo", + "geo/gse185244" + ], "homepage": "https://pephub.databio.org/", "name": "PEPhub", - "pattern": "^gse[0-9]{6}$", - "uri_format": "https://pephub.databio.org/geo/$1" + "pattern": "^[A-Za-z0-9_\\-]+(/[A-Za-z0-9_\\-]+)?$", + "uri_format": "https://pephub.databio.org/$1" }, "peptideatlas": { "biocontext": { From 129f37040a4852d9f7e72416fbd9d978f949566e Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Mon, 7 Oct 2024 15:54:20 -0400 Subject: [PATCH 04/22] add example_extras to pephub --- src/bioregistry/data/bioregistry.json | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index 450474eda..1eac9d3f4 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -86692,9 +86692,11 @@ "orcid": "0009-0009-5240-7463" }, "description": "PEPhub is a database, web interface, and API for sharing, retrieving, and validating sample metadata. PEPhub uses Portable Encapsulated Projects (PEP) biological metadata standard to store, edit, and access PEPs in one place.", - "example": [ - "geo", - "geo/gse185244" + "example": "geo", + "example_extras": [ + "geo/gse185244", + "bedbase/gse198944", + "databio/encode_batch_1" ], "homepage": "https://pephub.databio.org/", "name": "PEPhub", From 17a349084fdaf6b0b6144aa0c6534a8c93ba011e Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Mon, 7 Oct 2024 18:47:35 -0400 Subject: [PATCH 05/22] ammend 'relevancy_type' and 'relevant' data for two entries in curated papers --- src/bioregistry/curation/curated_papers.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bioregistry/curation/curated_papers.csv b/src/bioregistry/curation/curated_papers.csv index e76545c25..4856a262c 100644 --- a/src/bioregistry/curation/curated_papers.csv +++ b/src/bioregistry/curation/curated_papers.csv @@ -6,14 +6,14 @@ pmid,relevant,relevancy_type,notes 39115390,0,irrelevant_other, 39095357,0,irrelevant_other, 39084442,0,not_identifiers_resource, -38991851,0,new_prefix,"identifiers for sharing, retrieving, and validating sample metadata." +38991851,1,new_prefix,"identifiers for sharing, retrieving, and validating sample metadata." 38991828,0,irrelevant_other, 39049520,0,not_identifiers_resource, 39104826,1,existing,"Already present in the bioregistry as a provider for mesh prefix" 39050757,0,irrelevant_other, 39064021,0,irrelevant_other, 39028894,0,not_identifiers_resource, -39044201,0,not_identifiers,"Potential resource for rare diseases identifiers, but not identifier information" +39044201,0,not_identifiers_resource,"Potential resource for rare diseases identifiers, but not identifier information" 39088253,0,irrelevant_other, 39119155,0,irrelevant_other, 39005357,0,irrelevant_other, From 4ab43a2058dffe683879833bdeeab6c228afe91b Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Tue, 8 Oct 2024 12:30:06 -0400 Subject: [PATCH 06/22] remove PEPhub as a prefix --- src/bioregistry/curation/curated_papers.csv | 2 +- src/bioregistry/data/bioregistry.json | 24 --------------------- 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/src/bioregistry/curation/curated_papers.csv b/src/bioregistry/curation/curated_papers.csv index 4856a262c..6442a293f 100644 --- a/src/bioregistry/curation/curated_papers.csv +++ b/src/bioregistry/curation/curated_papers.csv @@ -6,7 +6,7 @@ pmid,relevant,relevancy_type,notes 39115390,0,irrelevant_other, 39095357,0,irrelevant_other, 39084442,0,not_identifiers_resource, -38991851,1,new_prefix,"identifiers for sharing, retrieving, and validating sample metadata." +38991851,1,unclear,"identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry" 38991828,0,irrelevant_other, 39049520,0,not_identifiers_resource, 39104826,1,existing,"Already present in the bioregistry as a provider for mesh prefix" diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index 1eac9d3f4..a45ce5dba 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -86679,30 +86679,6 @@ "orcid": "0000-0003-4423-4370" } }, - "pephub": { - "contact": { - "email": "nsheffield@virginia.edu", - "name": "Nathan Sheffield", - "orcid": "0000-0001-5643-4068" - }, - "contributor": { - "email": "m.naguthana@hotmail.com", - "github": "nagutm", - "name": "Mufaddal Naguthanawala", - "orcid": "0009-0009-5240-7463" - }, - "description": "PEPhub is a database, web interface, and API for sharing, retrieving, and validating sample metadata. PEPhub uses Portable Encapsulated Projects (PEP) biological metadata standard to store, edit, and access PEPs in one place.", - "example": "geo", - "example_extras": [ - "geo/gse185244", - "bedbase/gse198944", - "databio/encode_batch_1" - ], - "homepage": "https://pephub.databio.org/", - "name": "PEPhub", - "pattern": "^[A-Za-z0-9_\\-]+(/[A-Za-z0-9_\\-]+)?$", - "uri_format": "https://pephub.databio.org/$1" - }, "peptideatlas": { "biocontext": { "prefix": "PEPTIDEATLAS" From 65f1d8926f243a9aaeecdcaa0f1fa430669af983 Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Wed, 9 Oct 2024 09:26:00 -0400 Subject: [PATCH 07/22] update curated_papers with orcid and date --- src/bioregistry/curation/curated_papers.csv | 42 ++++++++++----------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/bioregistry/curation/curated_papers.csv b/src/bioregistry/curation/curated_papers.csv index 6442a293f..0fa0ca9a9 100644 --- a/src/bioregistry/curation/curated_papers.csv +++ b/src/bioregistry/curation/curated_papers.csv @@ -1,21 +1,21 @@ -pmid,relevant,relevancy_type,notes -39104285,1,new_provider,"Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species" -39074139,1,new_provider,"Resolver for PDB IDs" -39014503,0,no_website, -39047988,0,irrelevant_other, -39115390,0,irrelevant_other, -39095357,0,irrelevant_other, -39084442,0,not_identifiers_resource, -38991851,1,unclear,"identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry" -38991828,0,irrelevant_other, -39049520,0,not_identifiers_resource, -39104826,1,existing,"Already present in the bioregistry as a provider for mesh prefix" -39050757,0,irrelevant_other, -39064021,0,irrelevant_other, -39028894,0,not_identifiers_resource, -39044201,0,not_identifiers_resource,"Potential resource for rare diseases identifiers, but not identifier information" -39088253,0,irrelevant_other, -39119155,0,irrelevant_other, -39005357,0,irrelevant_other, -39044130,0,irrelevant_other, -39010878,0,irrelevant_other, \ No newline at end of file +pmid,relevant,relevancy_type,notes,orcid,date +39104285,1,new_provider,"Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species",0009-0009-5240-7463,09/24/2024 +39074139,1,new_provider,"Resolver for PDB IDs",0009-0009-5240-7463,09/24/2024 +39014503,0,no_website,,0009-0009-5240-7463,09/25/2024 +39047988,0,irrelevant_other,,0009-0009-5240-7463,09/25/2024 +39115390,0,irrelevant_other,,0009-0009-5240-7463,09/26/2024 +39095357,0,irrelevant_other,,0009-0009-5240-7463,09/26/2024 +39084442,0,not_identifiers_resource,,0009-0009-5240-7463,09/27/2024 +38991851,1,unclear,"identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry",0009-0009-5240-7463,09/28/2024 +38991828,0,irrelevant_other,,0009-0009-5240-7463,09/28/2024 +39049520,0,not_identifiers_resource,,0009-0009-5240-7463,09/30/2024 +39104826,1,existing,"Already present in the bioregistry as a provider for mesh prefix",0009-0009-5240-7463,10/1/2024 +39050757,0,irrelevant_other,,0009-0009-5240-7463,10/1/2024 +39064021,0,irrelevant_other,,0009-0009-5240-7463,10/1/2024 +39028894,0,not_identifiers_resource,,0009-0009-5240-7463,10/4/2024 +39044201,0,not_identifiers_resource,"Potential resource for rare diseases identifiers, but not identifier information",0009-0009-5240-7463,10/4/2024 +39088253,0,irrelevant_other,,0009-0009-5240-7463,10/5/2024 +39119155,0,irrelevant_other,,0009-0009-5240-7463,10/5/2024 +39005357,0,irrelevant_other,,0009-0009-5240-7463,10/5/2024 +39044130,0,irrelevant_other,,0009-0009-5240-7463,10/5/2024 +39010878,0,irrelevant_other,,0009-0009-5240-7463,10/5/2024 \ No newline at end of file From 31a6106f8215e38b2ec33ad9d87c62d39db2ad6f Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Fri, 11 Oct 2024 13:44:24 -0400 Subject: [PATCH 08/22] add unit test for validating input in curated_papers file. --- src/bioregistry/constants.py | 12 +++++ src/bioregistry/curation/curated_papers.csv | 21 -------- src/bioregistry/data/curated_papers.txt | 21 ++++++++ tests/test_curated_papers.py | 57 +++++++++++++++++++++ 4 files changed, 90 insertions(+), 21 deletions(-) delete mode 100644 src/bioregistry/curation/curated_papers.csv create mode 100644 src/bioregistry/data/curated_papers.txt create mode 100644 tests/test_curated_papers.py diff --git a/src/bioregistry/constants.py b/src/bioregistry/constants.py index 29b75926c..5f4b7f7d5 100644 --- a/src/bioregistry/constants.py +++ b/src/bioregistry/constants.py @@ -19,6 +19,7 @@ "MISMATCH_PATH", "BIOREGISTRY_MODULE", "RAW_DIRECTORY", + "CURATED_PAPERS_PATH" ] PYDANTIC_1 = importlib.metadata.version("pydantic").startswith("1.") @@ -33,6 +34,17 @@ COLLECTIONS_PATH = DATA_DIRECTORY / "collections.json" MISMATCH_PATH = DATA_DIRECTORY / "mismatch.json" CONTEXTS_PATH = DATA_DIRECTORY / "contexts.json" +CURATED_PAPERS_PATH = DATA_DIRECTORY / "curated_papers.txt" +CURATED_PAPERS_RELEVANCY_VOCAB = [ + "new_prefix", + "new_provider", + "new_publication", + "not_identifiers_resource", + "no_website", + "existing", + "unclear", + "irrelevant_other" +] BIOREGISTRY_MODULE = pystow.module("bioregistry") diff --git a/src/bioregistry/curation/curated_papers.csv b/src/bioregistry/curation/curated_papers.csv deleted file mode 100644 index 0fa0ca9a9..000000000 --- a/src/bioregistry/curation/curated_papers.csv +++ /dev/null @@ -1,21 +0,0 @@ -pmid,relevant,relevancy_type,notes,orcid,date -39104285,1,new_provider,"Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species",0009-0009-5240-7463,09/24/2024 -39074139,1,new_provider,"Resolver for PDB IDs",0009-0009-5240-7463,09/24/2024 -39014503,0,no_website,,0009-0009-5240-7463,09/25/2024 -39047988,0,irrelevant_other,,0009-0009-5240-7463,09/25/2024 -39115390,0,irrelevant_other,,0009-0009-5240-7463,09/26/2024 -39095357,0,irrelevant_other,,0009-0009-5240-7463,09/26/2024 -39084442,0,not_identifiers_resource,,0009-0009-5240-7463,09/27/2024 -38991851,1,unclear,"identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry",0009-0009-5240-7463,09/28/2024 -38991828,0,irrelevant_other,,0009-0009-5240-7463,09/28/2024 -39049520,0,not_identifiers_resource,,0009-0009-5240-7463,09/30/2024 -39104826,1,existing,"Already present in the bioregistry as a provider for mesh prefix",0009-0009-5240-7463,10/1/2024 -39050757,0,irrelevant_other,,0009-0009-5240-7463,10/1/2024 -39064021,0,irrelevant_other,,0009-0009-5240-7463,10/1/2024 -39028894,0,not_identifiers_resource,,0009-0009-5240-7463,10/4/2024 -39044201,0,not_identifiers_resource,"Potential resource for rare diseases identifiers, but not identifier information",0009-0009-5240-7463,10/4/2024 -39088253,0,irrelevant_other,,0009-0009-5240-7463,10/5/2024 -39119155,0,irrelevant_other,,0009-0009-5240-7463,10/5/2024 -39005357,0,irrelevant_other,,0009-0009-5240-7463,10/5/2024 -39044130,0,irrelevant_other,,0009-0009-5240-7463,10/5/2024 -39010878,0,irrelevant_other,,0009-0009-5240-7463,10/5/2024 \ No newline at end of file diff --git a/src/bioregistry/data/curated_papers.txt b/src/bioregistry/data/curated_papers.txt new file mode 100644 index 000000000..fbba71d80 --- /dev/null +++ b/src/bioregistry/data/curated_papers.txt @@ -0,0 +1,21 @@ +pmid relevant relevancy_type notes orcid date_curated +39104285 1 new_provider "Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species" 0009-0009-5240-7463 2024-09-24 +39074139 1 new_provider Resolver for PDB IDs 0009-0009-5240-7463 2024-09-24 +39014503 0 no_website 0009-0009-5240-7463 2024-09-25 +39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25 +39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 +39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 +39084442 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-27 +38991851 1 unclear "identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry" 0009-0009-5240-7463 2024-09-28 +38991828 0 irrelevant_other 0009-0009-5240-7463 2024-09-28 +39049520 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-30 +39104826 1 existing Already present in the bioregistry as a provider for mesh prefix 0009-0009-5240-7463 2024-10-01 +39050757 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 +39064021 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 +39028894 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04 +39044201 0 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information 0009-0009-5240-7463 2024-10-04 +39088253 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39119155 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39005357 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39044130 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39010878 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py new file mode 100644 index 000000000..da4fd3934 --- /dev/null +++ b/tests/test_curated_papers.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +"""Test for checking the integrity of the curated_papers TSV file.""" + +import csv +from datetime import datetime +import re +import unittest + +from bioregistry.constants import CURATED_PAPERS_PATH, ORCID_PATTERN, CURATED_PAPERS_RELEVANCY_VOCAB + + +class TestTSV(unittest.TestCase): + """Tests for curated_papers tsv file""" + + def setUp(self): + """Set up the test case.""" + self.tsv_file_path = CURATED_PAPERS_PATH + self.relevancy_vocab = CURATED_PAPERS_RELEVANCY_VOCAB + self.orcid_pattern = re.compile(ORCID_PATTERN) + + def validate_row(self, row): + """Validates a single row from the TSV file""" + + # Validate required fields + required_fields = ["pmid", "relevant", "relevancy_type", "orcid", "date_curated"] + for field in required_fields: + self.assertIn(field, row) + + # Validate pmid is an integer + self.assertTrue(row["pmid"].isdigit()) + + # Validate relevant is 0 or 1 + self.assertIn(row["relevant"], ["0", "1"]) + + # Validate relevancy_type is in relevancy_vocab + self.assertIn(row["relevancy_type"], self.relevancy_vocab) + + # Validate orcid against oricd_pattern + self.assertTrue(self.orcid_pattern.match(row["orcid"])) + + # Validate date_curated format + try: + datetime.strptime(row["date_curated"], "%Y-%m-%d") + except ValueError: + self.fail(f"Date_curated should follow format YYYY-MM-DD") + + def test_tsv_file(self): + """Tests all rows in TSV file are valid""" + with open(self.tsv_file_path, mode='r') as tsv_file: + tsv_reader = csv.DictReader(tsv_file, delimiter='\t') + for row in tsv_reader: + with self.subTest(row=row): + self.validate_row(row) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From a2f731bcba445a8c118ca2bc05e35d3b7f5e966a Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Fri, 11 Oct 2024 14:39:22 -0400 Subject: [PATCH 09/22] fix style issues --- src/bioregistry/constants.py | 17 ++++++------ src/bioregistry/data/curated_papers.txt | 2 +- tests/test_curated_papers.py | 36 ++++++++++++++----------- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/src/bioregistry/constants.py b/src/bioregistry/constants.py index 5f4b7f7d5..ab14b2957 100644 --- a/src/bioregistry/constants.py +++ b/src/bioregistry/constants.py @@ -19,7 +19,6 @@ "MISMATCH_PATH", "BIOREGISTRY_MODULE", "RAW_DIRECTORY", - "CURATED_PAPERS_PATH" ] PYDANTIC_1 = importlib.metadata.version("pydantic").startswith("1.") @@ -36,14 +35,14 @@ CONTEXTS_PATH = DATA_DIRECTORY / "contexts.json" CURATED_PAPERS_PATH = DATA_DIRECTORY / "curated_papers.txt" CURATED_PAPERS_RELEVANCY_VOCAB = [ - "new_prefix", - "new_provider", - "new_publication", - "not_identifiers_resource", - "no_website", - "existing", - "unclear", - "irrelevant_other" + "new_prefix", + "new_provider", + "new_publication", + "not_identifiers_resource", + "no_website", + "existing", + "unclear", + "irrelevant_other", ] BIOREGISTRY_MODULE = pystow.module("bioregistry") diff --git a/src/bioregistry/data/curated_papers.txt b/src/bioregistry/data/curated_papers.txt index fbba71d80..5455d7050 100644 --- a/src/bioregistry/data/curated_papers.txt +++ b/src/bioregistry/data/curated_papers.txt @@ -1,4 +1,4 @@ -pmid relevant relevancy_type notes orcid date_curated +pmid relevant relevancy_type notes orcid date_curated 39104285 1 new_provider "Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species" 0009-0009-5240-7463 2024-09-24 39074139 1 new_provider Resolver for PDB IDs 0009-0009-5240-7463 2024-09-24 39014503 0 no_website 0009-0009-5240-7463 2024-09-25 diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py index da4fd3934..434487909 100644 --- a/tests/test_curated_papers.py +++ b/tests/test_curated_papers.py @@ -3,25 +3,28 @@ """Test for checking the integrity of the curated_papers TSV file.""" import csv -from datetime import datetime import re import unittest +from datetime import datetime -from bioregistry.constants import CURATED_PAPERS_PATH, ORCID_PATTERN, CURATED_PAPERS_RELEVANCY_VOCAB +from bioregistry.constants import ( + CURATED_PAPERS_PATH, + CURATED_PAPERS_RELEVANCY_VOCAB, + ORCID_PATTERN, +) class TestTSV(unittest.TestCase): - """Tests for curated_papers tsv file""" + """Tests for curated_papers tsv file.""" def setUp(self): """Set up the test case.""" self.tsv_file_path = CURATED_PAPERS_PATH self.relevancy_vocab = CURATED_PAPERS_RELEVANCY_VOCAB self.orcid_pattern = re.compile(ORCID_PATTERN) - + def validate_row(self, row): - """Validates a single row from the TSV file""" - + """Validate a single row from the TSV file.""" # Validate required fields required_fields = ["pmid", "relevant", "relevancy_type", "orcid", "date_curated"] for field in required_fields: @@ -32,26 +35,27 @@ def validate_row(self, row): # Validate relevant is 0 or 1 self.assertIn(row["relevant"], ["0", "1"]) - + # Validate relevancy_type is in relevancy_vocab self.assertIn(row["relevancy_type"], self.relevancy_vocab) - + # Validate orcid against oricd_pattern self.assertTrue(self.orcid_pattern.match(row["orcid"])) - + # Validate date_curated format try: datetime.strptime(row["date_curated"], "%Y-%m-%d") except ValueError: - self.fail(f"Date_curated should follow format YYYY-MM-DD") - + self.fail("Date_curated should follow format YYYY-MM-DD") + def test_tsv_file(self): - """Tests all rows in TSV file are valid""" - with open(self.tsv_file_path, mode='r') as tsv_file: - tsv_reader = csv.DictReader(tsv_file, delimiter='\t') + """Tests all rows in TSV file are valid.""" + with open(self.tsv_file_path, mode="r") as tsv_file: + tsv_reader = csv.DictReader(tsv_file, delimiter="\t") for row in tsv_reader: with self.subTest(row=row): self.validate_row(row) -if __name__ == '__main__': - unittest.main() \ No newline at end of file + +if __name__ == "__main__": + unittest.main() From c74c63afa5a363df7ade5fdc817c442de67fa118 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 12 Oct 2024 15:01:18 +0200 Subject: [PATCH 10/22] Refactor --- src/bioregistry/constants.py | 2 +- src/bioregistry/data/curated_papers.tsv | 21 ++++++++++ src/bioregistry/data/curated_papers.txt | 21 ---------- tests/test_curated_papers.py | 51 ++++++++++++++++--------- 4 files changed, 54 insertions(+), 41 deletions(-) create mode 100644 src/bioregistry/data/curated_papers.tsv delete mode 100644 src/bioregistry/data/curated_papers.txt diff --git a/src/bioregistry/constants.py b/src/bioregistry/constants.py index ab14b2957..22d8515e0 100644 --- a/src/bioregistry/constants.py +++ b/src/bioregistry/constants.py @@ -33,7 +33,7 @@ COLLECTIONS_PATH = DATA_DIRECTORY / "collections.json" MISMATCH_PATH = DATA_DIRECTORY / "mismatch.json" CONTEXTS_PATH = DATA_DIRECTORY / "contexts.json" -CURATED_PAPERS_PATH = DATA_DIRECTORY / "curated_papers.txt" +CURATED_PAPERS_PATH = DATA_DIRECTORY / "curated_papers.tsv" CURATED_PAPERS_RELEVANCY_VOCAB = [ "new_prefix", "new_provider", diff --git a/src/bioregistry/data/curated_papers.tsv b/src/bioregistry/data/curated_papers.tsv new file mode 100644 index 000000000..fae1446ac --- /dev/null +++ b/src/bioregistry/data/curated_papers.tsv @@ -0,0 +1,21 @@ +pmid relevant relevancy_type prefix notes pr_added orcid date_curated +39104285 1 new_provider Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species 1193 0009-0009-5240-7463 2024-09-24 +39074139 1 new_provider Resolver for PDB IDs 0009-0009-5240-7463 2024-09-24 +39014503 0 no_website 0009-0009-5240-7463 2024-09-25 +39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25 +39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 +39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 +39084442 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-27 +38991851 1 unclear identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry 0009-0009-5240-7463 2024-09-28 +38991828 0 irrelevant_other 0009-0009-5240-7463 2024-09-28 +39049520 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-30 +39104826 1 existing Already present in the bioregistry as a provider for mesh prefix 0009-0009-5240-7463 2024-10-01 +39050757 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 +39064021 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 +39028894 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04 +39044201 0 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information 0009-0009-5240-7463 2024-10-04 +39088253 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39119155 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39005357 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39044130 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39010878 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 diff --git a/src/bioregistry/data/curated_papers.txt b/src/bioregistry/data/curated_papers.txt deleted file mode 100644 index 5455d7050..000000000 --- a/src/bioregistry/data/curated_papers.txt +++ /dev/null @@ -1,21 +0,0 @@ -pmid relevant relevancy_type notes orcid date_curated -39104285 1 new_provider "Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species" 0009-0009-5240-7463 2024-09-24 -39074139 1 new_provider Resolver for PDB IDs 0009-0009-5240-7463 2024-09-24 -39014503 0 no_website 0009-0009-5240-7463 2024-09-25 -39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25 -39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 -39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 -39084442 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-27 -38991851 1 unclear "identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry" 0009-0009-5240-7463 2024-09-28 -38991828 0 irrelevant_other 0009-0009-5240-7463 2024-09-28 -39049520 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-30 -39104826 1 existing Already present in the bioregistry as a provider for mesh prefix 0009-0009-5240-7463 2024-10-01 -39050757 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 -39064021 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 -39028894 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04 -39044201 0 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information 0009-0009-5240-7463 2024-10-04 -39088253 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39119155 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39005357 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39044130 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39010878 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py index 434487909..11e8e0104 100644 --- a/tests/test_curated_papers.py +++ b/tests/test_curated_papers.py @@ -3,59 +3,72 @@ """Test for checking the integrity of the curated_papers TSV file.""" import csv -import re import unittest from datetime import datetime +import bioregistry from bioregistry.constants import ( CURATED_PAPERS_PATH, CURATED_PAPERS_RELEVANCY_VOCAB, ORCID_PATTERN, ) +required_fields = [ + "pmid", + "relevant", + "relevancy_type", + "orcid", + "date_curated", + "notes", + "pr_added", # links back to the PR where curations were done +] + class TestTSV(unittest.TestCase): """Tests for curated_papers tsv file.""" - def setUp(self): - """Set up the test case.""" - self.tsv_file_path = CURATED_PAPERS_PATH - self.relevancy_vocab = CURATED_PAPERS_RELEVANCY_VOCAB - self.orcid_pattern = re.compile(ORCID_PATTERN) - def validate_row(self, row): """Validate a single row from the TSV file.""" # Validate required fields - required_fields = ["pmid", "relevant", "relevancy_type", "orcid", "date_curated"] + for field in required_fields: self.assertIn(field, row) - # Validate pmid is an integer - self.assertTrue(row["pmid"].isdigit()) + self.assertTrue(row["pmid"].isdigit(), msg="PubMed identifier should be an integer") + self.assertTrue(row["pr_added"].isdigit(), msg="Pull Request should be an integer") # Validate relevant is 0 or 1 self.assertIn(row["relevant"], ["0", "1"]) + if row["relevant"] == "1": + prefix = row["prefix"] + self.assertIsNotNone(prefix, msg="prefix should be set for all relevant entries") + self.assertNotEqual("", prefix, msg="prefix should not be empty for relevant entries") + self.assertEqual( + bioregistry.normalize_prefix(prefix), + prefix, + msg="prefix should be standardized for relevant entries", + ) + # Validate relevancy_type is in relevancy_vocab - self.assertIn(row["relevancy_type"], self.relevancy_vocab) + self.assertIn(row["relevancy_type"], CURATED_PAPERS_RELEVANCY_VOCAB) - # Validate orcid against oricd_pattern - self.assertTrue(self.orcid_pattern.match(row["orcid"])) + self.assertRegex(row["orcid"], ORCID_PATTERN) + + self.assertFalse(row["notes"].startswith('"')) + self.assertFalse(row["notes"].endswith('"')) # Validate date_curated format try: datetime.strptime(row["date_curated"], "%Y-%m-%d") except ValueError: - self.fail("Date_curated should follow format YYYY-MM-DD") + self.fail("date_curated should follow format YYYY-MM-DD") def test_tsv_file(self): """Tests all rows in TSV file are valid.""" - with open(self.tsv_file_path, mode="r") as tsv_file: + with CURATED_PAPERS_PATH.open() as tsv_file: tsv_reader = csv.DictReader(tsv_file, delimiter="\t") for row in tsv_reader: + print(row) with self.subTest(row=row): self.validate_row(row) - - -if __name__ == "__main__": - unittest.main() From 0d7a8bf20add4bc6539ec24634b0459ad20e74a7 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 12 Oct 2024 15:17:05 +0200 Subject: [PATCH 11/22] Add example full rows --- src/bioregistry/data/curated_papers.tsv | 6 +++--- tests/test_curated_papers.py | 9 ++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/bioregistry/data/curated_papers.tsv b/src/bioregistry/data/curated_papers.tsv index fae1446ac..5c5c4875f 100644 --- a/src/bioregistry/data/curated_papers.tsv +++ b/src/bioregistry/data/curated_papers.tsv @@ -1,7 +1,7 @@ pmid relevant relevancy_type prefix notes pr_added orcid date_curated -39104285 1 new_provider Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species 1193 0009-0009-5240-7463 2024-09-24 -39074139 1 new_provider Resolver for PDB IDs 0009-0009-5240-7463 2024-09-24 -39014503 0 no_website 0009-0009-5240-7463 2024-09-25 +39104285 1 new_provider uniprot Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species 1193 0009-0009-5240-7463 2024-09-24 +39074139 1 new_provider pdb Resolver for PDB IDs 1193 0009-0009-5240-7463 2024-09-24 +39014503 0 no_website 1193 0009-0009-5240-7463 2024-09-25 39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25 39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py index 11e8e0104..436203da3 100644 --- a/tests/test_curated_papers.py +++ b/tests/test_curated_papers.py @@ -67,8 +67,7 @@ def validate_row(self, row): def test_tsv_file(self): """Tests all rows in TSV file are valid.""" with CURATED_PAPERS_PATH.open() as tsv_file: - tsv_reader = csv.DictReader(tsv_file, delimiter="\t") - for row in tsv_reader: - print(row) - with self.subTest(row=row): - self.validate_row(row) + reader = csv.DictReader(tsv_file, delimiter="\t") + for row, data in enumerate(reader, start=1): + with self.subTest(row=row, data=data): + self.validate_row(data) From 3587530ed715f207dd9eea909758246cd5a0abbb Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 12 Oct 2024 15:26:48 +0200 Subject: [PATCH 12/22] Centralize code into reusable module --- src/bioregistry/constants.py | 10 --------- src/bioregistry/curation/literature.py | 31 ++++++++++++++++++++++++++ tests/test_curated_papers.py | 27 +++++++--------------- 3 files changed, 39 insertions(+), 29 deletions(-) create mode 100644 src/bioregistry/curation/literature.py diff --git a/src/bioregistry/constants.py b/src/bioregistry/constants.py index 22d8515e0..6041ab787 100644 --- a/src/bioregistry/constants.py +++ b/src/bioregistry/constants.py @@ -34,16 +34,6 @@ MISMATCH_PATH = DATA_DIRECTORY / "mismatch.json" CONTEXTS_PATH = DATA_DIRECTORY / "contexts.json" CURATED_PAPERS_PATH = DATA_DIRECTORY / "curated_papers.tsv" -CURATED_PAPERS_RELEVANCY_VOCAB = [ - "new_prefix", - "new_provider", - "new_publication", - "not_identifiers_resource", - "no_website", - "existing", - "unclear", - "irrelevant_other", -] BIOREGISTRY_MODULE = pystow.module("bioregistry") diff --git a/src/bioregistry/curation/literature.py b/src/bioregistry/curation/literature.py new file mode 100644 index 000000000..858b68e40 --- /dev/null +++ b/src/bioregistry/curation/literature.py @@ -0,0 +1,31 @@ +"""Utilities for working with the data produced by the semi-automated curation workflow.""" + +import enum + +__all__ = [ + "CurationRelevance", + "COLUMNS", +] + +COLUMNS = [ + "pmid", + "relevant", + "relevancy_type", + "orcid", + "date_curated", + "notes", + "pr_added", # links back to the PR where curations were done +] + + +class CurationRelevance(str, enum.Enum): + """An enumeration for curation relevance.""" + + new_prefix = enum.auto() + new_provider = enum.auto() + new_publication = enum.auto() + not_identifiers_resource = enum.auto() + no_website = enum.auto() + existing = enum.auto() + unclear = enum.auto() + irrelevant_other = enum.auto() diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py index 436203da3..72dec545f 100644 --- a/tests/test_curated_papers.py +++ b/tests/test_curated_papers.py @@ -7,31 +7,20 @@ from datetime import datetime import bioregistry -from bioregistry.constants import ( - CURATED_PAPERS_PATH, - CURATED_PAPERS_RELEVANCY_VOCAB, - ORCID_PATTERN, -) - -required_fields = [ - "pmid", - "relevant", - "relevancy_type", - "orcid", - "date_curated", - "notes", - "pr_added", # links back to the PR where curations were done -] +from bioregistry.constants import CURATED_PAPERS_PATH, ORCID_PATTERN +from bioregistry.curation.literature import CurationRelevance, COLUMNS class TestTSV(unittest.TestCase): """Tests for curated_papers tsv file.""" + def setUp(self): + """Set up the test case.""" + self.relevancy_types = {r.name for r in CurationRelevance} + def validate_row(self, row): """Validate a single row from the TSV file.""" - # Validate required fields - - for field in required_fields: + for field in COLUMNS: self.assertIn(field, row) self.assertTrue(row["pmid"].isdigit(), msg="PubMed identifier should be an integer") @@ -51,7 +40,7 @@ def validate_row(self, row): ) # Validate relevancy_type is in relevancy_vocab - self.assertIn(row["relevancy_type"], CURATED_PAPERS_RELEVANCY_VOCAB) + self.assertIn(row["relevancy_type"], self.relevancy_types) self.assertRegex(row["orcid"], ORCID_PATTERN) From b3e6c0ae7a2c2c102ee24040d53f497b4d09604c Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 12 Oct 2024 15:31:24 +0200 Subject: [PATCH 13/22] Add TODO --- src/bioregistry/curation/literature.py | 12 ++++++++++++ tests/test_curated_papers.py | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/bioregistry/curation/literature.py b/src/bioregistry/curation/literature.py index 858b68e40..6ecf6ebaf 100644 --- a/src/bioregistry/curation/literature.py +++ b/src/bioregistry/curation/literature.py @@ -2,6 +2,8 @@ import enum +import click + __all__ = [ "CurationRelevance", "COLUMNS", @@ -29,3 +31,13 @@ class CurationRelevance(str, enum.Enum): existing = enum.auto() unclear = enum.auto() irrelevant_other = enum.auto() + + +@click.command() +def main(): + """Import data from the literature curation into the Bioregistry.""" + raise NotImplementedError + + +if __name__ == "__main__": + main() diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py index 72dec545f..55a55707d 100644 --- a/tests/test_curated_papers.py +++ b/tests/test_curated_papers.py @@ -8,7 +8,7 @@ import bioregistry from bioregistry.constants import CURATED_PAPERS_PATH, ORCID_PATTERN -from bioregistry.curation.literature import CurationRelevance, COLUMNS +from bioregistry.curation.literature import COLUMNS, CurationRelevance class TestTSV(unittest.TestCase): From 7bcf7e93afdf49f2ef1c0c1136f1d92535ef96a3 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 12 Oct 2024 15:35:38 +0200 Subject: [PATCH 14/22] Add documentation --- docs/source/curation.rst | 3 +++ docs/source/index.rst | 1 + src/bioregistry/curation/literature.py | 1 + 3 files changed, 5 insertions(+) create mode 100644 docs/source/curation.rst diff --git a/docs/source/curation.rst b/docs/source/curation.rst new file mode 100644 index 000000000..0006caff3 --- /dev/null +++ b/docs/source/curation.rst @@ -0,0 +1,3 @@ +Curation +======== +.. automodapi:: bioregistry.curation.literature diff --git a/docs/source/index.rst b/docs/source/index.rst index 79b226c5a..f780a1d93 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -57,6 +57,7 @@ To install in development mode, use the following: cli pandas deployment + curation Indices and Tables ------------------ diff --git a/src/bioregistry/curation/literature.py b/src/bioregistry/curation/literature.py index 6ecf6ebaf..10fb0e227 100644 --- a/src/bioregistry/curation/literature.py +++ b/src/bioregistry/curation/literature.py @@ -23,6 +23,7 @@ class CurationRelevance(str, enum.Enum): """An enumeration for curation relevance.""" + #: A resource for new primary identifiers new_prefix = enum.auto() new_provider = enum.auto() new_publication = enum.auto() From 7e4cc4d23446af260b7c85196f60e5286be37e40 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 12 Oct 2024 15:39:23 +0200 Subject: [PATCH 15/22] Update curation.rst --- docs/source/curation.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/curation.rst b/docs/source/curation.rst index 0006caff3..0cb3940a1 100644 --- a/docs/source/curation.rst +++ b/docs/source/curation.rst @@ -1,3 +1,11 @@ Curation ======== +There are several curation workflows implemented in :mod:`bioregistry.curation`. + +Bulk Import +----------- +.. automodapi:: bioregistry.curation.bulk_import + +Semi-automated Literature Curation +---------------------------------- .. automodapi:: bioregistry.curation.literature From 50ec62b47c3ccb2f44ffbedea3d636ddbf6d2a39 Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Tue, 15 Oct 2024 15:04:40 -0400 Subject: [PATCH 16/22] remove prefix column from TSV file and simplify relevancy check --- src/bioregistry/data/curated_papers.tsv | 42 ++++++++++++------------- tests/test_curated_papers.py | 7 ++++- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/src/bioregistry/data/curated_papers.tsv b/src/bioregistry/data/curated_papers.tsv index 5c5c4875f..6388d4ecd 100644 --- a/src/bioregistry/data/curated_papers.tsv +++ b/src/bioregistry/data/curated_papers.tsv @@ -1,21 +1,21 @@ -pmid relevant relevancy_type prefix notes pr_added orcid date_curated -39104285 1 new_provider uniprot Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species 1193 0009-0009-5240-7463 2024-09-24 -39074139 1 new_provider pdb Resolver for PDB IDs 1193 0009-0009-5240-7463 2024-09-24 -39014503 0 no_website 1193 0009-0009-5240-7463 2024-09-25 -39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25 -39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 -39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 -39084442 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-27 -38991851 1 unclear identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry 0009-0009-5240-7463 2024-09-28 -38991828 0 irrelevant_other 0009-0009-5240-7463 2024-09-28 -39049520 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-30 -39104826 1 existing Already present in the bioregistry as a provider for mesh prefix 0009-0009-5240-7463 2024-10-01 -39050757 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 -39064021 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 -39028894 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04 -39044201 0 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information 0009-0009-5240-7463 2024-10-04 -39088253 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39119155 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39005357 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39044130 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39010878 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +pmid relevant relevancy_type notes pr_added orcid date_curated +39104285 1 new_provider Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species 1193 0009-0009-5240-7463 2024-09-24 +39074139 1 new_provider Resolver for PDB IDs 1193 0009-0009-5240-7463 2024-09-24 +39014503 0 no_website 1193 0009-0009-5240-7463 2024-09-25 +39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25 +39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 +39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 +39084442 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-27 +38991851 1 unclear identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry 0009-0009-5240-7463 2024-09-28 +38991828 0 irrelevant_other 0009-0009-5240-7463 2024-09-28 +39049520 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-30 +39104826 1 existing Already present in the bioregistry as a provider for mesh prefix 0009-0009-5240-7463 2024-10-01 +39050757 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 +39064021 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 +39028894 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04 +39044201 0 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information 0009-0009-5240-7463 2024-10-04 +39088253 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39119155 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39005357 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39044130 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39010878 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py index 55a55707d..7d095e22c 100644 --- a/tests/test_curated_papers.py +++ b/tests/test_curated_papers.py @@ -6,7 +6,7 @@ import unittest from datetime import datetime -import bioregistry +# import bioregistry from bioregistry.constants import CURATED_PAPERS_PATH, ORCID_PATTERN from bioregistry.curation.literature import COLUMNS, CurationRelevance @@ -29,6 +29,10 @@ def validate_row(self, row): # Validate relevant is 0 or 1 self.assertIn(row["relevant"], ["0", "1"]) + """ + Commenting out this check for now. This can be re-implemented if a need + for it arises in the future + if row["relevant"] == "1": prefix = row["prefix"] self.assertIsNotNone(prefix, msg="prefix should be set for all relevant entries") @@ -38,6 +42,7 @@ def validate_row(self, row): prefix, msg="prefix should be standardized for relevant entries", ) + """ # Validate relevancy_type is in relevancy_vocab self.assertIn(row["relevancy_type"], self.relevancy_types) From 7816e3b939759b567ec9929d50ebb0a85e6b8c59 Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Thu, 17 Oct 2024 00:00:01 -0400 Subject: [PATCH 17/22] Add docstrings for CurationRelevance and re-order TSV file --- src/bioregistry/curation/literature.py | 9 +++++- src/bioregistry/data/curated_papers.tsv | 42 ++++++++++++------------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/src/bioregistry/curation/literature.py b/src/bioregistry/curation/literature.py index 10fb0e227..17478b8b7 100644 --- a/src/bioregistry/curation/literature.py +++ b/src/bioregistry/curation/literature.py @@ -13,10 +13,10 @@ "pmid", "relevant", "relevancy_type", + "pr_added", # links back to the PR where curations were done "orcid", "date_curated", "notes", - "pr_added", # links back to the PR where curations were done ] @@ -25,12 +25,19 @@ class CurationRelevance(str, enum.Enum): #: A resource for new primary identifiers new_prefix = enum.auto() + #: A resolver for existing identifiers new_provider = enum.auto() + #: A new publication for an existing prefix new_publication = enum.auto() + #: A database, but not for identifier information not_identifiers_resource = enum.auto() + #: Paper suggestive of a new database, but no link to website provided no_website = enum.auto() + #: An existing entry in the bioregistry existing = enum.auto() + #: Not clear how to curate in the bioregistry, follow up discussion required unclear = enum.auto() + #: Completely unrelated information irrelevant_other = enum.auto() diff --git a/src/bioregistry/data/curated_papers.tsv b/src/bioregistry/data/curated_papers.tsv index 6388d4ecd..f956d1d87 100644 --- a/src/bioregistry/data/curated_papers.tsv +++ b/src/bioregistry/data/curated_papers.tsv @@ -1,21 +1,21 @@ -pmid relevant relevancy_type notes pr_added orcid date_curated -39104285 1 new_provider Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species 1193 0009-0009-5240-7463 2024-09-24 -39074139 1 new_provider Resolver for PDB IDs 1193 0009-0009-5240-7463 2024-09-24 -39014503 0 no_website 1193 0009-0009-5240-7463 2024-09-25 -39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25 -39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 -39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 -39084442 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-27 -38991851 1 unclear identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry 0009-0009-5240-7463 2024-09-28 -38991828 0 irrelevant_other 0009-0009-5240-7463 2024-09-28 -39049520 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-30 -39104826 1 existing Already present in the bioregistry as a provider for mesh prefix 0009-0009-5240-7463 2024-10-01 -39050757 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 -39064021 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 -39028894 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04 -39044201 0 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information 0009-0009-5240-7463 2024-10-04 -39088253 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39119155 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39005357 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39044130 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39010878 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +pmid relevant relevancy_type pr_added orcid date_curated notes +39104285 1 new_provider 1193 0009-0009-5240-7463 2024-09-24 Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species +39074139 1 new_provider 1193 0009-0009-5240-7463 2024-09-24 Resolver for PDB IDs +39014503 0 no_website 0009-0009-5240-7463 2024-09-25 +39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25 +39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 +39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 +39084442 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-27 +38991851 1 unclear 1194 0009-0009-5240-7463 2024-09-28 identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry +38991828 0 irrelevant_other 0009-0009-5240-7463 2024-09-28 +39049520 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-30 +39104826 1 existing 0009-0009-5240-7463 2024-10-01 Already present in the bioregistry as a provider for mesh prefix +39050757 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 +39064021 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 +39028894 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04 +39044201 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04 Potential resource for rare diseases identifiers, but not identifier information +39088253 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39119155 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39005357 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39044130 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +39010878 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 From 462f7f86feb8b7559dd16f933dffffee33d71564 Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Thu, 17 Oct 2024 13:03:54 -0400 Subject: [PATCH 18/22] Handle empty inputs for notes and pr_added fields --- src/bioregistry/curation/literature.py | 4 +-- src/bioregistry/data/curated_papers.tsv | 42 ++++++++++++------------- tests/test_curated_papers.py | 11 +++++-- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/src/bioregistry/curation/literature.py b/src/bioregistry/curation/literature.py index 17478b8b7..19a99a9b9 100644 --- a/src/bioregistry/curation/literature.py +++ b/src/bioregistry/curation/literature.py @@ -12,10 +12,10 @@ COLUMNS = [ "pmid", "relevant", - "relevancy_type", - "pr_added", # links back to the PR where curations were done "orcid", "date_curated", + "relevancy_type", + "pr_added", # links back to the PR where curations were done "notes", ] diff --git a/src/bioregistry/data/curated_papers.tsv b/src/bioregistry/data/curated_papers.tsv index f956d1d87..68b964311 100644 --- a/src/bioregistry/data/curated_papers.tsv +++ b/src/bioregistry/data/curated_papers.tsv @@ -1,21 +1,21 @@ -pmid relevant relevancy_type pr_added orcid date_curated notes -39104285 1 new_provider 1193 0009-0009-5240-7463 2024-09-24 Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species -39074139 1 new_provider 1193 0009-0009-5240-7463 2024-09-24 Resolver for PDB IDs -39014503 0 no_website 0009-0009-5240-7463 2024-09-25 -39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25 -39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 -39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26 -39084442 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-27 -38991851 1 unclear 1194 0009-0009-5240-7463 2024-09-28 identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry -38991828 0 irrelevant_other 0009-0009-5240-7463 2024-09-28 -39049520 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-30 -39104826 1 existing 0009-0009-5240-7463 2024-10-01 Already present in the bioregistry as a provider for mesh prefix -39050757 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 -39064021 0 irrelevant_other 0009-0009-5240-7463 2024-10-01 -39028894 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04 -39044201 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04 Potential resource for rare diseases identifiers, but not identifier information -39088253 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39119155 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39005357 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39044130 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 -39010878 0 irrelevant_other 0009-0009-5240-7463 2024-10-05 +pmid relevant orcid date_curated relevancy_type pr_added notes +39104285 1 0009-0009-5240-7463 2024-09-24 new_provider 1193 Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species +39074139 1 0009-0009-5240-7463 2024-09-24 new_provider 1193 Resolver for PDB IDs +39014503 0 0009-0009-5240-7463 2024-09-25 no_website +39047988 0 0009-0009-5240-7463 2024-09-25 irrelevant_other +39115390 0 0009-0009-5240-7463 2024-09-26 irrelevant_other +39095357 0 0009-0009-5240-7463 2024-09-26 irrelevant_other +39084442 0 0009-0009-5240-7463 2024-09-27 not_identifiers_resource +38991851 1 0009-0009-5240-7463 2024-09-28 unclear identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry +38991828 0 0009-0009-5240-7463 2024-09-28 irrelevant_other +39049520 0 0009-0009-5240-7463 2024-09-30 not_identifiers_resource +39104826 1 0009-0009-5240-7463 2024-10-01 existing Already present in the bioregistry as a provider for mesh prefix +39050757 0 0009-0009-5240-7463 2024-10-01 irrelevant_other +39064021 0 0009-0009-5240-7463 2024-10-01 irrelevant_other +39028894 0 0009-0009-5240-7463 2024-10-04 not_identifiers_resource +39044201 0 0009-0009-5240-7463 2024-10-04 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information +39088253 0 0009-0009-5240-7463 2024-10-05 irrelevant_other +39119155 0 0009-0009-5240-7463 2024-10-05 irrelevant_other +39005357 0 0009-0009-5240-7463 2024-10-05 irrelevant_other +39044130 0 0009-0009-5240-7463 2024-10-05 irrelevant_other +39010878 0 0009-0009-5240-7463 2024-10-05 irrelevant_other diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py index 7d095e22c..93bf6a824 100644 --- a/tests/test_curated_papers.py +++ b/tests/test_curated_papers.py @@ -24,7 +24,10 @@ def validate_row(self, row): self.assertIn(field, row) self.assertTrue(row["pmid"].isdigit(), msg="PubMed identifier should be an integer") - self.assertTrue(row["pr_added"].isdigit(), msg="Pull Request should be an integer") + + # Allow pr_added to be empty + if row["pr_added"]: + self.assertTrue(row["pr_added"].isdigit(), msg="Pull Request should be an integer") # Validate relevant is 0 or 1 self.assertIn(row["relevant"], ["0", "1"]) @@ -49,8 +52,10 @@ def validate_row(self, row): self.assertRegex(row["orcid"], ORCID_PATTERN) - self.assertFalse(row["notes"].startswith('"')) - self.assertFalse(row["notes"].endswith('"')) + # Handle None values values for notes + if row["notes"] is not None: + self.assertFalse(row["notes"].startswith('"')) + self.assertFalse(row["notes"].endswith('"')) # Validate date_curated format try: From 4948f4623c1fc4d1d64b65fb134acede371e0fea Mon Sep 17 00:00:00 2001 From: "Benjamin M. Gyori" Date: Thu, 17 Oct 2024 14:40:23 -0400 Subject: [PATCH 19/22] Fix typo --- tests/test_curated_papers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py index 93bf6a824..ab515d532 100644 --- a/tests/test_curated_papers.py +++ b/tests/test_curated_papers.py @@ -52,7 +52,7 @@ def validate_row(self, row): self.assertRegex(row["orcid"], ORCID_PATTERN) - # Handle None values values for notes + # Handle None values for notes if row["notes"] is not None: self.assertFalse(row["notes"].startswith('"')) self.assertFalse(row["notes"].endswith('"')) From 0169e361d4a4abe5e048d3ab38a3e805e1a836a2 Mon Sep 17 00:00:00 2001 From: Mufaddal Naguthanawala Date: Sat, 19 Oct 2024 15:14:14 -0400 Subject: [PATCH 20/22] add publication info for FURNA --- src/bioregistry/data/bioregistry.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index 1fa35f2ba..b47bf18a9 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -85850,6 +85850,15 @@ "description": "FURNA (Functions of RNAs) is a database of ligand-RNA interactions and Gene Ontology annotations for RNAs in the Protein Data Bank (PDB).", "homepage": "https://seq2fun.dcmb.med.umich.edu/furna/", "name": "furna", + "publications": [ + { + "doi": "10.1371/journal.pbio.3002476", + "pmc": "PMC11309384", + "pubmed": "39074139", + "title": "FURNA: A database for functional annotations of RNA structures", + "year": 2024 + } + ], "uri_format": "https://seq2fun.dcmb.med.umich.edu/furna/pdb.cgi?pdbid=$1" } ], From 98eb707ec35c36222fbcf6fe5d7e56b8658c78c5 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 19 Oct 2024 22:23:23 +0200 Subject: [PATCH 21/22] Update test_curated_papers.py --- tests/test_curated_papers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_curated_papers.py b/tests/test_curated_papers.py index ab515d532..f646df47e 100644 --- a/tests/test_curated_papers.py +++ b/tests/test_curated_papers.py @@ -1,12 +1,9 @@ -# -*- coding: utf-8 -*- - """Test for checking the integrity of the curated_papers TSV file.""" import csv import unittest from datetime import datetime -# import bioregistry from bioregistry.constants import CURATED_PAPERS_PATH, ORCID_PATTERN from bioregistry.curation.literature import COLUMNS, CurationRelevance From 269204ba2dadde2be0bc7a2e45445717117b771f Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 19 Oct 2024 22:23:46 +0200 Subject: [PATCH 22/22] Update literature.py --- src/bioregistry/curation/literature.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/bioregistry/curation/literature.py b/src/bioregistry/curation/literature.py index 19a99a9b9..04d6c13a2 100644 --- a/src/bioregistry/curation/literature.py +++ b/src/bioregistry/curation/literature.py @@ -2,8 +2,6 @@ import enum -import click - __all__ = [ "CurationRelevance", "COLUMNS", @@ -39,13 +37,3 @@ class CurationRelevance(str, enum.Enum): unclear = enum.auto() #: Completely unrelated information irrelevant_other = enum.auto() - - -@click.command() -def main(): - """Import data from the literature curation into the Bioregistry.""" - raise NotImplementedError - - -if __name__ == "__main__": - main()