Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update curated papers, add new PDB provider (FURNA). #1193

Merged
merged 27 commits into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
01b9685
update curated papers list
Oct 1, 2024
1824a51
update curated papers list with papers identified on Aug 9 batch. Cur…
Oct 7, 2024
c3967ac
Add contributor information, update regex pattern and examples for pe…
Oct 7, 2024
129f370
add example_extras to pephub
Oct 7, 2024
17a3490
ammend 'relevancy_type' and 'relevant' data for two entries in curate…
Oct 7, 2024
4ab43a2
remove PEPhub as a prefix
Oct 8, 2024
65f1d89
update curated_papers with orcid and date
Oct 9, 2024
541b9d0
Merge branch 'main' into update_curated_papers
cthoyt Oct 10, 2024
31a6106
add unit test for validating input in curated_papers file.
Oct 11, 2024
a2f731b
fix style issues
Oct 11, 2024
c74c63a
Refactor
cthoyt Oct 12, 2024
0d7a8bf
Add example full rows
cthoyt Oct 12, 2024
3587530
Centralize code into reusable module
cthoyt Oct 12, 2024
b3e6c0a
Add TODO
cthoyt Oct 12, 2024
7bcf7e9
Add documentation
cthoyt Oct 12, 2024
7e4cc4d
Update curation.rst
cthoyt Oct 12, 2024
a9b5cac
Merge branch 'main' into pr/1193
cthoyt Oct 12, 2024
50ec62b
remove prefix column from TSV file and simplify relevancy check
Oct 15, 2024
7816e3b
Add docstrings for CurationRelevance and re-order TSV file
Oct 17, 2024
462f7f8
Handle empty inputs for notes and pr_added fields
Oct 17, 2024
4948f46
Fix typo
bgyori Oct 17, 2024
e29deef
Merge branch 'main' into update_curated_papers
bgyori Oct 17, 2024
387ce61
Merge branch 'main' into update_curated_papers
bgyori Oct 18, 2024
0169e36
add publication info for FURNA
Oct 19, 2024
98eb707
Update test_curated_papers.py
cthoyt Oct 19, 2024
269204b
Update literature.py
cthoyt Oct 19, 2024
ad69445
Merge branch 'main' into update_curated_papers
cthoyt Oct 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/curation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@ There are several curation workflows implemented in :mod:`bioregistry.curation`.
Bulk Import
-----------
.. automodapi:: bioregistry.curation.bulk_import

Semi-automated Literature Curation
----------------------------------
.. automodapi:: bioregistry.curation.literature
1 change: 1 addition & 0 deletions src/bioregistry/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
COLLECTIONS_PATH = DATA_DIRECTORY / "collections.json"
MISMATCH_PATH = DATA_DIRECTORY / "mismatch.json"
CONTEXTS_PATH = DATA_DIRECTORY / "contexts.json"
CURATED_PAPERS_PATH = DATA_DIRECTORY / "curated_papers.tsv"

BIOREGISTRY_MODULE = pystow.module("bioregistry")

Expand Down
1 change: 0 additions & 1 deletion src/bioregistry/curation/curated_papers.csv

This file was deleted.

39 changes: 39 additions & 0 deletions src/bioregistry/curation/literature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Utilities for working with the data produced by the semi-automated curation workflow."""

import enum

__all__ = [
"CurationRelevance",
"COLUMNS",
]

COLUMNS = [
"pmid",
"relevant",
"orcid",
"date_curated",
"relevancy_type",
"pr_added", # links back to the PR where curations were done
"notes",
]


class CurationRelevance(str, enum.Enum):
"""An enumeration for curation relevance."""

#: A resource for new primary identifiers
new_prefix = enum.auto()
#: A resolver for existing identifiers
new_provider = enum.auto()
bgyori marked this conversation as resolved.
Show resolved Hide resolved
#: A new publication for an existing prefix
new_publication = enum.auto()
#: A database, but not for identifier information
not_identifiers_resource = enum.auto()
#: Paper suggestive of a new database, but no link to website provided
no_website = enum.auto()
#: An existing entry in the bioregistry
existing = enum.auto()
#: Not clear how to curate in the bioregistry, follow up discussion required
unclear = enum.auto()
#: Completely unrelated information
irrelevant_other = enum.auto()
18 changes: 18 additions & 0 deletions src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -85844,6 +85844,24 @@
"prefix": "pdbj",
"uri_format": "http://service.pdbj.org/mine/Detail?PDBID=$1&PAGEID=Summary"
},
"providers": [
{
"code": "furna",
"description": "FURNA (Functions of RNAs) is a database of ligand-RNA interactions and Gene Ontology annotations for RNAs in the Protein Data Bank (PDB).",
"homepage": "https://seq2fun.dcmb.med.umich.edu/furna/",
"name": "furna",
"publications": [
{
"doi": "10.1371/journal.pbio.3002476",
"pmc": "PMC11309384",
"pubmed": "39074139",
"title": "FURNA: A database for functional annotations of RNA structures",
"year": 2024
}
],
"uri_format": "https://seq2fun.dcmb.med.umich.edu/furna/pdb.cgi?pdbid=$1"
}
],
"publications": [
{
"doi": "10.1002/pro.4211",
Expand Down
21 changes: 21 additions & 0 deletions src/bioregistry/data/curated_papers.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
pmid relevant orcid date_curated relevancy_type pr_added notes
39104285 1 0009-0009-5240-7463 2024-09-24 new_provider 1193 Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species
39074139 1 0009-0009-5240-7463 2024-09-24 new_provider 1193 Resolver for PDB IDs
39014503 0 0009-0009-5240-7463 2024-09-25 no_website
39047988 0 0009-0009-5240-7463 2024-09-25 irrelevant_other
39115390 0 0009-0009-5240-7463 2024-09-26 irrelevant_other
39095357 0 0009-0009-5240-7463 2024-09-26 irrelevant_other
39084442 0 0009-0009-5240-7463 2024-09-27 not_identifiers_resource
38991851 1 0009-0009-5240-7463 2024-09-28 unclear identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry
38991828 0 0009-0009-5240-7463 2024-09-28 irrelevant_other
39049520 0 0009-0009-5240-7463 2024-09-30 not_identifiers_resource
39104826 1 0009-0009-5240-7463 2024-10-01 existing Already present in the bioregistry as a provider for mesh prefix
39050757 0 0009-0009-5240-7463 2024-10-01 irrelevant_other
39064021 0 0009-0009-5240-7463 2024-10-01 irrelevant_other
39028894 0 0009-0009-5240-7463 2024-10-04 not_identifiers_resource
39044201 0 0009-0009-5240-7463 2024-10-04 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information
39088253 0 0009-0009-5240-7463 2024-10-05 irrelevant_other
39119155 0 0009-0009-5240-7463 2024-10-05 irrelevant_other
39005357 0 0009-0009-5240-7463 2024-10-05 irrelevant_other
39044130 0 0009-0009-5240-7463 2024-10-05 irrelevant_other
39010878 0 0009-0009-5240-7463 2024-10-05 irrelevant_other
69 changes: 69 additions & 0 deletions tests/test_curated_papers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Test for checking the integrity of the curated_papers TSV file."""

import csv
import unittest
from datetime import datetime

from bioregistry.constants import CURATED_PAPERS_PATH, ORCID_PATTERN
from bioregistry.curation.literature import COLUMNS, CurationRelevance


class TestTSV(unittest.TestCase):
"""Tests for curated_papers tsv file."""

def setUp(self):
"""Set up the test case."""
self.relevancy_types = {r.name for r in CurationRelevance}

def validate_row(self, row):
"""Validate a single row from the TSV file."""
for field in COLUMNS:
self.assertIn(field, row)

self.assertTrue(row["pmid"].isdigit(), msg="PubMed identifier should be an integer")

# Allow pr_added to be empty
if row["pr_added"]:
self.assertTrue(row["pr_added"].isdigit(), msg="Pull Request should be an integer")

# Validate relevant is 0 or 1
self.assertIn(row["relevant"], ["0", "1"])

"""
Commenting out this check for now. This can be re-implemented if a need
for it arises in the future

if row["relevant"] == "1":
prefix = row["prefix"]
self.assertIsNotNone(prefix, msg="prefix should be set for all relevant entries")
bgyori marked this conversation as resolved.
Show resolved Hide resolved
self.assertNotEqual("", prefix, msg="prefix should not be empty for relevant entries")
self.assertEqual(
bioregistry.normalize_prefix(prefix),
prefix,
msg="prefix should be standardized for relevant entries",
)
"""

# Validate relevancy_type is in relevancy_vocab
self.assertIn(row["relevancy_type"], self.relevancy_types)

self.assertRegex(row["orcid"], ORCID_PATTERN)

# Handle None values for notes
if row["notes"] is not None:
self.assertFalse(row["notes"].startswith('"'))
self.assertFalse(row["notes"].endswith('"'))

# Validate date_curated format
try:
datetime.strptime(row["date_curated"], "%Y-%m-%d")
except ValueError:
self.fail("date_curated should follow format YYYY-MM-DD")

def test_tsv_file(self):
"""Tests all rows in TSV file are valid."""
with CURATED_PAPERS_PATH.open() as tsv_file:
reader = csv.DictReader(tsv_file, delimiter="\t")
for row, data in enumerate(reader, start=1):
with self.subTest(row=row, data=data):
self.validate_row(data)