Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update curated papers, add new PDB provider (FURNA). #1193

Merged
merged 27 commits into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
01b9685
update curated papers list
Oct 1, 2024
1824a51
update curated papers list with papers identified on Aug 9 batch. Cur…
Oct 7, 2024
c3967ac
Add contributor information, update regex pattern and examples for pe…
Oct 7, 2024
129f370
add example_extras to pephub
Oct 7, 2024
17a3490
ammend 'relevancy_type' and 'relevant' data for two entries in curate…
Oct 7, 2024
4ab43a2
remove PEPhub as a prefix
Oct 8, 2024
65f1d89
update curated_papers with orcid and date
Oct 9, 2024
541b9d0
Merge branch 'main' into update_curated_papers
cthoyt Oct 10, 2024
31a6106
add unit test for validating input in curated_papers file.
Oct 11, 2024
a2f731b
fix style issues
Oct 11, 2024
c74c63a
Refactor
cthoyt Oct 12, 2024
0d7a8bf
Add example full rows
cthoyt Oct 12, 2024
3587530
Centralize code into reusable module
cthoyt Oct 12, 2024
b3e6c0a
Add TODO
cthoyt Oct 12, 2024
7bcf7e9
Add documentation
cthoyt Oct 12, 2024
7e4cc4d
Update curation.rst
cthoyt Oct 12, 2024
a9b5cac
Merge branch 'main' into pr/1193
cthoyt Oct 12, 2024
50ec62b
remove prefix column from TSV file and simplify relevancy check
Oct 15, 2024
7816e3b
Add docstrings for CurationRelevance and re-order TSV file
Oct 17, 2024
462f7f8
Handle empty inputs for notes and pr_added fields
Oct 17, 2024
4948f46
Fix typo
bgyori Oct 17, 2024
e29deef
Merge branch 'main' into update_curated_papers
bgyori Oct 17, 2024
387ce61
Merge branch 'main' into update_curated_papers
bgyori Oct 18, 2024
0169e36
add publication info for FURNA
Oct 19, 2024
98eb707
Update test_curated_papers.py
cthoyt Oct 19, 2024
269204b
Update literature.py
cthoyt Oct 19, 2024
ad69445
Merge branch 'main' into update_curated_papers
cthoyt Oct 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/bioregistry/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@
COLLECTIONS_PATH = DATA_DIRECTORY / "collections.json"
MISMATCH_PATH = DATA_DIRECTORY / "mismatch.json"
CONTEXTS_PATH = DATA_DIRECTORY / "contexts.json"
CURATED_PAPERS_PATH = DATA_DIRECTORY / "curated_papers.txt"
CURATED_PAPERS_RELEVANCY_VOCAB = [
"new_prefix",
"new_provider",
"new_publication",
"not_identifiers_resource",
"no_website",
"existing",
"unclear",
"irrelevant_other",
]

BIOREGISTRY_MODULE = pystow.module("bioregistry")

Expand Down
1 change: 0 additions & 1 deletion src/bioregistry/curation/curated_papers.csv

This file was deleted.

9 changes: 9 additions & 0 deletions src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -85749,6 +85749,15 @@
"prefix": "pdbj",
"uri_format": "http://service.pdbj.org/mine/Detail?PDBID=$1&PAGEID=Summary"
},
"providers": [
{
"code": "furna",
"description": "FURNA (Functions of RNAs) is a database of ligand-RNA interactions and Gene Ontology annotations for RNAs in the Protein Data Bank (PDB).",
"homepage": "https://seq2fun.dcmb.med.umich.edu/furna/",
"name": "furna",
"uri_format": "https://seq2fun.dcmb.med.umich.edu/furna/pdb.cgi?pdbid=$1"
}
],
"publications": [
{
"doi": "10.1002/pro.4211",
Expand Down
21 changes: 21 additions & 0 deletions src/bioregistry/data/curated_papers.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
pmid relevant relevancy_type notes orcid date_curated
39104285 1 new_provider "Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species" 0009-0009-5240-7463 2024-09-24
39074139 1 new_provider Resolver for PDB IDs 0009-0009-5240-7463 2024-09-24
39014503 0 no_website 0009-0009-5240-7463 2024-09-25
39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25
39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26
39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26
39084442 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-27
bgyori marked this conversation as resolved.
Show resolved Hide resolved
38991851 1 unclear "identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry" 0009-0009-5240-7463 2024-09-28
38991828 0 irrelevant_other 0009-0009-5240-7463 2024-09-28
39049520 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-30
39104826 1 existing Already present in the bioregistry as a provider for mesh prefix 0009-0009-5240-7463 2024-10-01
39050757 0 irrelevant_other 0009-0009-5240-7463 2024-10-01
39064021 0 irrelevant_other 0009-0009-5240-7463 2024-10-01
39028894 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04
39044201 0 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information 0009-0009-5240-7463 2024-10-04
39088253 0 irrelevant_other 0009-0009-5240-7463 2024-10-05
39119155 0 irrelevant_other 0009-0009-5240-7463 2024-10-05
39005357 0 irrelevant_other 0009-0009-5240-7463 2024-10-05
39044130 0 irrelevant_other 0009-0009-5240-7463 2024-10-05
39010878 0 irrelevant_other 0009-0009-5240-7463 2024-10-05
61 changes: 61 additions & 0 deletions tests/test_curated_papers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-

"""Test for checking the integrity of the curated_papers TSV file."""

import csv
import re
import unittest
from datetime import datetime

from bioregistry.constants import (
CURATED_PAPERS_PATH,
CURATED_PAPERS_RELEVANCY_VOCAB,
ORCID_PATTERN,
)


class TestTSV(unittest.TestCase):
"""Tests for curated_papers tsv file."""

def setUp(self):
"""Set up the test case."""
self.tsv_file_path = CURATED_PAPERS_PATH
self.relevancy_vocab = CURATED_PAPERS_RELEVANCY_VOCAB
self.orcid_pattern = re.compile(ORCID_PATTERN)

def validate_row(self, row):
"""Validate a single row from the TSV file."""
# Validate required fields
required_fields = ["pmid", "relevant", "relevancy_type", "orcid", "date_curated"]
for field in required_fields:
self.assertIn(field, row)

# Validate pmid is an integer
self.assertTrue(row["pmid"].isdigit())

# Validate relevant is 0 or 1
self.assertIn(row["relevant"], ["0", "1"])

# Validate relevancy_type is in relevancy_vocab
self.assertIn(row["relevancy_type"], self.relevancy_vocab)

# Validate orcid against oricd_pattern
self.assertTrue(self.orcid_pattern.match(row["orcid"]))

# Validate date_curated format
try:
datetime.strptime(row["date_curated"], "%Y-%m-%d")
except ValueError:
self.fail("Date_curated should follow format YYYY-MM-DD")

def test_tsv_file(self):
"""Tests all rows in TSV file are valid."""
with open(self.tsv_file_path, mode="r") as tsv_file:
tsv_reader = csv.DictReader(tsv_file, delimiter="\t")
for row in tsv_reader:
with self.subTest(row=row):
self.validate_row(row)


if __name__ == "__main__":
unittest.main()
Loading