feat: Endpoint for retrieving all transcripts for a given HGNC ID (#11)…

… (#12) Co-authored-by: Manuel Holtgrewe <[email protected]>
bihealth · Oct 15, 2023 · 2b425bf · 2b425bf
1 parent 5f82241
commit 2b425bf
Show file tree

Hide file tree

Showing 9 changed files with 76,258 additions and 136 deletions.
diff --git a/Pipfile b/Pipfile
@@ -18,6 +18,7 @@ mypy = "*"
 pytest = "*"
 pytest-coverage = "*"
 httpx = "*"
+pytest-snapshot = "*"
 
 [requires]
 python_version = "3.10"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/dotty/main.py b/dotty/main.py
@@ -1,4 +1,5 @@
 import logging
+import typing
 from contextlib import asynccontextmanager
 
 import bioutils.assemblies
@@ -15,6 +16,10 @@
 
 #: The global Driver instance.
 driver: Driver = None  # type: ignore[assignment]
+#: Map from HGNC ID to transcripts.
+hgnc_to_transcripts: dict[str, list[typing.Any]] = {}
+#: Map from Assembly to map from hgnc_id to transcripts.
+assembly_to_hgnc_to_transcripts: dict[Assembly, dict[str, list[typing.Any]]] = {}
 
 #: Contig names per assembly.
 contig_names: dict[Assembly, set[str]] = {
@@ -32,6 +37,21 @@ async def lifespan(app: FastAPI):  # pragma: no cover
     driver = Driver(cdot_dir=settings.DATA_DIR)
     driver.load()
     _logger.info("driver loaded")
+    for assembly in Assembly:
+        for transcript in driver.data_providers[assembly].transcripts.keys():
+            if (
+                assembly.value
+                not in driver.data_providers[assembly]
+                ._get_transcript(transcript)["genome_builds"]
+                .keys()
+            ):
+                continue
+            hgnc_id = f"HGNC:{driver.data_providers[assembly]._get_transcript(transcript)['hgnc']}"
+            hgnc_to_transcripts.setdefault(hgnc_id, []).append(
+                driver.data_providers[assembly]._get_transcript(transcript)
+            )
+        assembly_to_hgnc_to_transcripts[assembly] = hgnc_to_transcripts
+    _logger.info("map built")
     yield
 
 
@@ -63,6 +83,93 @@ class Result(pydantic.BaseModel):
     spdi: Spdi
 
 
+class ExonAlignment(pydantic.BaseModel):
+    """Alignment of an exon to an assembly."""
+
+    #: Exon start in reference.
+    ref_start: int
+    #: Exon end in reference.
+    ref_end: int
+    #: Exon number.
+    exon_no: int
+    #: Exon start in transcript.
+    tx_start: int
+    #: Exon end in transcript.
+    tx_end: int
+    #: The gapped alignment description.
+    alignment: str | None
+
+    @staticmethod
+    def _from_list(lst: list[typing.Any]) -> "ExonAlignment":
+        """Create an ``ExonAlignment`` from a list."""
+        return ExonAlignment(
+            ref_start=lst[0],
+            ref_end=lst[1],
+            exon_no=lst[2],
+            tx_start=lst[3],
+            tx_end=lst[4],
+            alignment=lst[5],
+        )
+
+
+class TanscriptAlignment(pydantic.BaseModel):
+    """Alignment of a `Transcript` to an assembly."""
+
+    #: Assembly of alignment.
+    assembly: str
+    #: Alignment contig.
+    contig: str
+    #: CDS start.
+    cds_start: int
+    #: CDS end.
+    cds_end: int
+    #: Exons, first two entries are start/end positions on the chromosome.
+    exons: list[ExonAlignment]
+
+    @staticmethod
+    def _from_dict(assembly, dct: dict[str, typing.Any]) -> "TanscriptAlignment":
+        """Create a ``TanscriptAlignment`` from a dictionary."""
+        return TanscriptAlignment(
+            assembly=assembly,
+            contig=dct["contig"],
+            cds_start=dct["cds_start"],
+            cds_end=dct["cds_end"],
+            exons=[ExonAlignment._from_list(lst) for lst in dct["exons"]],
+        )
+
+
+class Transcript(pydantic.BaseModel):
+    """Transcript model."""
+
+    #: Transcript ID.
+    id: str
+    #: Gene HGNC ID.
+    hgnc_id: str
+    #: Gene HGNC symbol.
+    hgnc_symbol: str
+    #: Alignments of the transcripts.
+    alignments: list[TanscriptAlignment]
+
+    @staticmethod
+    def _from_dict(assembly: str, dct: dict[str, typing.Any]) -> "Transcript":
+        """Create a ``Transcript`` from a dictionary."""
+        return Transcript(
+            id=dct["id"],
+            hgnc_id=f"HGNC:{dct['hgnc']}",
+            hgnc_symbol=dct["gene_name"],
+            alignments=[TanscriptAlignment._from_dict(assembly, dct["genome_builds"][assembly])]
+            if assembly in dct["genome_builds"]
+            else [],
+        )
+
+
+class TranscriptResult(pydantic.BaseModel):
+    """The result of the query for searching for transcripts."""
+
+    #: The actual payload / list of transcripts.
+    transcripts: list[Transcript]
+
+
 @app.get("/api/v1/to-spdi", response_model=Result)
 async def to_spdi(q: str, assembly: Assembly = Assembly.GRCH38) -> Result:
     """Resolve the given HGVS variant to SPDI representation."""
@@ -93,3 +200,24 @@ async def to_spdi(q: str, assembly: Assembly = Assembly.GRCH38) -> Result:
             alternate_inserted=alternative,
         )
     )
+
+
+@app.get("/api/v1/find-transcripts", response_model=TranscriptResult)
+async def find_transcripts(hgnc_id: str, assembly: Assembly = Assembly.GRCH38) -> TranscriptResult:
+    """Find transcripts for the given HGNC ID."""
+    result = []
+    transctipts = assembly_to_hgnc_to_transcripts[assembly].get(hgnc_id, [])
+    if not transctipts:
+        raise HTTPException(status_code=404, detail="No transcripts found")
+    else:
+        for t in transctipts:
+            if (
+                assembly.value not in t["genome_builds"]
+                or "cds_start" not in t["genome_builds"][assembly.value]
+                or "cds_end" not in t["genome_builds"][assembly.value]
+                or "exons" not in t["genome_builds"][assembly.value]
+            ):
+                continue
+
+            result.append(Transcript._from_dict(assembly.value, t))
+        return TranscriptResult(transcripts=result)
diff --git a/stubs/cdot/hgvs/dataproviders/json_data_provider.pyi b/stubs/cdot/hgvs/dataproviders/json_data_provider.pyi
@@ -1,3 +1,5 @@
+import typing
+
 from hgvs.dataproviders.interface import Interface
 
 class AbstractJSONDataProvider(Interface):
@@ -7,4 +9,5 @@ class LocalDataProvider(AbstractJSONDataProvider):
     pass
 
 class JSONDataProvider(LocalDataProvider):
-    pass
+    transcripts: dict[str, typing.Any]
+    def _get_transcript(self, tx_id: str) -> dict[str, typing.Any]: ...
diff --git a/stubs/pytest_snapshot/__init__.pyi b/stubs/pytest_snapshot/__init__.pyi
diff --git a/stubs/pytest_snapshot/plugin.pyi b/stubs/pytest_snapshot/plugin.pyi
@@ -0,0 +1,6 @@
+from pathlib import Path
+from typing import Union
+
+class Snapshot:
+    def assert_match(self, value: Union[str, bytes], snapshot_name: Union[str, Path]): ...
+    def assert_match_dir(self, dir_dict: dict, snapshot_dir_name: Union[str, Path]): ...