Merge pull request #56 from murphycj/bump-ensembl-version

Update code to support Ensembl 111
murphycj · Sep 30, 2024 · 65290b3 · 65290b3
2 parents fb68065 + 65ed7d5
commit 65290b3
Show file tree

Hide file tree

Showing 10 changed files with 121 additions and 44 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -20,11 +20,11 @@ jobs:
           pip install .
           cd test
           ../bin/agfusion download -g hg19
-          ../bin/agfusion download -g hg38
+          ../bin/agfusion download -s homo_sapiens -r 111
           ../bin/agfusion download -s mus_musculus -r 84
           pyensembl install --release 84 --species mus_musculus
           pyensembl install --release 75 --species homo_sapiens
-          pyensembl install --release 95 --species homo_sapiens
+          pyensembl install --release 111 --species homo_sapiens
       - name: Test with pytest
         run: |
           cd test

diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,6 @@ dist/
 *pyc
 build/
 *.db
+*.ipynb
+*clans.tsv
+*.db.gz
diff --git a/README.md b/README.md
@@ -215,6 +215,19 @@ agfusion annotate \
 ![alt tag](https://github.com/murphycj/AGFusion/blob/master/doc/ENSMUST00000064477-ENSMUST00000002487-rescale.png)
 ![alt tag](https://github.com/murphycj/AGFusion/blob/master/doc/ENSMUST00000122054-ENSMUST00000070330-rescale.png)
 
+# Building your own database
+AGFusion uses a pre-built SQLite database to annotation gene fusions; in addition to data from pyensembl. The SQLite databases are stored on AWS S3.
+
+Follow the steps below if you want to build your own SQLite database:
+
+(1) Install [mysqlclient](https://github.com/PyMySQL/mysqlclient).
+
+(2) Download and unzip the PFAM reference file: [https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz](https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz)
+
+(3) Install your desired pyensembl reference genome. For example: `pyensembl install --release 111`.
+
+(4) Build the AGFusion database: `agfusion build -d . -s homo_sapiens -r 111 --pfam Pfam-A.clans.tsv`
+
 # Troubleshooting
 
 **(1) Problem:** I get a warning message like the following:

diff --git a/agfusion/cli.py b/agfusion/cli.py
@@ -111,15 +111,17 @@ def annotate(
 
     if batch_out_dir is not None:
 
+        gene1_name = fusion.gene5prime.gene.name
+        if gene1_name == "":
+            gene1_name = fusion.gene5prime.gene.id
+
+        gene2_name = fusion.gene3prime.gene.name
+        if gene2_name == "":
+            gene2_name = fusion.gene3prime.gene.id
+
         outdir = join(
             batch_out_dir,
-            fusion.gene5prime.gene.name
-            + "-"
-            + str(junction5prime)
-            + "_"
-            + fusion.gene3prime.gene.name
-            + "-"
-            + str(junction3prime),
+            gene1_name + "-" + str(junction5prime) + "_" + gene2_name + "-" + str(junction3prime),
         )
 
     fusion.save_transcript_cdna(out_dir=outdir, middlestar=args.middlestar)
@@ -155,8 +157,7 @@ def batch_mode(args, agfusion_db, pyensembl_data, rename, colors):
         agfusion_db.logger.warn(f"Output directory {args.out} already exists! Overwriting...")
 
     if not Path(args.file).exists():
-        FileNotFoundError(f"File not found {args.file}")
-        sys.exit(1)
+        raise FileNotFoundError(f"File not found {args.file}")
 
     if args.algorithm in parsers.parsers:
         for fusion in parsers.parsers[args.algorithm](args.file, agfusion_db.logger):

diff --git a/agfusion/database.py b/agfusion/database.py
@@ -1,5 +1,6 @@
 """Classes for AGFusion database objects
 """
+
 import logging
 import sqlite3
 import sys
@@ -115,9 +116,14 @@ def __init__(self, db_dir, species, release, pfam, server):
         for line in open(pfam, "r"):
             line = line.rstrip().split("\t")
 
+            # old format
+            # pfam_id = line[0]
+            # pfam_name = line[1]
+            # pfam_desc = line[3]
+
             pfam_id = line[0]
-            pfam_name = line[1]
-            pfam_desc = line[3]
+            pfam_name = line[3]
+            pfam_desc = line[4]
 
             self.pfam_mapping[pfam_id] = {"name": pfam_name, "desc": pfam_desc}
 
@@ -257,15 +263,12 @@ def fetch_gene_names(self):
             sys.exit(1)
 
         mysql_command = (
-            """SELECT gene.gene_id, xref.display_label FROM " \
-            "gene, object_xref, xref,external_db WHERE " \
-            "gene.gene_id = object_xref.ensembl_id AND " \
-            "object_xref.ensembl_object_type = 'Gene' AND " \
-            "object_xref.xref_id = xref.xref_id AND " \
-            "xref.external_db_id = external_db.external_db_id AND " \
-            "external_db.db_name = '"""
-            + gene_name_db
-            + """';"""
+            "SELECT gene.gene_id, xref.display_label FROM gene, object_xref, xref, external_db "
+            "WHERE gene.gene_id = object_xref.ensembl_id "
+            "AND object_xref.ensembl_object_type = 'Gene' "
+            "AND object_xref.xref_id = xref.xref_id "
+            "AND xref.external_db_id = external_db.external_db_id "
+            f"AND external_db.db_name = '{gene_name_db}';"
         )
 
         self.logger.info("MySQL - %s", mysql_command)

diff --git a/agfusion/model.py b/agfusion/model.py
@@ -1,6 +1,7 @@
 """
 Holds classes for containing information for Gene and Fusion exon and protein information.
 """
+
 import itertools
 import os
 import re
@@ -342,7 +343,15 @@ def __init__(
             noncanonical=noncanonical,
         )
 
-        self.name = self.gene5prime.gene.name + "_" + self.gene3prime.gene.name
+        gene1_name = self.gene5prime.gene.name
+        if gene1_name == "":
+            gene1_name = self.gene5prime.gene.id
+
+        gene2_name = self.gene3prime.gene.name
+        if gene2_name == "":
+            gene2_name = self.gene3prime.gene.id
+
+        self.name = gene1_name + "_" + gene2_name
         self.name = self.name.replace("/", "-")
 
         # construct all the fusion transcript combinations

diff --git a/agfusion/utils.py b/agfusion/utils.py
@@ -13,7 +13,7 @@
 # this is mostly contigent on the maximum ensembl release supported
 # by pyensembl
 
-MAX_ENSEMBL_RELEASE = 95
+MAX_ENSEMBL_RELEASE = 111
 
 GENOME_SHORTCUTS = {
     "GRCm38": ["mus_musculus", MAX_ENSEMBL_RELEASE],

diff --git a/bin/build_dbs.py b/bin/build_dbs.py
@@ -0,0 +1,46 @@
+"""
+Download and build the agfusion database for a range of releases.
+"""
+
+import subprocess
+
+
+def run_command(command):
+    """Run a command and print it."""
+    print(f"Running: {command}")
+    subprocess.run(command, shell=True, check=True)
+
+
+# species = "homo_sapiens"
+species = "mus_musculus"
+
+# Loop over releases from 96 to 110
+for i in range(92, 112):
+    # Check if the file exists on S3
+    s3_check_command = f"aws s3 ls s3://agfusion/agfusion.{species}.{i}.db.gz"
+    result = subprocess.run(
+        s3_check_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True
+    )
+
+    if result.returncode == 0:
+        print(f"File for release {i} already exists in S3. Skipping...")
+        continue
+
+    # continue
+
+    print(f"Building release {i}")
+
+    # Install the release using pyensembl
+    run_command(f"pyensembl install --release {i} --species {species}")
+
+    # Build the agfusion database
+    run_command(f"agfusion build -d . -s {species} -r {i} --pfam Pfam-A.clans.tsv")
+
+    # Compress the database
+    run_command(f"gzip agfusion.{species}.{i}.db")
+
+    # Upload the compressed file to S3
+    run_command(f"aws s3 cp agfusion.{species}.{i}.db.gz s3://agfusion")
+
+    # Delete all files for the release from pyensembl
+    run_command(f"pyensembl delete-all-files --release {i} --species {species}")
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
-matplotlib>=3.6.1
-pandas>=1.5.1
-biopython>=1.79
+matplotlib>=3.9.2
+pandas==2.2.3
+biopython>=1.84
 future>=0.16.0
-pyensembl>=1.1.0
+pyensembl>=2.3.13
+numpy<2
diff --git a/test/test_parsers.py b/test/test_parsers.py
@@ -1,5 +1,6 @@
 """Unit tests.
 """
+
 import unittest
 from os.path import abspath, curdir, join
 
@@ -15,9 +16,9 @@
 db_human = database.AGFusionDB(abspath(join(curdir, "agfusion.homo_sapiens.75.db")))
 db_human.build = "homo_sapiens_75"
 
-data_human95 = pyensembl.EnsemblRelease(95, "human")
-db_human95 = database.AGFusionDB(abspath(join(curdir, "agfusion.homo_sapiens.95.db")))
-db_human95.build = "homo_sapiens_95"
+data_human_hg38 = pyensembl.EnsemblRelease(111, "human")
+db_human_hg38 = database.AGFusionDB(abspath(join(curdir, "agfusion.homo_sapiens.111.db")))
+db_human_hg38.build = "homo_sapiens_111"
 
 
 BASEDIR = "./data/FusionsFindingAlgorithms"
@@ -111,15 +112,15 @@ def test_with_coding_effect(self):
         all_fusions = ["ARID3B_MYCNUT", "ARID3B_MYCN", "TVP23C_CDRT4"]
         for fusion in parsers.parsers["starfusion"](
             f"{BASEDIR}/STARFusion/" + "star-fusion.fusion_predictions.abridged.coding_effect.tsv",
-            db_human95.logger,
+            db_human_hg38.logger,
         ):
             fusion = model.Fusion(
                 gene5prime=fusion["gene5prime"],
                 gene5primejunction=fusion["gene5prime_junction"],
                 gene3prime=fusion["gene3prime"],
                 gene3primejunction=fusion["gene3prime_junction"],
-                db=db_human95,
-                pyensembl_data=data_human95,
+                db=db_human_hg38,
+                pyensembl_data=data_human_hg38,
                 protein_databases=["pfam"],
                 noncanonical=False,
             )
@@ -155,15 +156,15 @@ def test_parse_human(self):
         all_fusions = ["BCAS4_BCAS3", "HNRNPC_ACIN1"]
         for fusion in parsers.parsers["longgf"](
             f"{BASEDIR}/LongGF/fusions_hg38.log",
-            db_human95.logger,
+            db_human_hg38.logger,
         ):
             fusion = model.Fusion(
                 gene5prime=fusion["gene5prime"],
                 gene5primejunction=fusion["gene5prime_junction"],
                 gene3prime=fusion["gene3prime"],
                 gene3primejunction=fusion["gene3prime_junction"],
-                db=db_human95,
-                pyensembl_data=data_human95,
+                db=db_human_hg38,
+                pyensembl_data=data_human_hg38,
                 protein_databases=["pfam"],
                 noncanonical=False,
             )
@@ -176,35 +177,35 @@ class TestFusionInspector(unittest.TestCase):
     def test_parse_human(self):
         """Test basic parsing."""
 
-        all_fusions = ["AL627171.2_TPM3", "STAT3_AL627171.2"]
+        all_fusions = ["ENSG00000282885_TPM3", "STAT3_ENSG00000282885"]
 
         for fusion in parsers.parsers["fusioninspector"](
             f"{BASEDIR}/FusionInspector/test.FusionInspector.fusions.abridged.txt",
-            db_human95.logger,
+            db_human_hg38.logger,
         ):
             fusion = model.Fusion(
                 gene5prime=fusion["gene5prime"],
                 gene5primejunction=fusion["gene5prime_junction"],
                 gene3prime=fusion["gene3prime"],
                 gene3primejunction=fusion["gene3prime_junction"],
-                db=db_human95,
-                pyensembl_data=data_human95,
+                db=db_human_hg38,
+                pyensembl_data=data_human_hg38,
                 protein_databases=["pfam"],
                 noncanonical=False,
             )
             assert fusion.name in all_fusions, f"{fusion.name} not in list!"
 
         for fusion in parsers.parsers["fusioninspector"](
             f"{BASEDIR}/FusionInspector/test.FusionInspector.fusions.txt",
-            db_human95.logger,
+            db_human_hg38.logger,
         ):
             fusion = model.Fusion(
                 gene5prime=fusion["gene5prime"],
                 gene5primejunction=fusion["gene5prime_junction"],
                 gene3prime=fusion["gene3prime"],
                 gene3primejunction=fusion["gene3prime_junction"],
-                db=db_human95,
-                pyensembl_data=data_human95,
+                db=db_human_hg38,
+                pyensembl_data=data_human_hg38,
                 protein_databases=["pfam"],
                 noncanonical=False,
             )