Skip to content

Commit

Permalink
Merge pull request #56 from murphycj/bump-ensembl-version
Browse files Browse the repository at this point in the history
Update code to support Ensembl 111
  • Loading branch information
murphycj authored Sep 30, 2024
2 parents fb68065 + 65ed7d5 commit 65290b3
Show file tree
Hide file tree
Showing 10 changed files with 121 additions and 44 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ jobs:
pip install .
cd test
../bin/agfusion download -g hg19
../bin/agfusion download -g hg38
../bin/agfusion download -s homo_sapiens -r 111
../bin/agfusion download -s mus_musculus -r 84
pyensembl install --release 84 --species mus_musculus
pyensembl install --release 75 --species homo_sapiens
pyensembl install --release 95 --species homo_sapiens
pyensembl install --release 111 --species homo_sapiens
- name: Test with pytest
run: |
cd test
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ dist/
*pyc
build/
*.db
*.ipynb
*clans.tsv
*.db.gz
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,19 @@ agfusion annotate \
![alt tag](https://github.com/murphycj/AGFusion/blob/master/doc/ENSMUST00000064477-ENSMUST00000002487-rescale.png)
![alt tag](https://github.com/murphycj/AGFusion/blob/master/doc/ENSMUST00000122054-ENSMUST00000070330-rescale.png)

# Building your own database
AGFusion uses a pre-built SQLite database to annotation gene fusions; in addition to data from pyensembl. The SQLite databases are stored on AWS S3.

Follow the steps below if you want to build your own SQLite database:

(1) Install [mysqlclient](https://github.com/PyMySQL/mysqlclient).

(2) Download and unzip the PFAM reference file: [https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz](https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz)

(3) Install your desired pyensembl reference genome. For example: `pyensembl install --release 111`.

(4) Build the AGFusion database: `agfusion build -d . -s homo_sapiens -r 111 --pfam Pfam-A.clans.tsv`

# Troubleshooting

**(1) Problem:** I get a warning message like the following:
Expand Down
19 changes: 10 additions & 9 deletions agfusion/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,15 +111,17 @@ def annotate(

if batch_out_dir is not None:

gene1_name = fusion.gene5prime.gene.name
if gene1_name == "":
gene1_name = fusion.gene5prime.gene.id

gene2_name = fusion.gene3prime.gene.name
if gene2_name == "":
gene2_name = fusion.gene3prime.gene.id

outdir = join(
batch_out_dir,
fusion.gene5prime.gene.name
+ "-"
+ str(junction5prime)
+ "_"
+ fusion.gene3prime.gene.name
+ "-"
+ str(junction3prime),
gene1_name + "-" + str(junction5prime) + "_" + gene2_name + "-" + str(junction3prime),
)

fusion.save_transcript_cdna(out_dir=outdir, middlestar=args.middlestar)
Expand Down Expand Up @@ -155,8 +157,7 @@ def batch_mode(args, agfusion_db, pyensembl_data, rename, colors):
agfusion_db.logger.warn(f"Output directory {args.out} already exists! Overwriting...")

if not Path(args.file).exists():
FileNotFoundError(f"File not found {args.file}")
sys.exit(1)
raise FileNotFoundError(f"File not found {args.file}")

if args.algorithm in parsers.parsers:
for fusion in parsers.parsers[args.algorithm](args.file, agfusion_db.logger):
Expand Down
25 changes: 14 additions & 11 deletions agfusion/database.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Classes for AGFusion database objects
"""

import logging
import sqlite3
import sys
Expand Down Expand Up @@ -115,9 +116,14 @@ def __init__(self, db_dir, species, release, pfam, server):
for line in open(pfam, "r"):
line = line.rstrip().split("\t")

# old format
# pfam_id = line[0]
# pfam_name = line[1]
# pfam_desc = line[3]

pfam_id = line[0]
pfam_name = line[1]
pfam_desc = line[3]
pfam_name = line[3]
pfam_desc = line[4]

self.pfam_mapping[pfam_id] = {"name": pfam_name, "desc": pfam_desc}

Expand Down Expand Up @@ -257,15 +263,12 @@ def fetch_gene_names(self):
sys.exit(1)

mysql_command = (
"""SELECT gene.gene_id, xref.display_label FROM " \
"gene, object_xref, xref,external_db WHERE " \
"gene.gene_id = object_xref.ensembl_id AND " \
"object_xref.ensembl_object_type = 'Gene' AND " \
"object_xref.xref_id = xref.xref_id AND " \
"xref.external_db_id = external_db.external_db_id AND " \
"external_db.db_name = '"""
+ gene_name_db
+ """';"""
"SELECT gene.gene_id, xref.display_label FROM gene, object_xref, xref, external_db "
"WHERE gene.gene_id = object_xref.ensembl_id "
"AND object_xref.ensembl_object_type = 'Gene' "
"AND object_xref.xref_id = xref.xref_id "
"AND xref.external_db_id = external_db.external_db_id "
f"AND external_db.db_name = '{gene_name_db}';"
)

self.logger.info("MySQL - %s", mysql_command)
Expand Down
11 changes: 10 additions & 1 deletion agfusion/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Holds classes for containing information for Gene and Fusion exon and protein information.
"""

import itertools
import os
import re
Expand Down Expand Up @@ -342,7 +343,15 @@ def __init__(
noncanonical=noncanonical,
)

self.name = self.gene5prime.gene.name + "_" + self.gene3prime.gene.name
gene1_name = self.gene5prime.gene.name
if gene1_name == "":
gene1_name = self.gene5prime.gene.id

gene2_name = self.gene3prime.gene.name
if gene2_name == "":
gene2_name = self.gene3prime.gene.id

self.name = gene1_name + "_" + gene2_name
self.name = self.name.replace("/", "-")

# construct all the fusion transcript combinations
Expand Down
2 changes: 1 addition & 1 deletion agfusion/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# this is mostly contigent on the maximum ensembl release supported
# by pyensembl

MAX_ENSEMBL_RELEASE = 95
MAX_ENSEMBL_RELEASE = 111

GENOME_SHORTCUTS = {
"GRCm38": ["mus_musculus", MAX_ENSEMBL_RELEASE],
Expand Down
46 changes: 46 additions & 0 deletions bin/build_dbs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
Download and build the agfusion database for a range of releases.
"""

import subprocess


def run_command(command):
"""Run a command and print it."""
print(f"Running: {command}")
subprocess.run(command, shell=True, check=True)


# species = "homo_sapiens"
species = "mus_musculus"

# Loop over releases from 96 to 110
for i in range(92, 112):
# Check if the file exists on S3
s3_check_command = f"aws s3 ls s3://agfusion/agfusion.{species}.{i}.db.gz"
result = subprocess.run(
s3_check_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True
)

if result.returncode == 0:
print(f"File for release {i} already exists in S3. Skipping...")
continue

# continue

print(f"Building release {i}")

# Install the release using pyensembl
run_command(f"pyensembl install --release {i} --species {species}")

# Build the agfusion database
run_command(f"agfusion build -d . -s {species} -r {i} --pfam Pfam-A.clans.tsv")

# Compress the database
run_command(f"gzip agfusion.{species}.{i}.db")

# Upload the compressed file to S3
run_command(f"aws s3 cp agfusion.{species}.{i}.db.gz s3://agfusion")

# Delete all files for the release from pyensembl
run_command(f"pyensembl delete-all-files --release {i} --species {species}")
9 changes: 5 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
matplotlib>=3.6.1
pandas>=1.5.1
biopython>=1.79
matplotlib>=3.9.2
pandas==2.2.3
biopython>=1.84
future>=0.16.0
pyensembl>=1.1.0
pyensembl>=2.3.13
numpy<2
33 changes: 17 additions & 16 deletions test/test_parsers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Unit tests.
"""

import unittest
from os.path import abspath, curdir, join

Expand All @@ -15,9 +16,9 @@
db_human = database.AGFusionDB(abspath(join(curdir, "agfusion.homo_sapiens.75.db")))
db_human.build = "homo_sapiens_75"

data_human95 = pyensembl.EnsemblRelease(95, "human")
db_human95 = database.AGFusionDB(abspath(join(curdir, "agfusion.homo_sapiens.95.db")))
db_human95.build = "homo_sapiens_95"
data_human_hg38 = pyensembl.EnsemblRelease(111, "human")
db_human_hg38 = database.AGFusionDB(abspath(join(curdir, "agfusion.homo_sapiens.111.db")))
db_human_hg38.build = "homo_sapiens_111"


BASEDIR = "./data/FusionsFindingAlgorithms"
Expand Down Expand Up @@ -111,15 +112,15 @@ def test_with_coding_effect(self):
all_fusions = ["ARID3B_MYCNUT", "ARID3B_MYCN", "TVP23C_CDRT4"]
for fusion in parsers.parsers["starfusion"](
f"{BASEDIR}/STARFusion/" + "star-fusion.fusion_predictions.abridged.coding_effect.tsv",
db_human95.logger,
db_human_hg38.logger,
):
fusion = model.Fusion(
gene5prime=fusion["gene5prime"],
gene5primejunction=fusion["gene5prime_junction"],
gene3prime=fusion["gene3prime"],
gene3primejunction=fusion["gene3prime_junction"],
db=db_human95,
pyensembl_data=data_human95,
db=db_human_hg38,
pyensembl_data=data_human_hg38,
protein_databases=["pfam"],
noncanonical=False,
)
Expand Down Expand Up @@ -155,15 +156,15 @@ def test_parse_human(self):
all_fusions = ["BCAS4_BCAS3", "HNRNPC_ACIN1"]
for fusion in parsers.parsers["longgf"](
f"{BASEDIR}/LongGF/fusions_hg38.log",
db_human95.logger,
db_human_hg38.logger,
):
fusion = model.Fusion(
gene5prime=fusion["gene5prime"],
gene5primejunction=fusion["gene5prime_junction"],
gene3prime=fusion["gene3prime"],
gene3primejunction=fusion["gene3prime_junction"],
db=db_human95,
pyensembl_data=data_human95,
db=db_human_hg38,
pyensembl_data=data_human_hg38,
protein_databases=["pfam"],
noncanonical=False,
)
Expand All @@ -176,35 +177,35 @@ class TestFusionInspector(unittest.TestCase):
def test_parse_human(self):
"""Test basic parsing."""

all_fusions = ["AL627171.2_TPM3", "STAT3_AL627171.2"]
all_fusions = ["ENSG00000282885_TPM3", "STAT3_ENSG00000282885"]

for fusion in parsers.parsers["fusioninspector"](
f"{BASEDIR}/FusionInspector/test.FusionInspector.fusions.abridged.txt",
db_human95.logger,
db_human_hg38.logger,
):
fusion = model.Fusion(
gene5prime=fusion["gene5prime"],
gene5primejunction=fusion["gene5prime_junction"],
gene3prime=fusion["gene3prime"],
gene3primejunction=fusion["gene3prime_junction"],
db=db_human95,
pyensembl_data=data_human95,
db=db_human_hg38,
pyensembl_data=data_human_hg38,
protein_databases=["pfam"],
noncanonical=False,
)
assert fusion.name in all_fusions, f"{fusion.name} not in list!"

for fusion in parsers.parsers["fusioninspector"](
f"{BASEDIR}/FusionInspector/test.FusionInspector.fusions.txt",
db_human95.logger,
db_human_hg38.logger,
):
fusion = model.Fusion(
gene5prime=fusion["gene5prime"],
gene5primejunction=fusion["gene5prime_junction"],
gene3prime=fusion["gene3prime"],
gene3primejunction=fusion["gene3prime_junction"],
db=db_human95,
pyensembl_data=data_human95,
db=db_human_hg38,
pyensembl_data=data_human_hg38,
protein_databases=["pfam"],
noncanonical=False,
)
Expand Down

0 comments on commit 65290b3

Please sign in to comment.