Skip to content

Commit

Permalink
add archaea
Browse files Browse the repository at this point in the history
  • Loading branch information
gbouras13 committed Jul 23, 2024
1 parent f31c59e commit 839b4ee
Show file tree
Hide file tree
Showing 15 changed files with 3,674 additions and 415 deletions.
4 changes: 4 additions & 0 deletions src/dnaapler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ def chromosome(
archaea command
"""


@main_cli.command()
@click.help_option("--help", "-h")
@click.version_option(get_version(), "--version", "-V")
Expand Down Expand Up @@ -278,6 +279,7 @@ def archaea(
# end dnaapler
end_dnaapler(start_time)


"""
Plasmid command
"""
Expand Down Expand Up @@ -899,6 +901,8 @@ def all(
gene = "dnaA,terL"
elif db == "repa,terl":
gene = "repA,terL"
elif db == "cog1474":
gene = "cog1474"

# custom
if custom_db != "":
Expand Down
3,231 changes: 3,231 additions & 0 deletions src/dnaapler/db/all.faa

Large diffs are not rendered by default.

Binary file modified src/dnaapler/db/all_db.pdb
Binary file not shown.
Binary file modified src/dnaapler/db/all_db.phr
Binary file not shown.
Binary file modified src/dnaapler/db/all_db.pin
Binary file not shown.
10 changes: 5 additions & 5 deletions src/dnaapler/db/all_db.pjs
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
"dbtype": "Protein",
"db-version": 5,
"description": "all.faa",
"number-of-letters": 4304128,
"number-of-sequences": 8739,
"last-updated": "2023-08-15T10:44:00",
"number-of-letters": 4463108,
"number-of-sequences": 9142,
"last-updated": "2024-07-23T22:32:00",
"number-of-volumes": 1,
"bytes-total": 5770444,
"bytes-to-cache": 4382868,
"bytes-total": 6022224,
"bytes-to-cache": 4545475,
"files": [
"all_db.pdb",
"all_db.phr",
Expand Down
Binary file modified src/dnaapler/db/all_db.psq
Binary file not shown.
Binary file modified src/dnaapler/db/all_db.pto
Binary file not shown.
806 changes: 403 additions & 403 deletions src/dnaapler/db/cog1474.fasta

Large diffs are not rendered by default.

Binary file modified src/dnaapler/db/cog1474_db.phr
Binary file not shown.
Binary file modified src/dnaapler/db/cog1474_db.pin
Binary file not shown.
6 changes: 3 additions & 3 deletions src/dnaapler/db/cog1474_db.pjs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
"description": "cog1474.fasta",
"number-of-letters": 158980,
"number-of-sequences": 403,
"last-updated": "2024-07-23T15:32:00",
"last-updated": "2024-07-23T22:30:00",
"number-of-volumes": 1,
"bytes-total": 272156,
"bytes-to-cache": 162704,
"bytes-total": 288633,
"bytes-to-cache": 162712,
"files": [
"cog1474_db.pdb",
"cog1474_db.phr",
Expand Down
19 changes: 15 additions & 4 deletions src/dnaapler/utils/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def all_process_blast_output_and_reorient(
seed_value: int,
custom_db: str,
) -> None:
"""Processes the blast output,reorients and saves all contigs into os.path.join(output, f"{prefix}_reoriented.fasta")
"""Processes the blast output, reorients and saves all contigs into os.path.join(output, f"{prefix}_reoriented.fasta")
:param input: input file
:param blast_file: blast output file
Expand Down Expand Up @@ -174,19 +174,27 @@ def all_process_blast_output_and_reorient(
"repA": filtered_df[
filtered_df["sseqid"].str.contains("UniRef90", case=False)
].shape[0],
"cog1474": filtered_df[
filtered_df["sseqid"].str.contains("cog1474", case=False)
].shape[0],
}

# if there are hits to more than 1 of dnaA, terL, repA, implement logic
# to prefer dnaA, repA then terL (in that order)
if (counts["dnaA"] > 0) + (counts["terL"] > 0) + (counts["repA"] > 0) >= 2:
if (counts["dnaA"] > 0) + (counts["terL"] > 0) + (counts["repA"] > 0) + (counts["cog1474"] > 0) >= 2:
# prefer dnaA if it is greater than zero
if counts["dnaA"] > 0:
# keep only the hits where dnaA is found
filtered_df = filtered_df[
filtered_df["sseqid"].str.contains("DNAA")
]

else: # where there is repA and terL, keep repA
# else prefer cog1474 - archaea if it is greater than zero
elif counts["cog1474"] > 0:
filtered_df = filtered_df[
filtered_df["sseqid"].str.contains("cog1474")
]
# otherwise where there is repA and terL, keep repA
else:
filtered_df = filtered_df[
filtered_df["sseqid"].str.contains("UniRef90")
]
Expand Down Expand Up @@ -229,6 +237,9 @@ def all_process_blast_output_and_reorient(
# set as dnaA by default
gene = "dnaA"

# for archaea
if "cog1474" in filtered_df["sseqid"][0]:
gene = "cog1474"
# for plasmids
if "UniRef90" in filtered_df["sseqid"][0]:
gene = "repA"
Expand Down
2 changes: 2 additions & 0 deletions src/dnaapler/utils/bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def run_bulk_blast(
db_name = "dnaA_terL_db"
elif gene == "repA,terL":
db_name = "repA_terL_db"
elif db == "cog1474":
gene = "cog1474_db"

# for chromosome, plasmid or phage or all
# runs blast
Expand Down
11 changes: 11 additions & 0 deletions tests/test_overall.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,17 @@ def test_chrom(tmp_dir):
cmd = f"dnaapler chromosome -i {input_fasta} -o {tmp_dir} -t 1 -f"
exec_command(cmd)

def test_all_archaea(tmp_dir):
"""test all archaea"""
input_fasta: Path = f"{overall_test_data}/CP001742.1_archaea.fasta"
cmd = f"dnaapler all -i {input_fasta} -o {tmp_dir} -t 1 -f"
exec_command(cmd)

def test_archaea(tmp_dir):
"""test archaea"""
input_fasta: Path = f"{overall_test_data}/CP001742.1_archaea.fasta"
cmd = f"dnaapler archaea -i {input_fasta} -o {tmp_dir} -t 1 -f"
exec_command(cmd)

def test_chrom_start_codon_not_found(tmp_dir):
"""test chrom"""
Expand Down

0 comments on commit 839b4ee

Please sign in to comment.