Skip to content

Commit

Permalink
feat: add org_id param
Browse files Browse the repository at this point in the history
  • Loading branch information
balajtimate committed Oct 31, 2023
1 parent 11f7e08 commit ab51124
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 21 deletions.
11 changes: 0 additions & 11 deletions htsinfer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,17 +273,6 @@ def __call__(
"not be inferred by the application"
)
)
parser.add_argument(
'--org-name',
dest="org_name",
metavar="STR",
type=str,
default=None,
help=(
"source organism of the sequencing library; if provided, will not "
"not be inferred by the application"
)
)
parser.add_argument(
"--verbosity",
choices=[e.name for e in LogLevels],
Expand Down
64 changes: 57 additions & 7 deletions htsinfer/get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

import logging
from pathlib import Path
from typing import Optional
import subprocess as sp
import tempfile

from typing import Optional
from Bio import SeqIO # type: ignore
import pandas as pd # type: ignore
from pandas import DataFrame # type: ignore

Expand Down Expand Up @@ -51,6 +52,7 @@ class GetLibSource:
min_freq_ratio: Minimum frequency ratio between the first and second
most frequent source in order for the former to be considered the
library's source.
org_id: Taxonomy ID of the organism.
"""
def __init__( # pylint: disable=E1101
self,
Expand All @@ -64,7 +66,6 @@ def __init__( # pylint: disable=E1101
self.tmp_dir = config.args.tmp_dir
self.min_match_pct = config.args.lib_source_min_match_pct
self.min_freq_ratio = config.args.lib_source_min_freq_ratio
self.org_name = config.args.org_name
self.org_id = config.args.org_id

def evaluate(self) -> ResultsSource:
Expand All @@ -75,9 +76,12 @@ def evaluate(self) -> ResultsSource:
"""
source = ResultsSource()
# Check if library_source is provided, otherwise infer it
if self.org_name is not None:
source.file_1.short_name = self.org_name
if self.org_id is not None:
source.file_1.taxon_id = self.org_id
source.file_1.short_name = self.get_organism_name(
self.org_id,
self.transcripts_file
)
else:
# Infer library source here and set it to source.library_source
index = self.create_kallisto_index()
Expand All @@ -89,10 +93,11 @@ def evaluate(self) -> ResultsSource:
source.file_1.taxon_id = library_source.taxon_id

if self.paths[1] is not None:
# Check if library_source is provided for file_2, otherwise infer it
if self.org_name is not None:
source.file_2.short_name = self.org_name
# Check if library_source is provided for file_2,
# otherwise infer it
if self.org_id is not None:
source.file_2.taxon_id = self.org_id
source.file_2.short_name = source.file_1.short_name
else:
library_source = self.get_source(
fastq=self.paths[1],
Expand Down Expand Up @@ -301,3 +306,48 @@ def get_source_expression(

# return as dictionary
return dat_agg.sort_values(["tpm"], ascending=False)

@staticmethod
def get_organism_name(
taxon_id: int,
transcripts_file: Path,
) -> Optional[str]:
"""Return name of the organism, based on tax ID.
Args:
taxon_id: Taxonomy ID of a given organism (int).
transcripts_file: Path to fasta file containing transcripts.
Returns:
Short name of the organism belonging to the given tax ID.
Raises:
Could not process input FASTA file.
"""
org_dict = {}
# Construct dictionary of organism ID's and names
try:
for record in SeqIO.parse(
handle=transcripts_file,
format='fasta',
):
org_id = int(record.description.split("|")[4])
org_name = record.description.split("|")[3]

if org_id not in org_dict:
org_dict[org_id] = org_name
else:
org_dict[org_id] = org_name

except OSError as exc:
raise FileProblem(
f"Could not process file '{transcripts_file}'"
) from exc

if taxon_id in org_dict:
return org_dict[taxon_id]
LOGGER.warning(
f"Taxon ID '{taxon_id}' not found in organism dictionary, "
"using default 'None' for organism."
)
return None
4 changes: 1 addition & 3 deletions htsinfer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
)
import logging
import re
from typing import Optional, Union
from typing import Optional
from pathlib import Path
import tempfile

Expand Down Expand Up @@ -356,7 +356,6 @@ class Args(BaseModel):
records: Number of input file records to process; set to `0` to
process all records.
threads: Number of threads to run STAR with.
org_name: Organism name.
org_id: Organism ID.
transcripts_file: File path to transcripts FASTA file.
read_layout_adapter_file: Path to text file containing 3' adapter
Expand Down Expand Up @@ -431,7 +430,6 @@ class Args(BaseModel):
CleanupRegimes.DEFAULT
records: int = 0
threads: int = 1
org_name: Optional[str] = None
org_id: Optional[int] = None
transcripts_file: Path = Path()
read_layout_adapter_file: Path = Path()
Expand Down

0 comments on commit ab51124

Please sign in to comment.