Skip to content

Commit

Permalink
Merge pull request #63 from databio/dev
Browse files Browse the repository at this point in the history
release 0.3.0
  • Loading branch information
khoroshevskyi authored Aug 21, 2024
2 parents 4298e4b + 4301c3d commit 813a61a
Show file tree
Hide file tree
Showing 24 changed files with 774 additions and 309 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/run-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ["3.8", "3.11"]
python-version: ["3.9", "3.11"]
os: [ubuntu-latest]

steps:
Expand Down
13 changes: 12 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,15 @@ openSignalMatrix
out2023/*

# test data
test/test_data/*
test/test_data/*
/scripts/bedclassifier_tuning/results/
/scripts/bedclassifier_tuning/data/
genome_config.yaml
alias/hg19/fasta/default/hg19.chrom.sizes
alias/hg19/fasta/default/hg19.fa
alias/hg19/fasta/default/hg19.fa.fai
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c__ASDs.json
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.chrom.sizes
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai
test/Untitled.ipynb
3 changes: 2 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ include bedboss/bedqc/*
include bedboss/qdrant_index/*
include bedboss/bedbuncher/*
include bedboss/bedbuncher/tools/*
include bedboss/bedclassifier/*
include bedboss/bedclassifier/*
include bedboss/tokens/*
2 changes: 1 addition & 1 deletion bedboss/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.1"
__version__ = "0.3.0"
33 changes: 31 additions & 2 deletions bedboss/bedboss.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pephubclient.helpers import is_registry_path, MessageHandler as m
from bbconf.bbagent import BedBaseAgent
from bbconf.models.base_models import FileModel
from bbconf.const import DEFAULT_LICENSE

from bedboss.bedstat.bedstat import bedstat
from bedboss.bedmaker.bedmaker import make_all
Expand Down Expand Up @@ -55,6 +56,7 @@ def run_all(
genome: str,
bedbase_config: Union[str, bbconf.BedBaseAgent],
name: str = None,
license_id: str = DEFAULT_LICENSE,
rfg_config: str = None,
narrowpeak: bool = False,
check_qc: bool = True,
Expand All @@ -67,6 +69,10 @@ def run_all(
upload_qdrant: bool = False,
upload_s3: bool = False,
upload_pephub: bool = False,
# Universes
universe: bool = False,
universe_method: str = None,
universe_bedset: str = None,
pm: pypiper.PipelineManager = None,
) -> str:
"""
Expand All @@ -78,6 +84,7 @@ def run_all(
:param str genome: genome_assembly of the sample. [required] options: (hg19, hg38, mm10) # TODO: add more
:param str name: name of the sample (human-readable name, e.g. "H3K27ac in liver") [optional]
:param Union[str, bbconf.BedBaseConf] bedbase_config: The path to the bedbase configuration file, or bbconf object.
:param str license_id: license identifier [optional] (default: "DUO:0000042").; Find All licenses in bedbase.org
:param str rfg_config: file path to the genome config file [optional]
:param bool narrowpeak: whether the regions are narrow. Used to create bed file from bedgraph or bigwig
(transcription factor implies narrow, histone mark implies broad peaks) [optional]
Expand All @@ -92,6 +99,10 @@ def run_all(
:param bool upload_qdrant: whether to skip qdrant indexing
:param bool upload_s3: whether to upload to s3
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
:param bool universe: whether to add the sample as the universe [Default: False]
:param str universe_method: method used to create the universe [Default: None]
:param str universe_bedset: bedset identifier for the universe [Default: None]
:param pypiper.PipelineManager pm: pypiper object
:return str bed_digest: bed digest
"""
Expand Down Expand Up @@ -189,6 +200,7 @@ def run_all(
plots=plots.model_dump(exclude_unset=True),
files=files.model_dump(exclude_unset=True),
classification=classification.model_dump(exclude_unset=True),
license_id=license_id,
upload_qdrant=upload_qdrant,
upload_pephub=upload_pephub,
upload_s3=upload_s3,
Expand All @@ -197,6 +209,13 @@ def run_all(
nofail=True,
)

if universe:
bbagent.bed.add_universe(
bedfile_id=bed_metadata.bed_digest,
bedset_id=universe_bedset,
construct_method=universe_method,
)

if stop_pipeline:
pm.stop_pipeline()

Expand All @@ -211,7 +230,9 @@ def insert_pep(
bedset_id: str = None,
bedset_name: str = None,
rfg_config: str = None,
create_bedset: bool = True,
license_id: str = DEFAULT_LICENSE,
create_bedset: bool = False,
bedset_heavy: bool = False,
check_qc: bool = True,
ensdb: str = None,
just_db_commit: bool = False,
Expand All @@ -232,7 +253,10 @@ def insert_pep(
:param str bedset_id: bedset identifier
:param str bedset_name: bedset name
:param str rfg_config: path to the genome config file (refgenie)
:param str license_id: license identifier [optional] (default: "DUO:0000042").; Find All licenses in bedbase.org
This license will be used for bedfiles where license is not provided in PEP file
:param bool create_bedset: whether to create bedset
:param bool bedset_heavy: whether to use heavy processing (add all columns to the database)
:param bool upload_qdrant: whether to upload bedfiles to qdrant
:param bool check_qc: whether to run quality control during badmaking
:param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata
Expand Down Expand Up @@ -279,6 +303,7 @@ def insert_pep(
genome=pep_sample.genome,
name=pep_sample.sample_name,
bedbase_config=bbagent,
license_id=pep_sample.get("license_id") or license_id,
narrowpeak=is_narrow_peak,
chrom_sizes=pep_sample.get("chrom_sizes"),
open_signal_matrix=pep_sample.get("open_signal_matrix"),
Expand All @@ -292,8 +317,12 @@ def insert_pep(
upload_qdrant=upload_qdrant,
upload_s3=upload_s3,
upload_pephub=upload_pephub,
universe=pep_sample.get("universe"),
universe_method=pep_sample.get("universe_method"),
universe_bedset=pep_sample.get("universe_bedset"),
pm=pm,
)

processed_ids.append(bed_id)
except BedBossException as e:
_LOGGER.error(f"Failed to process {pep_sample.sample_name}. See {e}")
Expand All @@ -308,7 +337,7 @@ def insert_pep(
name=bedset_name or pep.name,
output_folder=output_folder,
description=pep.description,
heavy=True,
heavy=bedset_heavy,
upload_pephub=upload_pephub,
upload_s3=upload_s3,
no_fail=no_fail,
Expand Down
2 changes: 1 addition & 1 deletion bedboss/bedbuncher/bedbuncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def run_bedbuncher(
description=description,
upload_pephub=upload_pephub,
upload_s3=upload_s3,
plots=plots.model_dump(exclude_none=True, exclude_unset=True),
plots=plots.model_dump(exclude_none=True, exclude_unset=True) if plots else {},
local_path=output_folder,
no_fail=no_fail,
overwrite=force_overwrite,
Expand Down
27 changes: 27 additions & 0 deletions bedboss/bedclassifier/bedclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,39 @@ def get_bed_type(bed: str, no_fail: Optional[bool] = True) -> Tuple[str, str]:

max_rows = 5
row_count = 0

while row_count <= max_rows:
try:
df = pd.read_csv(bed, sep="\t", header=None, nrows=4, skiprows=row_count)
if row_count > 0:
_LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
break
except UnicodeDecodeError as e:
try:
df = pd.read_csv(
bed,
sep="\t",
header=None,
nrows=4,
skiprows=row_count,
encoding="utf-16",
)
if row_count > 0:
_LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
break
except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
if row_count <= max_rows:
row_count += 1
else:
if no_fail:
_LOGGER.warning(
f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype"
)
return "unknown_bedtype", "unknown_bedtype"
else:
raise BedTypeException(
reason=f"Bed type could not be determined due to CSV parse error {e}"
)
except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
if row_count <= max_rows:
row_count += 1
Expand Down
4 changes: 2 additions & 2 deletions bedboss/bedmaker/bedmaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def make_bed(
if input_type not in [member.value for member in InputTypes]:
raise BedBossException(
f"Invalid input type: {input_type}. "
f"Supported types: {', '.join(InputTypes.__members__.values())}"
f"Supported types: {', '.join([k.value for k in InputTypes])}"
)

if not pm:
Expand Down Expand Up @@ -405,7 +405,7 @@ def make_all(

return BedMakerOutput(
bed_file=output_bed,
bigbed_file=output_bigbed,
bigbed_file=os.path.abspath(output_bigbed) if output_bigbed else None,
bed_digest=RegionSet(output_bed).identifier,
bed_type=bed_type,
bed_format=bed_format,
Expand Down
3 changes: 2 additions & 1 deletion bedboss/bedmaker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from bedboss.const import (
REFGENIE_ENV_VAR,
DEFAULT_REFGENIE_PATH,
)

_LOGGER = logging.getLogger("bedboss")
Expand Down Expand Up @@ -66,7 +67,7 @@ def get_rgc(rfg_config: Union[str, Path] = None) -> RGC:
"""
if not rfg_config:
_LOGGER.info("Creating refgenie genome config file...")
cwd = os.getenv(REFGENIE_ENV_VAR, os.getcwd())
cwd = os.getenv(REFGENIE_ENV_VAR, DEFAULT_REFGENIE_PATH)
rfg_config = os.path.join(cwd, "genome_config.yaml")

# get path to the genome config; from arg or env var if arg not provided
Expand Down
8 changes: 6 additions & 2 deletions bedboss/bedstat/bedstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
OPEN_SIGNAL_URL,
)
from bedboss.utils import download_file
from bedboss.exceptions import OpenSignalMatrixException
from bedboss.exceptions import OpenSignalMatrixException, BedBossException


_LOGGER = logging.getLogger("bedboss")
Expand Down Expand Up @@ -158,7 +158,11 @@ def bedstat(
f"--ensdb={ensdb} --digest={bed_digest}"
)

pm.run(cmd=command, target=json_file_path)
try:
pm.run(cmd=command, target=json_file_path)
except Exception as e:
_LOGGER.error(f"Pipeline failed: {e}")
raise BedBossException(f"Pipeline failed: {e}")

data = {}
if os.path.exists(json_file_path):
Expand Down
Loading

0 comments on commit 813a61a

Please sign in to comment.