diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 8cc4048..58da7fe 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -12,7 +12,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.8", "3.11"] + python-version: ["3.9", "3.11"] os: [ubuntu-latest] steps: diff --git a/.gitignore b/.gitignore index 3a554a2..65e8d4b 100644 --- a/.gitignore +++ b/.gitignore @@ -139,4 +139,15 @@ openSignalMatrix out2023/* # test data -test/test_data/* \ No newline at end of file +test/test_data/* +/scripts/bedclassifier_tuning/results/ +/scripts/bedclassifier_tuning/data/ +genome_config.yaml +alias/hg19/fasta/default/hg19.chrom.sizes +alias/hg19/fasta/default/hg19.fa +alias/hg19/fasta/default/hg19.fa.fai +data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c__ASDs.json +data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.chrom.sizes +data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa +data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai +test/Untitled.ipynb diff --git a/MANIFEST.in b/MANIFEST.in index f709b94..f8d5555 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -8,4 +8,5 @@ include bedboss/bedqc/* include bedboss/qdrant_index/* include bedboss/bedbuncher/* include bedboss/bedbuncher/tools/* -include bedboss/bedclassifier/* \ No newline at end of file +include bedboss/bedclassifier/* +include bedboss/tokens/* \ No newline at end of file diff --git a/bedboss/_version.py b/bedboss/_version.py index 3ced358..493f741 100644 --- a/bedboss/_version.py +++ b/bedboss/_version.py @@ -1 +1 @@ -__version__ = "0.2.1" +__version__ = "0.3.0" diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index 193ed2f..298501e 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -12,6 +12,7 @@ from pephubclient.helpers import is_registry_path, MessageHandler as m from bbconf.bbagent import BedBaseAgent from bbconf.models.base_models import FileModel +from bbconf.const import DEFAULT_LICENSE from bedboss.bedstat.bedstat import bedstat from bedboss.bedmaker.bedmaker import make_all @@ -55,6 +56,7 @@ def run_all( genome: str, bedbase_config: Union[str, bbconf.BedBaseAgent], name: str = None, + license_id: str = DEFAULT_LICENSE, rfg_config: str = None, narrowpeak: bool = False, check_qc: bool = True, @@ -67,6 +69,10 @@ def run_all( upload_qdrant: bool = False, upload_s3: bool = False, upload_pephub: bool = False, + # Universes + universe: bool = False, + universe_method: str = None, + universe_bedset: str = None, pm: pypiper.PipelineManager = None, ) -> str: """ @@ -78,6 +84,7 @@ def run_all( :param str genome: genome_assembly of the sample. [required] options: (hg19, hg38, mm10) # TODO: add more :param str name: name of the sample (human-readable name, e.g. "H3K27ac in liver") [optional] :param Union[str, bbconf.BedBaseConf] bedbase_config: The path to the bedbase configuration file, or bbconf object. + :param str license_id: license identifier [optional] (default: "DUO:0000042").; Find All licenses in bedbase.org :param str rfg_config: file path to the genome config file [optional] :param bool narrowpeak: whether the regions are narrow. Used to create bed file from bedgraph or bigwig (transcription factor implies narrow, histone mark implies broad peaks) [optional] @@ -92,6 +99,10 @@ def run_all( :param bool upload_qdrant: whether to skip qdrant indexing :param bool upload_s3: whether to upload to s3 :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) + + :param bool universe: whether to add the sample as the universe [Default: False] + :param str universe_method: method used to create the universe [Default: None] + :param str universe_bedset: bedset identifier for the universe [Default: None] :param pypiper.PipelineManager pm: pypiper object :return str bed_digest: bed digest """ @@ -189,6 +200,7 @@ def run_all( plots=plots.model_dump(exclude_unset=True), files=files.model_dump(exclude_unset=True), classification=classification.model_dump(exclude_unset=True), + license_id=license_id, upload_qdrant=upload_qdrant, upload_pephub=upload_pephub, upload_s3=upload_s3, @@ -197,6 +209,13 @@ def run_all( nofail=True, ) + if universe: + bbagent.bed.add_universe( + bedfile_id=bed_metadata.bed_digest, + bedset_id=universe_bedset, + construct_method=universe_method, + ) + if stop_pipeline: pm.stop_pipeline() @@ -211,7 +230,9 @@ def insert_pep( bedset_id: str = None, bedset_name: str = None, rfg_config: str = None, - create_bedset: bool = True, + license_id: str = DEFAULT_LICENSE, + create_bedset: bool = False, + bedset_heavy: bool = False, check_qc: bool = True, ensdb: str = None, just_db_commit: bool = False, @@ -232,7 +253,10 @@ def insert_pep( :param str bedset_id: bedset identifier :param str bedset_name: bedset name :param str rfg_config: path to the genome config file (refgenie) + :param str license_id: license identifier [optional] (default: "DUO:0000042").; Find All licenses in bedbase.org + This license will be used for bedfiles where license is not provided in PEP file :param bool create_bedset: whether to create bedset + :param bool bedset_heavy: whether to use heavy processing (add all columns to the database) :param bool upload_qdrant: whether to upload bedfiles to qdrant :param bool check_qc: whether to run quality control during badmaking :param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata @@ -279,6 +303,7 @@ def insert_pep( genome=pep_sample.genome, name=pep_sample.sample_name, bedbase_config=bbagent, + license_id=pep_sample.get("license_id") or license_id, narrowpeak=is_narrow_peak, chrom_sizes=pep_sample.get("chrom_sizes"), open_signal_matrix=pep_sample.get("open_signal_matrix"), @@ -292,8 +317,12 @@ def insert_pep( upload_qdrant=upload_qdrant, upload_s3=upload_s3, upload_pephub=upload_pephub, + universe=pep_sample.get("universe"), + universe_method=pep_sample.get("universe_method"), + universe_bedset=pep_sample.get("universe_bedset"), pm=pm, ) + processed_ids.append(bed_id) except BedBossException as e: _LOGGER.error(f"Failed to process {pep_sample.sample_name}. See {e}") @@ -308,7 +337,7 @@ def insert_pep( name=bedset_name or pep.name, output_folder=output_folder, description=pep.description, - heavy=True, + heavy=bedset_heavy, upload_pephub=upload_pephub, upload_s3=upload_s3, no_fail=no_fail, diff --git a/bedboss/bedbuncher/bedbuncher.py b/bedboss/bedbuncher/bedbuncher.py index bcd5fcc..fa96e48 100644 --- a/bedboss/bedbuncher/bedbuncher.py +++ b/bedboss/bedbuncher/bedbuncher.py @@ -158,7 +158,7 @@ def run_bedbuncher( description=description, upload_pephub=upload_pephub, upload_s3=upload_s3, - plots=plots.model_dump(exclude_none=True, exclude_unset=True), + plots=plots.model_dump(exclude_none=True, exclude_unset=True) if plots else {}, local_path=output_folder, no_fail=no_fail, overwrite=force_overwrite, diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index 420cea0..d32c737 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -38,12 +38,39 @@ def get_bed_type(bed: str, no_fail: Optional[bool] = True) -> Tuple[str, str]: max_rows = 5 row_count = 0 + while row_count <= max_rows: try: df = pd.read_csv(bed, sep="\t", header=None, nrows=4, skiprows=row_count) if row_count > 0: _LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}") break + except UnicodeDecodeError as e: + try: + df = pd.read_csv( + bed, + sep="\t", + header=None, + nrows=4, + skiprows=row_count, + encoding="utf-16", + ) + if row_count > 0: + _LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}") + break + except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e: + if row_count <= max_rows: + row_count += 1 + else: + if no_fail: + _LOGGER.warning( + f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype" + ) + return "unknown_bedtype", "unknown_bedtype" + else: + raise BedTypeException( + reason=f"Bed type could not be determined due to CSV parse error {e}" + ) except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e: if row_count <= max_rows: row_count += 1 diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py index 5f33d05..ecd982b 100755 --- a/bedboss/bedmaker/bedmaker.py +++ b/bedboss/bedmaker/bedmaker.py @@ -169,7 +169,7 @@ def make_bed( if input_type not in [member.value for member in InputTypes]: raise BedBossException( f"Invalid input type: {input_type}. " - f"Supported types: {', '.join(InputTypes.__members__.values())}" + f"Supported types: {', '.join([k.value for k in InputTypes])}" ) if not pm: @@ -405,7 +405,7 @@ def make_all( return BedMakerOutput( bed_file=output_bed, - bigbed_file=output_bigbed, + bigbed_file=os.path.abspath(output_bigbed) if output_bigbed else None, bed_digest=RegionSet(output_bed).identifier, bed_type=bed_type, bed_format=bed_format, diff --git a/bedboss/bedmaker/utils.py b/bedboss/bedmaker/utils.py index 971451a..1701ae7 100644 --- a/bedboss/bedmaker/utils.py +++ b/bedboss/bedmaker/utils.py @@ -15,6 +15,7 @@ from bedboss.const import ( REFGENIE_ENV_VAR, + DEFAULT_REFGENIE_PATH, ) _LOGGER = logging.getLogger("bedboss") @@ -66,7 +67,7 @@ def get_rgc(rfg_config: Union[str, Path] = None) -> RGC: """ if not rfg_config: _LOGGER.info("Creating refgenie genome config file...") - cwd = os.getenv(REFGENIE_ENV_VAR, os.getcwd()) + cwd = os.getenv(REFGENIE_ENV_VAR, DEFAULT_REFGENIE_PATH) rfg_config = os.path.join(cwd, "genome_config.yaml") # get path to the genome config; from arg or env var if arg not provided diff --git a/bedboss/bedstat/bedstat.py b/bedboss/bedstat/bedstat.py index 67da509..9d00174 100755 --- a/bedboss/bedstat/bedstat.py +++ b/bedboss/bedstat/bedstat.py @@ -17,7 +17,7 @@ OPEN_SIGNAL_URL, ) from bedboss.utils import download_file -from bedboss.exceptions import OpenSignalMatrixException +from bedboss.exceptions import OpenSignalMatrixException, BedBossException _LOGGER = logging.getLogger("bedboss") @@ -158,7 +158,11 @@ def bedstat( f"--ensdb={ensdb} --digest={bed_digest}" ) - pm.run(cmd=command, target=json_file_path) + try: + pm.run(cmd=command, target=json_file_path) + except Exception as e: + _LOGGER.error(f"Pipeline failed: {e}") + raise BedBossException(f"Pipeline failed: {e}") data = {} if os.path.exists(json_file_path): diff --git a/bedboss/cli.py b/bedboss/cli.py index 06b0d94..8a4951d 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -1,23 +1,26 @@ import typer from typing import Union import os -import pypiper -from bedboss.bedqc.bedqc import bedqc +from bedboss import __version__ from bedboss.const import MAX_FILE_SIZE, MAX_REGION_NUMBER, MIN_REGION_WIDTH -from bedboss import __version__ +# commented and made new const here, because it speeds up help function, +# from bbconf.const import DEFAULT_LICENSE +DEFAULT_LICENSE = "DUO:0000042" app = typer.Typer(pretty_exceptions_short=False, pretty_exceptions_show_locals=False) def create_pm( outfolder: str, multi: bool = False, recover: bool = True, dirty: bool = False -) -> pypiper.PipelineManager: +): + import pypiper + pm_out_folder = outfolder pm_out_folder = os.path.join(pm_out_folder, "pipeline_manager") - pm = pypiper.PipelineManager( + pm: pypiper.PipelineManager = pypiper.PipelineManager( name="bedboss-pipeline", outfolder=pm_out_folder, version=__version__, @@ -63,6 +66,11 @@ def run_all( file_okay=True, readable=True, ), + license_id: str = typer.Option( + DEFAULT_LICENSE, + help="License ID. If not provided for in PEP" + "for each bed file, this license will be used", + ), rfg_config: str = typer.Option(None, help="Path to the rfg config file"), narrowpeak: bool = typer.Option(False, help="Is the input file a narrowpeak file?"), check_qc: bool = typer.Option(True, help="Check the quality of the input file?"), @@ -78,6 +86,14 @@ def run_all( upload_qdrant: bool = typer.Option(False, help="Upload to Qdrant"), upload_s3: bool = typer.Option(False, help="Upload to S3"), upload_pephub: bool = typer.Option(False, help="Upload to PEPHub"), + # Universes + universe: bool = typer.Option(False, help="Create a universe"), + universe_method: str = typer.Option( + None, help="Method used to create the universe" + ), + universe_bedset: str = typer.Option( + None, help="Bedset used used to create the universe" + ), # PipelineManager multi: bool = typer.Option(False, help="Run multiple samples"), recover: bool = typer.Option(True, help="Recover from previous run"), @@ -88,13 +104,17 @@ def run_all( Run the bedboss pipeline for a single bed file """ from bedboss.bedboss import run_all as run_all_bedboss + from bbconf.bbagent import BedBaseAgent + + agent = BedBaseAgent(bedbase_config) run_all_bedboss( input_file=input_file, input_type=input_type, outfolder=outfolder, genome=genome, - bedbase_config=bedbase_config, + bedbase_config=agent, + license_id=license_id, rfg_config=rfg_config, narrowpeak=narrowpeak, check_qc=check_qc, @@ -107,6 +127,9 @@ def run_all( upload_qdrant=upload_qdrant, upload_s3=upload_s3, upload_pephub=upload_pephub, + universe=universe, + universe_method=universe_method, + universe_bedset=universe_bedset, pm=create_pm(outfolder=outfolder, multi=multi, recover=recover, dirty=dirty), ) @@ -122,7 +145,10 @@ def run_pep( file_okay=True, readable=True, ), - create_bedset: bool = typer.Option(True, help="Create a new bedset"), + create_bedset: bool = typer.Option(False, help="Create a new bedset"), + bedset_heavy: bool = typer.Option( + False, help="Run the heavy version of the bedbuncher pipeline" + ), bedset_id: Union[str, None] = typer.Option(None, help="Bedset ID"), rfg_config: str = typer.Option(None, help="Path to the rfg config file"), check_qc: bool = typer.Option(True, help="Check the quality of the input file?"), @@ -135,6 +161,7 @@ def run_pep( upload_s3: bool = typer.Option(False, help="Upload to S3"), upload_pephub: bool = typer.Option(False, help="Upload to PEPHub"), no_fail: bool = typer.Option(False, help="Do not fail on error"), + license_id: str = typer.Option(DEFAULT_LICENSE, help="License ID"), # PipelineManager multi: bool = typer.Option(False, help="Run multiple samples"), recover: bool = typer.Option(True, help="Recover from previous run"), @@ -152,10 +179,12 @@ def run_pep( bedset_id=bedset_id, rfg_config=rfg_config, create_bedset=create_bedset, + bedset_heavy=bedset_heavy, check_qc=check_qc, ensdb=ensdb, just_db_commit=just_db_commit, force_overwrite=force_overwrite, + license_id=license_id, upload_s3=upload_s3, upload_pephub=upload_pephub, upload_qdrant=upload_qdrant, @@ -276,6 +305,8 @@ def run_qc( recover: bool = typer.Option(True, help="Recover from previous run"), dirty: bool = typer.Option(False, help="Run without removing existing files"), ): + from bedboss.bedqc.bedqc import bedqc + bedqc( bedfile=bed_file, outfolder=outfolder, @@ -378,7 +409,6 @@ def make_bedset( def init_config( outfolder: str = typer.Option(..., help="Path to the output folder"), ): - from bedboss.utils import save_example_bedbase_config save_example_bedbase_config(outfolder) @@ -414,7 +444,108 @@ def delete_bedset( print(f"BedSet {identifier} deleted from the bedbase database") -@app.command(help="check installed R packages") +@app.command(help="Tokenize a bedfile") +def tokenize_bed( + bed_id: str = typer.Option( + ..., + help="Path to the bed file", + ), + universe_id: str = typer.Option( + ..., + help="Universe ID", + ), + cache_folder: str = typer.Option( + None, + help="Path to the cache folder", + ), + add_to_db: bool = typer.Option( + False, + help="Add the tokenized bed file to the bedbase database", + ), + bedbase_config: str = typer.Option( + None, + help="Path to the bedbase config file", + exists=True, + file_okay=True, + readable=True, + ), + overwrite: bool = typer.Option( + False, + help="Overwrite the existing tokenized bed file", + ), +): + from bedboss.tokens.tokens import tokenize_bed_file + + tokenize_bed_file( + universe=universe_id, + bed=bed_id, + cache_folder=cache_folder, + add_to_db=add_to_db, + config=bedbase_config, + overwrite=overwrite, + ) + + +@app.command(help="Delete tokenized bed file") +def delete_tokenized( + universe_id: str = typer.Option( + ..., + help="Universe ID", + ), + bed_id: str = typer.Option( + ..., + help="Bed ID", + ), + config: str = typer.Option( + None, + help="Path to the bedbase config file", + exists=True, + file_okay=True, + readable=True, + ), +): + from bedboss.tokens.tokens import delete_tokenized + + delete_tokenized( + universe=universe_id, + bed=bed_id, + config=config, + ) + + +@app.command(help="Convert bed file to universe") +def convert_universe( + bed_id: str = typer.Option( + ..., + help="Path to the bed file", + ), + config: str = typer.Option( + ..., + help="Path to the bedbase config file", + exists=True, + file_okay=True, + readable=True, + ), + method: str = typer.Option( + None, + help="Method used to create the universe", + ), + bedset: str = typer.Option( + None, + help="Bedset used to create the universe", + ), +): + from bbconf.bbagent import BedBaseAgent + + bbagent = BedBaseAgent(config) + bbagent.bed.add_universe( + bedfile_id=bed_id, + bedset_id=bedset, + construct_method=method, + ) + + +@app.command(help="Check installed R packages") def check_requirements(): from bedboss.bedboss import requirements_check diff --git a/bedboss/const.py b/bedboss/const.py index a109d62..12ce79b 100644 --- a/bedboss/const.py +++ b/bedboss/const.py @@ -34,4 +34,6 @@ BEDBOSS_PEP_SCHEMA_PATH = "https://schema.databio.org/pipelines/bedboss.yaml" REFGENIE_ENV_VAR = "REFGENIE" +DEFAULT_REFGENIE_PATH = os.path.join(HOME_PATH, ".refgenie") + BED_PEP_REGISTRY = "databio/allbeds:bedbase" diff --git a/bedboss/exceptions.py b/bedboss/exceptions.py index 2aea22b..c764403 100644 --- a/bedboss/exceptions.py +++ b/bedboss/exceptions.py @@ -1,4 +1,4 @@ -class BedBossException(BaseException): +class BedBossException(Exception): """Exception, when bedboss fails.""" def __init__(self, reason: str = ""): diff --git a/bedboss/tokens/__init__.py b/bedboss/tokens/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bedboss/tokens/tokens.py b/bedboss/tokens/tokens.py new file mode 100644 index 0000000..45c7b9f --- /dev/null +++ b/bedboss/tokens/tokens.py @@ -0,0 +1,78 @@ +# functions for tokenization of bed files +import logging +from typing import Union +import os +from bbconf.bbagent import BedBaseAgent +from geniml.bbclient import BBClient +from geniml.bbclient.const import DEFAULT_CACHE_FOLDER + +from genimtools.tokenizers import TreeTokenizer + +from bedboss.exceptions import BedBossException + +_LOGGER = logging.getLogger("bedboss") + + +def tokenize_bed_file( + universe: str, + bed: str, + cache_folder: Union[str, os.PathLike] = DEFAULT_CACHE_FOLDER, + add_to_db: bool = False, + config: str = None, + overwrite: bool = False, +) -> None: + """ + Tokenize all bed file and add to the local cache + + :param universe: universe name to which the bed file will be tokenized + :param bed: bed file to be tokenized + :param cache_folder: path to the cache folder + :param add_to_db: flag to add tokenized bed file to the bedbase database [config should be provided if True] + :param config: path to the bedbase config file + :param overwrite: flag to overwrite the existing tokenized bed file + + :return: None + """ + bbc = BBClient(cache_folder=cache_folder or DEFAULT_CACHE_FOLDER) + + tokenizer = TreeTokenizer(bbc.seek(universe)) + rs = bbc.load_bed(bed) + + tokens = tokenizer(rs).ids + + # b = tokens.to_regions() # [Region(chr1, 100, 200), ... ] + # f = tokens.to_bit_vector() # + + bbc.cache_tokens(universe, bed, tokens) + _LOGGER.info(f"Tokenized bed file '{bed}' added to the cache") + + if add_to_db: + if not config: + BedBossException( + "Config file is required to add tokenized bed file to the database" + ) + + bbagent = BedBaseAgent(config=config) + bbagent.bed.add_tokenized( + bed_id=bed, universe_id=universe, token_vector=tokens, overwrite=overwrite + ) + _LOGGER.info(f"Tokenized bed file '{bed}' added to the database") + + +def delete_tokenized( + universe: str, + bed: str, + config: str = None, +) -> None: + """ + Delete tokenized bed file from the database + + :param universe: universe name to which the bed file will be tokenized + :param bed: bed file to be tokenized + :param config: path to the bedbase config file + + :return: None + """ + bba = BedBaseAgent(config=config) + + bba.bed.delete_tokenized(bed_id=bed, universe_id=universe) diff --git a/docs/changelog.md b/docs/changelog.md index cdd875e..38785f3 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,16 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +# [0.3.0] - 2024-08-21 +## Added +- Added classifier +- Added create universe uploader +- Added tokenization and token uploader + +## Changes +- Updated efficiency of CLI help + + # [0.2.1] - 2024-04-09 ## Changed - small naming tweaks diff --git a/docs/templates/usage.template b/docs/templates/usage.template index 5b0c7fd..90dd4d2 100644 --- a/docs/templates/usage.template +++ b/docs/templates/usage.template @@ -1,24 +1,9 @@ # Usage reference -BEDboss is command-line tool-warehouse of 3 pipelines for genomic interval files - -This pipeline can be run using next positional arguments: - -- `bedboss all`: Runs all pipelines one in order: bedmaker -> bedqc -> bedstat - -- `bedboss insert`: Runs all pipelines one in order by using PEP file and creates bedset: bedmaker -> bedqc -> bedstat -> bedbuncher - -- `bedboss make`: Creates Bed and BigBed files from other type of genomic interval files [bigwig|bedgraph|bed|bigbed|wig] - -- `bedboss qc`: Runs Quality control for bed file (Works only with bed files) - -- `bedboss stat`: Runs statistics for bed and bigbed files. - -- `bedboss bunch`: Creates bedset from PEP file - -- `bedboss index`: Creates bed file vectors and inserts to qdrant database - -- `bedboss requirements-check`: Check if all requirements are installed +BEDboss is command-line tool-manager and a set of tools for working with BED files and BEDbase. Main components of BEDboss are: +1) Pipeline for processing BED files: bedmaker, bedqc, and bedstats. +2) Indexing of the Bed files in bedbase +3) Managing bed and bedsets in the database Here you can see the command-line usage instructions for the main bedboss command and for each subcommand: diff --git a/docs/usage.md b/docs/usage.md index f1eeee6..216496b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,307 +1,281 @@ # Usage reference -BEDboss is command-line tool-warehouse of 3 pipelines for genomic interval files +BEDboss is command-line tool-manager and a set of tools for working with BED files and BEDbase. Main components of BEDboss are: +1) Pipeline for processing BED files: bedmaker, bedqc, and bedstats. +2) Indexing of the Bed files in bedbase +3) Managing bed and bedsets in the database -This pipeline can be run using next positional arguments: - -- `bedboss all`: Runs all pipelines one in order: bedmaker -> bedqc -> bedstat +Here you can see the command-line usage instructions for the main bedboss command and for each subcommand: -- `bedboss insert`: Runs all pipelines one in order by using PEP file and creates bedset: bedmaker -> bedqc -> bedstat -> bedbuncher +## `bedboss --help` +```console + + Usage: bedboss [OPTIONS] COMMAND [ARGS]... + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --version -v App version │ +│ --install-completion [bash|zsh|fish|powershell|pwsh] Install completion for the specified shell. [default: None] │ +│ --show-completion [bash|zsh|fish|powershell|pwsh] Show completion for the specified shell, to copy it or customize the installation. [default: None] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ check-requirements check installed R packages │ +│ delete-bed Delete bed from the bedbase database │ +│ delete-bedset Delete BedSet from the bedbase database │ +│ init-config Initialize the new, sample configuration file │ +│ make-bed Create a bed files form a [bigwig, bedgraph, bed, bigbed, wig] file │ +│ make-bedset Create a bedset from a pep file, and insert it to the bedbase database. │ +│ make-bigbed Create a bigbed files form a bed file │ +│ reindex Reindex the bedbase database and insert all files to the qdrant database. │ +│ run-all Run all the bedboss pipeline for a single bed file │ +│ run-pep Run the all bedboss pipeline for a bed files in a PEP │ +│ run-qc Run the quality control for a bed file │ +│ run-stats Create the statistics for a single bed file. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -- `bedboss make`: Creates Bed and BigBed files from other type of genomic interval files [bigwig|bedgraph|bed|bigbed|wig] +``` -- `bedboss qc`: Runs Quality control for bed file (Works only with bed files) +## `bedboss check-requirements --help` +```console + + Usage: bedboss check-requirements [OPTIONS] + + check installed R packages + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -- `bedboss stat`: Runs statistics for bed and bigbed files. +``` -- `bedboss bunch`: Creates bedset from PEP file +## `bedboss delete-bed --help` +```console + + Usage: bedboss delete-bed [OPTIONS] + + Delete bed from the bedbase database + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --sample-id TEXT Sample ID [default: None] [required] │ +│ * --config TEXT Path to the bedbase config file [default: None] [required] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -- `bedboss index`: Creates bed file vectors and inserts to qdrant database +``` -- `bedboss requirements-check`: Check if all requirements are installed +## `bedboss delete-bedset --help` +```console + + Usage: bedboss delete-bedset [OPTIONS] + + Delete BedSet from the bedbase database + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --identifier TEXT BedSet ID [default: None] [required] │ +│ * --config TEXT Path to the bedbase config file [default: None] [required] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -Here you can see the command-line usage instructions for the main bedboss command and for each subcommand: +``` -## `bedboss --help` +## `bedboss init-config --help` ```console -HNSWBackend requires hnswlib. Install hnswlib, or ignore this if you don't need HNSWBackend -version: 0.1.0 -usage: bedboss [-h] [--version] [--silent] [--verbosity V] [--logdev] - {all,insert,make,qc,stat,bunch,index,requirements-check} ... + + Usage: bedboss init-config [OPTIONS] + + Initialize the new, sample configuration file + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --outfolder TEXT Path to the output folder [default: None] [required] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -Warehouse of pipelines for BED-like files: bedmaker, bedstat, and bedqc. +``` -positional arguments: - {all,insert,make,qc,stat,bunch,index,requirements-check} - all Run all bedboss pipelines and insert data into bedbase - insert Run all bedboss pipelines using one PEP and insert - data into bedbase - make A pipeline to convert bed, bigbed, bigwig or bedgraph - files into bed and bigbed formats - qc Run quality control on bed file (bedqc) - stat A pipeline to read a file in BED format and produce - metadata in JSON format. - bunch A pipeline to create bedsets (sets of BED files) that - will be retrieved from bedbase. - index Index not indexed bed files and add them to the qdrant - database - requirements-check Check if all requirements are installed +## `bedboss make-bed --help` +```console + + Usage: bedboss make-bed [OPTIONS] + + Create a bed files form a [bigwig, bedgraph, bed, bigbed, wig] file + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --input-file TEXT Path to the input file [default: None] [required] │ +│ * --input-type TEXT Type of the input file. Options are: bigwig, bedgraph, bed, bigbed, wig [default: None] [required] │ +│ * --outfolder TEXT Path to the output folder [default: None] [required] │ +│ * --genome TEXT Genome name. Example: 'hg38' [default: None] [required] │ +│ --rfg-config TEXT Path to the rfg config file [default: None] │ +│ --narrowpeak --no-narrowpeak Is the input file a narrowpeak file? [default: no-narrowpeak] │ +│ --chrom-sizes TEXT Path to the chrom sizes file [default: None] │ +│ --multi --no-multi Run multiple samples [default: no-multi] │ +│ --recover --no-recover Recover from previous run [default: recover] │ +│ --dirty --no-dirty Run without removing existing files [default: no-dirty] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -options: - -h, --help show this help message and exit - --version show program's version number and exit - --silent Silence logging. Overrides verbosity. - --verbosity V Set logging level (1-5 or logging module level name) - --logdev Expand content of logging message format. ``` -## `bedboss all --help` +## `bedboss make-bedset --help` ```console -HNSWBackend requires hnswlib. Install hnswlib, or ignore this if you don't need HNSWBackend -usage: bedboss all [-h] --outfolder OUTFOLDER -s SAMPLE_NAME -f INPUT_FILE -t - INPUT_TYPE -g GENOME [-r RFG_CONFIG] - [--chrom-sizes CHROM_SIZES] [-n] [--standardize] - [--check-qc] [--open-signal-matrix OPEN_SIGNAL_MATRIX] - [--ensdb ENSDB] --bedbase-config BEDBASE_CONFIG - [--treatment TREATMENT] [--cell-type CELL_TYPE] - [--description DESCRIPTION] [--no-db-commit] - [--just-db-commit] [--upload_qdrant] [--upload-pephub] [-R] - [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--logdev] + + Usage: bedboss make-bedset [OPTIONS] + + Create a bedset from a pep file, and insert it to the bedbase database. + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --pep TEXT PEP file. Local or remote path [default: None] [required] │ +│ * --outfolder TEXT Path to the output folder [default: None] [required] │ +│ * --bedbase-config TEXT Path to the bedbase config file [default: None] [required] │ +│ * --bedset-name TEXT Name of the bedset [default: None] [required] │ +│ --heavy --no-heavy Run the heavy version of the pipeline [default: no-heavy] │ +│ --force-overwrite --no-force-overwrite Force overwrite the output files [default: no-force-overwrite] │ +│ --upload-s3 --no-upload-s3 Upload to S3 [default: no-upload-s3] │ +│ --upload-pephub --no-upload-pephub Upload to PEPHub [default: no-upload-pephub] │ +│ --no-fail --no-no-fail Do not fail on error [default: no-no-fail] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -options: - -h, --help show this help message and exit - --outfolder OUTFOLDER - Pipeline output folder [Required] - -s SAMPLE_NAME, --sample-name SAMPLE_NAME - name of the sample used to systematically build the - output name [Required] - -f INPUT_FILE, --input-file INPUT_FILE - Input file [Required] - -t INPUT_TYPE, --input-type INPUT_TYPE - Input type [Required] options: - (bigwig|bedgraph|bed|bigbed|wig) - -g GENOME, --genome GENOME - reference genome (assembly) [Required] - -r RFG_CONFIG, --rfg-config RFG_CONFIG - file path to the genome config file(refgenie) - --chrom-sizes CHROM_SIZES - a full path to the chrom.sizes required for the - bedtobigbed conversion - -n, --narrowpeak whether it's a narrowpeak file - --standardize Standardize bed files: remove non-standard chromosomes - and headers if necessary Default: False - --check-qc Check quality control before processing data. Default: - True - --open-signal-matrix OPEN_SIGNAL_MATRIX - a full path to the openSignalMatrix required for the - tissue specificity plots - --ensdb ENSDB A full path to the ensdb gtf file required for genomes - not in GDdata - --bedbase-config BEDBASE_CONFIG - a path to the bedbase configuration file [Required] - --treatment TREATMENT - A treatment of the bed file - --cell-type CELL_TYPE - A cell type of the bed file - --description DESCRIPTION - A description of the bed file - --no-db-commit skip the JSON commit to the database [Default: False] - --just-db-commit Do not save the results locally - --upload_qdrant whether to execute qdrant indexing - --upload-pephub upload to pephub - -R, --recover Overwrite locks to recover from previous failed run - -N, --new-start Overwrite all results to start a fresh run - -D, --dirty Don't auto-delete intermediate files - -F, --force-follow Always run 'follow' commands - -T, --testmode Only print commands, don't run - --silent Silence logging. Overrides verbosity. - --verbosity V Set logging level (1-5 or logging module level name) - --logdev Expand content of logging message format. ``` -## `bedboss insert --help` +## `bedboss make-bigbed --help` ```console -HNSWBackend requires hnswlib. Install hnswlib, or ignore this if you don't need HNSWBackend -usage: bedboss insert [-h] --bedbase-config BEDBASE_CONFIG --pep PEP - --output-folder OUTPUT_FOLDER [-r RFG_CONFIG] - [--check-qc] [--standardize] [--create-bedset] - [--upload_qdrant] [--ensdb ENSDB] [--no-db-commit] - [--just-db-commit] [--force_overwrite] [--upload-s3] - [--upload-pephub] [-R] [-N] [-D] [-F] [-T] [--silent] - [--verbosity V] [--logdev] + + Usage: bedboss make-bigbed [OPTIONS] + + Create a bigbed files form a bed file + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --bed-file TEXT Path to the input file [default: None] [required] │ +│ * --bed-type TEXT bed type to be used for bigBed file generation 'bed{bedtype}+{n}' [Default: None] (e.g bed3+1) [default: None] [required] │ +│ * --outfolder TEXT Path to the output folder [default: None] [required] │ +│ * --genome TEXT Genome name. Example: 'hg38' [default: None] [required] │ +│ --rfg-config TEXT Path to the rfg config file [default: None] │ +│ --chrom-sizes TEXT Path to the chrom sizes file [default: None] │ +│ --multi --no-multi Run multiple samples [default: no-multi] │ +│ --recover --no-recover Recover from previous run [default: recover] │ +│ --dirty --no-dirty Run without removing existing files [default: no-dirty] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -options: - -h, --help show this help message and exit - --bedbase-config BEDBASE_CONFIG - a path to the bedbase configuration file [Required] - --pep PEP path to the pep file or pephub registry path - containing pep [Required] - --output-folder OUTPUT_FOLDER - Pipeline output folder [Required] - -r RFG_CONFIG, --rfg-config RFG_CONFIG - file path to the genome config file(refgenie) - --check-qc Check quality control before processing data. Default: - True - --standardize Standardize bed files: remove non-standard chromosomes - and headers if necessary Default: False - --create-bedset Create bedset using pep samples. Name of the bedset - will be based on pep name.Default: False - --upload_qdrant whether to execute qdrant indexing - --ensdb ENSDB A full path to the ensdb gtf file required for genomes - not in GDdata - --no-db-commit skip the JSON commit to the database [Default: False] - --just-db-commit just commit the JSON to the database - --force_overwrite Weather to overwrite existing records. [Default: - False] - --upload-s3 Weather to upload bed, bigbed, and statistics to s3. - Before uploading you have to set up all necessury env - vars: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and - AWS_ENDPOINT_URL. [Default: False] - --upload-pephub upload to pephub - -R, --recover Overwrite locks to recover from previous failed run - -N, --new-start Overwrite all results to start a fresh run - -D, --dirty Don't auto-delete intermediate files - -F, --force-follow Always run 'follow' commands - -T, --testmode Only print commands, don't run - --silent Silence logging. Overrides verbosity. - --verbosity V Set logging level (1-5 or logging module level name) - --logdev Expand content of logging message format. ``` -## `bedboss make --help` +## `bedboss reindex --help` ```console -HNSWBackend requires hnswlib. Install hnswlib, or ignore this if you don't need HNSWBackend -usage: bedboss make [-h] -f INPUT_FILE --outfolder OUTFOLDER [-n] -t - INPUT_TYPE -g GENOME [-r RFG_CONFIG] -o OUTPUT_BED - --output-bigbed OUTPUT_BIGBED -s SAMPLE_NAME - [--chrom-sizes CHROM_SIZES] [--standardize] [-R] [-N] [-D] - [-F] [-T] [--silent] [--verbosity V] [--logdev] + + Usage: bedboss reindex [OPTIONS] + + Reindex the bedbase database and insert all files to the qdrant database. + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --bedbase-config TEXT Path to the bedbase config file [default: None] [required] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -options: - -h, --help show this help message and exit - -f INPUT_FILE, --input-file INPUT_FILE - path to the input file [Required] - --outfolder OUTFOLDER - Pipeline output folder [Required] - -n, --narrowpeak whether it's a narrowpeak file - -t INPUT_TYPE, --input-type INPUT_TYPE - input file format (supported formats: bedGraph, - bigBed, bigWig, wig) [Required] - -g GENOME, --genome GENOME - reference genome [Required] - -r RFG_CONFIG, --rfg-config RFG_CONFIG - file path to the genome config file - -o OUTPUT_BED, --output-bed OUTPUT_BED - path to the output BED files [Required] - --output-bigbed OUTPUT_BIGBED - path to the folder of output bigBed files [Required] - -s SAMPLE_NAME, --sample-name SAMPLE_NAME - name of the sample used to systematically build the - output name [Required] - --chrom-sizes CHROM_SIZES - A full path to the chrom.sizes required for the - bedtobigbed conversion [optional] - --standardize Standardize bed files: remove non-standard chromosomes - and headers if necessary Default: False - -R, --recover Overwrite locks to recover from previous failed run - -N, --new-start Overwrite all results to start a fresh run - -D, --dirty Don't auto-delete intermediate files - -F, --force-follow Always run 'follow' commands - -T, --testmode Only print commands, don't run - --silent Silence logging. Overrides verbosity. - --verbosity V Set logging level (1-5 or logging module level name) - --logdev Expand content of logging message format. ``` -## `bedboss qc --help` +## `bedboss run-all --help` ```console -HNSWBackend requires hnswlib. Install hnswlib, or ignore this if you don't need HNSWBackend -usage: bedboss qc [-h] --bedfile BEDFILE --outfolder OUTFOLDER [-R] [-N] [-D] - [-F] [-T] [--silent] [--verbosity V] [--logdev] + + Usage: bedboss run-all [OPTIONS] + + Run all the bedboss pipeline for a single bed file + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --input-file TEXT Path to the input file [default: None] [required] │ +│ * --input-type TEXT Type of the input file. Options are: bigwig, bedgraph, bed, bigbed, wig [default: None] [required] │ +│ * --outfolder TEXT Path to the output folder [default: None] [required] │ +│ * --genome TEXT Genome name. Example: 'hg38' [default: None] [required] │ +│ * --bedbase-config TEXT Path to the bedbase config file [default: None] [required] │ +│ --rfg-config TEXT Path to the rfg config file [default: None] │ +│ --narrowpeak --no-narrowpeak Is the input file a narrowpeak file? [default: no-narrowpeak] │ +│ --check-qc --no-check-qc Check the quality of the input file? [default: check-qc] │ +│ --chrom-sizes TEXT Path to the chrom sizes file [default: None] │ +│ --open-signal-matrix TEXT Path to the open signal matrix file [default: None] │ +│ --ensdb TEXT Path to the EnsDb database file [default: None] │ +│ --just-db-commit --no-just-db-commit Just commit to the database? [default: no-just-db-commit] │ +│ --force-overwrite --no-force-overwrite Force overwrite the output files [default: no-force-overwrite] │ +│ --upload-qdrant --no-upload-qdrant Upload to Qdrant [default: no-upload-qdrant] │ +│ --upload-s3 --no-upload-s3 Upload to S3 [default: no-upload-s3] │ +│ --upload-pephub --no-upload-pephub Upload to PEPHub [default: no-upload-pephub] │ +│ --multi --no-multi Run multiple samples [default: no-multi] │ +│ --recover --no-recover Recover from previous run [default: recover] │ +│ --dirty --no-dirty Run without removing existing files [default: no-dirty] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -options: - -h, --help show this help message and exit - --bedfile BEDFILE a full path to bed file to process [Required] - --outfolder OUTFOLDER - a full path to output log folder. [Required] - -R, --recover Overwrite locks to recover from previous failed run - -N, --new-start Overwrite all results to start a fresh run - -D, --dirty Don't auto-delete intermediate files - -F, --force-follow Always run 'follow' commands - -T, --testmode Only print commands, don't run - --silent Silence logging. Overrides verbosity. - --verbosity V Set logging level (1-5 or logging module level name) - --logdev Expand content of logging message format. ``` -## `bedboss stat --help` +## `bedboss run-pep --help` ```console -HNSWBackend requires hnswlib. Install hnswlib, or ignore this if you don't need HNSWBackend -usage: bedboss stat [-h] --bedfile BEDFILE --genome GENOME --outfolder - OUTFOLDER [--bigbed BIGBED] - [--open-signal-matrix OPEN_SIGNAL_MATRIX] [--ensdb ENSDB] - [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] - [--logdev] + + Usage: bedboss run-pep [OPTIONS] + + Run the all bedboss pipeline for a bed files in a PEP + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --pep TEXT PEP file. Local or remote path [default: None] [required] │ +│ * --outfolder TEXT Path to the output folder [default: None] [required] │ +│ * --bedbase-config TEXT Path to the bedbase config file [default: None] [required] │ +│ --create-bedset --no-create-bedset Create a new bedset [default: no-create-bedset] │ +│ --bedset-heavy --no-bedset-heavy Run the heavy version of the bedbuncher pipeline [default: no-bedset-heavy] │ +│ --bedset-id TEXT Bedset ID [default: None] │ +│ --rfg-config TEXT Path to the rfg config file [default: None] │ +│ --check-qc --no-check-qc Check the quality of the input file? [default: check-qc] │ +│ --ensdb TEXT Path to the EnsDb database file [default: None] │ +│ --just-db-commit --no-just-db-commit Just commit to the database? [default: no-just-db-commit] │ +│ --force-overwrite --no-force-overwrite Force overwrite the output files [default: no-force-overwrite] │ +│ --upload-qdrant --no-upload-qdrant Upload to Qdrant [default: no-upload-qdrant] │ +│ --upload-s3 --no-upload-s3 Upload to S3 [default: no-upload-s3] │ +│ --upload-pephub --no-upload-pephub Upload to PEPHub [default: no-upload-pephub] │ +│ --no-fail --no-no-fail Do not fail on error [default: no-no-fail] │ +│ --multi --no-multi Run multiple samples [default: no-multi] │ +│ --recover --no-recover Recover from previous run [default: recover] │ +│ --dirty --no-dirty Run without removing existing files [default: no-dirty] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -options: - -h, --help show this help message and exit - --bedfile BEDFILE a full path to bed file to process [Required] - --genome GENOME genome assembly of the sample [Required] - --outfolder OUTFOLDER - Pipeline output folder [Required] - --bigbed BIGBED a full path to the bigbed files - --open-signal-matrix OPEN_SIGNAL_MATRIX - a full path to the openSignalMatrix required for the - tissue specificity plots - --ensdb ENSDB a full path to the ensdb gtf file required for genomes - not in GDdata - -R, --recover Overwrite locks to recover from previous failed run - -N, --new-start Overwrite all results to start a fresh run - -D, --dirty Don't auto-delete intermediate files - -F, --force-follow Always run 'follow' commands - -T, --testmode Only print commands, don't run - --silent Silence logging. Overrides verbosity. - --verbosity V Set logging level (1-5 or logging module level name) - --logdev Expand content of logging message format. ``` -## `bedboss bunch --help` +## `bedboss run-qc --help` ```console -HNSWBackend requires hnswlib. Install hnswlib, or ignore this if you don't need HNSWBackend -usage: bedboss bunch [-h] --bedbase-config BEDBASE_CONFIG --bedset-name - BEDSET_NAME --bedset-pep BEDSET_PEP - [--base-api BEDBASE_API] [--cache-path CACHE_PATH] - [--heavy] + + Usage: bedboss run-qc [OPTIONS] + + Run the quality control for a bed file + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --bed-file TEXT Path to the bed file to check the quality control on. [default: None] [required] │ +│ * --outfolder TEXT Path to the output folder [default: None] [required] │ +│ --max-file-size INTEGER Maximum file size threshold to pass the quality [default: 2147483648] │ +│ --max-region-number INTEGER Maximum number of regions threshold to pass the quality [default: 5000000] │ +│ --min-region-width INTEGER Minimum region width threshold to pass the quality [default: 10] │ +│ --multi --no-multi Run multiple samples [default: no-multi] │ +│ --recover --no-recover Recover from previous run [default: recover] │ +│ --dirty --no-dirty Run without removing existing files [default: no-dirty] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -options: - -h, --help show this help message and exit - --bedbase-config BEDBASE_CONFIG - a path to the bedbase configuration file [Required] - --bedset-name BEDSET_NAME - a name of the bedset [Required] - --bedset-pep BEDSET_PEP - bedset pep path or pephub registry path containing - bedset pep [Required] - --base-api BEDBASE_API - Bedbase API to use. Default is https://api.bedbase.org - --cache-path CACHE_PATH - Path to the cache folder. Default is ./bedabse_cache - --heavy whether to use heavy processing (Calculate and crate - plots using R script). ``` -## `bedboss index --help` +## `bedboss run-stats --help` ```console -HNSWBackend requires hnswlib. Install hnswlib, or ignore this if you don't need HNSWBackend -usage: bedboss index [-h] --bedbase-config BEDBASE_CONFIG - [--bedbase-api BEDBASE_API] + + Usage: bedboss run-stats [OPTIONS] + + Create the statistics for a single bed file. + +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --bed-file TEXT Path to the bed file [default: None] [required] │ +│ * --genome TEXT Genome name. Example: 'hg38' [default: None] [required] │ +│ * --outfolder TEXT Path to the output folder [default: None] [required] │ +│ --ensdb TEXT Path to the EnsDb database file [default: None] │ +│ --open-signal-matrix TEXT Path to the open signal matrix file [default: None] │ +│ --just-db-commit --no-just-db-commit Just commit to the database? [default: no-just-db-commit] │ +│ --multi --no-multi Run multiple samples [default: no-multi] │ +│ --recover --no-recover Recover from previous run [default: recover] │ +│ --dirty --no-dirty Run without removing existing files [default: no-dirty] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -options: - -h, --help show this help message and exit - --bedbase-config BEDBASE_CONFIG - a path to the bedbase configuration file [Required] - --bedbase-api BEDBASE_API - URL of the Bedbase API [Default: - https://api.bedbase.org] ``` diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index cb808c6..40842da 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,13 +1,13 @@ logmuse>=0.2.7 coloredlogs>=15.0.1 -peppy>=0.40.1 +peppy>=0.40.5 yacman>=0.8.4 requests>=2.28.2 piper>=v0.14.0 -bbconf>=0.5.1 +bbconf>=0.6.0 # bbconf @ git+https://github.com/databio/bbconf.git@dev#egg=bbconf refgenconf>=0.12.2 -pandas>=1.5.3 +pandas>=2.0.0 ubiquerg>=0.6.2 -pephubclient>=0.2.1 -geniml>=0.2.0 \ No newline at end of file +pephubclient>=0.4.4 +geniml>=0.4.0 \ No newline at end of file diff --git a/scripts/bedclassifier_tuning/README.md b/scripts/bedclassifier_tuning/README.md new file mode 100644 index 0000000..88c3fc6 --- /dev/null +++ b/scripts/bedclassifier_tuning/README.md @@ -0,0 +1,2 @@ +Just making a script to download and assess BED files and determine if they are correctly classified. + diff --git a/scripts/bedclassifier_tuning/bedclassifier_output_schema.yaml b/scripts/bedclassifier_tuning/bedclassifier_output_schema.yaml new file mode 100644 index 0000000..1f793a0 --- /dev/null +++ b/scripts/bedclassifier_tuning/bedclassifier_output_schema.yaml @@ -0,0 +1,23 @@ +title: Bed Classifier +description: Output for bed classification results +type: object +properties: + pipeline_name: "bedclassifier" + samples: + type: object + properties: + bedfile_named: + type: string + description: "reported bedfile name e.g. narrowpeak" + bedfile_type: + type: string + description: "reported bedfile type" + given_bedfile_type: + type: string + description: "given bed file type" + types_match: + type: boolean + description: "Do the types match?" + gsm: + type: string + description: "given gsm" \ No newline at end of file diff --git a/scripts/bedclassifier_tuning/bedclassify.py b/scripts/bedclassifier_tuning/bedclassify.py new file mode 100644 index 0000000..a4110f3 --- /dev/null +++ b/scripts/bedclassifier_tuning/bedclassify.py @@ -0,0 +1,188 @@ +import gzip +import logging +import os +import shutil + +import pipestat +import pypiper +from typing import Optional + +from bedboss.bedclassifier import get_bed_type +from bedboss.exceptions import BedTypeException + +_LOGGER = logging.getLogger("bedboss") + +from geofetch import Finder, Geofetcher + + +class BedClassifier: + """ + This will take the input of either a .bed or a .bed.gz and classify the type of BED file. + + """ + + def __init__( + self, + input_file: str, + output_dir: Optional[str] = None, + bed_digest: Optional[str] = None, + input_type: Optional[str] = None, + pm: pypiper.PipelineManager = None, + report_to_database: Optional[bool] = False, + psm: pipestat.PipestatManager = None, + gsm: str = None, + ): + # Raise Exception if input_type is given and it is NOT a BED file + # Raise Exception if the input file cannot be resolved + + self.gsm = gsm + self.input_file = input_file + self.bed_digest = bed_digest + self.input_type = input_type + + self.abs_bed_path = os.path.abspath(self.input_file) + self.file_name = os.path.splitext(os.path.basename(self.abs_bed_path))[0] + self.file_extension = os.path.splitext(self.abs_bed_path)[-1] + + # we need this only if unzipping a file + self.output_dir = output_dir or os.path.join( + os.path.dirname(self.abs_bed_path), "temp_processing" + ) + # Use existing Pipeline Manager if it exists + self.pm = pm + + if psm is None: + pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default" + self.psm = pipestat.PipestatManager( + pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml" + ) + else: + self.psm = psm + + if self.file_extension == ".gz": + unzipped_input_file = os.path.join(self.output_dir, self.file_name) + + with gzip.open(self.input_file, "rb") as f_in: + _LOGGER.info( + f"Unzipping file:{self.input_file} and Creating Unzipped file: {unzipped_input_file}" + ) + with open(unzipped_input_file, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + self.input_file = unzipped_input_file + if self.pm: + self.pm.clean_add(unzipped_input_file) + + try: + self.bed_type, self.bed_type_named = get_bed_type(self.input_file) + except BedTypeException as e: + _LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}") + self.bed_type = "unknown_bedtype" + self.bed_type_named = "unknown_bedtype" + + if self.input_type is not None: + if self.bed_type_named != self.input_type: + _LOGGER.warning( + f"BED file classified as different type than given input: {self.bed_type} vs {self.input_type}" + ) + do_types_match = False + else: + do_types_match = True + else: + do_types_match = False + + # Create Value Dict to report via pipestat + + all_values = {} + + if self.input_type: + all_values.update({"given_bedfile_type": self.input_type}) + if self.bed_type: + all_values.update({"bedfile_type": self.bed_type}) + if self.bed_type_named: + all_values.update({"bedfile_named": self.bed_type_named}) + if self.gsm: + all_values.update({"gsm": self.gsm}) + + all_values.update({"types_match": do_types_match}) + + try: + psm.report(record_identifier=bed_digest, values=all_values) + except Exception as e: + _LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}") + + if self.pm: + self.pm.stop_pipeline() + + +def main(): + # PEP for reporting all classification results + pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default" + + # Place these external to pycharm folder!!! + data_output_path = os.path.abspath("data") + results_path = os.path.abspath("results") + logs_dir = os.path.join(results_path, "logs") + + gse_obj = Finder() + + # # Optionally: provide filter string and max number of retrieve elements + # gse_obj = Finder(filters="narrowpeak", retmax=100) + # + # gse_list = gse_obj.get_gse_all() + # gse_obj.generate_file("data/output.txt", gse_list=gse_list) + + pm = pypiper.PipelineManager( + name="bedclassifier", + outfolder=logs_dir, + recover=True, + ) + + pm.start_pipeline() + + # for geo in gse_list: + geofetcher_obj = Geofetcher( + filter="\.(bed|narrowPeak|broadPeak)\.", + filter_size="25MB", + data_source="samples", + geo_folder=data_output_path, + metadata_folder=data_output_path, + processed=True, + max_soft_size="20MB", + discard_soft=True, + ) + + # geofetcher_obj.fetch_all(input="data/output.txt", name="donald_test") + geofetched = geofetcher_obj.get_projects( + input=os.path.join(data_output_path, "output.txt"), just_metadata=False + ) + + samples = geofetched["output_samples"].samples + + psm = pipestat.PipestatManager( + pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml" + ) + + for sample in samples: + if isinstance(sample.output_file_path, list): + bedfile = sample.output_file_path[0] + else: + bedfile = sample.output_file_path + geo_accession = sample.sample_geo_accession + sample_name = sample.sample_name + bed_type_from_geo = sample.type.lower() + + bed = BedClassifier( + input_file=bedfile, + bed_digest=sample_name, # TODO FIX THIS IT HOULD BE AN ACTUAL DIGEST + output_dir=results_path, + input_type=bed_type_from_geo, + psm=psm, + pm=pm, + gsm=geo_accession, + ) + + pm.stop_pipeline() + + +if __name__ == "__main__": + main() diff --git a/scripts/update_usage_docs.sh b/scripts/update_usage_docs.sh index 5f432aa..c60bcae 100755 --- a/scripts/update_usage_docs.sh +++ b/scripts/update_usage_docs.sh @@ -2,7 +2,7 @@ cp ../docs/templates/usage.template usage.template # bedboss --help > USAGE.temp 2>&1 -for cmd in "--help" "all --help" "insert --help" "make --help" "qc --help" "stat --help" "bunch --help" "index --help" ; do +for cmd in "--help" "check-requirements --help" "delete-bed --help" "delete-bedset --help" "init-config --help" "make-bed --help" "make-bedset --help" "make-bigbed --help" "reindex --help" "run-all --help" "run-pep --help" "run-qc --help" "run-stats --help"; do echo $cmd echo -e "## \`bedboss $cmd\`" > USAGE_header.temp bedboss $cmd --help > USAGE.temp 2>&1 diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index d4c6f19..4b4dd0e 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -16,7 +16,6 @@ class TestBedClassifier: - def test_classification( self, ):