Merge pull request #63 from databio/dev

release 0.3.0
databio · Aug 21, 2024 · 813a61a · 813a61a
2 parents 4298e4b + 4301c3d
commit 813a61a
Show file tree

Hide file tree

Showing 24 changed files with 774 additions and 309 deletions.
diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: ["3.8", "3.11"]
+        python-version: ["3.9", "3.11"]
         os: [ubuntu-latest]
 
     steps:

diff --git a/.gitignore b/.gitignore
@@ -139,4 +139,15 @@ openSignalMatrix
 out2023/*
 
 # test data
-test/test_data/*
+test/test_data/*
+/scripts/bedclassifier_tuning/results/
+/scripts/bedclassifier_tuning/data/
+genome_config.yaml
+alias/hg19/fasta/default/hg19.chrom.sizes
+alias/hg19/fasta/default/hg19.fa
+alias/hg19/fasta/default/hg19.fa.fai
+data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c__ASDs.json
+data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.chrom.sizes
+data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa
+data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai
+test/Untitled.ipynb
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -8,4 +8,5 @@ include bedboss/bedqc/*
 include bedboss/qdrant_index/*
 include bedboss/bedbuncher/*
 include bedboss/bedbuncher/tools/*
-include bedboss/bedclassifier/*
+include bedboss/bedclassifier/*
+include bedboss/tokens/*
diff --git a/bedboss/_version.py b/bedboss/_version.py
@@ -1 +1 @@
-__version__ = "0.2.1"
+__version__ = "0.3.0"
diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py
@@ -12,6 +12,7 @@
 from pephubclient.helpers import is_registry_path, MessageHandler as m
 from bbconf.bbagent import BedBaseAgent
 from bbconf.models.base_models import FileModel
+from bbconf.const import DEFAULT_LICENSE
 
 from bedboss.bedstat.bedstat import bedstat
 from bedboss.bedmaker.bedmaker import make_all
@@ -55,6 +56,7 @@ def run_all(
     genome: str,
     bedbase_config: Union[str, bbconf.BedBaseAgent],
     name: str = None,
+    license_id: str = DEFAULT_LICENSE,
     rfg_config: str = None,
     narrowpeak: bool = False,
     check_qc: bool = True,
@@ -67,6 +69,10 @@ def run_all(
     upload_qdrant: bool = False,
     upload_s3: bool = False,
     upload_pephub: bool = False,
+    # Universes
+    universe: bool = False,
+    universe_method: str = None,
+    universe_bedset: str = None,
     pm: pypiper.PipelineManager = None,
 ) -> str:
     """
@@ -78,6 +84,7 @@ def run_all(
     :param str genome: genome_assembly of the sample. [required] options: (hg19, hg38, mm10) # TODO: add more
     :param str name: name of the sample (human-readable name, e.g. "H3K27ac in liver") [optional]
     :param Union[str, bbconf.BedBaseConf] bedbase_config: The path to the bedbase configuration file, or bbconf object.
+    :param str license_id: license identifier [optional] (default: "DUO:0000042").; Find All licenses in bedbase.org
     :param str rfg_config: file path to the genome config file [optional]
     :param bool narrowpeak: whether the regions are narrow. Used to create bed file from bedgraph or bigwig
         (transcription factor implies narrow, histone mark implies broad peaks) [optional]
@@ -92,6 +99,10 @@ def run_all(
     :param bool upload_qdrant: whether to skip qdrant indexing
     :param bool upload_s3: whether to upload to s3
     :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
+
+    :param bool universe: whether to add the sample as the universe [Default: False]
+    :param str universe_method: method used to create the universe [Default: None]
+    :param str universe_bedset: bedset identifier for the universe [Default: None]
     :param pypiper.PipelineManager pm: pypiper object
     :return str bed_digest: bed digest
     """
@@ -189,6 +200,7 @@ def run_all(
         plots=plots.model_dump(exclude_unset=True),
         files=files.model_dump(exclude_unset=True),
         classification=classification.model_dump(exclude_unset=True),
+        license_id=license_id,
         upload_qdrant=upload_qdrant,
         upload_pephub=upload_pephub,
         upload_s3=upload_s3,
@@ -197,6 +209,13 @@ def run_all(
         nofail=True,
     )
 
+    if universe:
+        bbagent.bed.add_universe(
+            bedfile_id=bed_metadata.bed_digest,
+            bedset_id=universe_bedset,
+            construct_method=universe_method,
+        )
+
     if stop_pipeline:
         pm.stop_pipeline()
 
@@ -211,7 +230,9 @@ def insert_pep(
     bedset_id: str = None,
     bedset_name: str = None,
     rfg_config: str = None,
-    create_bedset: bool = True,
+    license_id: str = DEFAULT_LICENSE,
+    create_bedset: bool = False,
+    bedset_heavy: bool = False,
     check_qc: bool = True,
     ensdb: str = None,
     just_db_commit: bool = False,
@@ -232,7 +253,10 @@ def insert_pep(
     :param str bedset_id: bedset identifier
     :param str bedset_name: bedset name
     :param str rfg_config: path to the genome config file (refgenie)
+    :param str license_id: license identifier [optional] (default: "DUO:0000042").; Find All licenses in bedbase.org
+        This license will be used for bedfiles where license is not provided in PEP file
     :param bool create_bedset: whether to create bedset
+    :param bool bedset_heavy: whether to use heavy processing (add all columns to the database)
     :param bool upload_qdrant: whether to upload bedfiles to qdrant
     :param bool check_qc: whether to run quality control during badmaking
     :param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata
@@ -279,6 +303,7 @@ def insert_pep(
                 genome=pep_sample.genome,
                 name=pep_sample.sample_name,
                 bedbase_config=bbagent,
+                license_id=pep_sample.get("license_id") or license_id,
                 narrowpeak=is_narrow_peak,
                 chrom_sizes=pep_sample.get("chrom_sizes"),
                 open_signal_matrix=pep_sample.get("open_signal_matrix"),
@@ -292,8 +317,12 @@ def insert_pep(
                 upload_qdrant=upload_qdrant,
                 upload_s3=upload_s3,
                 upload_pephub=upload_pephub,
+                universe=pep_sample.get("universe"),
+                universe_method=pep_sample.get("universe_method"),
+                universe_bedset=pep_sample.get("universe_bedset"),
                 pm=pm,
             )
+
             processed_ids.append(bed_id)
         except BedBossException as e:
             _LOGGER.error(f"Failed to process {pep_sample.sample_name}. See {e}")
@@ -308,7 +337,7 @@ def insert_pep(
             name=bedset_name or pep.name,
             output_folder=output_folder,
             description=pep.description,
-            heavy=True,
+            heavy=bedset_heavy,
             upload_pephub=upload_pephub,
             upload_s3=upload_s3,
             no_fail=no_fail,

diff --git a/bedboss/bedbuncher/bedbuncher.py b/bedboss/bedbuncher/bedbuncher.py
@@ -158,7 +158,7 @@ def run_bedbuncher(
         description=description,
         upload_pephub=upload_pephub,
         upload_s3=upload_s3,
-        plots=plots.model_dump(exclude_none=True, exclude_unset=True),
+        plots=plots.model_dump(exclude_none=True, exclude_unset=True) if plots else {},
         local_path=output_folder,
         no_fail=no_fail,
         overwrite=force_overwrite,

diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py
@@ -38,12 +38,39 @@ def get_bed_type(bed: str, no_fail: Optional[bool] = True) -> Tuple[str, str]:
 
     max_rows = 5
     row_count = 0
+
     while row_count <= max_rows:
         try:
             df = pd.read_csv(bed, sep="\t", header=None, nrows=4, skiprows=row_count)
             if row_count > 0:
                 _LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
             break
+        except UnicodeDecodeError as e:
+            try:
+                df = pd.read_csv(
+                    bed,
+                    sep="\t",
+                    header=None,
+                    nrows=4,
+                    skiprows=row_count,
+                    encoding="utf-16",
+                )
+                if row_count > 0:
+                    _LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
+                break
+            except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
+                if row_count <= max_rows:
+                    row_count += 1
+                else:
+                    if no_fail:
+                        _LOGGER.warning(
+                            f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype"
+                        )
+                        return "unknown_bedtype", "unknown_bedtype"
+                    else:
+                        raise BedTypeException(
+                            reason=f"Bed type could not be determined due to CSV parse error {e}"
+                        )
         except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
             if row_count <= max_rows:
                 row_count += 1

diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py
@@ -169,7 +169,7 @@ def make_bed(
     if input_type not in [member.value for member in InputTypes]:
         raise BedBossException(
             f"Invalid input type: {input_type}. "
-            f"Supported types: {', '.join(InputTypes.__members__.values())}"
+            f"Supported types: {', '.join([k.value for k in InputTypes])}"
         )
 
     if not pm:
@@ -405,7 +405,7 @@ def make_all(
 
     return BedMakerOutput(
         bed_file=output_bed,
-        bigbed_file=output_bigbed,
+        bigbed_file=os.path.abspath(output_bigbed) if output_bigbed else None,
         bed_digest=RegionSet(output_bed).identifier,
         bed_type=bed_type,
         bed_format=bed_format,

diff --git a/bedboss/bedmaker/utils.py b/bedboss/bedmaker/utils.py
@@ -15,6 +15,7 @@
 
 from bedboss.const import (
     REFGENIE_ENV_VAR,
+    DEFAULT_REFGENIE_PATH,
 )
 
 _LOGGER = logging.getLogger("bedboss")
@@ -66,7 +67,7 @@ def get_rgc(rfg_config: Union[str, Path] = None) -> RGC:
     """
     if not rfg_config:
         _LOGGER.info("Creating refgenie genome config file...")
-        cwd = os.getenv(REFGENIE_ENV_VAR, os.getcwd())
+        cwd = os.getenv(REFGENIE_ENV_VAR, DEFAULT_REFGENIE_PATH)
         rfg_config = os.path.join(cwd, "genome_config.yaml")
 
     # get path to the genome config; from arg or env var if arg not provided

diff --git a/bedboss/bedstat/bedstat.py b/bedboss/bedstat/bedstat.py
@@ -17,7 +17,7 @@
     OPEN_SIGNAL_URL,
 )
 from bedboss.utils import download_file
-from bedboss.exceptions import OpenSignalMatrixException
+from bedboss.exceptions import OpenSignalMatrixException, BedBossException
 
 
 _LOGGER = logging.getLogger("bedboss")
@@ -158,7 +158,11 @@ def bedstat(
             f"--ensdb={ensdb} --digest={bed_digest}"
         )
 
-        pm.run(cmd=command, target=json_file_path)
+        try:
+            pm.run(cmd=command, target=json_file_path)
+        except Exception as e:
+            _LOGGER.error(f"Pipeline failed: {e}")
+            raise BedBossException(f"Pipeline failed: {e}")
 
     data = {}
     if os.path.exists(json_file_path):