Skip to content

Commit

Permalink
Merge pull request #67 from databio/dev_bedclassifier_script
Browse files Browse the repository at this point in the history
Dev bedclassifier script
  • Loading branch information
donaldcampbelljr authored May 30, 2024
2 parents 68dd96e + b2e9b12 commit 7cb35e1
Show file tree
Hide file tree
Showing 7 changed files with 252 additions and 3 deletions.
13 changes: 12 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,15 @@ openSignalMatrix
out2023/*

# test data
test/test_data/*
test/test_data/*
/scripts/bedclassifier_tuning/results/
/scripts/bedclassifier_tuning/data/
genome_config.yaml
alias/hg19/fasta/default/hg19.chrom.sizes
alias/hg19/fasta/default/hg19.fa
alias/hg19/fasta/default/hg19.fa.fai
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c__ASDs.json
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.chrom.sizes
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai
test/Untitled.ipynb
27 changes: 27 additions & 0 deletions bedboss/bedclassifier/bedclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,39 @@ def get_bed_type(bed: str, no_fail: Optional[bool] = True) -> Tuple[str, str]:

max_rows = 5
row_count = 0

while row_count <= max_rows:
try:
df = pd.read_csv(bed, sep="\t", header=None, nrows=4, skiprows=row_count)
if row_count > 0:
_LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
break
except UnicodeDecodeError as e:
try:
df = pd.read_csv(
bed,
sep="\t",
header=None,
nrows=4,
skiprows=row_count,
encoding="utf-16",
)
if row_count > 0:
_LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
break
except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
if row_count <= max_rows:
row_count += 1
else:
if no_fail:
_LOGGER.warning(
f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype"
)
return "unknown_bedtype", "unknown_bedtype"
else:
raise BedTypeException(
reason=f"Bed type could not be determined due to CSV parse error {e}"
)
except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
if row_count <= max_rows:
row_count += 1
Expand Down
1 change: 0 additions & 1 deletion bedboss/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,6 @@ def make_bedset(
def init_config(
outfolder: str = typer.Option(..., help="Path to the output folder"),
):

from bedboss.utils import save_example_bedbase_config

save_example_bedbase_config(outfolder)
Expand Down
2 changes: 2 additions & 0 deletions scripts/bedclassifier_tuning/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Just making a script to download and assess BED files and determine if they are correctly classified.

23 changes: 23 additions & 0 deletions scripts/bedclassifier_tuning/bedclassifier_output_schema.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
title: Bed Classifier
description: Output for bed classification results
type: object
properties:
pipeline_name: "bedclassifier"
samples:
type: object
properties:
bedfile_named:
type: string
description: "reported bedfile name e.g. narrowpeak"
bedfile_type:
type: string
description: "reported bedfile type"
given_bedfile_type:
type: string
description: "given bed file type"
types_match:
type: boolean
description: "Do the types match?"
gsm:
type: string
description: "given gsm"
188 changes: 188 additions & 0 deletions scripts/bedclassifier_tuning/bedclassify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import gzip
import logging
import os
import shutil

import pipestat
import pypiper
from typing import Optional

from bedboss.bedclassifier import get_bed_type
from bedboss.exceptions import BedTypeException

_LOGGER = logging.getLogger("bedboss")

from geofetch import Finder, Geofetcher


class BedClassifier:
"""
This will take the input of either a .bed or a .bed.gz and classify the type of BED file.
"""

def __init__(
self,
input_file: str,
output_dir: Optional[str] = None,
bed_digest: Optional[str] = None,
input_type: Optional[str] = None,
pm: pypiper.PipelineManager = None,
report_to_database: Optional[bool] = False,
psm: pipestat.PipestatManager = None,
gsm: str = None,
):
# Raise Exception if input_type is given and it is NOT a BED file
# Raise Exception if the input file cannot be resolved

self.gsm = gsm
self.input_file = input_file
self.bed_digest = bed_digest
self.input_type = input_type

self.abs_bed_path = os.path.abspath(self.input_file)
self.file_name = os.path.splitext(os.path.basename(self.abs_bed_path))[0]
self.file_extension = os.path.splitext(self.abs_bed_path)[-1]

# we need this only if unzipping a file
self.output_dir = output_dir or os.path.join(
os.path.dirname(self.abs_bed_path), "temp_processing"
)
# Use existing Pipeline Manager if it exists
self.pm = pm

if psm is None:
pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default"
self.psm = pipestat.PipestatManager(
pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml"
)
else:
self.psm = psm

if self.file_extension == ".gz":
unzipped_input_file = os.path.join(self.output_dir, self.file_name)

with gzip.open(self.input_file, "rb") as f_in:
_LOGGER.info(
f"Unzipping file:{self.input_file} and Creating Unzipped file: {unzipped_input_file}"
)
with open(unzipped_input_file, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
self.input_file = unzipped_input_file
if self.pm:
self.pm.clean_add(unzipped_input_file)

try:
self.bed_type, self.bed_type_named = get_bed_type(self.input_file)
except BedTypeException as e:
_LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}")
self.bed_type = "unknown_bedtype"
self.bed_type_named = "unknown_bedtype"

if self.input_type is not None:
if self.bed_type_named != self.input_type:
_LOGGER.warning(
f"BED file classified as different type than given input: {self.bed_type} vs {self.input_type}"
)
do_types_match = False
else:
do_types_match = True
else:
do_types_match = False

# Create Value Dict to report via pipestat

all_values = {}

if self.input_type:
all_values.update({"given_bedfile_type": self.input_type})
if self.bed_type:
all_values.update({"bedfile_type": self.bed_type})
if self.bed_type_named:
all_values.update({"bedfile_named": self.bed_type_named})
if self.gsm:
all_values.update({"gsm": self.gsm})

all_values.update({"types_match": do_types_match})

try:
psm.report(record_identifier=bed_digest, values=all_values)
except Exception as e:
_LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}")

if self.pm:
self.pm.stop_pipeline()


def main():
# PEP for reporting all classification results
pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default"

# Place these external to pycharm folder!!!
data_output_path = os.path.abspath("data")
results_path = os.path.abspath("results")
logs_dir = os.path.join(results_path, "logs")

gse_obj = Finder()

# # Optionally: provide filter string and max number of retrieve elements
# gse_obj = Finder(filters="narrowpeak", retmax=100)
#
# gse_list = gse_obj.get_gse_all()
# gse_obj.generate_file("data/output.txt", gse_list=gse_list)

pm = pypiper.PipelineManager(
name="bedclassifier",
outfolder=logs_dir,
recover=True,
)

pm.start_pipeline()

# for geo in gse_list:
geofetcher_obj = Geofetcher(
filter="\.(bed|narrowPeak|broadPeak)\.",
filter_size="25MB",
data_source="samples",
geo_folder=data_output_path,
metadata_folder=data_output_path,
processed=True,
max_soft_size="20MB",
discard_soft=True,
)

# geofetcher_obj.fetch_all(input="data/output.txt", name="donald_test")
geofetched = geofetcher_obj.get_projects(
input=os.path.join(data_output_path, "output.txt"), just_metadata=False
)

samples = geofetched["output_samples"].samples

psm = pipestat.PipestatManager(
pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml"
)

for sample in samples:
if isinstance(sample.output_file_path, list):
bedfile = sample.output_file_path[0]
else:
bedfile = sample.output_file_path
geo_accession = sample.sample_geo_accession
sample_name = sample.sample_name
bed_type_from_geo = sample.type.lower()

bed = BedClassifier(
input_file=bedfile,
bed_digest=sample_name, # TODO FIX THIS IT HOULD BE AN ACTUAL DIGEST
output_dir=results_path,
input_type=bed_type_from_geo,
psm=psm,
pm=pm,
gsm=geo_accession,
)

pm.stop_pipeline()


if __name__ == "__main__":
main()
1 change: 0 additions & 1 deletion test/test_bedclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@


class TestBedClassifier:

def test_classification(
self,
):
Expand Down

0 comments on commit 7cb35e1

Please sign in to comment.