Skip to content

Commit

Permalink
add utils/config.py to define names of temporal directory
Browse files Browse the repository at this point in the history
  • Loading branch information
akikuno committed Aug 13, 2023
1 parent 8058ff2 commit 86d947e
Show file tree
Hide file tree
Showing 11 changed files with 133 additions and 160 deletions.
107 changes: 64 additions & 43 deletions src/DAJIN2/core/core_execute.py → src/DAJIN2/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,73 +13,97 @@
from collections import defaultdict
from DAJIN2.utils import io
from DAJIN2.core import classification, clustering, consensus, preprocess, report
from DAJIN2.utils.config import TEMP_ROOT_DIR

# limit max memory usage
mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")
resource.setrlimit(resource.RLIMIT_DATA, (int(mem_bytes * 9 / 10), -1))


def _parse_arguments(arguments: dict):
SAMPLE: str = arguments["sample"]
CONTROL: str = arguments["control"]
ALLELE: str = arguments["allele"]
NAME: str = arguments["name"]
THREADS: int = arguments["threads"]
GENOME_URLS = defaultdict(str)
def parse_arguments(arguments: dict):
genome_urls = defaultdict(str)
if "genome" in arguments:
GENOME_URLS["genome"] = arguments["genome"]
GENOME_URLS["blat"] = arguments["blat"]
GENOME_URLS["goldenpath"] = arguments["goldenpath"]
return SAMPLE, CONTROL, ALLELE, NAME, THREADS, GENOME_URLS
genome_urls.update(
{"genome": arguments["genome"], "blat": arguments["blat"], "goldenpath": arguments["goldenpath"]}
)

return (
arguments["sample"],
arguments["control"],
arguments["allele"],
arguments["name"],
arguments["threads"],
genome_urls,
)


def convert_inputs_to_posix(sample: str, control: str, allele: str) -> tuple:
sample = io.convert_to_posix(sample)
control = io.convert_to_posix(control)
allele = io.convert_to_posix(allele)
return sample, control, allele

def _format_inputs(arguments: dict):
SAMPLE, CONTROL, ALLELE, NAME, THREADS, GENOME_URLS = _parse_arguments(arguments)
SAMPLE = io.convert_to_posix(SAMPLE)
CONTROL = io.convert_to_posix(CONTROL)
ALLELE = io.convert_to_posix(ALLELE)

SAMPLE_NAME: str = preprocess.format_inputs.extract_basename(SAMPLE)
CONTROL_NAME: str = preprocess.format_inputs.extract_basename(CONTROL)
FASTA_ALLELES: dict = preprocess.format_inputs.dictionize_allele(ALLELE)
def create_temporal_directory(name: str, control_name: str) -> Path:
tempdir = Path(TEMP_ROOT_DIR, name)
Path(tempdir, "cache", ".igvjs", control_name).mkdir(parents=True, exist_ok=True)
return tempdir

TEMPDIR = Path("DAJINResults", ".tempdir", NAME)
Path(TEMPDIR, "cache", ".igvjs", CONTROL_NAME).mkdir(parents=True, exist_ok=True)

IS_CACHE_CONTROL = preprocess.check_caches.exists_cached_control(CONTROL, TEMPDIR)
IS_CACHE_GENOME = preprocess.check_caches.exists_cached_genome(GENOME_URLS["genome"], TEMPDIR, IS_CACHE_CONTROL)
def check_caches(control: str, tempdir: Path, genome_url: str) -> tuple:
is_cache_control = preprocess.check_caches.exists_cached_control(control, tempdir)
is_cache_genome = preprocess.check_caches.exists_cached_genome(genome_url, tempdir, is_cache_control)
return is_cache_control, is_cache_genome

GENOME_COODINATES = {
"genome": GENOME_URLS["genome"],

def get_genome_coordinates(genome_urls: dict, fasta_alleles: dict, is_cache_genome: bool, tempdir: Path) -> dict:
genome_coordinates = {
"genome": genome_urls["genome"],
"chrom_size": 0,
"chr": "control",
"start": 0,
"end": len(FASTA_ALLELES["control"]) - 1,
"end": len(fasta_alleles["control"]) - 1,
"strand": "+",
}
if GENOME_URLS["genome"]:
if not IS_CACHE_GENOME:
GENOME_COODINATES = preprocess.format_inputs.fetch_coordinate(
GENOME_COODINATES, GENOME_URLS, FASTA_ALLELES["control"]
)
GENOME_COODINATES = preprocess.format_inputs.fetch_chrom_size(GENOME_COODINATES, GENOME_URLS)
midsv.write_jsonl([GENOME_COODINATES], Path(TEMPDIR, "cache", "genome_coodinates.jsonl"))
else:
GENOME_COODINATES = midsv.read_jsonl(Path(TEMPDIR, "cache", "genome_coodinates.jsonl"))
return SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES, TEMPDIR, GENOME_COODINATES, THREADS
if genome_urls["genome"] and not is_cache_genome:
genome_coordinates = preprocess.format_inputs.fetch_coordinate(
genome_coordinates, genome_urls, fasta_alleles["control"]
)
genome_coordinates = preprocess.format_inputs.fetch_chrom_size(genome_coordinates, genome_urls)
midsv.write_jsonl([genome_coordinates], Path(tempdir, "cache", "genome_coodinates.jsonl"))
elif genome_urls["genome"]:
genome_coordinates = midsv.read_jsonl(Path(tempdir, "cache", "genome_coodinates.jsonl"))
return genome_coordinates


def format_inputs(arguments: dict) -> tuple:
sample, control, allele, name, threads, genome_urls = parse_arguments(arguments)
sample, control, allele = convert_inputs_to_posix(sample, control, allele)
sample_name = preprocess.format_inputs.extract_basename(sample)
control_name = preprocess.format_inputs.extract_basename(control)
fasta_alleles = preprocess.format_inputs.dictionize_allele(allele)
tempdir = create_temporal_directory(name, control_name)
is_cache_control, is_cache_genome = check_caches(control, tempdir, genome_urls["genome"])
genome_coordinates = get_genome_coordinates(genome_urls, fasta_alleles, is_cache_genome, tempdir)
return sample_name, control_name, fasta_alleles, tempdir, genome_coordinates, threads


def _dtnow() -> str:
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")


###########################################################
# main
###########################################################


def execute_control(arguments: dict):
print(f"{_dtnow()}: {arguments['control']} is now processing...", file=sys.stderr)
###########################################################
# Preprocess
###########################################################
SAMPLE, CONTROL, ALLELE, NAME, THREADS, GENOME_URLS = _parse_arguments(arguments)
SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES, TEMPDIR, GENOME_COODINATES, THREADS = _format_inputs(arguments)
SAMPLE, CONTROL, ALLELE, NAME, THREADS, GENOME_URLS = parse_arguments(arguments)
SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES, TEMPDIR, GENOME_COODINATES, THREADS = format_inputs(arguments)
preprocess.format_inputs.make_directories(TEMPDIR, CONTROL_NAME, is_control=True)
preprocess.format_inputs.make_report_directories(TEMPDIR, CONTROL_NAME, is_control=True)
###########################################################
Expand Down Expand Up @@ -129,8 +153,8 @@ def execute_sample(arguments: dict):
###########################################################
# Preprocess
###########################################################
SAMPLE, CONTROL, ALLELE, NAME, THREADS, GENOME_URLS = _parse_arguments(arguments)
SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES, TEMPDIR, GENOME_COODINATES, THREADS = _format_inputs(arguments)
SAMPLE, CONTROL, ALLELE, NAME, THREADS, GENOME_URLS = parse_arguments(arguments)
SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES, TEMPDIR, GENOME_COODINATES, THREADS = format_inputs(arguments)
preprocess.format_inputs.make_directories(TEMPDIR, SAMPLE_NAME)
preprocess.format_inputs.make_report_directories(TEMPDIR, SAMPLE_NAME)

Expand Down Expand Up @@ -197,9 +221,6 @@ def execute_sample(arguments: dict):
########################################################################
print(f"{_dtnow()}: Consensus calling of {arguments['sample']}...", file=sys.stderr)
# Downsampling to 1000 reads in each LABEL
# MUTATION_LOCI_LABELS = consensus.extract_mutation_loci_by_labels(
# clust_sample, TEMPDIR, FASTA_ALLELES, CONTROL_NAME, SAMPLE_NAME
# )
clust_subset_sample = consensus.subset_clust(clust_sample, 1000)
cons_percentage, cons_sequence = consensus.call_consensus(TEMPDIR, SAMPLE_NAME, clust_subset_sample)
# cons_percentage, cons_sequence = consensus.call_consensus(clust_subset_sample, MUTATION_LOCI_LABELS)
Expand Down
2 changes: 1 addition & 1 deletion src/DAJIN2/core/preprocess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@

# from DAJIN2.core.preprocess.extract_mutation_loci import process_mutation_loci
from DAJIN2.core.preprocess.extract_mutation_loci import extract_mutation_loci
from DAJIN2.core.preprocess.get_index_mapping import save_index_mapping
from DAJIN2.core.preprocess.tmp_get_index_mapping import save_index_mapping
from DAJIN2.core.preprocess.generate_insertion_fasta import generate_insertion_fasta
34 changes: 34 additions & 0 deletions src/DAJIN2/core/preprocess/directories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from __future__ import annotations

from pathlib import Path


def create_temporal(TEMPDIR: Path, NAME: str, is_control=False) -> None:
Path(TEMPDIR, "result").mkdir(parents=True, exist_ok=True)
if is_control:
SUBDIRS = ["fasta", "sam", "midsv", "mutation_loci", "clustering"]
else:
SUBDIRS = [
"fasta",
"sam",
"midsv",
"mutation_loci",
"knockin_loci",
"classification",
"clustering",
"consensus",
]
for subdir in SUBDIRS:
Path(TEMPDIR, NAME, subdir).mkdir(parents=True, exist_ok=True)


def create_report(TEMPDIR: Path, NAME: str, is_control=False) -> None:
if is_control:
Path(TEMPDIR, "report", "BAM", NAME).mkdir(parents=True, exist_ok=True)
return
SUBDIRS_REPORT = ["HTML", "FASTA", "BAM", "MUTATION_INFO", ".igvjs"]
for reportdir in SUBDIRS_REPORT:
if reportdir == "MUTATION_INFO":
Path(TEMPDIR, "report", reportdir).mkdir(parents=True, exist_ok=True)
else:
Path(TEMPDIR, "report", reportdir, NAME).mkdir(parents=True, exist_ok=True)
98 changes: 0 additions & 98 deletions src/DAJIN2/core/preprocess/get_index_mapping.py

This file was deleted.

4 changes: 3 additions & 1 deletion src/DAJIN2/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from waitress import serve
from werkzeug.utils import secure_filename

from DAJIN2.utils.config import TEMP_ROOT_DIR

from DAJIN2 import main


Expand Down Expand Up @@ -46,7 +48,7 @@ def root_page():
@app.route("/submit", methods=["POST"])
def submit():
name = request.form.get("name")
TEMPDIR = Path("DAJINResults", ".tempdir", name)
TEMPDIR = Path(TEMP_ROOT_DIR, name)
if TEMPDIR.exists():
shutil.rmtree(TEMPDIR)
UPLOAD_FOLDER = Path(TEMPDIR, "upload")
Expand Down
15 changes: 8 additions & 7 deletions src/DAJIN2/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

from DAJIN2 import gui, view
from DAJIN2.utils import io, report_generator, input_validator
from DAJIN2.core import core_execute
from DAJIN2.core import core
from DAJIN2.utils.config import DAJIN_RESULTS_DIR


VERSION = "0.3.1"
Expand All @@ -35,7 +36,7 @@ def update_threads(threads: int) -> int:
def generate_report(name: str) -> None:
report_generator.report(name)
print(
f"\N{party popper} Finished! Open DAJINResults/{name} to see the report.",
f"\N{party popper} Finished! Open {DAJIN_RESULTS_DIR}/{name} to see the report.",
file=sys.stderr,
)

Expand All @@ -49,8 +50,8 @@ def execute_single_mode(arguments: dict[str]):
input_validator.validate_files(arguments["sample"], arguments["control"], arguments["allele"])
if "genome" in arguments:
arguments.update(input_validator.validate_genome_and_fetch_urls(arguments["genome"]))
core_execute.execute_control(arguments)
core_execute.execute_sample(arguments)
core.execute_control(arguments)
core.execute_sample(arguments)
generate_report(arguments["name"])


Expand Down Expand Up @@ -168,7 +169,7 @@ def execute_batch_mode(arguments: dict[str]):
args.update(cache_urls_genome[args["genome"]])
contents_control.append(args)
contents_control_unique = [dict(item) for item in set(frozenset(d.items()) for d in contents_control)]
run_multiprocess(core_execute.execute_control, contents_control_unique, arguments["threads"])
run_multiprocess(core.execute_control, contents_control_unique, arguments["threads"])

# Handle samples
contents_sample = []
Expand All @@ -181,7 +182,7 @@ def execute_batch_mode(arguments: dict[str]):
args.update(cache_urls_genome[args["genome"]])
contents_sample.append(args)
contents_sample_unique = [dict(item) for item in set(frozenset(d.items()) for d in contents_sample)]
run_multiprocess(core_execute.execute_sample, contents_sample_unique, arguments["threads"])
run_multiprocess(core.execute_sample, contents_sample_unique, arguments["threads"])
# Finish
generate_report(name)

Expand All @@ -196,7 +197,7 @@ def execute():
parser.add_argument("-s", "--sample", type=str, help="Full path to a sample FASTQ file")
parser.add_argument("-c", "--control", type=str, help="Full path to a control FASTQ file")
parser.add_argument("-a", "--allele", type=str, help="Full path to a FASTA file")
parser.add_argument("-n", "--name", type=str, help="Output directory name")
parser.add_argument("-n", "--name", type=str, help="Output directory name", default="DAJIN2-results")
parser.add_argument(
"-g", "--genome", type=str, default="", help="Reference genome ID (e.g hg38, mm39) [default: '']"
)
Expand Down
4 changes: 4 additions & 0 deletions src/DAJIN2/utils/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from pathlib import Path

DAJIN_RESULTS_DIR = Path("DAJIN_Results")
TEMP_ROOT_DIR = Path(DAJIN_RESULTS_DIR, ".tempdir")
Loading

0 comments on commit 86d947e

Please sign in to comment.