diff --git a/src/ephemeris/_config_models.py b/src/ephemeris/_config_models.py index e06dcce..afb9996 100644 --- a/src/ephemeris/_config_models.py +++ b/src/ephemeris/_config_models.py @@ -42,10 +42,17 @@ class RepositoryInstallTargets(BaseModel): tools: List[RepositoryInstallTarget] +class DictOrValue(BaseModel): + __root__: Union[Dict[str, Union[str, int, float, bool, "DictOrValue"]], Union[str, int, float, bool]] + + +DictOrValue.update_forward_refs() + + class DataManager(BaseModel, extra=Extra.forbid): tags: List[str] tool_id: str - parameters: Optional[List[Dict[str, str]]] = None + parameters: Optional[DictOrValue] = None class DataManagers(BaseModel, extra=Extra.forbid): diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index a15fcf1..51aa04a 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -6,6 +6,7 @@ by genomes.yml that have already been executed and appear in the target installed data table configuration. """ +import json import logging import os import re @@ -38,6 +39,7 @@ from ._config_models import ( DataManager, DataManagers, + DictOrValue, read_data_managers, ) from .common_parser import get_common_args @@ -96,8 +98,7 @@ def tool_id_for(indexer: str, data_managers: DataManagers, mode: str) -> str: class RunDataManager(BaseModel): id: str items: Optional[List[Any]] = None - params: Optional[List[Any]] = None - data_table_reload: Optional[List[str]] = None + params: Optional[DictOrValue] = None class RunDataManagers(BaseModel): @@ -172,36 +173,34 @@ def walk_over_incomplete_runs(split_options: SplitOptions): if do_fetch and not split_options.is_build_complete(build_id, fetch_indexer): log.info(f"Fetching: {build_id}") fetch_tool_id = tool_id_for(fetch_indexer, data_managers, split_options.tool_id_mode) - fetch_params = [] - fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) - fetch_params.append({"dbkey_source|dbkey": genome["id"]}) description = genome.get("description") + fetch_params = { + "dbkey_source": {"dbkey_source_selector": "new", "dbkey": genome["id"]}, + "sequence_id": genome["id"], + "sequence_name": description, + } source = genome.get("source") if source == "ucsc": if not description: - description = ucsc_description_for_build(genome["id"]) - fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) - fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) - fetch_params.append({"sequence_name": description}) + fetch_params["sequence_name"] = ucsc_description_for_build(genome["id"]) + fetch_params["reference_source"] = { + "reference_source_selector": "ucsc", + "requested_dbkey": genome["id"], + } elif re.match("^[A-Z_]+[0-9.]+", source): - fetch_params.append({"reference_source|reference_source_selector": "ncbi"}) - fetch_params.append({"reference_source|requested_identifier": source}) - fetch_params.append({"sequence_name": genome["description"]}) - fetch_params.append({"sequence.id": genome["id"]}) + fetch_params["reference_source"] = { + "reference_source_selector": "ncbi", + "requested_identifier": source, + } elif re.match("^http", source): - fetch_params.append({"reference_source|reference_source_selector": "url"}) - fetch_params.append({"reference_source|user_url": source}) - fetch_params.append({"sequence_name": genome["description"]}) - fetch_params.append({"sequence.id": genome["id"]}) + fetch_params["reference_source"] = {"reference_source_selector": "url", "user_url": source} if description: - fetch_params.append({"dbkey_source|dbkey_name": description}) + fetch_params["dbkey_source"]["dbkey_name"] = description fetch_run_data_manager = RunDataManager( id=fetch_tool_id, params=fetch_params, - # Not needed according to Marius - # data_table_reload=["all_fasta", "__dbkeys__"], ) yield (build_id, fetch_indexer, fetch_run_data_manager) else: @@ -223,18 +222,17 @@ def walk_over_incomplete_runs(split_options: SplitOptions): tool_id = tool_id_for(indexer, data_managers, split_options.tool_id_mode) data_manager = data_managers.__root__[indexer] - params = data_manager.parameters + params = {} + if data_manager.parameters: + params = json.loads(data_manager.parameters.json()) or {} + genome_params = genome.pop("parameters", None) or {} + params.update(genome_params) if params is None: - params = [ - {"all_fasta_source": "{{ item.id }}"}, - {"sequence_name": "{{ item.name }}"}, - {"sequence_id": "{{ item.id }}"}, - ] - # why is this not pulled from the data managers conf? -nate - if re.search("bwa", tool_id): - params.append({"index_algorithm": "bwtsw"}) - if re.search("color_space", tool_id): - continue + params = { + "all_fasta_source": "{{ item.id }}", + "sequence_name": "{{ item.name }}", + "sequence_id": "{{ item.id }}", + } item = deepcopy(genome) item.pop("indexers", None) diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index 8e186f3..fbfb8cd 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -49,13 +49,36 @@ - genome """ +DATA_MANAGER_YAML_WITH_PARAMS = """ +the_data_manager: + tool_id: toolshed.g2.bx.psu.edu/repos/iuc/the_data_manager/the_data_manager/0.0.1' + parameters: + conditional: + param_a: a + param_b: b + tags: + - dm_tag +""" + +GENOMES_WITH_PARAMS = """ +genomes: + - dbkey: cat + description: fluffy + id: cat + indexers: + - the_data_manager + parameters: + conditional: + param_c: c +""" + -def setup_mock_idc_dir(directory: Path): +def setup_mock_idc_dir(directory: Path, genomes=MERGED_YAML_STR, data_managers=DATA_MANAGER_YAML_STR): merged = directory / "genomes.yml" - merged.write_text(MERGED_YAML_STR) + merged.write_text(genomes) - data_managers = directory / "data_managers.yml" - data_managers.write_text(DATA_MANAGER_YAML_STR) + data_managers_path = directory / "data_managers.yml" + data_managers_path.write_text(data_managers) def read_and_validate_run_data_manager_yaml(path): @@ -98,6 +121,20 @@ def test_split_genomes(tmp_path: Path): assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174" +def test_split_genomes_with_params(tmp_path): + setup_mock_idc_dir(tmp_path, GENOMES_WITH_PARAMS, DATA_MANAGER_YAML_WITH_PARAMS) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + split_genomes(split_options) + new_task = split_path / "cat" / "the_data_manager" + new_task_run_yaml = new_task / "run_data_managers.yaml" + run = read_and_validate_run_data_manager_yaml(new_task_run_yaml) + assert len(run.data_managers) == 1 + data_manager = run.data_managers[0] + # genome config overwrites data manager config + assert data_manager.params.json() == '{"conditional": {"param_c": "c"}}' + + def test_split_genomes_short_ids(tmp_path: Path): setup_mock_idc_dir(tmp_path) split_path = tmp_path / "split"