diff --git a/.dockstore.yml b/.dockstore.yml index 3876d50..56eb941 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -40,3 +40,8 @@ workflows: primaryDescriptorPath: /workflows/wf_transfer_column.wdl testParameterFiles: - empty.json + - name: Transfer_Assembly_Files + subclass: WDL + primaryDescriptorPath: /workflows/wf_transfer_assembly_files.wdl + testParameterFiles: + - empty.json diff --git a/tasks/task_fapi.wdl b/tasks/task_fapi.wdl new file mode 100644 index 0000000..cbc2afa --- /dev/null +++ b/tasks/task_fapi.wdl @@ -0,0 +1,21 @@ +version 1.0 + +task import_terra_table { + input { + String terra_project + String workspace_name + File terra_table + + } + command { + python3 /scripts/import_large_tsv/import_large_tsv.py --project ~{terra_project} --workspace ~{workspace_name} --tsv ~{terra_table} + } + output { + } + runtime { + memory: "4 GB" + cpu: 2 + docker: "broadinstitute/terra-tools:tqdm" + disks: "local-disk 100 HDD" + } +} \ No newline at end of file diff --git a/tasks/task_file_handling.wdl b/tasks/task_file_handling.wdl index ecc52d3..6eb6fa5 100644 --- a/tasks/task_file_handling.wdl +++ b/tasks/task_file_handling.wdl @@ -153,19 +153,65 @@ task zip_files { task transfer_files { input { Array[String] files_to_transfer + Array[String] samplenames String target_bucket + Boolean create_terra_table = true + String target_root_entity="transferred_files" + String transferred_file_column_header="transferred_file" Int CPUs = 4 Int mem_size_gb = 8 String? docker_image = "quay.io/theiagen/utility:1.1" } command <<< - file_path_array="~{sep=' ' files_to_transfer}" - - gsutil -m cp -n ${file_path_array[@]} ~{target_bucket} + file_path_array=(~{sep=' ' files_to_transfer}) + file_path_array_len=$(echo "${#file_path_array[@]}") + file_path_string_array="~{sep=' ' files_to_transfer}" + samplename_array=(~{sep=' ' samplenames}) + samplename_array_len=$(echo "${#samplename_array[@]}") + + # Set output header + if ~{create_terra_table}; then + # create header for terra data table outout + echo -e "entity:~{target_root_entity}_id\t~{transferred_file_column_header}" > transferred_files.tsv + # Ensure samplename and assembly array legnths are of equal length + echo "Samples: $samplename_array_len, file_paths: $file_path_array_len" + if [ "$samplename_array_len" -ne "$file_path_array_len" ]; then + echo "Input arrays are of unequal length. Samplenames: $samplename_array_len, file_paths: $file_path_array_len" >&2 + exit 1 + else + echo "Input arrays are of equal length. Samplenames: $samplename_array_len, file_paths: $file_path_array_len" + fi + else + #create header for transferred files output (non terra data table) + echo -e "~{transferred_file_column_header}" > transferred_files.tsv + fi + + #transfer files to target bucket + echo "Running gsutil -m cp -n ${file_path_string_array[@]} ~{target_bucket}" + gsutil -m cp -n ${file_path_string_array[@]} ~{target_bucket} - echo "transferred_files" > transferred_files.tsv - gsutil ls ~{target_bucket} >> transferred_files.tsv + #create datatable for transferred files + for index in ${!file_path_array[@]}; do + transferred_file=${file_path_array[$index]} + transferred_file=$(echo ${transferred_file} | awk -F "/" '{print $NF}') + samplename=${samplename_array[$index]} + gcp_address="~{target_bucket}${transferred_file}" + echo "GCP address: ${gcp_address}" + + if [ $(gsutil -q stat ${gcp_address}; echo $?) == 1 ]; then + echo "${transferred_file} does not exist in ~{target_bucket}" >&2 + else + echo "${transferred_file} found in ~{target_bucket}" + # populate transferred_files.tsv + if ~{create_terra_table}; then + echo -e "${samplename}\t${gcp_address}" >> transferred_files.tsv + else + echo "${gcp_address}" >> transferred_files.tsv + fi + fi + done + >>> output { File transferred_files = "transferred_files.tsv" diff --git a/workflows/wf_transfer_assembly_files.wdl b/workflows/wf_transfer_assembly_files.wdl new file mode 100644 index 0000000..29ab1b3 --- /dev/null +++ b/workflows/wf_transfer_assembly_files.wdl @@ -0,0 +1,136 @@ +version 1.0 + +import "../tasks/task_file_handling.wdl" as file_handling +import "../tasks/task_versioning.wdl" as versioning +import "../tasks/task_fapi.wdl" as fapi + +workflow transfer_assembly_files { + input { + Array[String] assemblies + Array[String] samplenames + Array[Float] percent_reference_coverage + Boolean import_terra_table = false + String target_terra_project = "NA" + String target_workspace_name = "NA" + String target_bucket + String target_root_entity + String transferred_file_column_header + + } + call filter_assemblies{ + input: + assemblies = assemblies, + samplenames = samplenames, + percent_reference_coverage = percent_reference_coverage + } + call file_handling.transfer_files{ + input: + files_to_transfer = filter_assemblies.assemblies_filterred, + samplenames = filter_assemblies.samplenames_filterred, + target_bucket = target_bucket, + target_root_entity = target_root_entity, + transferred_file_column_header = transferred_file_column_header, + create_terra_table = true + } + if(import_terra_table){ + call fapi.import_terra_table as import_table { + input: + terra_project = target_terra_project, + workspace_name = target_workspace_name, + terra_table = transfer_files.transferred_files + } + } + call versioning.version_capture{ + input: + } + output { + String transfer_assembies_version = version_capture.terra_utilities_version + String transfer_assemblies_analysis_date = version_capture.date + + File assembly_data_table = transfer_files.transferred_files + } +} + +task filter_assemblies { + input { + Array[String] assemblies + Array[Float] percent_reference_coverage + Int percent_reference_coverage_threshold = 90 + Array[String] samplenames + String? docker_image = "quay.io/theiagen/utility:1.1" + } + command <<< + assembly_array=(~{sep=' ' assemblies}) + assembly_array_len=$(echo "${#assembly_array[@]}") + reference_coverage_array=(~{sep=' ' percent_reference_coverage}) + referece_coverage_array_len=$(echo "${#reference_coverage_array[@]}") + samplename_array=(~{sep=' ' samplenames}) + samplename_array_len=$(echo "${#samplename_array[@]}") + mkdir ./passed_assemblies + passed_assemblies="" + passed_samplenames="" + + #Create files to capture batched and excluded samples + echo -e "Samplename\tPercent reference coverage" > assemblies_included.tsv + echo -e "Samplename\tPercent reference coverage" > assemblies_filterred.tsv + + #Ensure assembly, meta, and vadr arrays are of equal length + echo "Samples: $samplename_array_len, Assemblies: $assembly_array_len, percent_reference_coverages: $referece_coverage_array_len" + if [ "$samplename_array_len" -ne "$referece_coverage_array_len" ] && [ "$samplename_array_len" -ne "$assembly_array_len" ]; then + echo "Input arrays are of unequal length. Samples: $samplename_array_len, Assemblies: $assembly_array_len, percent_reference_coverages: $referece_coverage_array_len" >&2 + exit 1 + else + echo "Input arrays are of equal length. Samples: $samplename_array_len, Assemblies: $assembly_array_len, percent_reference_coverages: $referece_coverage_array_len" + fi + + echo "name array: ${samplename_array[@]}" + echo "reference_coverage array: ${reference_coverage_array[@]}" + + #remove samples that do not meet coverage threshold + for index in ${!samplename_array[@]}; do + samplename=${samplename_array[$index]} + assembly=${assembly_array[$index]} + reference_coverage=${reference_coverage_array[$index]} + reference_coverage=${reference_coverage%.*} + + if [ "${reference_coverage}" -ge "~{percent_reference_coverage_threshold}" ] ; then + passed_assemblies=( "${passed_assemblies[@]}" "${assembly}") + passed_samplenames=( "${passed_samplenames[@]}" "${samplename}") + echo -e "\t$samplename coverage (${reference_coverage}) passes threshold (~{percent_reference_coverage_threshold})" + echo -e "$samplename\t$reference_coverage" >> assemblies_included.tsv + else + echo -e "\t$samplename coverage (${reference_coverage}) does not meet coverage threshold (~{percent_reference_coverage_threshold})" + echo -e "$samplename\t$reference_coverage" >> assemblies_filterred.tsv + fi + done + + passed_assemblies_len=$(echo "${#passed_assemblies[@]}") + passed_samplenames_len=$(echo "${#passed_samplenames[@]}") + # sanity check before completing task + if [ "$passed_assemblies_len" -ne "$passed_samplenames_len" ] ; then + echo "OUTPUT arrays are of unequal length. samplenames:$passed_samplenames_len; assemblies: $passed_assemblies_len" >&2 + exit 1 + elif [ "${passed_assemblies_len}" == 1 ] ; then + echo "No assemblies passed coverage threshold" >&2 + exit 1 + else + echo "OUTPUT arrays are of equal length. samplenames:$passed_samplenames_len; assemblies: $passed_assemblies_len" + fi + + printf '%s\n' "${passed_assemblies[@]}" > PASSED_ASSEMBLIES + printf '%s\n' "${passed_samplenames[@]}" > PASSED_SAMPLENAMES + +>>> + output { + Array[String] assemblies_filterred = read_lines("PASSED_ASSEMBLIES") + Array[String] samplenames_filterred = read_lines("PASSED_SAMPLENAMES") + } + runtime { + docker: "~{docker_image}" + memory: "1 GB" + cpu: 1 + disks: "local-disk 10 HDD" + preemptible: 0 + } +} + diff --git a/workflows/wf_transfer_column.wdl b/workflows/wf_transfer_column.wdl index 2e4daec..b2c33bc 100644 --- a/workflows/wf_transfer_column.wdl +++ b/workflows/wf_transfer_column.wdl @@ -6,11 +6,13 @@ import "../tasks/task_versioning.wdl" as versioning workflow transfer_column_content { input { Array[String] files_to_transfer + Array[String] samplenames String target_bucket } call file_handling.transfer_files{ input: files_to_transfer=files_to_transfer, + samplenames=samplenames, target_bucket=target_bucket } call versioning.version_capture{