Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transfer Assembly Files Workflow #18

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,8 @@ workflows:
primaryDescriptorPath: /workflows/wf_transfer_column.wdl
testParameterFiles:
- empty.json
- name: Transfer_Assembly_Files
subclass: WDL
primaryDescriptorPath: /workflows/wf_transfer_assembly_files.wdl
testParameterFiles:
- empty.json
21 changes: 21 additions & 0 deletions tasks/task_fapi.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
version 1.0

task import_terra_table {
input {
String terra_project
String workspace_name
File terra_table

}
command {
python3 /scripts/import_large_tsv/import_large_tsv.py --project ~{terra_project} --workspace ~{workspace_name} --tsv ~{terra_table}
}
output {
}
runtime {
memory: "4 GB"
cpu: 2
docker: "broadinstitute/terra-tools:tqdm"
disks: "local-disk 100 HDD"
}
}
56 changes: 51 additions & 5 deletions tasks/task_file_handling.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -153,19 +153,65 @@ task zip_files {
task transfer_files {
input {
Array[String] files_to_transfer
Array[String] samplenames
String target_bucket
Boolean create_terra_table = true
String target_root_entity="transferred_files"
String transferred_file_column_header="transferred_file"
Int CPUs = 4
Int mem_size_gb = 8
String? docker_image = "quay.io/theiagen/utility:1.1"
}
command <<<
file_path_array="~{sep=' ' files_to_transfer}"

gsutil -m cp -n ${file_path_array[@]} ~{target_bucket}
file_path_array=(~{sep=' ' files_to_transfer})
file_path_array_len=$(echo "${#file_path_array[@]}")
file_path_string_array="~{sep=' ' files_to_transfer}"
samplename_array=(~{sep=' ' samplenames})
samplename_array_len=$(echo "${#samplename_array[@]}")

# Set output header
if ~{create_terra_table}; then
# create header for terra data table outout
echo -e "entity:~{target_root_entity}_id\t~{transferred_file_column_header}" > transferred_files.tsv
# Ensure samplename and assembly array legnths are of equal length
echo "Samples: $samplename_array_len, file_paths: $file_path_array_len"
if [ "$samplename_array_len" -ne "$file_path_array_len" ]; then
echo "Input arrays are of unequal length. Samplenames: $samplename_array_len, file_paths: $file_path_array_len" >&2
exit 1
else
echo "Input arrays are of equal length. Samplenames: $samplename_array_len, file_paths: $file_path_array_len"
fi
else
#create header for transferred files output (non terra data table)
echo -e "~{transferred_file_column_header}" > transferred_files.tsv
fi

#transfer files to target bucket
echo "Running gsutil -m cp -n ${file_path_string_array[@]} ~{target_bucket}"
gsutil -m cp -n ${file_path_string_array[@]} ~{target_bucket}

echo "transferred_files" > transferred_files.tsv
gsutil ls ~{target_bucket} >> transferred_files.tsv
#create datatable for transferred files
for index in ${!file_path_array[@]}; do
transferred_file=${file_path_array[$index]}
transferred_file=$(echo ${transferred_file} | awk -F "/" '{print $NF}')
samplename=${samplename_array[$index]}
gcp_address="~{target_bucket}${transferred_file}"
echo "GCP address: ${gcp_address}"

if [ $(gsutil -q stat ${gcp_address}; echo $?) == 1 ]; then
echo "${transferred_file} does not exist in ~{target_bucket}" >&2
else
echo "${transferred_file} found in ~{target_bucket}"
# populate transferred_files.tsv
if ~{create_terra_table}; then
echo -e "${samplename}\t${gcp_address}" >> transferred_files.tsv
else
echo "${gcp_address}" >> transferred_files.tsv
fi
fi
done


>>>
output {
File transferred_files = "transferred_files.tsv"
Expand Down
136 changes: 136 additions & 0 deletions workflows/wf_transfer_assembly_files.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
version 1.0

import "../tasks/task_file_handling.wdl" as file_handling
import "../tasks/task_versioning.wdl" as versioning
import "../tasks/task_fapi.wdl" as fapi

workflow transfer_assembly_files {
input {
Array[String] assemblies
Array[String] samplenames
Array[Float] percent_reference_coverage
Boolean import_terra_table = false
String target_terra_project = "NA"
String target_workspace_name = "NA"
String target_bucket
String target_root_entity
String transferred_file_column_header

}
call filter_assemblies{
input:
assemblies = assemblies,
samplenames = samplenames,
percent_reference_coverage = percent_reference_coverage
}
call file_handling.transfer_files{
input:
files_to_transfer = filter_assemblies.assemblies_filterred,
samplenames = filter_assemblies.samplenames_filterred,
target_bucket = target_bucket,
target_root_entity = target_root_entity,
transferred_file_column_header = transferred_file_column_header,
create_terra_table = true
}
if(import_terra_table){
call fapi.import_terra_table as import_table {
input:
terra_project = target_terra_project,
workspace_name = target_workspace_name,
terra_table = transfer_files.transferred_files
}
}
call versioning.version_capture{
input:
}
output {
String transfer_assembies_version = version_capture.terra_utilities_version
String transfer_assemblies_analysis_date = version_capture.date

File assembly_data_table = transfer_files.transferred_files
}
}

task filter_assemblies {
input {
Array[String] assemblies
Array[Float] percent_reference_coverage
Int percent_reference_coverage_threshold = 90
Array[String] samplenames
String? docker_image = "quay.io/theiagen/utility:1.1"
}
command <<<
assembly_array=(~{sep=' ' assemblies})
assembly_array_len=$(echo "${#assembly_array[@]}")
reference_coverage_array=(~{sep=' ' percent_reference_coverage})
referece_coverage_array_len=$(echo "${#reference_coverage_array[@]}")
samplename_array=(~{sep=' ' samplenames})
samplename_array_len=$(echo "${#samplename_array[@]}")
mkdir ./passed_assemblies
passed_assemblies=""
passed_samplenames=""

#Create files to capture batched and excluded samples
echo -e "Samplename\tPercent reference coverage" > assemblies_included.tsv
echo -e "Samplename\tPercent reference coverage" > assemblies_filterred.tsv

#Ensure assembly, meta, and vadr arrays are of equal length
echo "Samples: $samplename_array_len, Assemblies: $assembly_array_len, percent_reference_coverages: $referece_coverage_array_len"
if [ "$samplename_array_len" -ne "$referece_coverage_array_len" ] && [ "$samplename_array_len" -ne "$assembly_array_len" ]; then
echo "Input arrays are of unequal length. Samples: $samplename_array_len, Assemblies: $assembly_array_len, percent_reference_coverages: $referece_coverage_array_len" >&2
exit 1
else
echo "Input arrays are of equal length. Samples: $samplename_array_len, Assemblies: $assembly_array_len, percent_reference_coverages: $referece_coverage_array_len"
fi

echo "name array: ${samplename_array[@]}"
echo "reference_coverage array: ${reference_coverage_array[@]}"

#remove samples that do not meet coverage threshold
for index in ${!samplename_array[@]}; do
samplename=${samplename_array[$index]}
assembly=${assembly_array[$index]}
reference_coverage=${reference_coverage_array[$index]}
reference_coverage=${reference_coverage%.*}

if [ "${reference_coverage}" -ge "~{percent_reference_coverage_threshold}" ] ; then
passed_assemblies=( "${passed_assemblies[@]}" "${assembly}")
passed_samplenames=( "${passed_samplenames[@]}" "${samplename}")
echo -e "\t$samplename coverage (${reference_coverage}) passes threshold (~{percent_reference_coverage_threshold})"
echo -e "$samplename\t$reference_coverage" >> assemblies_included.tsv
else
echo -e "\t$samplename coverage (${reference_coverage}) does not meet coverage threshold (~{percent_reference_coverage_threshold})"
echo -e "$samplename\t$reference_coverage" >> assemblies_filterred.tsv
fi
done

passed_assemblies_len=$(echo "${#passed_assemblies[@]}")
passed_samplenames_len=$(echo "${#passed_samplenames[@]}")
# sanity check before completing task
if [ "$passed_assemblies_len" -ne "$passed_samplenames_len" ] ; then
echo "OUTPUT arrays are of unequal length. samplenames:$passed_samplenames_len; assemblies: $passed_assemblies_len" >&2
exit 1
elif [ "${passed_assemblies_len}" == 1 ] ; then
echo "No assemblies passed coverage threshold" >&2
exit 1
else
echo "OUTPUT arrays are of equal length. samplenames:$passed_samplenames_len; assemblies: $passed_assemblies_len"
fi

printf '%s\n' "${passed_assemblies[@]}" > PASSED_ASSEMBLIES
printf '%s\n' "${passed_samplenames[@]}" > PASSED_SAMPLENAMES

>>>
output {
Array[String] assemblies_filterred = read_lines("PASSED_ASSEMBLIES")
Array[String] samplenames_filterred = read_lines("PASSED_SAMPLENAMES")
}
runtime {
docker: "~{docker_image}"
memory: "1 GB"
cpu: 1
disks: "local-disk 10 HDD"
preemptible: 0
}
}

2 changes: 2 additions & 0 deletions workflows/wf_transfer_column.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ import "../tasks/task_versioning.wdl" as versioning
workflow transfer_column_content {
input {
Array[String] files_to_transfer
Array[String] samplenames
String target_bucket
}
call file_handling.transfer_files{
input:
files_to_transfer=files_to_transfer,
samplenames=samplenames,
target_bucket=target_bucket
}
call versioning.version_capture{
Expand Down