Skip to content

Commit

Permalink
Merge pull request #587 from umccr/enhancement/generate-md5sums-for-r…
Browse files Browse the repository at this point in the history
…aw-fastqs

Generate md5sums for raw fastq inputs, not gzipped inputs
  • Loading branch information
alexiswl authored Nov 15, 2024
2 parents 7458058 + 1b365e8 commit 11599ee
Show file tree
Hide file tree
Showing 9 changed files with 77 additions and 77 deletions.
2 changes: 1 addition & 1 deletion config/workflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -316,5 +316,5 @@ workflows:
versions:
- name: 4.2.4
path: 4.2.4/dragen-instrument-run-fastq-to-ora-pipeline__4.2.4.cwl
md5sum: da57d2421f8ffb47ec102dd02a4d5db7
md5sum: 9df445c3fdd3b101f1061ae0a29d7fa0
categories: []
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ requirements:

(
echo "\$(date -Iseconds): Collecting md5sums of gzipped fastq files" 1>&2 && \\
bash "$(get_fastq_gz_md5sum_files_script_path())" > "$(inputs.output_directory_name)/fastq_gzipped.md5.txt" && \\
bash "$(get_fastq_raw_md5sum_files_script_path())" > "$(inputs.output_directory_name)/fastq_raw.md5.txt" && \\
echo "\$(date -Iseconds): Md5sum complete" 1>&2 && \\
echo "\$(date -Iseconds): Collecting file sizes of gzipped fastq files" 1>&2 && \\
bash "$(get_fastq_gz_file_sizes_script_path())" > "$(inputs.output_directory_name)/fastq_gzipped.filesizes.tsv" && \\
Expand Down
42 changes: 21 additions & 21 deletions typescript-expressions/dragen-tools/4.0.3/dragen-tools__4.0.3.cwljs
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ function get_new_fastq_list_csv_script_path() {
*/
return "generate-new-fastq-list-csv.sh";
}
function get_fastq_gz_md5sum_files_script_path() {
function get_fastq_raw_md5sum_files_script_path() {
/*
Get the script path to generating the md5sum for each fastq gzip file
*/
return "generate-md5sum-for-fastq-gz-files.sh";
return "generate-md5sum-for-fastq-raw-files.sh";
}
function get_fastq_gz_file_sizes_script_path() {
/*
Expand Down Expand Up @@ -469,42 +469,42 @@ function generate_ora_mv_files_script(fastq_list_rows, input_directory, output_d
contents:ora_mv_files_script
};
}
function generate_fastq_gz_md5sum_files_script(fastq_list_rows, input_directory) {
function get_md5sum_fastq_raw_script(fastq_list_rows, input_directory) {
/*
Generate the fastq gzip md5sum files script command, results are printed to stdout
*/
var get_md5sum_fastq_gz_script = "#!/usr/bin/env bash\n\n";
get_md5sum_fastq_gz_script += "# Exit on failure\n";
get_md5sum_fastq_gz_script += "set -euo pipefail\n\n";
var get_md5sum_fastq_raw_script_contents = "#!/usr/bin/env bash\n\n";
get_md5sum_fastq_raw_script_contents += "# Exit on failure\n";
get_md5sum_fastq_raw_script_contents += "set -euo pipefail\n\n";
/* Initialise the bash array */
get_md5sum_fastq_gz_script += "# Get fastq gz paths\n";
get_md5sum_fastq_gz_script += "FASTQ_GZ_PATHS=(\n";
get_md5sum_fastq_raw_script_contents += "# Get fastq gz paths\n";
get_md5sum_fastq_raw_script_contents += "FASTQ_GZ_PATHS=(\n";
/* Iterate over all files */
for (var _i = 0, fastq_list_rows_4 = fastq_list_rows; _i < fastq_list_rows_4.length; _i++) {
var fastq_list_row = fastq_list_rows_4[_i];
/* Confirm read 1 is a file type */
if ("class" in fastq_list_row.read_1 && fastq_list_row.read_1.class === "File") {
/* Add relative path of read 1 */
get_md5sum_fastq_gz_script += " \"".concat(fastq_list_row.read_1.path.replace(input_directory.path + "/", ''), "\" \\\n");
get_md5sum_fastq_raw_script_contents += " \"".concat(fastq_list_row.read_1.path.replace(input_directory.path + "/", ''), "\" \\\n");
}
/* Confirm read 2 is a file type */
if (fastq_list_row.read_2 !== null && "class" in fastq_list_row.read_2 && fastq_list_row.read_2.class === "File") {
get_md5sum_fastq_gz_script += " \"".concat(fastq_list_row.read_2.path.replace(input_directory.path + "/", ''), "\" \\\n");
get_md5sum_fastq_raw_script_contents += " \"".concat(fastq_list_row.read_2.path.replace(input_directory.path + "/", ''), "\" \\\n");
}
}
/* Complete the bash array */
get_md5sum_fastq_gz_script += ")\n\n";
get_md5sum_fastq_raw_script_contents += ")\n\n";
/* Build the for loop */
get_md5sum_fastq_gz_script += "# Generate md5sums for the input fastq gz files\n";
get_md5sum_fastq_gz_script += "for fastq_gz_path in \"${FASTQ_GZ_PATHS[@]}\"; do\n";
get_md5sum_fastq_gz_script += " full_input_path=\"".concat(input_directory.path, "/${fastq_gz_path}\"\n");
get_md5sum_fastq_gz_script += " md5sum \"${full_input_path}\" | sed \"s%${full_input_path}%${fastq_gz_path}%\"\n";
get_md5sum_fastq_gz_script += "done\n\n";
get_md5sum_fastq_gz_script += "# Md5sum script complete\n";
get_md5sum_fastq_raw_script_contents += "# Generate md5sums for the input fastq gz files\n";
get_md5sum_fastq_raw_script_contents += "for fastq_gz_path in \"${FASTQ_GZ_PATHS[@]}\"; do\n";
get_md5sum_fastq_raw_script_contents += " full_input_path=\"".concat(input_directory.path, "/${fastq_gz_path}\"\n");
get_md5sum_fastq_raw_script_contents += " zcat \"${full_input_path}\" | md5sum | sed \"s%-%${fastq_gz_path/* .gz/}%\"\n"; */
get_md5sum_fastq_raw_script_contents += "done\n\n";
get_md5sum_fastq_raw_script_contents += "# Md5sum script complete\n";
return {
class:"File",
basename:get_fastq_gz_md5sum_files_script_path(),
contents:get_md5sum_fastq_gz_script
basename:get_fastq_raw_md5sum_files_script_path(),
contents:get_md5sum_fastq_raw_script_contents
};
}
function generate_fastq_gz_file_sizes_script(fastq_list_rows, input_directory) {
Expand Down Expand Up @@ -814,8 +814,8 @@ function generate_ora_mount_points(input_run, output_directory_path, sample_id_l
});
/* Generate the script to generate the md5sums of the input gzipped fastq files */
e.push({
"entryname":get_fastq_gz_md5sum_files_script_path(),
"entry":generate_fastq_gz_md5sum_files_script(fastq_list_rows, input_run)
"entryname":get_fastq_raw_md5sum_files_script_path(),
"entry":get_md5sum_fastq_raw_script(fastq_list_rows, input_run)
});
/* Generate the script to generate the filesizes of the input gzipped fastq files */
e.push({
Expand Down
46 changes: 23 additions & 23 deletions typescript-expressions/dragen-tools/4.0.3/dragen-tools__4.0.3.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ exports.get_fastq_list_csv_path = get_fastq_list_csv_path;
exports.get_tumor_fastq_list_csv_path = get_tumor_fastq_list_csv_path;
exports.get_ora_mv_files_script_path = get_ora_mv_files_script_path;
exports.get_new_fastq_list_csv_script_path = get_new_fastq_list_csv_script_path;
exports.get_fastq_gz_md5sum_files_script_path = get_fastq_gz_md5sum_files_script_path;
exports.get_fastq_raw_md5sum_files_script_path = get_fastq_raw_md5sum_files_script_path;
exports.get_fastq_gz_file_sizes_script_path = get_fastq_gz_file_sizes_script_path;
exports.get_fastq_ora_md5sum_files_script_path = get_fastq_ora_md5sum_files_script_path;
exports.get_fastq_ora_file_sizes_script_path = get_fastq_ora_file_sizes_script_path;
Expand All @@ -30,7 +30,7 @@ exports.get_mask_dir = get_mask_dir;
exports.get_ref_scratch_dir = get_ref_scratch_dir;
exports.get_ora_intermediate_output_dir = get_ora_intermediate_output_dir;
exports.generate_ora_mv_files_script = generate_ora_mv_files_script;
exports.generate_fastq_gz_md5sum_files_script = generate_fastq_gz_md5sum_files_script;
exports.get_md5sum_fastq_raw_script = get_md5sum_fastq_raw_script;
exports.generate_fastq_gz_file_sizes_script = generate_fastq_gz_file_sizes_script;
exports.generate_fastq_ora_md5sum_files_script = generate_fastq_ora_md5sum_files_script;
exports.generate_fastq_ora_file_sizes_script = generate_fastq_ora_file_sizes_script;
Expand Down Expand Up @@ -120,11 +120,11 @@ function get_new_fastq_list_csv_script_path() {
*/
return "generate-new-fastq-list-csv.sh";
}
function get_fastq_gz_md5sum_files_script_path() {
function get_fastq_raw_md5sum_files_script_path() {
/*
Get the script path to generating the md5sum for each fastq gzip file
*/
return "generate-md5sum-for-fastq-gz-files.sh";
return "generate-md5sum-for-fastq-raw-files.sh";
}
function get_fastq_gz_file_sizes_script_path() {
/*
Expand Down Expand Up @@ -511,42 +511,42 @@ function generate_ora_mv_files_script(fastq_list_rows, input_directory, output_d
contents: ora_mv_files_script
};
}
function generate_fastq_gz_md5sum_files_script(fastq_list_rows, input_directory) {
function get_md5sum_fastq_raw_script(fastq_list_rows, input_directory) {
/*
Generate the fastq gzip md5sum files script command, results are printed to stdout
*/
var get_md5sum_fastq_gz_script = "#!/usr/bin/env bash\n\n";
get_md5sum_fastq_gz_script += "# Exit on failure\n";
get_md5sum_fastq_gz_script += "set -euo pipefail\n\n";
var get_md5sum_fastq_raw_script_contents = "#!/usr/bin/env bash\n\n";
get_md5sum_fastq_raw_script_contents += "# Exit on failure\n";
get_md5sum_fastq_raw_script_contents += "set -euo pipefail\n\n";
// Initialise the bash array
get_md5sum_fastq_gz_script += "# Get fastq gz paths\n";
get_md5sum_fastq_gz_script += "FASTQ_GZ_PATHS=(\n";
get_md5sum_fastq_raw_script_contents += "# Get fastq gz paths\n";
get_md5sum_fastq_raw_script_contents += "FASTQ_GZ_PATHS=(\n";
// Iterate over all files
for (var _i = 0, fastq_list_rows_4 = fastq_list_rows; _i < fastq_list_rows_4.length; _i++) {
var fastq_list_row = fastq_list_rows_4[_i];
// Confirm read 1 is a file type
if ("class_" in fastq_list_row.read_1 && fastq_list_row.read_1.class_ === cwl_ts_auto_1.File_class.FILE) {
// Add relative path of read 1
get_md5sum_fastq_gz_script += " \"".concat(fastq_list_row.read_1.path.replace(input_directory.path + "/", ''), "\" \\\n");
get_md5sum_fastq_raw_script_contents += " \"".concat(fastq_list_row.read_1.path.replace(input_directory.path + "/", ''), "\" \\\n");
}
// Confirm read 2 is a file type
if (fastq_list_row.read_2 !== null && "class_" in fastq_list_row.read_2 && fastq_list_row.read_2.class_ === cwl_ts_auto_1.File_class.FILE) {
get_md5sum_fastq_gz_script += " \"".concat(fastq_list_row.read_2.path.replace(input_directory.path + "/", ''), "\" \\\n");
get_md5sum_fastq_raw_script_contents += " \"".concat(fastq_list_row.read_2.path.replace(input_directory.path + "/", ''), "\" \\\n");
}
}
// Complete the bash array
get_md5sum_fastq_gz_script += ")\n\n";
get_md5sum_fastq_raw_script_contents += ")\n\n";
// Build the for loop
get_md5sum_fastq_gz_script += "# Generate md5sums for the input fastq gz files\n";
get_md5sum_fastq_gz_script += "for fastq_gz_path in \"${FASTQ_GZ_PATHS[@]}\"; do\n";
get_md5sum_fastq_gz_script += " full_input_path=\"".concat(input_directory.path, "/${fastq_gz_path}\"\n");
get_md5sum_fastq_gz_script += " md5sum \"${full_input_path}\" | sed \"s%${full_input_path}%${fastq_gz_path}%\"\n";
get_md5sum_fastq_gz_script += "done\n\n";
get_md5sum_fastq_gz_script += "# Md5sum script complete\n";
get_md5sum_fastq_raw_script_contents += "# Generate md5sums for the input fastq gz files\n";
get_md5sum_fastq_raw_script_contents += "for fastq_gz_path in \"${FASTQ_GZ_PATHS[@]}\"; do\n";
get_md5sum_fastq_raw_script_contents += " full_input_path=\"".concat(input_directory.path, "/${fastq_gz_path}\"\n");
get_md5sum_fastq_raw_script_contents += " zcat \"${full_input_path}\" | md5sum | sed \"s%-%${fastq_gz_path//.gz/}%\"\n";
get_md5sum_fastq_raw_script_contents += "done\n\n";
get_md5sum_fastq_raw_script_contents += "# Md5sum script complete\n";
return {
class_: cwl_ts_auto_1.File_class.FILE,
basename: get_fastq_gz_md5sum_files_script_path(),
contents: get_md5sum_fastq_gz_script
basename: get_fastq_raw_md5sum_files_script_path(),
contents: get_md5sum_fastq_raw_script_contents
};
}
function generate_fastq_gz_file_sizes_script(fastq_list_rows, input_directory) {
Expand Down Expand Up @@ -856,8 +856,8 @@ function generate_ora_mount_points(input_run, output_directory_path, sample_id_l
});
// Generate the script to generate the md5sums of the input gzipped fastq files
e.push({
"entryname": get_fastq_gz_md5sum_files_script_path(),
"entry": generate_fastq_gz_md5sum_files_script(fastq_list_rows, input_run)
"entryname": get_fastq_raw_md5sum_files_script_path(),
"entry": get_md5sum_fastq_raw_script(fastq_list_rows, input_run)
});
// Generate the script to generate the filesizes of the input gzipped fastq files
e.push({
Expand Down
42 changes: 21 additions & 21 deletions typescript-expressions/dragen-tools/4.0.3/dragen-tools__4.0.3.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,11 @@ export function get_new_fastq_list_csv_script_path(): string {
*/
return "generate-new-fastq-list-csv.sh"
}
export function get_fastq_gz_md5sum_files_script_path(): string {
export function get_fastq_raw_md5sum_files_script_path(): string {
/*
Get the script path to generating the md5sum for each fastq gzip file
*/
return "generate-md5sum-for-fastq-gz-files.sh"
return "generate-md5sum-for-fastq-raw-files.sh"
}

export function get_fastq_gz_file_sizes_script_path(): string {
Expand Down Expand Up @@ -566,48 +566,48 @@ export function generate_ora_mv_files_script(fastq_list_rows: FastqListRow[], in
}


export function generate_fastq_gz_md5sum_files_script(fastq_list_rows: FastqListRow[], input_directory: IDirectory): IFile {
export function get_md5sum_fastq_raw_script(fastq_list_rows: FastqListRow[], input_directory: IDirectory): IFile {
/*
Generate the fastq gzip md5sum files script command, results are printed to stdout
*/
let get_md5sum_fastq_gz_script = "#!/usr/bin/env bash\n\n"
let get_md5sum_fastq_raw_script_contents = "#!/usr/bin/env bash\n\n"

get_md5sum_fastq_gz_script += `# Exit on failure\n`
get_md5sum_fastq_gz_script += `set -euo pipefail\n\n`
get_md5sum_fastq_raw_script_contents += `# Exit on failure\n`
get_md5sum_fastq_raw_script_contents += `set -euo pipefail\n\n`

// Initialise the bash array
get_md5sum_fastq_gz_script += `# Get fastq gz paths\n`
get_md5sum_fastq_gz_script += `FASTQ_GZ_PATHS=(\n`
get_md5sum_fastq_raw_script_contents += `# Get fastq gz paths\n`
get_md5sum_fastq_raw_script_contents += `FASTQ_GZ_PATHS=(\n`

// Iterate over all files
for (let fastq_list_row of fastq_list_rows) {
// Confirm read 1 is a file type
if ("class_" in fastq_list_row.read_1 && fastq_list_row.read_1.class_ === File_class.FILE) {
// Add relative path of read 1
get_md5sum_fastq_gz_script += ` "${fastq_list_row.read_1.path.replace(input_directory.path + "/", '')}" \\\n`
get_md5sum_fastq_raw_script_contents += ` "${fastq_list_row.read_1.path.replace(input_directory.path + "/", '')}" \\\n`
}
// Confirm read 2 is a file type
if (fastq_list_row.read_2 !== null && "class_" in fastq_list_row.read_2 && fastq_list_row.read_2.class_ === File_class.FILE) {
get_md5sum_fastq_gz_script += ` "${fastq_list_row.read_2.path.replace(input_directory.path + "/", '')}" \\\n`
get_md5sum_fastq_raw_script_contents += ` "${fastq_list_row.read_2.path.replace(input_directory.path + "/", '')}" \\\n`
}
}

// Complete the bash array
get_md5sum_fastq_gz_script += `)\n\n`
get_md5sum_fastq_raw_script_contents += `)\n\n`

// Build the for loop
get_md5sum_fastq_gz_script += `# Generate md5sums for the input fastq gz files\n`
get_md5sum_fastq_gz_script += `for fastq_gz_path in "\${FASTQ_GZ_PATHS[@]}"; do\n`
get_md5sum_fastq_gz_script += ` full_input_path="${input_directory.path}/\${fastq_gz_path}"\n`
get_md5sum_fastq_gz_script += ` md5sum "\${full_input_path}" | sed "s%\${full_input_path}%\${fastq_gz_path}%"\n`
get_md5sum_fastq_gz_script += `done\n\n`
get_md5sum_fastq_raw_script_contents += `# Generate md5sums for the input fastq gz files\n`
get_md5sum_fastq_raw_script_contents += `for fastq_gz_path in "\${FASTQ_GZ_PATHS[@]}"; do\n`
get_md5sum_fastq_raw_script_contents += ` full_input_path="${input_directory.path}/\${fastq_gz_path}"\n`
get_md5sum_fastq_raw_script_contents += ` zcat "\${full_input_path}" | md5sum | sed "s%-%\${fastq_gz_path//.gz/}%"\n`
get_md5sum_fastq_raw_script_contents += `done\n\n`

get_md5sum_fastq_gz_script += `# Md5sum script complete\n`
get_md5sum_fastq_raw_script_contents += `# Md5sum script complete\n`

return {
class_: File_class.FILE,
basename: get_fastq_gz_md5sum_files_script_path(),
contents: get_md5sum_fastq_gz_script
basename: get_fastq_raw_md5sum_files_script_path(),
contents: get_md5sum_fastq_raw_script_contents
}
}

Expand Down Expand Up @@ -958,8 +958,8 @@ export function generate_ora_mount_points(input_run: IDirectory, output_director

// Generate the script to generate the md5sums of the input gzipped fastq files
e.push({
"entryname": get_fastq_gz_md5sum_files_script_path(),
"entry": generate_fastq_gz_md5sum_files_script(fastq_list_rows, input_run)
"entryname": get_fastq_raw_md5sum_files_script_path(),
"entry": get_md5sum_fastq_raw_script(fastq_list_rows, input_run)
})

// Generate the script to generate the filesizes of the input gzipped fastq files
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ FASTQ_GZ_PATHS=(
# Generate md5sums for the input fastq gz files
for fastq_gz_path in "${FASTQ_GZ_PATHS[@]}"; do
full_input_path="data/${fastq_gz_path}"
md5sum "${full_input_path}" | sed "s%${full_input_path}%${fastq_gz_path}%"
zcat "${full_input_path}" | md5sum | sed "s%-%${fastq_gz_path//.gz/}%"
done

# Md5sum script complete
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ var TUMOR_FASTQ_LIST_CSV_FILE_PATH = "tests/data/tumor_fastq_list.csv";
var ORA_FASTQ_LIST_CSV_FILE_PATH = "tests/data/fastq_list.ora.csv";
var MV_ORA_FILE_PATH = "tests/data/mv-ora.sh";
var GENERATE_NEW_FASTQ_LIST_CSV_SH_PATH = "tests/data/generate-new-fastq-list-csv.sh";
var GENERATE_MD5SUM_FOR_FASTQ_GZ_FILES_SH_PATH = "tests/data/generate-md5sum-for-fastq-gz-files.sh";
var GENERATE_MD5SUM_FOR_FASTQ_GZ_FILES_SH_PATH = "tests/data/generate-md5sum-for-fastq-raw-files.sh";
var GENERATE_MD5SUM_FOR_FASTQ_ORA_FILES_SH_PATH = "tests/data/generate-md5sum-for-fastq-ora-files.sh";
var GENERATE_FILE_SIZES_FOR_FASTQ_GZ_FILES_SH_PATH = "tests/data/generate-file-sizes-for-fastq-gz-files.sh";
var GENERATE_FILE_SIZES_FOR_FASTQ_ORA_FILES_SH_PATH = "tests/data/generate-file-sizes-for-fastq-ora-files.sh";
Expand Down Expand Up @@ -184,7 +184,7 @@ var EXPECTED_ORA_NEW_FASTQ_LIST_CSV_SH_OUTPUT = {
};
var EXPECTED_MD5SUM_FOR_FASTQ_GZ_FILES_SH_OUTPUT = {
class_: cwl_ts_auto_1.File_class.FILE,
basename: "generate-md5sum-for-fastq-gz-files.sh",
basename: "generate-md5sum-for-fastq-raw-files.sh",
contents: (0, fs_1.readFileSync)(GENERATE_MD5SUM_FOR_FASTQ_GZ_FILES_SH_PATH, "utf8")
};
var EXPECTED_MD5SUM_FOR_FASTQ_ORA_FILES_SH_OUTPUT = {
Expand Down Expand Up @@ -343,7 +343,7 @@ describe('Test ora mount points', function () {
"entry": EXPECTED_ORA_NEW_FASTQ_LIST_CSV_SH_OUTPUT
},
{
"entryname": "generate-md5sum-for-fastq-gz-files.sh",
"entryname": "generate-md5sum-for-fastq-raw-files.sh",
"entry": EXPECTED_MD5SUM_FOR_FASTQ_GZ_FILES_SH_OUTPUT
},
{
Expand Down
Loading

0 comments on commit 11599ee

Please sign in to comment.