From 5c66be951e78b6d35ff62e639596f510ed898c7f Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 9 Jan 2025 12:32:41 -0500 Subject: [PATCH 1/6] potential fix for #64 --- gtars/src/uniwig/counting.rs | 11 ++++------- gtars/src/uniwig/mod.rs | 33 ++++++++++++++++++++------------- gtars/tests/data/out/_core.wig | 5 +++-- gtars/tests/data/out/_end.wig | 1 + gtars/tests/data/out/_start.wig | 3 ++- gtars/tests/test.rs | 10 +++++++--- 6 files changed, 37 insertions(+), 26 deletions(-) diff --git a/gtars/src/uniwig/counting.rs b/gtars/src/uniwig/counting.rs index f5bcdf4..1165c9f 100644 --- a/gtars/src/uniwig/counting.rs +++ b/gtars/src/uniwig/counting.rs @@ -34,9 +34,7 @@ pub fn start_end_counts( chrom_size: i32, smoothsize: i32, stepsize: i32, - shift: i32, ) -> (Vec, Vec) { - //let vin_iter = starts_vector.iter(); let mut v_coordinate_positions: Vec = Vec::new(); // these are the final coordinates after any adjustments let mut v_coord_counts: Vec = Vec::new(); // u8 stores 0:255 This may be insufficient. u16 max is 65535 @@ -55,7 +53,7 @@ pub fn start_end_counts( adjusted_start_site = starts_vector[0]; // get first coordinate position - adjusted_start_site.0 = adjusted_start_site.0 - smoothsize + shift; + adjusted_start_site.0 = adjusted_start_site.0 - smoothsize; current_end_site = adjusted_start_site; current_end_site.0 = adjusted_start_site.0 + 1 + smoothsize * 2; @@ -74,7 +72,7 @@ pub fn start_end_counts( coordinate_value = *coord; adjusted_start_site = coordinate_value; - adjusted_start_site.0 = coordinate_value.0 - smoothsize + shift; + adjusted_start_site.0 = coordinate_value.0 - smoothsize; let current_score = adjusted_start_site.1; @@ -164,7 +162,6 @@ pub fn core_counts( ends_vector: &[(i32, i32)], chrom_size: i32, stepsize: i32, - shift: i32, ) -> (Vec, Vec) { let mut v_coordinate_positions: Vec = Vec::new(); // these are the final coordinates after any adjustments let mut v_coord_counts: Vec = Vec::new(); // u8 stores 0:255 This may be insufficient. u16 max is 65535 @@ -184,7 +181,7 @@ pub fn core_counts( current_start_site = starts_vector[0]; // get first coordinate position current_end_site = ends_vector[0]; - current_start_site.0 = current_start_site.0 + shift; + current_start_site.0 = current_start_site.0; if current_start_site.0 < 1 { current_start_site.0 = 1; @@ -201,7 +198,7 @@ pub fn core_counts( current_start_site = coordinate_value; - current_start_site.0 = current_start_site.0 + shift; + current_start_site.0 = current_start_site.0; let current_score = current_start_site.1; count += current_score; diff --git a/gtars/src/uniwig/mod.rs b/gtars/src/uniwig/mod.rs index 17c996e..38803c0 100644 --- a/gtars/src/uniwig/mod.rs +++ b/gtars/src/uniwig/mod.rs @@ -191,8 +191,8 @@ pub fn run_uniwig(matches: &ArgMatches) { } /// Ensures that the start position is at a minimum equal to `1` -fn clamped_start_position(start: i32, smoothsize: i32) -> i32 { - std::cmp::max(1, start - smoothsize) +fn clamped_start_position(start: i32, smoothsize: i32, wig_shift:i32) -> i32 { + std::cmp::max(1, start - smoothsize + wig_shift) } /// Ensure that the start position is at a minimum equal to `0` fn clamped_start_position_zero_pos(start: i32, smoothsize: i32) -> i32 { @@ -222,7 +222,6 @@ pub fn uniwig_main( .build() .unwrap(); - let mut wig_shift: i32 = 0; // This will be set to 1 when writing to wiggle files, else always set to 0 // Determine Input File Type let input_filetype = FileType::from_str(filetype.to_lowercase().as_str()); @@ -258,9 +257,6 @@ pub fn uniwig_main( if output_type == "bedgraph" || output_type == "bw" || output_type == "bigwig" { output_type = "bedGraph" // we must create bedgraphs first before creating bigwig files } - if output_type == "wig" { - wig_shift = 1; - } // Pare down chromosomes if necessary let mut final_chromosomes = @@ -299,7 +295,6 @@ pub fn uniwig_main( current_chrom_size, smoothsize, stepsize, - wig_shift, ); match output_type { @@ -322,6 +317,7 @@ pub fn uniwig_main( clamped_start_position( primary_start.0, smoothsize, + 1 //must shift wiggle starts and core by 1 since it is 1 based ), stepsize, current_chrom_size, @@ -390,7 +386,6 @@ pub fn uniwig_main( current_chrom_size, smoothsize, stepsize, - wig_shift, ); match output_type { "file" => { @@ -411,6 +406,7 @@ pub fn uniwig_main( clamped_start_position( primary_end.0, smoothsize, + 0, ), ); write_to_bed_graph_file( @@ -432,6 +428,7 @@ pub fn uniwig_main( clamped_start_position( primary_end.0, smoothsize, + 0, // ends already 1 based, do not shift further ), stepsize, current_chrom_size, @@ -450,6 +447,7 @@ pub fn uniwig_main( clamped_start_position( primary_end.0, smoothsize, + 0 ), stepsize, meta_data_file_names[1].clone(), @@ -468,6 +466,7 @@ pub fn uniwig_main( clamped_start_position( primary_end.0, smoothsize, + 0 ), stepsize, meta_data_file_names[1].clone(), @@ -481,7 +480,6 @@ pub fn uniwig_main( &chromosome.ends, current_chrom_size, stepsize, - wig_shift, ); match output_type { "file" => { @@ -499,7 +497,10 @@ pub fn uniwig_main( let count_info: (Vec, Vec, Vec) = compress_counts( &mut core_results, - primary_start.0, + clamped_start_position_zero_pos( + primary_start.0, + 0, + ), ); write_to_bed_graph_file( &count_info, @@ -517,7 +518,7 @@ pub fn uniwig_main( &core_results.0, file_name.clone(), chrom_name.clone(), - clamped_start_position(primary_start.0, 0), + clamped_start_position(primary_start.0, 0,1), //starts are 1 based must be shifted by 1 stepsize, current_chrom_size, ); @@ -531,7 +532,10 @@ pub fn uniwig_main( &core_results.0, file_name.clone(), chrom_name.clone(), - primary_start.0, + clamped_start_position_zero_pos( + primary_start.0, + 0, + ), stepsize, meta_data_file_names[2].clone(), ); @@ -546,7 +550,10 @@ pub fn uniwig_main( &core_results.0, file_name.clone(), chrom_name.clone(), - primary_start.0, + clamped_start_position_zero_pos( + primary_start.0, + 0, + ), stepsize, meta_data_file_names[2].clone(), ); diff --git a/gtars/tests/data/out/_core.wig b/gtars/tests/data/out/_core.wig index 81ae5e9..7142f6c 100644 --- a/gtars/tests/data/out/_core.wig +++ b/gtars/tests/data/out/_core.wig @@ -1,9 +1,10 @@ -fixedStep chrom=chr1 start=2 step=1 +fixedStep chrom=chr1 start=3 step=1 2 2 3 +4 +2 2 -1 2 1 1 diff --git a/gtars/tests/data/out/_end.wig b/gtars/tests/data/out/_end.wig index f3119c1..306e8c4 100644 --- a/gtars/tests/data/out/_end.wig +++ b/gtars/tests/data/out/_end.wig @@ -12,4 +12,5 @@ fixedStep chrom=chr1 start=5 step=1 0 0 0 +0 0 \ No newline at end of file diff --git a/gtars/tests/data/out/_start.wig b/gtars/tests/data/out/_start.wig index b08c334..a8481c0 100644 --- a/gtars/tests/data/out/_start.wig +++ b/gtars/tests/data/out/_start.wig @@ -1,4 +1,4 @@ -fixedStep chrom=chr1 start=1 step=1 +fixedStep chrom=chr1 start=2 step=1 2 2 3 @@ -16,4 +16,5 @@ fixedStep chrom=chr1 start=1 step=1 0 0 0 +0 0 \ No newline at end of file diff --git a/gtars/tests/test.rs b/gtars/tests/test.rs index e197edc..e869197 100644 --- a/gtars/tests/test.rs +++ b/gtars/tests/test.rs @@ -486,7 +486,6 @@ mod tests { &chromosome.ends, current_chrom_size, stepsize, - 0, ); } } @@ -507,7 +506,6 @@ mod tests { current_chrom_size, smooth_size, stepsize, - 0, ); } } @@ -675,8 +673,10 @@ mod tests { let tempbedpath = format!("{}{}", path_to_crate, "/tests/data/test5.bed"); let combinedbedpath = tempbedpath.as_str(); + //let combinedbedpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy4.bed"; let chromsizerefpath = combinedbedpath; + //let chromsizerefpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy.chrom.sizes"; let tempdir = tempfile::tempdir().unwrap(); let path = PathBuf::from(&tempdir.path()); @@ -685,8 +685,12 @@ mod tests { let bwfileheader_path = path.into_os_string().into_string().unwrap(); let bwfileheader = bwfileheader_path.as_str(); - let smoothsize: i32 = 5; + //let bwfileheader = "/home/drc/Downloads/unwig_testing_19dec2024/output/npy_output/"; + //let bwfileheader = "/home/drc/Downloads/unwig_testing_19dec2024/output/wig_output/"; + + let smoothsize: i32 = 2; let output_type = "npy"; + //let output_type = "wig"; let filetype = "bed"; let num_threads = 6; let score = false; From 27d52f5995ae9452de13de1f3ed43e195e9c2a99 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:47:21 -0500 Subject: [PATCH 2/6] attempt to use shared hashmap for #65 does not work --- gtars/Cargo.toml | 1 + gtars/src/uniwig/mod.rs | 64 ++++++++++++++++++++++++++++--------- gtars/src/uniwig/writing.rs | 34 +++++++------------- gtars/tests/test.rs | 12 ++++--- 4 files changed, 68 insertions(+), 43 deletions(-) diff --git a/gtars/Cargo.toml b/gtars/Cargo.toml index 462af9a..a5708eb 100644 --- a/gtars/Cargo.toml +++ b/gtars/Cargo.toml @@ -32,6 +32,7 @@ base64-url = "2.0.0" sha2 = "0.10.7" md-5 = "0.10.5" seq_io = "0.3.2" +serde_json = "1.0.135" [dev-dependencies] diff --git a/gtars/src/uniwig/mod.rs b/gtars/src/uniwig/mod.rs index 38803c0..1f728ae 100644 --- a/gtars/src/uniwig/mod.rs +++ b/gtars/src/uniwig/mod.rs @@ -34,6 +34,7 @@ use std::str::FromStr; use std::sync::{Arc, Mutex}; use std::thread; use tokio::runtime; +use serde_json; pub mod cli; pub mod counting; @@ -248,9 +249,17 @@ pub fn uniwig_main( } }; + let mut npy_meta_data: HashMap> = HashMap::new(); + let mut arc_npy_meta_data = Arc::new(Mutex::new(npy_meta_data)); + let mut chromosome_data_clone = Arc::clone(&arc_npy_meta_data); + match input_filetype { //BED AND NARROWPEAK WORKFLOW Ok(FileType::BED) | Ok(FileType::NARROWPEAK) => { + // Pare down chromosomes if necessary + let mut final_chromosomes = + get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score); + // Some housekeeping depending on output type let og_output_type = output_type; // need this later for conversion let mut output_type = output_type; @@ -258,9 +267,25 @@ pub fn uniwig_main( output_type = "bedGraph" // we must create bedgraphs first before creating bigwig files } - // Pare down chromosomes if necessary - let mut final_chromosomes = - get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score); + if output_type == "npy"{ + // populate hashmap for the npy meta data + let mut arc_npy_meta_data_locked = arc_npy_meta_data.lock().unwrap(); + for chromosome in final_chromosomes.iter(){ + let chr_name = chromosome.chrom.clone(); + let current_chrom_size = + *chrom_sizes.get(&chromosome.chrom).unwrap() as i32; + + arc_npy_meta_data_locked.insert( + chr_name, + HashMap::from([ + ("stepsize".to_string(), stepsize), + ("reported_chrom_size".to_string(), current_chrom_size), + ]), + ); + + } + + } let bar = ProgressBar::new(final_chromosomes.len() as u64); @@ -348,6 +373,7 @@ pub fn uniwig_main( "{}{}_{}.{}", bwfileheader, chrom_name, "start", output_type ); + write_to_npy_file( &count_result.0, file_name.clone(), @@ -356,8 +382,8 @@ pub fn uniwig_main( primary_start.0, smoothsize, ), - stepsize, - meta_data_file_names[0].clone(), + &mut chromosome_data_clone, + "start", ); } _ => { @@ -374,8 +400,8 @@ pub fn uniwig_main( primary_start.0, smoothsize, ), - stepsize, - meta_data_file_names[0].clone(), + &mut chromosome_data_clone, + "start", ); } } @@ -449,8 +475,8 @@ pub fn uniwig_main( smoothsize, 0 ), - stepsize, - meta_data_file_names[1].clone(), + &mut chromosome_data_clone, + "end", ); } _ => { @@ -468,8 +494,8 @@ pub fn uniwig_main( smoothsize, 0 ), - stepsize, - meta_data_file_names[1].clone(), + &mut chromosome_data_clone, + "end", ); } } @@ -536,8 +562,8 @@ pub fn uniwig_main( primary_start.0, 0, ), - stepsize, - meta_data_file_names[2].clone(), + &mut chromosome_data_clone, + "core", ); } _ => { @@ -554,8 +580,8 @@ pub fn uniwig_main( primary_start.0, 0, ), - stepsize, - meta_data_file_names[2].clone(), + &mut chromosome_data_clone, + "core", ); } } @@ -587,6 +613,14 @@ pub fn uniwig_main( ); } } + "npy" => { + //write combined metadata + let json_string = serde_json::to_string_pretty(&npy_meta_data).unwrap(); + let combined_npy_meta_file_path = format!("{}{}.{}", bwfileheader, "npy_meta", "json"); + let mut file = File::create(combined_npy_meta_file_path).unwrap(); + file.write_all(json_string.as_bytes()).unwrap(); + + } _ => {} } bar.finish(); diff --git a/gtars/src/uniwig/writing.rs b/gtars/src/uniwig/writing.rs index baebb37..14b82c9 100644 --- a/gtars/src/uniwig/writing.rs +++ b/gtars/src/uniwig/writing.rs @@ -8,16 +8,20 @@ use std::fs::{create_dir_all, remove_file, File, OpenOptions}; use std::io::{BufWriter, Write}; use std::path::PathBuf; use std::{fs, io}; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; -/// Write output to npy files +/// Write output to npy files AND update the meta_data hashmap pub fn write_to_npy_file( counts: &[u32], filename: String, chromname: String, start_position: i32, - stepsize: i32, - metafilename: String, + npy_meta_data_map: &mut Arc>>>, + out_selection: &str, ) { + let mut chromosome_data_guard = npy_meta_data_map.lock().unwrap(); + // For future reference `&Vec` is a SLICE and thus we must use the `to_vec` function below when creating an array // https://users.rust-lang.org/t/why-does-std-to-vec-exist/45893/9 @@ -25,27 +29,11 @@ pub fn write_to_npy_file( let arr = Array::from_vec(counts.to_vec()); let _ = write_npy(filename, &arr); - // Write to the metadata file. - // Note: there should be a single metadata file for starts, ends and core - - let path = std::path::Path::new(&metafilename).parent().unwrap(); - let _ = create_dir_all(path); - - let mut file = OpenOptions::new() - .create(true) // Create the file if it doesn't exist - .append(true) // Append data to the existing file if it does exist - .open(metafilename) - .unwrap(); + // Write to the metadata hashmap + if let Some(current_chr_data) = chromosome_data_guard.get_mut(chromname.as_str()) { + current_chr_data.insert(out_selection.to_string(), start_position); + } - // The original wiggle file header. This can be anything we wish it to be. Currently space delimited. - let mut wig_header = "fixedStep chrom=".to_string() - + chromname.as_str() - + " start=" - + start_position.to_string().as_str() - + " step=" - + stepsize.to_string().as_str(); - wig_header.push('\n'); - file.write_all(wig_header.as_ref()).unwrap(); } /// Write either combined bedGraph, wiggle files, and bed files diff --git a/gtars/tests/test.rs b/gtars/tests/test.rs index e869197..433ce80 100644 --- a/gtars/tests/test.rs +++ b/gtars/tests/test.rs @@ -673,10 +673,12 @@ mod tests { let tempbedpath = format!("{}{}", path_to_crate, "/tests/data/test5.bed"); let combinedbedpath = tempbedpath.as_str(); - //let combinedbedpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy4.bed"; + let combinedbedpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy3.bed"; + //let combinedbedpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/chr1415.bed"; let chromsizerefpath = combinedbedpath; - //let chromsizerefpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy.chrom.sizes"; + let chromsizerefpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy.chrom.sizes"; + //let chromsizerefpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/test.chrom.sizes"; let tempdir = tempfile::tempdir().unwrap(); let path = PathBuf::from(&tempdir.path()); @@ -685,16 +687,16 @@ mod tests { let bwfileheader_path = path.into_os_string().into_string().unwrap(); let bwfileheader = bwfileheader_path.as_str(); - //let bwfileheader = "/home/drc/Downloads/unwig_testing_19dec2024/output/npy_output/"; + let bwfileheader = "/home/drc/Downloads/unwig_testing_19dec2024/output/npy_output/"; //let bwfileheader = "/home/drc/Downloads/unwig_testing_19dec2024/output/wig_output/"; - let smoothsize: i32 = 2; + let smoothsize: i32 = 10; let output_type = "npy"; //let output_type = "wig"; let filetype = "bed"; let num_threads = 6; let score = false; - let stepsize = 1; + let stepsize = 5; let zoom = 0; let vec_count_type = vec!["start", "end", "core"]; From 391ba686a8adbc924abfb500f8a782af596fd994 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:52:03 -0500 Subject: [PATCH 3/6] Revert "attempt to use shared hashmap for #65 does not work" This reverts commit 27d52f5995ae9452de13de1f3ed43e195e9c2a99. --- gtars/Cargo.toml | 1 - gtars/src/uniwig/mod.rs | 64 +++++++++---------------------------- gtars/src/uniwig/writing.rs | 34 +++++++++++++------- gtars/tests/test.rs | 12 +++---- 4 files changed, 43 insertions(+), 68 deletions(-) diff --git a/gtars/Cargo.toml b/gtars/Cargo.toml index a5708eb..462af9a 100644 --- a/gtars/Cargo.toml +++ b/gtars/Cargo.toml @@ -32,7 +32,6 @@ base64-url = "2.0.0" sha2 = "0.10.7" md-5 = "0.10.5" seq_io = "0.3.2" -serde_json = "1.0.135" [dev-dependencies] diff --git a/gtars/src/uniwig/mod.rs b/gtars/src/uniwig/mod.rs index 1f728ae..38803c0 100644 --- a/gtars/src/uniwig/mod.rs +++ b/gtars/src/uniwig/mod.rs @@ -34,7 +34,6 @@ use std::str::FromStr; use std::sync::{Arc, Mutex}; use std::thread; use tokio::runtime; -use serde_json; pub mod cli; pub mod counting; @@ -249,17 +248,9 @@ pub fn uniwig_main( } }; - let mut npy_meta_data: HashMap> = HashMap::new(); - let mut arc_npy_meta_data = Arc::new(Mutex::new(npy_meta_data)); - let mut chromosome_data_clone = Arc::clone(&arc_npy_meta_data); - match input_filetype { //BED AND NARROWPEAK WORKFLOW Ok(FileType::BED) | Ok(FileType::NARROWPEAK) => { - // Pare down chromosomes if necessary - let mut final_chromosomes = - get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score); - // Some housekeeping depending on output type let og_output_type = output_type; // need this later for conversion let mut output_type = output_type; @@ -267,25 +258,9 @@ pub fn uniwig_main( output_type = "bedGraph" // we must create bedgraphs first before creating bigwig files } - if output_type == "npy"{ - // populate hashmap for the npy meta data - let mut arc_npy_meta_data_locked = arc_npy_meta_data.lock().unwrap(); - for chromosome in final_chromosomes.iter(){ - let chr_name = chromosome.chrom.clone(); - let current_chrom_size = - *chrom_sizes.get(&chromosome.chrom).unwrap() as i32; - - arc_npy_meta_data_locked.insert( - chr_name, - HashMap::from([ - ("stepsize".to_string(), stepsize), - ("reported_chrom_size".to_string(), current_chrom_size), - ]), - ); - - } - - } + // Pare down chromosomes if necessary + let mut final_chromosomes = + get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score); let bar = ProgressBar::new(final_chromosomes.len() as u64); @@ -373,7 +348,6 @@ pub fn uniwig_main( "{}{}_{}.{}", bwfileheader, chrom_name, "start", output_type ); - write_to_npy_file( &count_result.0, file_name.clone(), @@ -382,8 +356,8 @@ pub fn uniwig_main( primary_start.0, smoothsize, ), - &mut chromosome_data_clone, - "start", + stepsize, + meta_data_file_names[0].clone(), ); } _ => { @@ -400,8 +374,8 @@ pub fn uniwig_main( primary_start.0, smoothsize, ), - &mut chromosome_data_clone, - "start", + stepsize, + meta_data_file_names[0].clone(), ); } } @@ -475,8 +449,8 @@ pub fn uniwig_main( smoothsize, 0 ), - &mut chromosome_data_clone, - "end", + stepsize, + meta_data_file_names[1].clone(), ); } _ => { @@ -494,8 +468,8 @@ pub fn uniwig_main( smoothsize, 0 ), - &mut chromosome_data_clone, - "end", + stepsize, + meta_data_file_names[1].clone(), ); } } @@ -562,8 +536,8 @@ pub fn uniwig_main( primary_start.0, 0, ), - &mut chromosome_data_clone, - "core", + stepsize, + meta_data_file_names[2].clone(), ); } _ => { @@ -580,8 +554,8 @@ pub fn uniwig_main( primary_start.0, 0, ), - &mut chromosome_data_clone, - "core", + stepsize, + meta_data_file_names[2].clone(), ); } } @@ -613,14 +587,6 @@ pub fn uniwig_main( ); } } - "npy" => { - //write combined metadata - let json_string = serde_json::to_string_pretty(&npy_meta_data).unwrap(); - let combined_npy_meta_file_path = format!("{}{}.{}", bwfileheader, "npy_meta", "json"); - let mut file = File::create(combined_npy_meta_file_path).unwrap(); - file.write_all(json_string.as_bytes()).unwrap(); - - } _ => {} } bar.finish(); diff --git a/gtars/src/uniwig/writing.rs b/gtars/src/uniwig/writing.rs index 14b82c9..baebb37 100644 --- a/gtars/src/uniwig/writing.rs +++ b/gtars/src/uniwig/writing.rs @@ -8,20 +8,16 @@ use std::fs::{create_dir_all, remove_file, File, OpenOptions}; use std::io::{BufWriter, Write}; use std::path::PathBuf; use std::{fs, io}; -use std::collections::HashMap; -use std::sync::{Arc, Mutex}; -/// Write output to npy files AND update the meta_data hashmap +/// Write output to npy files pub fn write_to_npy_file( counts: &[u32], filename: String, chromname: String, start_position: i32, - npy_meta_data_map: &mut Arc>>>, - out_selection: &str, + stepsize: i32, + metafilename: String, ) { - let mut chromosome_data_guard = npy_meta_data_map.lock().unwrap(); - // For future reference `&Vec` is a SLICE and thus we must use the `to_vec` function below when creating an array // https://users.rust-lang.org/t/why-does-std-to-vec-exist/45893/9 @@ -29,11 +25,27 @@ pub fn write_to_npy_file( let arr = Array::from_vec(counts.to_vec()); let _ = write_npy(filename, &arr); - // Write to the metadata hashmap - if let Some(current_chr_data) = chromosome_data_guard.get_mut(chromname.as_str()) { - current_chr_data.insert(out_selection.to_string(), start_position); - } + // Write to the metadata file. + // Note: there should be a single metadata file for starts, ends and core + + let path = std::path::Path::new(&metafilename).parent().unwrap(); + let _ = create_dir_all(path); + + let mut file = OpenOptions::new() + .create(true) // Create the file if it doesn't exist + .append(true) // Append data to the existing file if it does exist + .open(metafilename) + .unwrap(); + // The original wiggle file header. This can be anything we wish it to be. Currently space delimited. + let mut wig_header = "fixedStep chrom=".to_string() + + chromname.as_str() + + " start=" + + start_position.to_string().as_str() + + " step=" + + stepsize.to_string().as_str(); + wig_header.push('\n'); + file.write_all(wig_header.as_ref()).unwrap(); } /// Write either combined bedGraph, wiggle files, and bed files diff --git a/gtars/tests/test.rs b/gtars/tests/test.rs index 433ce80..e869197 100644 --- a/gtars/tests/test.rs +++ b/gtars/tests/test.rs @@ -673,12 +673,10 @@ mod tests { let tempbedpath = format!("{}{}", path_to_crate, "/tests/data/test5.bed"); let combinedbedpath = tempbedpath.as_str(); - let combinedbedpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy3.bed"; - //let combinedbedpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/chr1415.bed"; + //let combinedbedpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy4.bed"; let chromsizerefpath = combinedbedpath; - let chromsizerefpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy.chrom.sizes"; - //let chromsizerefpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/test.chrom.sizes"; + //let chromsizerefpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy.chrom.sizes"; let tempdir = tempfile::tempdir().unwrap(); let path = PathBuf::from(&tempdir.path()); @@ -687,16 +685,16 @@ mod tests { let bwfileheader_path = path.into_os_string().into_string().unwrap(); let bwfileheader = bwfileheader_path.as_str(); - let bwfileheader = "/home/drc/Downloads/unwig_testing_19dec2024/output/npy_output/"; + //let bwfileheader = "/home/drc/Downloads/unwig_testing_19dec2024/output/npy_output/"; //let bwfileheader = "/home/drc/Downloads/unwig_testing_19dec2024/output/wig_output/"; - let smoothsize: i32 = 10; + let smoothsize: i32 = 2; let output_type = "npy"; //let output_type = "wig"; let filetype = "bed"; let num_threads = 6; let score = false; - let stepsize = 5; + let stepsize = 1; let zoom = 0; let vec_count_type = vec!["start", "end", "core"]; From 5f5973bfd216512652579b668bc8fb2dfc21a328 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 9 Jan 2025 17:46:46 -0500 Subject: [PATCH 4/6] working solution for #65 --- gtars/Cargo.toml | 1 + gtars/src/uniwig/mod.rs | 65 +++++++++++++++++++++++++++++++++++++---- gtars/tests/test.rs | 6 ---- 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/gtars/Cargo.toml b/gtars/Cargo.toml index 462af9a..a5708eb 100644 --- a/gtars/Cargo.toml +++ b/gtars/Cargo.toml @@ -32,6 +32,7 @@ base64-url = "2.0.0" sha2 = "0.10.7" md-5 = "0.10.5" seq_io = "0.3.2" +serde_json = "1.0.135" [dev-dependencies] diff --git a/gtars/src/uniwig/mod.rs b/gtars/src/uniwig/mod.rs index 38803c0..9feb9d5 100644 --- a/gtars/src/uniwig/mod.rs +++ b/gtars/src/uniwig/mod.rs @@ -5,7 +5,7 @@ use indicatif::ProgressBar; use rayon::prelude::*; use std::error::Error; -use std::fs::File; +use std::fs::{remove_file, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; use crate::uniwig::counting::{ @@ -237,6 +237,8 @@ pub fn uniwig_main( meta_data_file_names[1] = format!("{}{}.{}", bwfileheader, "end", "meta"); meta_data_file_names[2] = format!("{}{}.{}", bwfileheader, "core", "meta"); + let mut npy_meta_data_map: HashMap> = HashMap::new(); + let chrom_sizes = match read_chromosome_sizes(chromsizerefpath) { // original program gets chromosome size from a .sizes file, e.g. chr1 248956422 // the original program simply pushes 0's until the end of the chromosome length and writes these to file. @@ -251,6 +253,10 @@ pub fn uniwig_main( match input_filetype { //BED AND NARROWPEAK WORKFLOW Ok(FileType::BED) | Ok(FileType::NARROWPEAK) => { + // Pare down chromosomes if necessary + let mut final_chromosomes = + get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score); + // Some housekeeping depending on output type let og_output_type = output_type; // need this later for conversion let mut output_type = output_type; @@ -258,10 +264,6 @@ pub fn uniwig_main( output_type = "bedGraph" // we must create bedgraphs first before creating bigwig files } - // Pare down chromosomes if necessary - let mut final_chromosomes = - get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score); - let bar = ProgressBar::new(final_chromosomes.len() as u64); // Pool installs iterator via rayon crate @@ -587,6 +589,59 @@ pub fn uniwig_main( ); } } + "npy" =>{ + // populate hashmap for the npy meta data + for chromosome in final_chromosomes.iter(){ + let chr_name = chromosome.chrom.clone(); + let current_chrom_size = + *chrom_sizes.get(&chromosome.chrom).unwrap() as i32; + npy_meta_data_map.insert( + chr_name, + HashMap::from([ + ("stepsize".to_string(), stepsize), + ("reported_chrom_size".to_string(), current_chrom_size), + ]), + ); + } + + for location in vec_count_type.iter() { + + let temp_meta_file_name = format!("{}{}.{}", bwfileheader, *location, "meta"); + + if let Ok(file) = File::open(&temp_meta_file_name) { + + let reader = BufReader::new(file); + + for line in reader.lines() { + let line = line.unwrap(); + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 3 { + let chrom = parts[1].split('=') + .nth(1) + .expect("Processing npy metadata file: Missing chromosome in line"); + let start_str = parts[2].split('=') + .nth(1) + .expect("Processing npy metadata file: Missing start position in line"); + let starting_position: i32 = start_str.parse().expect("Processing npy metadata file: Invalid start position"); + + if let Some(current_chr_data) = npy_meta_data_map.get_mut(chrom) { + current_chr_data.insert((*location.to_string()).parse().unwrap(), starting_position); + } + } + } + // Remove the file after it is used. + let path = std::path::Path::new(&temp_meta_file_name); + let _ = remove_file(path).unwrap(); + } + + } + //write combined metadata as json + let json_string = serde_json::to_string_pretty(&npy_meta_data_map).unwrap(); + let combined_npy_meta_file_path = format!("{}{}.{}", bwfileheader, "npy_meta", "json"); + let mut file = File::create(combined_npy_meta_file_path).unwrap(); + file.write_all(json_string.as_bytes()).unwrap(); + + } _ => {} } bar.finish(); diff --git a/gtars/tests/test.rs b/gtars/tests/test.rs index e869197..5eb8993 100644 --- a/gtars/tests/test.rs +++ b/gtars/tests/test.rs @@ -673,10 +673,8 @@ mod tests { let tempbedpath = format!("{}{}", path_to_crate, "/tests/data/test5.bed"); let combinedbedpath = tempbedpath.as_str(); - //let combinedbedpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy4.bed"; let chromsizerefpath = combinedbedpath; - //let chromsizerefpath = "/home/drc/Downloads/unwig_testing_19dec2024/input/dummy.chrom.sizes"; let tempdir = tempfile::tempdir().unwrap(); let path = PathBuf::from(&tempdir.path()); @@ -685,12 +683,8 @@ mod tests { let bwfileheader_path = path.into_os_string().into_string().unwrap(); let bwfileheader = bwfileheader_path.as_str(); - //let bwfileheader = "/home/drc/Downloads/unwig_testing_19dec2024/output/npy_output/"; - //let bwfileheader = "/home/drc/Downloads/unwig_testing_19dec2024/output/wig_output/"; - let smoothsize: i32 = 2; let output_type = "npy"; - //let output_type = "wig"; let filetype = "bed"; let num_threads = 6; let score = false; From 8ae3d414d11b9272a78ae63f3e8d69eec5eff3d0 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Thu, 9 Jan 2025 17:47:05 -0500 Subject: [PATCH 5/6] cargo fmt --- gtars/src/common/utils.rs | 5 ++--- gtars/src/digests/mod.rs | 18 +++++++-------- gtars/src/uniwig/counting.rs | 1 - gtars/src/uniwig/mod.rs | 43 +++++++++++++++++++----------------- 4 files changed, 34 insertions(+), 33 deletions(-) diff --git a/gtars/src/common/utils.rs b/gtars/src/common/utils.rs index f1d5fc1..4a4bec0 100644 --- a/gtars/src/common/utils.rs +++ b/gtars/src/common/utils.rs @@ -39,8 +39,8 @@ pub fn get_dynamic_reader(path: &Path) -> Result>> { /// /// - file_path: path to the file to read, or '-' for stdin /// -/// # Returns -/// +/// # Returns +/// /// A `BufReader` object for a given file path or stdin. pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result>> { if file_path_str == "-" { @@ -51,7 +51,6 @@ pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result String { format!("{:x}", result) } - /// Processes a FASTA file to compute the digests of each sequence in the file. /// /// This function reads a FASTA file, computes the SHA-512 and MD5 digests for each sequence, @@ -103,7 +102,8 @@ pub fn digest_fasta(file_path: &str) -> Result> { let file_reader = get_dynamic_reader(&path)?; let mut fasta_reader = Reader::new(file_reader); let mut results = Vec::new(); - while let Some(record) = fasta_reader.next() { // returns a RefRecord object + while let Some(record) = fasta_reader.next() { + // returns a RefRecord object let record = record.expect("Error found when retrieving next record."); let id = record.id().expect("No ID found for the FASTA record"); let mut sha512_hasher = Sha512::new(); @@ -123,7 +123,7 @@ pub fn digest_fasta(file_path: &str) -> Result> { id: id.to_string(), length: length, sha512t24u: sha512, - md5: md5 + md5: md5, }); } Ok(results) @@ -169,10 +169,10 @@ mod tests { assert_eq!(results[0].sha512t24u, "iYtREV555dUFKg2_agSJW6suquUyPpMw"); assert_eq!(results[0].md5, "5f63cfaa3ef61f88c9635fb9d18ec945"); } - + #[test] fn bogus_fasta_file() { let result = digest_fasta("tests/data/bogus.fa"); assert!(result.is_err(), "Expected an error for a bogus fasta file"); } -} \ No newline at end of file +} diff --git a/gtars/src/uniwig/counting.rs b/gtars/src/uniwig/counting.rs index 1165c9f..4b3415d 100644 --- a/gtars/src/uniwig/counting.rs +++ b/gtars/src/uniwig/counting.rs @@ -35,7 +35,6 @@ pub fn start_end_counts( smoothsize: i32, stepsize: i32, ) -> (Vec, Vec) { - let mut v_coordinate_positions: Vec = Vec::new(); // these are the final coordinates after any adjustments let mut v_coord_counts: Vec = Vec::new(); // u8 stores 0:255 This may be insufficient. u16 max is 65535 diff --git a/gtars/src/uniwig/mod.rs b/gtars/src/uniwig/mod.rs index 9feb9d5..7b364cc 100644 --- a/gtars/src/uniwig/mod.rs +++ b/gtars/src/uniwig/mod.rs @@ -191,7 +191,7 @@ pub fn run_uniwig(matches: &ArgMatches) { } /// Ensures that the start position is at a minimum equal to `1` -fn clamped_start_position(start: i32, smoothsize: i32, wig_shift:i32) -> i32 { +fn clamped_start_position(start: i32, smoothsize: i32, wig_shift: i32) -> i32 { std::cmp::max(1, start - smoothsize + wig_shift) } /// Ensure that the start position is at a minimum equal to `0` @@ -222,7 +222,6 @@ pub fn uniwig_main( .build() .unwrap(); - // Determine Input File Type let input_filetype = FileType::from_str(filetype.to_lowercase().as_str()); // Set up output file names @@ -319,7 +318,7 @@ pub fn uniwig_main( clamped_start_position( primary_start.0, smoothsize, - 1 //must shift wiggle starts and core by 1 since it is 1 based + 1, //must shift wiggle starts and core by 1 since it is 1 based ), stepsize, current_chrom_size, @@ -449,7 +448,7 @@ pub fn uniwig_main( clamped_start_position( primary_end.0, smoothsize, - 0 + 0, ), stepsize, meta_data_file_names[1].clone(), @@ -468,7 +467,7 @@ pub fn uniwig_main( clamped_start_position( primary_end.0, smoothsize, - 0 + 0, ), stepsize, meta_data_file_names[1].clone(), @@ -520,7 +519,7 @@ pub fn uniwig_main( &core_results.0, file_name.clone(), chrom_name.clone(), - clamped_start_position(primary_start.0, 0,1), //starts are 1 based must be shifted by 1 + clamped_start_position(primary_start.0, 0, 1), //starts are 1 based must be shifted by 1 stepsize, current_chrom_size, ); @@ -589,9 +588,9 @@ pub fn uniwig_main( ); } } - "npy" =>{ + "npy" => { // populate hashmap for the npy meta data - for chromosome in final_chromosomes.iter(){ + for chromosome in final_chromosomes.iter() { let chr_name = chromosome.chrom.clone(); let current_chrom_size = *chrom_sizes.get(&chromosome.chrom).unwrap() as i32; @@ -605,27 +604,32 @@ pub fn uniwig_main( } for location in vec_count_type.iter() { - - let temp_meta_file_name = format!("{}{}.{}", bwfileheader, *location, "meta"); + let temp_meta_file_name = + format!("{}{}.{}", bwfileheader, *location, "meta"); if let Ok(file) = File::open(&temp_meta_file_name) { - let reader = BufReader::new(file); for line in reader.lines() { let line = line.unwrap(); let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() >= 3 { - let chrom = parts[1].split('=') - .nth(1) - .expect("Processing npy metadata file: Missing chromosome in line"); + let chrom = parts[1].split('=').nth(1).expect( + "Processing npy metadata file: Missing chromosome in line", + ); let start_str = parts[2].split('=') .nth(1) .expect("Processing npy metadata file: Missing start position in line"); - let starting_position: i32 = start_str.parse().expect("Processing npy metadata file: Invalid start position"); + let starting_position: i32 = start_str.parse().expect( + "Processing npy metadata file: Invalid start position", + ); - if let Some(current_chr_data) = npy_meta_data_map.get_mut(chrom) { - current_chr_data.insert((*location.to_string()).parse().unwrap(), starting_position); + if let Some(current_chr_data) = npy_meta_data_map.get_mut(chrom) + { + current_chr_data.insert( + (*location.to_string()).parse().unwrap(), + starting_position, + ); } } } @@ -633,14 +637,13 @@ pub fn uniwig_main( let path = std::path::Path::new(&temp_meta_file_name); let _ = remove_file(path).unwrap(); } - } //write combined metadata as json let json_string = serde_json::to_string_pretty(&npy_meta_data_map).unwrap(); - let combined_npy_meta_file_path = format!("{}{}.{}", bwfileheader, "npy_meta", "json"); + let combined_npy_meta_file_path = + format!("{}{}.{}", bwfileheader, "npy_meta", "json"); let mut file = File::create(combined_npy_meta_file_path).unwrap(); file.write_all(json_string.as_bytes()).unwrap(); - } _ => {} } From bb5bc897ca4d11435d9c0c63780c16a2c1faa810 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Fri, 10 Jan 2025 11:42:05 -0500 Subject: [PATCH 6/6] comment out r-devel test --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 15a1bce..1d20979 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -19,7 +19,7 @@ jobs: # - {os: windows-latest, r: 'release', rust-version: 'stable-msvc', rust-target: 'x86_64-pc-windows-gnu'} - {os: macOS-latest, r: 'release', rust-version: 'stable'} - {os: ubuntu-latest, r: 'release', rust-version: 'stable'} - - {os: ubuntu-latest, r: 'devel', rust-version: 'stable'} + #- {os: ubuntu-latest, r: 'devel', rust-version: 'stable'} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true steps: