Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for #64 and #65 #66

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions gtars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ base64-url = "2.0.0"
sha2 = "0.10.7"
md-5 = "0.10.5"
seq_io = "0.3.2"
serde_json = "1.0.135"


[dev-dependencies]
Expand Down
5 changes: 2 additions & 3 deletions gtars/src/common/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> {
///
/// - file_path: path to the file to read, or '-' for stdin
///
/// # Returns
///
/// # Returns
///
/// A `BufReader` object for a given file path or stdin.
pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<dyn Read>>> {
if file_path_str == "-" {
Expand All @@ -51,7 +51,6 @@ pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<d
}
}


///
/// Create a region-to-id hash-map from a list of regions
///
Expand Down
18 changes: 9 additions & 9 deletions gtars/src/digests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,21 @@
//! # Usage
//!
//! The `sha512t24u` function can be used to compute the GA4GH sha512t24 checksum of a string.
//!
//!
//! ```rust
//! use gtars::digests::sha512t24u;
//!
//! let digest = sha512t24u("hello world");
//! ```
use std::io::prelude::{Read, Write};
use std::io;
use std::fs::File;
use std::io;
use std::io::prelude::{Read, Write};
use std::path::Path;

use anyhow::Result;
use md5::Md5;
use seq_io::fasta::{Reader, Record, RefRecord};
use sha2::{Digest, Sha512};
use seq_io::fasta::{Reader, RefRecord, Record};

use crate::common::utils::get_dynamic_reader;

Expand Down Expand Up @@ -75,7 +75,6 @@ pub fn md5(string: &str) -> String {
format!("{:x}", result)
}


/// Processes a FASTA file to compute the digests of each sequence in the file.
///
/// This function reads a FASTA file, computes the SHA-512 and MD5 digests for each sequence,
Expand Down Expand Up @@ -103,7 +102,8 @@ pub fn digest_fasta(file_path: &str) -> Result<Vec<DigestResult>> {
let file_reader = get_dynamic_reader(&path)?;
let mut fasta_reader = Reader::new(file_reader);
let mut results = Vec::new();
while let Some(record) = fasta_reader.next() { // returns a RefRecord object
while let Some(record) = fasta_reader.next() {
// returns a RefRecord object
let record = record.expect("Error found when retrieving next record.");
let id = record.id().expect("No ID found for the FASTA record");
let mut sha512_hasher = Sha512::new();
Expand All @@ -123,7 +123,7 @@ pub fn digest_fasta(file_path: &str) -> Result<Vec<DigestResult>> {
id: id.to_string(),
length: length,
sha512t24u: sha512,
md5: md5
md5: md5,
});
}
Ok(results)
Expand Down Expand Up @@ -169,10 +169,10 @@ mod tests {
assert_eq!(results[0].sha512t24u, "iYtREV555dUFKg2_agSJW6suquUyPpMw");
assert_eq!(results[0].md5, "5f63cfaa3ef61f88c9635fb9d18ec945");
}

#[test]
fn bogus_fasta_file() {
let result = digest_fasta("tests/data/bogus.fa");
assert!(result.is_err(), "Expected an error for a bogus fasta file");
}
}
}
12 changes: 4 additions & 8 deletions gtars/src/uniwig/counting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,7 @@ pub fn start_end_counts(
chrom_size: i32,
smoothsize: i32,
stepsize: i32,
shift: i32,
) -> (Vec<u32>, Vec<i32>) {
//let vin_iter = starts_vector.iter();

let mut v_coordinate_positions: Vec<i32> = Vec::new(); // these are the final coordinates after any adjustments
let mut v_coord_counts: Vec<u32> = Vec::new(); // u8 stores 0:255 This may be insufficient. u16 max is 65535

Expand All @@ -55,7 +52,7 @@ pub fn start_end_counts(

adjusted_start_site = starts_vector[0]; // get first coordinate position

adjusted_start_site.0 = adjusted_start_site.0 - smoothsize + shift;
adjusted_start_site.0 = adjusted_start_site.0 - smoothsize;

current_end_site = adjusted_start_site;
current_end_site.0 = adjusted_start_site.0 + 1 + smoothsize * 2;
Expand All @@ -74,7 +71,7 @@ pub fn start_end_counts(
coordinate_value = *coord;

adjusted_start_site = coordinate_value;
adjusted_start_site.0 = coordinate_value.0 - smoothsize + shift;
adjusted_start_site.0 = coordinate_value.0 - smoothsize;

let current_score = adjusted_start_site.1;

Expand Down Expand Up @@ -164,7 +161,6 @@ pub fn core_counts(
ends_vector: &[(i32, i32)],
chrom_size: i32,
stepsize: i32,
shift: i32,
) -> (Vec<u32>, Vec<i32>) {
let mut v_coordinate_positions: Vec<i32> = Vec::new(); // these are the final coordinates after any adjustments
let mut v_coord_counts: Vec<u32> = Vec::new(); // u8 stores 0:255 This may be insufficient. u16 max is 65535
Expand All @@ -184,7 +180,7 @@ pub fn core_counts(
current_start_site = starts_vector[0]; // get first coordinate position
current_end_site = ends_vector[0];

current_start_site.0 = current_start_site.0 + shift;
current_start_site.0 = current_start_site.0;

if current_start_site.0 < 1 {
current_start_site.0 = 1;
Expand All @@ -201,7 +197,7 @@ pub fn core_counts(

current_start_site = coordinate_value;

current_start_site.0 = current_start_site.0 + shift;
current_start_site.0 = current_start_site.0;

let current_score = current_start_site.1;
count += current_score;
Expand Down
103 changes: 84 additions & 19 deletions gtars/src/uniwig/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

use rayon::prelude::*;
use std::error::Error;
use std::fs::File;
use std::fs::{remove_file, File};
use std::io::{BufRead, BufReader, BufWriter, Write};

use crate::uniwig::counting::{
Expand Down Expand Up @@ -191,8 +191,8 @@
}

/// Ensures that the start position is at a minimum equal to `1`
fn clamped_start_position(start: i32, smoothsize: i32) -> i32 {
std::cmp::max(1, start - smoothsize)
fn clamped_start_position(start: i32, smoothsize: i32, wig_shift: i32) -> i32 {
std::cmp::max(1, start - smoothsize + wig_shift)
}
/// Ensure that the start position is at a minimum equal to `0`
fn clamped_start_position_zero_pos(start: i32, smoothsize: i32) -> i32 {
Expand Down Expand Up @@ -222,8 +222,6 @@
.build()
.unwrap();

let mut wig_shift: i32 = 0; // This will be set to 1 when writing to wiggle files, else always set to 0

// Determine Input File Type
let input_filetype = FileType::from_str(filetype.to_lowercase().as_str());
// Set up output file names
Expand All @@ -238,6 +236,8 @@
meta_data_file_names[1] = format!("{}{}.{}", bwfileheader, "end", "meta");
meta_data_file_names[2] = format!("{}{}.{}", bwfileheader, "core", "meta");

let mut npy_meta_data_map: HashMap<String, HashMap<String, i32>> = HashMap::new();

let chrom_sizes = match read_chromosome_sizes(chromsizerefpath) {
// original program gets chromosome size from a .sizes file, e.g. chr1 248956422
// the original program simply pushes 0's until the end of the chromosome length and writes these to file.
Expand All @@ -252,19 +252,16 @@
match input_filetype {
//BED AND NARROWPEAK WORKFLOW
Ok(FileType::BED) | Ok(FileType::NARROWPEAK) => {
// Pare down chromosomes if necessary
let mut final_chromosomes =
get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score);

// Some housekeeping depending on output type
let og_output_type = output_type; // need this later for conversion
let mut output_type = output_type;
if output_type == "bedgraph" || output_type == "bw" || output_type == "bigwig" {
output_type = "bedGraph" // we must create bedgraphs first before creating bigwig files
}
if output_type == "wig" {
wig_shift = 1;
}

// Pare down chromosomes if necessary
let mut final_chromosomes =
get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score);

let bar = ProgressBar::new(final_chromosomes.len() as u64);

Expand Down Expand Up @@ -299,7 +296,6 @@
current_chrom_size,
smoothsize,
stepsize,
wig_shift,
);

match output_type {
Expand All @@ -322,6 +318,7 @@
clamped_start_position(
primary_start.0,
smoothsize,
1, //must shift wiggle starts and core by 1 since it is 1 based
),
stepsize,
current_chrom_size,
Expand Down Expand Up @@ -390,7 +387,6 @@
current_chrom_size,
smoothsize,
stepsize,
wig_shift,
);
match output_type {
"file" => {
Expand All @@ -411,6 +407,7 @@
clamped_start_position(
primary_end.0,
smoothsize,
0,
),
);
write_to_bed_graph_file(
Expand All @@ -432,6 +429,7 @@
clamped_start_position(
primary_end.0,
smoothsize,
0, // ends already 1 based, do not shift further
),
stepsize,
current_chrom_size,
Expand All @@ -450,6 +448,7 @@
clamped_start_position(
primary_end.0,
smoothsize,
0,
),
stepsize,
meta_data_file_names[1].clone(),
Expand All @@ -468,6 +467,7 @@
clamped_start_position(
primary_end.0,
smoothsize,
0,

Check warning on line 470 in gtars/src/uniwig/mod.rs

View check run for this annotation

Codecov / codecov/patch

gtars/src/uniwig/mod.rs#L470

Added line #L470 was not covered by tests
),
stepsize,
meta_data_file_names[1].clone(),
Expand All @@ -481,7 +481,6 @@
&chromosome.ends,
current_chrom_size,
stepsize,
wig_shift,
);
match output_type {
"file" => {
Expand All @@ -499,7 +498,10 @@
let count_info: (Vec<u32>, Vec<u32>, Vec<u32>) =
compress_counts(
&mut core_results,
primary_start.0,
clamped_start_position_zero_pos(
primary_start.0,
0,
),
);
write_to_bed_graph_file(
&count_info,
Expand All @@ -517,7 +519,7 @@
&core_results.0,
file_name.clone(),
chrom_name.clone(),
clamped_start_position(primary_start.0, 0),
clamped_start_position(primary_start.0, 0, 1), //starts are 1 based must be shifted by 1
stepsize,
current_chrom_size,
);
Expand All @@ -531,7 +533,10 @@
&core_results.0,
file_name.clone(),
chrom_name.clone(),
primary_start.0,
clamped_start_position_zero_pos(
primary_start.0,
0,
),
stepsize,
meta_data_file_names[2].clone(),
);
Expand All @@ -546,7 +551,10 @@
&core_results.0,
file_name.clone(),
chrom_name.clone(),
primary_start.0,
clamped_start_position_zero_pos(
primary_start.0,
0,
),

Check warning on line 557 in gtars/src/uniwig/mod.rs

View check run for this annotation

Codecov / codecov/patch

gtars/src/uniwig/mod.rs#L554-L557

Added lines #L554 - L557 were not covered by tests
stepsize,
meta_data_file_names[2].clone(),
);
Expand Down Expand Up @@ -580,6 +588,63 @@
);
}
}
"npy" => {
// populate hashmap for the npy meta data
for chromosome in final_chromosomes.iter() {
let chr_name = chromosome.chrom.clone();
let current_chrom_size =
*chrom_sizes.get(&chromosome.chrom).unwrap() as i32;
npy_meta_data_map.insert(
chr_name,
HashMap::from([
("stepsize".to_string(), stepsize),
("reported_chrom_size".to_string(), current_chrom_size),
]),
);
}

for location in vec_count_type.iter() {
let temp_meta_file_name =
format!("{}{}.{}", bwfileheader, *location, "meta");

if let Ok(file) = File::open(&temp_meta_file_name) {
let reader = BufReader::new(file);

for line in reader.lines() {
let line = line.unwrap();
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 3 {
let chrom = parts[1].split('=').nth(1).expect(
"Processing npy metadata file: Missing chromosome in line",
);
let start_str = parts[2].split('=')
.nth(1)
.expect("Processing npy metadata file: Missing start position in line");
let starting_position: i32 = start_str.parse().expect(
"Processing npy metadata file: Invalid start position",
);

if let Some(current_chr_data) = npy_meta_data_map.get_mut(chrom)
{
current_chr_data.insert(
(*location.to_string()).parse().unwrap(),
starting_position,
);
}
}

Check warning on line 634 in gtars/src/uniwig/mod.rs

View check run for this annotation

Codecov / codecov/patch

gtars/src/uniwig/mod.rs#L634

Added line #L634 was not covered by tests
}
// Remove the file after it is used.
let path = std::path::Path::new(&temp_meta_file_name);
let _ = remove_file(path).unwrap();
}
}
//write combined metadata as json
let json_string = serde_json::to_string_pretty(&npy_meta_data_map).unwrap();
let combined_npy_meta_file_path =
format!("{}{}.{}", bwfileheader, "npy_meta", "json");
let mut file = File::create(combined_npy_meta_file_path).unwrap();
file.write_all(json_string.as_bytes()).unwrap();
}
_ => {}
}
bar.finish();
Expand Down
5 changes: 3 additions & 2 deletions gtars/tests/data/out/_core.wig
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
fixedStep chrom=chr1 start=2 step=1
fixedStep chrom=chr1 start=3 step=1
2
2
3
4
2
2
1
2
1
1
Expand Down
Loading
Loading