Skip to content

Commit

Permalink
applicativesystem
Browse files Browse the repository at this point in the history
finished writing the code today and had no access to internet and now pushing it ahead. Final and binary release.
  • Loading branch information
applicativesystem committed Dec 18, 2024
1 parent 6001d41 commit 7316a4f
Show file tree
Hide file tree
Showing 8 changed files with 162 additions and 66 deletions.
23 changes: 20 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,33 @@
# rust-suffix-arrays

- This rust emphasis on the following part for the implementation.
- This rust emphasis on the following part for the implementation.
- rust suffix array building from two points: 1. LCP and 2. Kasai path generation.
- please see the other trait for the Kasai and Ukonnen implemented using this crate to build the graph.
- general note: Incase of Golang and RUST, please see the last commit message and if it says compiled binary then it is completed or else still in development version.

- rust suffix array
- rust wavelet structure
- rust lcp from suffix arrays for suffix tries.


```
cargo build
```

```
➜ gauravsablok rust-suffix-arrays-tries git:(main) ✗ ./target/debug/rust-suffix-arrays -h
Usage: rust-suffix-arrays <SUFFIXARRAY_ARG>
Arguments:
<SUFFIXARRAY_ARG> please provide the path to the fastq file
Options:
-h, --help Print help
-V, --version Print version
```
- to run the binary
```
./target/debug/rust-suffix-arrays ./sample-files/sample.fasta
```

Gaurav Sablok
Binary file added rust-suffix-arrays
Binary file not shown.
Binary file added rust-suffix-arrays.tar
Binary file not shown.
2 changes: 2 additions & 0 deletions sample-files/sample.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
@samplefasta
ATATAGCATGATCATAGACTGATCA
2 changes: 2 additions & 0 deletions sample-files/suffix-sorted-trees.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ATATAGCATGATCATAGACTGATCA [(24, "A"), (17, "ACTGATCA"), (15, "AGACTGATCA"), (4, "AGCATGATCATAGACTGATCA"), (13, "ATAGACTGATCA"), (2, "ATAGCATGATCATAGACTGATCA"), (0, "ATATAGCATGATCATAGACTGATCA"), (21, "ATCA"), (10, "ATCATAGACTGATCA"), (7, "ATGATCATAGACTGATCA"), (23, "CA"), (12, "CATAGACTGATCA"), (6, "CATGATCATAGACTGATCA"), (18, "CTGATCA"), (16, "GACTGATCA"), (20, "GATCA"), (9, "GATCATAGACTGATCA"), (5, "GCATGATCATAGACTGATCA"), (14, "TAGACTGATCA"), (3, "TAGCATGATCATAGACTGATCA"), (1, "TATAGCATGATCATAGACTGATCA"), (22, "TCA"), (11, "TCATAGACTGATCA"), (19, "TGATCA"), (8, "TGATCATAGACTGATCA")]

50 changes: 50 additions & 0 deletions sample-files/tuple_suffix_array.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
24 A

17 ACTGATCA

15 AGACTGATCA

4 AGCATGATCATAGACTGATCA

13 ATAGACTGATCA

2 ATAGCATGATCATAGACTGATCA

0 ATATAGCATGATCATAGACTGATCA

21 ATCA

10 ATCATAGACTGATCA

7 ATGATCATAGACTGATCA

23 CA

12 CATAGACTGATCA

6 CATGATCATAGACTGATCA

18 CTGATCA

16 GACTGATCA

20 GATCA

9 GATCATAGACTGATCA

5 GCATGATCATAGACTGATCA

14 TAGACTGATCA

3 TAGCATGATCATAGACTGATCA

1 TATAGCATGATCATAGACTGATCA

22 TCA

11 TCATAGACTGATCA

19 TGATCA

8 TGATCATAGACTGATCA

6 changes: 2 additions & 4 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@ use clap::Parser;
#[derive(Debug, Parser)]
#[clap(version)]

pub struct DEBruijnArgs {
pub struct SuffixArraysArgs {
/// please provide the path to the fastq file
pub debruijn_arg: String,
/// please provide the kmer for the graph construction
pub kmer_arg: usize,
pub suffixarray_arg: String
}
145 changes: 86 additions & 59 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,39 +1,31 @@
mod args;
use args::DEBruijnArgs;
use args::SuffixArraysArgs;
use clap::Parser;
use std::collections::BTreeMap;
use std::collections::HashSet;
use std::error::Error;
use std::fs::File;
use std::io::{BufRead, BufReader, Write};

/*
*Author Gaurav Sablok
*Universitat Potsdam
*Date 2024-12-17
rust-debruijn-algorithm: implementation of debruijns algorithm and
graph traversal in RUST. This implements three data structure, one of the
data structure is the BTreeMap which allows for for the node to the edge representation.
This alow to see that if the key is present in the BTreemap then visit the ode and retrieve
the edges or else dont for the tree traversal.
*Date 2024-12-18
rust-suffix-arrays: implementing the suffix arrays building as BtreeMaps,
so that when you have to get the edge of the suffix array, you dont have to traverse
throug the entire array. This will maintain the order of the heap for
later implementation of the LCP block or tree construction using the Kasai algorithm.
* */

fn main() {
let args = DEBruijnArgs::parse();
let debruijn_longread_output = debruijn_longread(&args.debruijn_arg, args.kmer_arg).unwrap();
println!("The bwt has been written: {}", debruijn_longread_output);
let args = SuffixArraysArgs::parse();
let suffix_output = suffix_array(&args.suffixarray_arg).unwrap();
sorted_suffix_array();
println!("The suffix arrays has been written: {}", suffix_output);
}

fn debruijn_longread(path: &str, overlap: usize) -> Result<String, Box<dyn Error>> {
/* the debruijn graph is constructed and hold the nodes and egdes into the Btreemap and
* TheBRtreeMap is further inserted into the Vector which allows for he faster revtriveal
* of the data.
*
* */

fn suffix_array(path: &str) -> Result<String, Box<dyn Error>> {
let file_appear = File::open(path).expect("file not present");
let fileread_appear = BufReader::new(file_appear);
let mut filecontent: Vec<String> = Vec::new();
Expand All @@ -51,57 +43,92 @@ fn debruijn_longread(path: &str, overlap: usize) -> Result<String, Box<dyn Error
}
}

#[derive(Debug, Clone, PartialEq, PartialOrd)]
struct NodeEdgeHold {
node: String,
edge: String,
}
// forming a binary heap for the suffix arrays so that the order of the suffixes
// can be kept intact and on the heap so that i can look for the top prefix using
// traversal or implementation for a inverse suffix.

#[derive(Debug, Clone, PartialEq, PartialOrd)]
struct MapStore {
node_edge_relation: Vec<NodeEdgeHold>,
uniq_node: Vec<String>,
}
#[derive(Debug, Clone)]

let mut debruijn: BTreeMap<String, MapStore> = BTreeMap::new();
struct SuffixMap {
indices: Vec<usize>,
array: Vec<String>,
}
let mut final_suffix_array: BTreeMap<String, SuffixMap> = BTreeMap::new();
let mut final_suffix_sorted_btreemap: BTreeMap<String, Vec<(usize, String)>> = BTreeMap::new();
for i in 0..filecontent_seq.len() {
let contentseq: Vec<String> = filecontent_seq[i]
let seqhold: Vec<_> = filecontent_seq[i]
.chars()
.map(|x| String::from(x))
.collect::<Vec<String>>();
let mut edge_map: Vec<String> = Vec::new();
let mut node_map: Vec<String> = Vec::new();
for i in 0..=contentseq.len() - overlap {
node_map.push(contentseq[i..i + overlap - 1].join(""));
edge_map.push(contentseq[i + 1..i + overlap].join(""));
}
let unique_node: HashSet<_> = node_map
.iter()
.collect::<HashSet<_>>();
let mut final_node_unique:Vec<String> = Vec::new();
for i in unique_node.iter(){
final_node_unique.push(i.to_string());
.collect::<Vec<_>>();
let mut heap_indices: Vec<usize> = Vec::new();
let mut heap_string: Vec<String> = Vec::new();
for i in 0..seqhold.len() {
heap_indices.push(i);
heap_string.push(seqhold[i..seqhold.len()].join("").to_string());
}
let mut node_edge_tuple: Vec<NodeEdgeHold> = Vec::new();
for (nodeiter, edgeiter) in node_map.iter().zip(edge_map.iter()) {
node_edge_tuple.push(NodeEdgeHold {
node: nodeiter.to_string(),
edge: edgeiter.to_string(),
})
let mut suffix_hold: Vec<(usize, String)> = Vec::new();
for (indices, str) in heap_indices.iter().zip(heap_string.iter()) {
suffix_hold.push((*indices, str.to_string()));
}
debruijn.insert(
filecontent_seq[i].clone(),
MapStore {
node_edge_relation: node_edge_tuple,
uniq_node: final_node_unique,
suffix_hold.sort_by(|a, b| a.1.cmp(&b.1));

final_suffix_sorted_btreemap.insert(filecontent_seq[i].clone(), suffix_hold);
final_suffix_array.insert(
seqhold.join(""),
SuffixMap {
indices: heap_indices,
array: heap_string,
},
);
}

let mut file_debruijn_write = File::create("debruijn-graph.txt").expect("file not pesent");
for i in debruijn.iter() {
writeln!(file_debruijn_write, "{:?}\t{:?}\n", i.0, i.1).expect("file not present");
let mut suffix_sorted_btreemap =
File::create("suffix-sorted-trees.txt").expect("file not present");

for (i,j) in final_suffix_sorted_btreemap.iter() {
writeln!(suffix_sorted_btreemap, "{}\t{:?}\n", i,j ).expect("line not present");
}

Ok("The debruijn graph construction has finished".to_string())
Ok("Binary Heaped Suffix arrays have been written".to_string())
}

fn sorted_suffix_array() {
/* This ives you a tuple based approach so that you can access the
* indices and the corresponding suffices by their value coordinates.
* */
let argsparse = SuffixArraysArgs::parse();
let fileopen = File::open(&argsparse.suffixarray_arg).unwrap();
let fileread = BufReader::new(fileopen);
let mut sequence: Vec<String> = Vec::new();
for i in fileread.lines() {
let line = i.expect("line not present");
if !line.starts_with("@") {
sequence.push(line);
}
}

let mut final_array: Vec<(usize, String)> = Vec::new();
let mut suffix_indices: Vec<usize> = Vec::new();
let mut suffix_strings: Vec<String> = Vec::new();
for i in 0..sequence.len() {
let seq_hold = sequence[i]
.chars()
.map(|x| String::from(x))
.collect::<Vec<_>>();
for i in 0..seq_hold.len() {
suffix_indices.push(i);
suffix_strings.push(seq_hold[i..seq_hold.len()].join("").to_string());
}

for (indices, string) in suffix_indices.iter().zip(suffix_strings.iter()) {
final_array.push((*indices, string.to_string()));
}
}

final_array.sort_by(|a, b| a.1.cmp(&b.1));

let mut final_tuple_write = File::create("tuple_suffix_array.txt").expect("file not present");
for i in final_array.iter() {
writeln!(final_tuple_write, "{}\t{}\n", i.0, i.1).expect("line not present");
}
}

0 comments on commit 7316a4f

Please sign in to comment.