Skip to content

Commit

Permalink
Merge pull request #39 from stijndcl/feature/rust-dat-parser
Browse files Browse the repository at this point in the history
Rust DAT parser
  • Loading branch information
stijndcl authored Feb 13, 2024
2 parents 934a19b + 293bcb7 commit 253afe6
Show file tree
Hide file tree
Showing 15 changed files with 938 additions and 26 deletions.
17 changes: 14 additions & 3 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
build:
name: Build binaries
runs-on: ubuntu-latest
needs: [dependencies]
needs: [ dependencies ]
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
Expand All @@ -37,7 +37,7 @@ jobs:
format:
name: Check formatting
runs-on: ubuntu-latest
needs: [dependencies]
needs: [ dependencies, build ]
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
Expand All @@ -51,7 +51,7 @@ jobs:
lint:
name: Linting
runs-on: ubuntu-latest
needs: [dependencies]
needs: [ dependencies, build ]
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
Expand All @@ -68,3 +68,14 @@ jobs:
github_token: ${{ secrets.GITHUB_TOKEN }}
workdir: scripts/helper_scripts/unipept-database-rs

test:
name: Run tests
runs-on: ubuntu-latest
needs: [ dependencies, build ]
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
with:
shared-key: ${{ env.CACHE_KEY }}
- run: cd scripts/helper_scripts/unipept-database-rs && cargo test
15 changes: 7 additions & 8 deletions scripts/helper_scripts/unipept-database-rs/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions scripts/helper_scripts/unipept-database-rs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ anyhow = "1.0.75"
bit-vec = "0.6.3"
chrono = "0.4.31"
clap = { version = "4.4.6", features = ["derive"] }
crossbeam-channel = "0.5.11"
regex = "1.10.2"
smartstring = { version = "1.0" }
strum = "0.25.0"
strum_macros = "0.25.3"
uniprot = "0.7.0"
lazy_static = "1.4.0"
num_cpus = "1.16.0"
31 changes: 31 additions & 0 deletions scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use anyhow::{Context, Result};
use clap::Parser;
use unipept_database::dat_parser::uniprot_dat_parser;
use unipept_database::dat_parser::utils::write_header;
use unipept_database::uniprot::UniprotType;

use unipept_database::utils::files::open_sin;

fn main() -> Result<()> {
let args = Cli::parse();
let reader = open_sin();

write_header();
let parser = uniprot_dat_parser(reader, args.threads);

for entry in parser {
entry
.context("Error parsing DAT entry")?
.write(&args.db_type);
}

Ok(())
}

#[derive(Parser, Debug)]
struct Cli {
#[clap(value_enum, short = 't', long, default_value_t = UniprotType::Swissprot)]
db_type: UniprotType,
#[clap(long, default_value_t = 0)]
threads: usize,
}
16 changes: 1 addition & 15 deletions scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::num::NonZeroUsize;
use anyhow::{Context, Result};
use clap::Parser;
use smartstring::{LazyCompact, SmartString};
use unipept_database::uniprot::UniprotType;
use uniprot::uniprot::{SequentialParser, ThreadedParser};

use unipept_database::utils::files::open_sin;
Expand Down Expand Up @@ -46,21 +47,6 @@ fn main() -> Result<()> {

type SmartStr = SmartString<LazyCompact>;

#[derive(clap::ValueEnum, Clone, Debug)]
enum UniprotType {
Swissprot,
Trembl,
}

impl UniprotType {
pub fn to_str(&self) -> &str {
match self {
UniprotType::Swissprot => "swissprot",
UniprotType::Trembl => "trembl",
}
}
}

// Parse a Uniprot XML file and convert it into a TSV-file
#[derive(Parser, Debug)]
struct Cli {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
use crate::dat_parser::entry::UniProtDATEntry;
use anyhow::{Context, Result};
use crossbeam_channel::{Receiver, Sender};
use std::thread;
use std::thread::JoinHandle;

/// A Consumer runs in a thread and constantly listens to a Receiver channel for raw data,
/// publishing parsed `UniProtDatEntry`s to a Sender channel
pub struct Consumer {
handle: Option<JoinHandle<()>>,
}

impl Consumer {
pub fn new() -> Self {
Self { handle: None }
}

pub fn start(&mut self, receiver: Receiver<Vec<u8>>, sender: Sender<Result<UniProtDATEntry>>) {
self.handle = Some(thread::spawn(move || {
for data in receiver {
// Cut out the \n// at the end
let data_slice = &data[..data.len() - 3];
let mut lines: Vec<String> = String::from_utf8_lossy(data_slice)
.split('\n')
.map(|x| x.to_string())
.collect();

let entry =
UniProtDATEntry::from_lines(&mut lines).context("Error parsing DAT entry");
sender
.send(entry)
.context("Error sending parsed DAT entry to receiver channel")
.unwrap();
}
}));
}

pub fn join(&mut self) {
if let Some(h) = self.handle.take() {
h.join().unwrap();
self.handle = None;
}
}
}
Loading

0 comments on commit 253afe6

Please sign in to comment.