Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/unipept/make-database
Browse files Browse the repository at this point in the history
  • Loading branch information
pverscha committed Jan 29, 2024
2 parents 551350a + ac980f2 commit 90a8ad9
Show file tree
Hide file tree
Showing 13 changed files with 574 additions and 157 deletions.
27 changes: 2 additions & 25 deletions .github/workflows/static_database.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,35 +44,12 @@ jobs:
# Compress the database before uploading it to a Github release
zip output.zip output.db
- name: Update database versioning
shell: bash
run: |
rm workflows/static_database/version.txt
echo "${{ steps.date.outputs.date }}" > workflows/static_database/version.txt
- name: Update resources
uses: test-room-7/action-update-file@v1
with:
file-path: workflows/static_database/version.txt
commit-msg: Bump db version to ${{ steps.date.outputs.date }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Get newly made commit sha
id: commit_sha
shell: bash
run: |
echo "::set-output name=sha::$(git rev-parse HEAD)"
- name: Create new tag
uses: octokit/request-action@v2.x
id: create_new_tag
uses: rickstaa/action-create-tag@v1
id: "tag_create"
with:
route: POST /repos/:owner/:repo/git/tags
owner: unipept
repo: make-database
tag: database-${{ steps.date.outputs.date }}
message: "Static information database built on ${{ steps.date.outputs.date }}"
object: ${{ steps.commit_sha.outputs.sha }}
type: commit
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Create Release
id: create_release
uses: actions/create-release@v1
Expand Down
66 changes: 66 additions & 0 deletions scripts/helper_scripts/unipept-database-rs/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions scripts/helper_scripts/unipept-database-rs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,8 @@ anyhow = "1.0.75"
bit-vec = "0.6.3"
chrono = "0.4.31"
clap = { version = "4.4.6", features = ["derive"] }
regex = "1.10.2"
smartstring = { version = "1.0" }
strum = "0.25.0"
strum_macros = "0.25.3"
uniprot = "0.7.0"
21 changes: 21 additions & 0 deletions scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
use anyhow::{Context, Result};
use clap::Parser;
use std::path::PathBuf;
use unipept_database::calculate_lcas::taxonomy::Taxonomy;
use unipept_database::taxons_uniprots_tables::utils::now_str;

#[derive(Parser)]
struct Cli {
#[clap(long)]
infile: PathBuf,
}

fn main() -> Result<()> {
let args = Cli::parse();

eprintln!("{}: reading taxonomy", now_str());
let tax = Taxonomy::build(&args.infile).context("Unable to build taxonomy")?;

eprintln!("{}: reading sequences", now_str());
tax.calculate_lcas()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use anyhow::{Context, Result};
use clap::Parser;
use std::path::PathBuf;
use unipept_database::taxons_lineages::taxon_list::TaxonList;

fn main() -> Result<()> {
let args = Cli::parse();

let mut tl = TaxonList::from_dumps(&args.names, &args.nodes)
.context("Failed to parse TaxonList from dumps")?;
tl.invalidate().context("Failed to validate TaxonList")?;
tl.write_taxons(&args.taxons)
.context("Failed to write TaxonList")?;
tl.write_lineages(&args.lineages)
.context("Failed to write lineages")?;

Ok(())
}

#[derive(Parser, Debug)]
struct Cli {
#[clap(short, long)]
names: PathBuf,
#[clap(short, long)]
nodes: PathBuf,
#[clap(short, long)]
taxons: PathBuf,
#[clap(short, long)]
lineages: PathBuf,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod taxonomy;
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
use std::collections::HashMap;
use std::io::BufRead;
use std::path::PathBuf;

use anyhow::{Context, Result};

use crate::taxons_uniprots_tables::utils::now_str;
use crate::utils::files::{open_read, open_sin};

const GENUS: u8 = 18;
const RANKS: u8 = 27;
const SPECIES: u8 = 22;
const NULL_STRING: &str = "\\N";
const SEPARATOR: &str = "\t";

pub struct Taxonomy {
taxonomy: Vec<Vec<i32>>,
}

impl Taxonomy {
pub fn build(infile: &PathBuf) -> Result<Self> {
let mut taxonomy_map: HashMap<i32, Vec<i32>> = HashMap::new();
let reader = open_read(infile).context("Unable to open input file")?;

let mut max = i32::MIN;

for line in reader.lines() {
let line = line.with_context(|| {
format!("Error reading line from input file {}", infile.display())
})?;
let mut elements = line.splitn(28, SEPARATOR).map(parse_int);
let key = elements
.next()
.context("Unable to access key at first index of line")??;

// Note on the collect::<> here: "?" can't be used inside of map() as it is a closure
// Collecting into a Result<Vec<_>> will stop instantly when it receives one Error
// https://doc.rust-lang.org/rust-by-example/error/iter_result.html#fail-the-entire-operation-with-collect
let lineage = elements.collect::<Result<Vec<i32>>>()?;
taxonomy_map.insert(key, lineage);

// Keep track of highest key
if key > max {
max = key;
}
}

let mut taxonomy = vec![Vec::new(); (max + 1) as usize];

for (key, value) in taxonomy_map {
taxonomy[key as usize] = value;
}

Ok(Taxonomy { taxonomy })
}

pub fn calculate_lcas(&self) -> Result<()> {
let reader = open_sin();

let mut current_sequence = String::new();
let mut taxa: Vec<i32> = Vec::new();

for (i, line) in reader.lines().enumerate() {
if i % 10000000 == 0 && i != 0 {
eprintln!("{}: {}", now_str(), i);
}

let line = line.context("error reading line from stdin")?;

let (sequence, taxon_id) =
line.split_once(SEPARATOR).context("error splitting line")?;
let taxon_id: i32 = taxon_id
.trim_end()
.parse()
.context("error parsing taxon id to int")?;

if current_sequence.is_empty() || current_sequence != sequence {
if !current_sequence.is_empty() {
self.handle_lca(&current_sequence, self.calculate_lca(&taxa));
}

current_sequence = sequence.to_string();
taxa.clear();
}

taxa.push(taxon_id);
}

self.handle_lca(&current_sequence, self.calculate_lca(&taxa));
Ok(())
}

fn calculate_lca(&self, taxa: &[i32]) -> i32 {
let mut lca = 1;

let lineages: Vec<&Vec<i32>> = taxa
.iter()
.map(|x| &self.taxonomy[*x as usize])
.filter(|x| !x.is_empty())
.collect();

for rank in 0..RANKS {
let final_rank = rank;
let mut value = -1;

let iterator = lineages
.iter()
.map(|&x| x[final_rank as usize])
.filter(|&x| {
if final_rank == GENUS || final_rank == SPECIES {
x > 0
} else {
x >= 0
}
});

// Check if all elements in the iterator are the same
// This was near-impossible to do with the iterators above,
// so we're using a simplified loop here
for item in iterator {
if value == -1 {
value = item;
} else if item != value {
return lca;
}
}

// If we found a new value that matched for all of them, use this as the new best
if value > 0 {
lca = value;
}
}

lca
}

fn handle_lca(&self, sequence: &String, lca: i32) {
println!("{}\t{}", sequence, lca);
}
}

fn parse_int(s: &str) -> Result<i32> {
if s == NULL_STRING {
return Ok(0);
}

s.parse::<i32>()
.with_context(|| format!("Error parsing {} as an integer", s))
}
2 changes: 2 additions & 0 deletions scripts/helper_scripts/unipept-database-rs/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
pub mod calculate_lcas;
pub mod taxons_lineages;
pub mod taxons_uniprots_tables;
pub mod utils;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod taxon_list;
Loading

0 comments on commit 90a8ad9

Please sign in to comment.