Skip to content

Commit

Permalink
feat: use/provide HGNC gene IDs (#10) (#11)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Jun 11, 2023
1 parent e573957 commit 0a1da92
Show file tree
Hide file tree
Showing 46 changed files with 10,214 additions and 6,010 deletions.
65 changes: 65 additions & 0 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,68 @@ impl Version {
}
}
}

/// Code related to the HGNC xlink table.
pub mod hgnc_xlink {
use std::collections::HashMap;

/// Data structure for representing an entry of the table.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
#[serde_with::skip_serializing_none]
pub struct Entry {
/// HGNC gene ID.
pub hgnc_id: String,
/// Ensembl gene ID.
pub ensembl_gene_id: Option<String>,
/// Entrez gene ID.
#[serde(alias = "entrez_id")]
pub ncgi_gene_id: Option<u32>,
/// Gene symbol.
pub gene_symbol: String,
}

/// Read the `hgnc_xlink.tsv` file using the `csv` crate via serde.
///
/// # Errors
///
/// In the case that the file could not be read.
pub fn load_entries<P: AsRef<std::path::Path>>(path: &P) -> Result<Vec<Entry>, anyhow::Error> {
let mut rdr = csv::ReaderBuilder::new()
.delimiter(b'\t')
.has_headers(true)
.from_path(path.as_ref())?;
let mut entries = Vec::new();
for result in rdr.deserialize() {
let entry: Entry = result?;
entries.push(entry);
}
Ok(entries)
}

/// Read the `hgnc_xlink.tsv` into a map from NCBI gene ID to HGNC gene ID.
///
/// # Errors
///
/// In the case that the file could not be read.
pub fn load_ncbi_to_hgnc<P: AsRef<std::path::Path>>(
path: P,
) -> Result<HashMap<u32, String>, anyhow::Error> {
let mut map = HashMap::new();
for entry in load_entries(&path)? {
if let Some(ncbi_gene_id) = entry.ncgi_gene_id {
map.insert(ncbi_gene_id, entry.hgnc_id);
}
}
Ok(map)
}

/// Uility function to make the inverse of a `HashMap`.
pub fn inverse_hashmap<K, V, S>(map: &HashMap<K, V, S>) -> HashMap<V, K, S>
where
K: std::hash::Hash + Eq + Clone,
V: std::hash::Hash + Eq + Clone,
S: std::hash::BuildHasher + Default,
{
map.iter().map(|(k, v)| (v.clone(), k.clone())).collect()
}
}
26 changes: 23 additions & 3 deletions src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use hpo::similarity::Builtins;
use prost::Message;
use rocksdb::{DBWithThreadMode, MultiThreaded};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::Instant;

use clap::Parser;
Expand All @@ -20,6 +21,9 @@ pub struct Args {
/// Path to the directory with the HPO files.
#[arg(long, required = true)]
pub path_hpo_dir: String,
/// Path to the TSV file with the HGNC xlink data.
#[arg(long, required = true)]
pub path_hgnc_xlink: String,

/// Path to JSON file with the genes to rank.
#[arg(long)]
Expand Down Expand Up @@ -54,11 +58,14 @@ pub mod query_result {
#[derive(
serde::Serialize, serde::Deserialize, PartialEq, Eq, PartialOrd, Ord, Debug, Clone,
)]
#[serde_with::skip_serializing_none]
pub struct Gene {
/// The NCBI gene ID.
pub entrez_id: u32,
/// The gene symbol.
pub gene_symbol: String,
/// The HGNC ID.
pub hgnc_id: Option<String>,
}

/// The performed query.
Expand Down Expand Up @@ -131,12 +138,16 @@ pub mod query_result {
#[allow(clippy::cast_possible_truncation)]
#[allow(clippy::cast_precision_loss)]
#[allow(clippy::too_many_lines)]
pub fn run_query(
pub fn run_query<S>(
patient: &HpoGroup,
genes: &Vec<&hpo::annotations::Gene>,
hpo: &Ontology,
db: &DBWithThreadMode<MultiThreaded>,
) -> Result<query_result::Container, anyhow::Error> {
ncbi_to_hgnc: &HashMap<u32, String, S>,
) -> Result<query_result::Container, anyhow::Error>
where
S: std::hash::BuildHasher,
{
let cf_resnik = db
.cf_handle("scores")
.expect("database is missing 'scores' column family");
Expand Down Expand Up @@ -234,6 +245,7 @@ pub fn run_query(
result.query.genes.push(query_result::Gene {
entrez_id: ncbi_gene_id,
gene_symbol: gene.name().to_string(),
hgnc_id: ncbi_to_hgnc.get(&ncbi_gene_id).cloned(),
});

result.result.push(query_result::Record {
Expand Down Expand Up @@ -336,9 +348,17 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:
before_load_genes.elapsed()
);

tracing::info!("Loading HGNC xlink...");
let before_load_xlink = Instant::now();
let ncbi_to_hgnc = crate::common::hgnc_xlink::load_ncbi_to_hgnc(&args.path_hgnc_xlink)?;
tracing::info!(
"... done loading HGNC xlink in {:?}",
before_load_xlink.elapsed()
);

tracing::info!("Starting priorization...");
let before_priorization = Instant::now();
let result = run_query(&query, &genes, &hpo, &db)?;
let result = run_query(&query, &genes, &hpo, &db, &ncbi_to_hgnc)?;
tracing::info!(
"... done with prioritization in {:?}",
before_priorization.elapsed()
Expand Down
63 changes: 50 additions & 13 deletions src/server/actix_server/hpo_genes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ use super::{CustomError, Match, ResultHpoTerm};
/// This allows to query for genes. The first given of the following is
/// interpreted.
///
/// - `gene_id` -- specify gene ID
/// - `gene_id` -- specify gene ID (either NCBI or HGNC gene ID)
/// - `gene_symbol` -- specify the gene symbol
/// - `max_results` -- the maximnum number of records to return
/// - `hpo_terms` -- whether to include `"hpo_terms"` in result
///
/// The following propery defines how matches are performed:
///
/// - `match` -- how to match
#[derive(serde::Deserialize, Debug, Clone)]
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
struct Query {
/// The gene ID to search for.
pub gene_id: Option<String>,
Expand Down Expand Up @@ -91,6 +91,17 @@ impl ResultEntry {
}
}

/// Container for the result.
#[derive(Debug, serde::Serialize, serde::Deserialize)]
struct Container {
/// Version information.
pub version: crate::common::Version,
/// The original query records.
pub query: Query,
/// The resulting records for the scored genes.
pub result: Vec<ResultEntry>,
}

/// Query for genes in the HPO database.
#[allow(clippy::unused_async)]
#[get("/hpo/genes")]
Expand All @@ -104,12 +115,14 @@ async fn handle(
let mut result: Vec<ResultEntry> = Vec::new();

if match_ == Match::Exact {
let gene = if let Some(gene_ncbi_id) = &query.gene_id {
let gene_id = GeneId::from(
gene_ncbi_id
.parse::<u32>()
.map_err(|e| CustomError::new(anyhow::anyhow!(e)))?,
);
let gene = if let Some(gene_id) = &query.gene_id {
let gene_id = if let Ok(ncbi_gene_id) = gene_id.parse::<u32>() {
Ok(GeneId::from(ncbi_gene_id))
} else if let Some(ncbi_gene_id) = data.hgnc_to_ncbi.get(gene_id) {
Ok(GeneId::from(*ncbi_gene_id))
} else {
Err(CustomError::new(anyhow::anyhow!("could not parse gene ID")))
}?;
ontology.gene(&gene_id)
} else if let Some(gene_symbol) = &query.gene_symbol {
ontology.gene_by_name(gene_symbol)
Expand Down Expand Up @@ -148,45 +161,69 @@ async fn handle(

result.sort();

let result = Container {
version: crate::common::Version::new(&data.ontology.hpo_version()),
query: query.into_inner(),
result,
};

Ok(Json(result))
}

#[cfg(test)]
mod test {
/// Helper function for running a query.
#[allow(dead_code)]
async fn run_query(uri: &str) -> Result<Vec<super::ResultEntry>, anyhow::Error> {
async fn run_query(uri: &str) -> Result<super::Container, anyhow::Error> {
let ontology = crate::common::load_hpo("tests/data/hpo")?;
let ncbi_to_hgnc =
crate::common::hgnc_xlink::load_ncbi_to_hgnc("tests/data/hgnc_xlink.tsv")?;
let hgnc_to_ncbi = crate::common::hgnc_xlink::inverse_hashmap(&ncbi_to_hgnc);
let app = actix_web::test::init_service(
actix_web::App::new()
.app_data(actix_web::web::Data::new(crate::server::WebServerData {
ontology,
db: None,
ncbi_to_hgnc,
hgnc_to_ncbi,
}))
.service(super::handle),
)
.await;
let req = actix_web::test::TestRequest::get().uri(uri).to_request();
let resp: Vec<super::ResultEntry> =
actix_web::test::call_and_read_body_json(&app, req).await;
let resp: super::Container = actix_web::test::call_and_read_body_json(&app, req).await;

Ok(resp)
}

#[actix_web::test]
async fn hpo_genes_gene_id_exact_no_hpo_terms() -> Result<(), anyhow::Error> {
async fn hpo_genes_ncbi_gene_id_exact_no_hpo_terms() -> Result<(), anyhow::Error> {
Ok(insta::assert_yaml_snapshot!(
&run_query("/hpo/genes?gene_id=2348").await?
))
}

#[actix_web::test]
async fn hpo_genes_gene_id_exact_with_hpo_terms() -> Result<(), anyhow::Error> {
async fn hpo_genes_ncbi_gene_id_exact_with_hpo_terms() -> Result<(), anyhow::Error> {
Ok(insta::assert_yaml_snapshot!(
&run_query("/hpo/genes?gene_id=2348&hpo_terms=true").await?
))
}

#[actix_web::test]
async fn hpo_genes_hgnc_gene_id_exact_no_hpo_terms() -> Result<(), anyhow::Error> {
Ok(insta::assert_yaml_snapshot!(
&run_query("/hpo/genes?gene_id=HGNC:3791").await?
))
}

#[actix_web::test]
async fn hpo_genes_hgnc_gene_id_exact_with_hpo_terms() -> Result<(), anyhow::Error> {
Ok(insta::assert_yaml_snapshot!(
&run_query("/hpo/genes?gene_id=HGNC:3791&hpo_terms=true").await?
))
}

#[actix_web::test]
async fn hpo_genes_gene_symbol_exact_no_hpo_terms() -> Result<(), anyhow::Error> {
Ok(insta::assert_yaml_snapshot!(
Expand Down
29 changes: 25 additions & 4 deletions src/server/actix_server/hpo_omims.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use super::{CustomError, Match, ResultHpoTerm};
/// The following propery defines how matches are performed:
///
/// - `match` -- how to match
#[derive(serde::Deserialize, Debug, Clone)]
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
struct Query {
/// The OMIM ID to search for.
pub omim_id: Option<String>,
Expand Down Expand Up @@ -131,6 +131,17 @@ impl ResultEntry {
}
}

/// Container for the result.
#[derive(Debug, serde::Serialize, serde::Deserialize)]
struct Container {
/// Version information.
pub version: crate::common::Version,
/// The original query records.
pub query: Query,
/// The resulting records for the scored genes.
pub result: Vec<ResultEntry>,
}

/// Query for OMIM diseases in the HPO database.
#[allow(clippy::unused_async)]
#[get("/hpo/omims")]
Expand Down Expand Up @@ -194,27 +205,37 @@ async fn handle(

result.sort();

let result = Container {
version: crate::common::Version::new(&data.ontology.hpo_version()),
query: query.into_inner(),
result,
};

Ok(Json(result))
}

#[cfg(test)]
mod test {
/// Helper function for running a query.
#[allow(dead_code)]
async fn run_query(uri: &str) -> Result<Vec<super::ResultEntry>, anyhow::Error> {
async fn run_query(uri: &str) -> Result<super::Container, anyhow::Error> {
let ontology = crate::common::load_hpo("tests/data/hpo")?;
let ncbi_to_hgnc =
crate::common::hgnc_xlink::load_ncbi_to_hgnc("tests/data/hgnc_xlink.tsv")?;
let hgnc_to_ncbi = crate::common::hgnc_xlink::inverse_hashmap(&ncbi_to_hgnc);
let app = actix_web::test::init_service(
actix_web::App::new()
.app_data(actix_web::web::Data::new(crate::server::WebServerData {
ontology,
db: None,
ncbi_to_hgnc,
hgnc_to_ncbi,
}))
.service(super::handle),
)
.await;
let req = actix_web::test::TestRequest::get().uri(uri).to_request();
let resp: Vec<super::ResultEntry> =
actix_web::test::call_and_read_body_json(&app, req).await;
let resp: super::Container = actix_web::test::call_and_read_body_json(&app, req).await;

Ok(resp)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ query:
genes:
- entrez_id: 7273
gene_symbol: TTN
hgnc_id: "HGNC:12403"
- entrez_id: 23483
gene_symbol: TGDS
hgnc_id: "HGNC:20324"
result:
- gene_symbol: TGDS
p_value: 0.1
Expand Down
Loading

0 comments on commit 0a1da92

Please sign in to comment.