Skip to content

Commit

Permalink
feat: use transcript infos from mehari server (#115) (#118)
Browse files Browse the repository at this point in the history
Release-As: 0.6.1
  • Loading branch information
holtgrewe authored Feb 9, 2024
1 parent 1507600 commit 94c5e2e
Show file tree
Hide file tree
Showing 24 changed files with 4,257 additions and 88,895 deletions.
47 changes: 26 additions & 21 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -65,36 +65,41 @@ serve:
serve-public:
npm run storybook -- --host=0.0.0.0

PROTO_BASE := https://raw.githubusercontent.com/varfish-org/annonars/main
PROTO_BASE_ANNONARS := https://raw.githubusercontent.com/varfish-org/annonars/main
PROTO_BASE_MEHARI := https://raw.githubusercontent.com/varfish-org/mehari/main

.PHONY: proto-fetch
proto-fetch:
mkdir -p protos/annonars/{clinvar,cons,dbsnp,functional,genes,gnomad,helixmtdb,regions}
wget -O protos/annonars/clinvar/minimal.proto $(PROTO_BASE)/protos/annonars/clinvar/minimal.proto
wget -O protos/annonars/clinvar/per_gene.proto $(PROTO_BASE)/protos/annonars/clinvar/per_gene.proto
wget -O protos/annonars/clinvar/sv.proto $(PROTO_BASE)/protos/annonars/clinvar/sv.proto
wget -O protos/annonars/cons/base.proto $(PROTO_BASE)/protos/annonars/cons/base.proto
wget -O protos/annonars/dbsnp/base.proto $(PROTO_BASE)/protos/annonars/dbsnp/base.proto
wget -O protos/annonars/functional/refseq.proto $(PROTO_BASE)/protos/annonars/functional/refseq.proto
wget -O protos/annonars/gnomad/exac_cnv.proto $(PROTO_BASE)/protos/annonars/gnomad/exac_cnv.proto
wget -O protos/annonars/gnomad/gnomad2.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad2.proto
wget -O protos/annonars/gnomad/gnomad3.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad3.proto
wget -O protos/annonars/gnomad/gnomad4.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad4.proto
wget -O protos/annonars/gnomad/gnomad_cnv4.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad_cnv4.proto
wget -O protos/annonars/gnomad/gnomad_sv2.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad_sv2.proto
wget -O protos/annonars/gnomad/gnomad_sv4.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad_sv4.proto
wget -O protos/annonars/gnomad/mtdna.proto $(PROTO_BASE)/protos/annonars/gnomad/mtdna.proto
wget -O protos/annonars/gnomad/vep_common.proto $(PROTO_BASE)/protos/annonars/gnomad/vep_common.proto
wget -O protos/annonars/gnomad/vep_gnomad2.proto $(PROTO_BASE)/protos/annonars/gnomad/vep_gnomad2.proto
wget -O protos/annonars/gnomad/vep_gnomad3.proto $(PROTO_BASE)/protos/annonars/gnomad/vep_gnomad3.proto
wget -O protos/annonars/gnomad/vep_gnomad4.proto $(PROTO_BASE)/protos/annonars/gnomad/vep_gnomad4.proto
wget -O protos/annonars/helixmtdb/base.proto $(PROTO_BASE)/protos/annonars/helixmtdb/base.proto
wget -O protos/annonars/regions/clingen.proto $(PROTO_BASE)/protos/annonars/regions/clingen.proto
wget -O protos/annonars/clinvar/minimal.proto $(PROTO_BASE_ANNONARS)/protos/annonars/clinvar/minimal.proto
wget -O protos/annonars/clinvar/per_gene.proto $(PROTO_BASE_ANNONARS)/protos/annonars/clinvar/per_gene.proto
wget -O protos/annonars/clinvar/sv.proto $(PROTO_BASE_ANNONARS)/protos/annonars/clinvar/sv.proto
wget -O protos/annonars/cons/base.proto $(PROTO_BASE_ANNONARS)/protos/annonars/cons/base.proto
wget -O protos/annonars/dbsnp/base.proto $(PROTO_BASE_ANNONARS)/protos/annonars/dbsnp/base.proto
wget -O protos/annonars/functional/refseq.proto $(PROTO_BASE_ANNONARS)/protos/annonars/functional/refseq.proto
wget -O protos/annonars/gnomad/exac_cnv.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/exac_cnv.proto
wget -O protos/annonars/gnomad/gnomad2.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad2.proto
wget -O protos/annonars/gnomad/gnomad3.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad3.proto
wget -O protos/annonars/gnomad/gnomad4.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad4.proto
wget -O protos/annonars/gnomad/gnomad_cnv4.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad_cnv4.proto
wget -O protos/annonars/gnomad/gnomad_sv2.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad_sv2.proto
wget -O protos/annonars/gnomad/gnomad_sv4.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad_sv4.proto
wget -O protos/annonars/gnomad/mtdna.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/mtdna.proto
wget -O protos/annonars/gnomad/vep_common.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/vep_common.proto
wget -O protos/annonars/gnomad/vep_gnomad2.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/vep_gnomad2.proto
wget -O protos/annonars/gnomad/vep_gnomad3.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/vep_gnomad3.proto
wget -O protos/annonars/gnomad/vep_gnomad4.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/vep_gnomad4.proto
wget -O protos/annonars/helixmtdb/base.proto $(PROTO_BASE_ANNONARS)/protos/annonars/helixmtdb/base.proto
wget -O protos/annonars/regions/clingen.proto $(PROTO_BASE_ANNONARS)/protos/annonars/regions/clingen.proto
mkdir -p protos/mehari
wget -O protos/mehari/server.proto $(PROTO_BASE_MEHARI)/protos/mehari/server.proto
wget -O protos/mehari/txs.proto $(PROTO_BASE_MEHARI)/protos/mehari/txs.proto

.PHONY: proto-ts
proto-ts:
mkdir -p src/pbs
npx protoc --ts_opt keep_enum_prefix --ts_out src/pbs --proto_path protos protos/annonars/*/*.proto
npx protoc --ts_opt keep_enum_prefix --ts_out src/pbs --proto_path protos protos/mehari/*.proto

.PHONY: proto
proto: proto-fetch proto-ts format lint
29 changes: 29 additions & 0 deletions protos/mehari/server.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Protobuf messages for the server.

syntax = "proto3";

package mehari.server;

import "mehari/txs.proto";

// Query for transcripts of a certain gene.
message GeneTranscriptsQuery {
// Gene identifier to query for.
optional string hgnc_id = 1;
// Genome build to use.
optional mehari.txs.GenomeBuild genome_build = 2;

// The number of entries to return per page.
optional int32 page_size = 3;
// The token to continue from a previous query.
optional string next_page_token = 4;
}

// Container for a response to `GeneTranscriptsQuery`.
message GeneTranscriptsResponse {
// The transcripts for the gene.
repeated mehari.txs.Transcript transcripts = 1;

// The token to continue from a previous query.
optional string next_page_token = 2;
}
148 changes: 148 additions & 0 deletions protos/mehari/txs.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
syntax = "proto3";

package mehari.txs;

// Stores long array of sequences with an "index" of sequence names to their
// index.
//
// The fields `aliases` and `aliases_idx` have the same length and `aliases_idx[i]`
// stores the index into `seqs` for the sequence `aliases[i]`. In other words.
// `seqs[aliases_idx[i]]` stores the sequence for `aliases[i]`.
message SequenceDb {
// The sequence aliases, cf. `aliases_idx`.
repeated string aliases = 1;
// The corresponding index in `seqs`, cf. `aliases`.
repeated uint32 aliases_idx = 2;
// The corresponding sequences.
repeated string seqs = 3;
}

// Mapping from gene to transcript ID.
message GeneToTxId {
// Gene HGNC ID; serves as gene identifier.
string gene_id = 1;
// Vector of all transcript IDs.
repeated string tx_ids = 2;
}

// Container for the transcript-related database.
message TranscriptDb {
// Vector of all transcripts.
repeated Transcript transcripts = 1;
// Mapping from gene ID to vector of all transcript IDs.
repeated GeneToTxId gene_to_tx = 2;
}

// Enumeration for `Transcript::biotype`.
enum TranscriptBiotype {
// unknown
TRANSCRIPT_BIOTYPE_UNKNOWN = 0;
// Coding transcript.
TRANSCRIPT_BIOTYPE_CODING = 1;
// Non-coding transcript.
TRANSCRIPT_BIOTYPE_NON_CODING = 2;
}

// Bit values for the transcript tags.
enum TranscriptTag {
// unknown
TRANSCRIPT_TAG_UNKNOWN = 0;
// Member of Ensembl basic.
TRANSCRIPT_TAG_BASIC = 1;
// Member of Ensembl canonical.
TRANSCRIPT_TAG_ENSEMBL_CANONICAL = 2;
// Member of MANE Select.
TRANSCRIPT_TAG_MANE_SELECT = 3;
// Member of MANE Plus Clinical.
TRANSCRIPT_TAG_MANE_PLUS_CLINICAL = 4;
// Member of RefSeq Select.
TRANSCRIPT_TAG_REF_SEQ_SELECT = 5;
// Flagged as being a selenoprotein (UGA => selenon).
TRANSCRIPT_TAG_SELENOPROTEIN = 6;
}

// Store information about a transcript.
message Transcript {
// Transcript accession with version, e.g., `"NM_007294.3"` or `"ENST00000461574.1"` for BRCA1.
string id = 1;
// HGNC symbol, e.g., `"BRCA1"`
string gene_symbol = 2;
// HGNC gene identifier, e.g., `"1100"` for BRCA1.
string gene_id = 3;
// Transcript biotype.
TranscriptBiotype biotype = 4;
// Transcript flags.
repeated TranscriptTag tags = 5;
// Identifier of the corresponding protein.
optional string protein = 6;
// CDS start codon.
optional int32 start_codon = 7;
// CDS stop codon.
optional int32 stop_codon = 8;
// Alignments on the different genome builds.
repeated GenomeAlignment genome_alignments = 9;
}

// Enumeration for the known genome builds.
enum GenomeBuild {
// unknown
GENOME_BUILD_UNKNOWN = 0;
// GRCH37.
GENOME_BUILD_GRCH37 = 1;
// GRCh38.
GENOME_BUILD_GRCH38 = 2;
}

// Enumeration for the two strands of the genome.
enum Strand {
// unknown
STRAND_UNKNOWN = 0;
// Forward / plus
STRAND_PLUS = 1;
// Reverse / minus
STRAND_MINUS = 2;
}

// Store information about a transcript aligning to a genome.
message GenomeAlignment {
// The genome build identifier.
GenomeBuild genome_build = 1;
// Accession of the contig sequence.
string contig = 2;
// CDS end position, `-1` to indicate `None`.
optional int32 cds_start = 3;
// CDS end position, `-1` to indicate `None`.
optional int32 cds_end = 4;
// The strand.
Strand strand = 5;
// Exons of the alignment.
repeated ExonAlignment exons = 6;
}

// Store the alignment of one exon to the reference.
message ExonAlignment {
// Start position on reference.
int32 alt_start_i = 1;
// End position on reference.
int32 alt_end_i = 2;
// Exon number.
int32 ord = 3;
// CDS start coordinate.
optional int32 alt_cds_start_i = 4;
// CDS end coordinate.
optional int32 alt_cds_end_i = 5;
// CIGAR string of alignment, empty indicates full matches.
string cigar = 6;
}

// Database of transcripts with sequences.
message TxSeqDatabase {
// Store transcripts with their aliases.
TranscriptDb tx_db = 1;
// Store sequence with their aliases.
SequenceDb seq_db = 2;
// The version of the database.
optional string version = 3;
// The reference assembly that this database refers to.
optional string genome_release = 4;
}
Loading

0 comments on commit 94c5e2e

Please sign in to comment.