feat: use transcript infos from mehari server (#115) (#118)

Release-As: 0.6.1
bihealth · Feb 9, 2024 · 94c5e2e · 94c5e2e
1 parent 1507600
commit 94c5e2e
Show file tree

Hide file tree

Showing 24 changed files with 4,257 additions and 88,895 deletions.
diff --git a/Makefile b/Makefile
@@ -65,36 +65,41 @@ serve:
 serve-public:
 	npm run storybook -- --host=0.0.0.0
 
-PROTO_BASE := https://raw.githubusercontent.com/varfish-org/annonars/main
+PROTO_BASE_ANNONARS := https://raw.githubusercontent.com/varfish-org/annonars/main
+PROTO_BASE_MEHARI := https://raw.githubusercontent.com/varfish-org/mehari/main
 
 .PHONY: proto-fetch
 proto-fetch:
 	mkdir -p protos/annonars/{clinvar,cons,dbsnp,functional,genes,gnomad,helixmtdb,regions}
-	wget -O protos/annonars/clinvar/minimal.proto $(PROTO_BASE)/protos/annonars/clinvar/minimal.proto
-	wget -O protos/annonars/clinvar/per_gene.proto $(PROTO_BASE)/protos/annonars/clinvar/per_gene.proto
-	wget -O protos/annonars/clinvar/sv.proto $(PROTO_BASE)/protos/annonars/clinvar/sv.proto
-	wget -O protos/annonars/cons/base.proto $(PROTO_BASE)/protos/annonars/cons/base.proto
-	wget -O protos/annonars/dbsnp/base.proto $(PROTO_BASE)/protos/annonars/dbsnp/base.proto
-	wget -O protos/annonars/functional/refseq.proto $(PROTO_BASE)/protos/annonars/functional/refseq.proto
-	wget -O protos/annonars/gnomad/exac_cnv.proto $(PROTO_BASE)/protos/annonars/gnomad/exac_cnv.proto
-	wget -O protos/annonars/gnomad/gnomad2.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad2.proto
-	wget -O protos/annonars/gnomad/gnomad3.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad3.proto
-	wget -O protos/annonars/gnomad/gnomad4.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad4.proto
-	wget -O protos/annonars/gnomad/gnomad_cnv4.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad_cnv4.proto
-	wget -O protos/annonars/gnomad/gnomad_sv2.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad_sv2.proto
-	wget -O protos/annonars/gnomad/gnomad_sv4.proto $(PROTO_BASE)/protos/annonars/gnomad/gnomad_sv4.proto
-	wget -O protos/annonars/gnomad/mtdna.proto $(PROTO_BASE)/protos/annonars/gnomad/mtdna.proto
-	wget -O protos/annonars/gnomad/vep_common.proto $(PROTO_BASE)/protos/annonars/gnomad/vep_common.proto
-	wget -O protos/annonars/gnomad/vep_gnomad2.proto $(PROTO_BASE)/protos/annonars/gnomad/vep_gnomad2.proto
-	wget -O protos/annonars/gnomad/vep_gnomad3.proto $(PROTO_BASE)/protos/annonars/gnomad/vep_gnomad3.proto
-	wget -O protos/annonars/gnomad/vep_gnomad4.proto $(PROTO_BASE)/protos/annonars/gnomad/vep_gnomad4.proto
-	wget -O protos/annonars/helixmtdb/base.proto $(PROTO_BASE)/protos/annonars/helixmtdb/base.proto
-	wget -O protos/annonars/regions/clingen.proto $(PROTO_BASE)/protos/annonars/regions/clingen.proto
+	wget -O protos/annonars/clinvar/minimal.proto $(PROTO_BASE_ANNONARS)/protos/annonars/clinvar/minimal.proto
+	wget -O protos/annonars/clinvar/per_gene.proto $(PROTO_BASE_ANNONARS)/protos/annonars/clinvar/per_gene.proto
+	wget -O protos/annonars/clinvar/sv.proto $(PROTO_BASE_ANNONARS)/protos/annonars/clinvar/sv.proto
+	wget -O protos/annonars/cons/base.proto $(PROTO_BASE_ANNONARS)/protos/annonars/cons/base.proto
+	wget -O protos/annonars/dbsnp/base.proto $(PROTO_BASE_ANNONARS)/protos/annonars/dbsnp/base.proto
+	wget -O protos/annonars/functional/refseq.proto $(PROTO_BASE_ANNONARS)/protos/annonars/functional/refseq.proto
+	wget -O protos/annonars/gnomad/exac_cnv.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/exac_cnv.proto
+	wget -O protos/annonars/gnomad/gnomad2.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad2.proto
+	wget -O protos/annonars/gnomad/gnomad3.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad3.proto
+	wget -O protos/annonars/gnomad/gnomad4.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad4.proto
+	wget -O protos/annonars/gnomad/gnomad_cnv4.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad_cnv4.proto
+	wget -O protos/annonars/gnomad/gnomad_sv2.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad_sv2.proto
+	wget -O protos/annonars/gnomad/gnomad_sv4.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/gnomad_sv4.proto
+	wget -O protos/annonars/gnomad/mtdna.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/mtdna.proto
+	wget -O protos/annonars/gnomad/vep_common.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/vep_common.proto
+	wget -O protos/annonars/gnomad/vep_gnomad2.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/vep_gnomad2.proto
+	wget -O protos/annonars/gnomad/vep_gnomad3.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/vep_gnomad3.proto
+	wget -O protos/annonars/gnomad/vep_gnomad4.proto $(PROTO_BASE_ANNONARS)/protos/annonars/gnomad/vep_gnomad4.proto
+	wget -O protos/annonars/helixmtdb/base.proto $(PROTO_BASE_ANNONARS)/protos/annonars/helixmtdb/base.proto
+	wget -O protos/annonars/regions/clingen.proto $(PROTO_BASE_ANNONARS)/protos/annonars/regions/clingen.proto
+	mkdir -p protos/mehari
+	wget -O protos/mehari/server.proto $(PROTO_BASE_MEHARI)/protos/mehari/server.proto
+	wget -O protos/mehari/txs.proto $(PROTO_BASE_MEHARI)/protos/mehari/txs.proto
 
 .PHONY: proto-ts
 proto-ts:
 	mkdir -p src/pbs
 	npx protoc --ts_opt keep_enum_prefix --ts_out src/pbs --proto_path protos protos/annonars/*/*.proto
+	npx protoc --ts_opt keep_enum_prefix --ts_out src/pbs --proto_path protos protos/mehari/*.proto
 
 .PHONY: proto
 proto: proto-fetch proto-ts format lint
diff --git a/protos/mehari/server.proto b/protos/mehari/server.proto
@@ -0,0 +1,29 @@
+// Protobuf messages for the server.
+
+syntax = "proto3";
+
+package mehari.server;
+
+import "mehari/txs.proto";
+
+// Query for transcripts of a certain gene.
+message GeneTranscriptsQuery {
+    // Gene identifier to query for.
+    optional string hgnc_id = 1;
+    // Genome build to use.
+    optional mehari.txs.GenomeBuild genome_build = 2;
+
+    // The number of entries to return per page.
+    optional int32 page_size = 3;
+    // The token to continue from a previous query.
+    optional string next_page_token = 4;
+}
+
+// Container for a response to `GeneTranscriptsQuery`.
+message GeneTranscriptsResponse {
+    // The transcripts for the gene.
+    repeated mehari.txs.Transcript transcripts = 1;
+
+    // The token to continue from a previous query.
+    optional string next_page_token = 2;
+}
diff --git a/protos/mehari/txs.proto b/protos/mehari/txs.proto
@@ -0,0 +1,148 @@
+syntax = "proto3";
+
+package mehari.txs;
+
+// Stores long array of sequences with an "index" of sequence names to their
+// index.
+//
+// The fields `aliases` and `aliases_idx` have the same length and `aliases_idx[i]`
+// stores the index into `seqs` for the sequence `aliases[i]`.  In other words.
+// `seqs[aliases_idx[i]]` stores the sequence for `aliases[i]`.
+message SequenceDb {
+    // The sequence aliases, cf. `aliases_idx`.
+    repeated string aliases = 1;
+    // The corresponding index in `seqs`, cf. `aliases`.
+    repeated uint32 aliases_idx = 2;
+    // The corresponding sequences.
+    repeated string seqs = 3;
+}
+
+// Mapping from gene to transcript ID.
+message GeneToTxId {
+    // Gene HGNC ID; serves as gene identifier.
+    string gene_id = 1;
+    // Vector of all transcript IDs.
+    repeated string tx_ids = 2;
+}
+
+// Container for the transcript-related database.
+message TranscriptDb {
+    // Vector of all transcripts.
+    repeated Transcript transcripts = 1;
+    // Mapping from gene ID to vector of all transcript IDs.
+    repeated GeneToTxId gene_to_tx = 2;
+}
+
+// Enumeration for `Transcript::biotype`.
+enum TranscriptBiotype {
+    // unknown
+    TRANSCRIPT_BIOTYPE_UNKNOWN = 0;
+    // Coding transcript.
+    TRANSCRIPT_BIOTYPE_CODING = 1;
+    // Non-coding transcript.
+    TRANSCRIPT_BIOTYPE_NON_CODING = 2;
+}
+
+// Bit values for the transcript tags.
+enum TranscriptTag {
+    // unknown
+    TRANSCRIPT_TAG_UNKNOWN = 0;
+    // Member of Ensembl basic.
+    TRANSCRIPT_TAG_BASIC = 1;
+    // Member of Ensembl canonical.
+    TRANSCRIPT_TAG_ENSEMBL_CANONICAL = 2;
+    // Member of MANE Select.
+    TRANSCRIPT_TAG_MANE_SELECT = 3;
+    // Member of MANE Plus Clinical.
+    TRANSCRIPT_TAG_MANE_PLUS_CLINICAL = 4;
+    // Member of RefSeq Select.
+    TRANSCRIPT_TAG_REF_SEQ_SELECT = 5;
+    // Flagged as being a selenoprotein (UGA => selenon).
+    TRANSCRIPT_TAG_SELENOPROTEIN = 6;
+}
+
+// Store information about a transcript.
+message Transcript {
+    // Transcript accession with version, e.g., `"NM_007294.3"` or `"ENST00000461574.1"` for BRCA1.
+    string id = 1;
+    // HGNC symbol, e.g., `"BRCA1"`
+    string gene_symbol = 2;
+    // HGNC gene identifier, e.g., `"1100"` for BRCA1.
+    string gene_id = 3;
+    // Transcript biotype.
+    TranscriptBiotype biotype = 4;
+    // Transcript flags.
+    repeated TranscriptTag tags = 5;
+    // Identifier of the corresponding protein.
+    optional string protein = 6;
+    // CDS start codon.
+    optional int32 start_codon = 7;
+    // CDS stop codon.
+    optional int32 stop_codon = 8;
+    // Alignments on the different genome builds.
+    repeated GenomeAlignment genome_alignments = 9;
+}
+
+// Enumeration for the known genome builds.
+enum GenomeBuild {
+    // unknown
+    GENOME_BUILD_UNKNOWN = 0;
+    // GRCH37.
+    GENOME_BUILD_GRCH37 = 1;
+    // GRCh38.
+    GENOME_BUILD_GRCH38 = 2;
+}
+
+// Enumeration for the two strands of the genome.
+enum Strand {
+    // unknown
+    STRAND_UNKNOWN = 0;
+    // Forward / plus
+    STRAND_PLUS = 1;
+    // Reverse / minus
+    STRAND_MINUS = 2;
+}
+
+// Store information about a transcript aligning to a genome.
+message GenomeAlignment {
+    // The genome build identifier.
+    GenomeBuild genome_build = 1;
+    // Accession of the contig sequence.
+    string contig = 2;
+    // CDS end position, `-1` to indicate `None`.
+    optional int32 cds_start = 3;
+    // CDS end position, `-1` to indicate `None`.
+    optional int32 cds_end = 4;
+    // The strand.
+    Strand strand = 5;
+    // Exons of the alignment.
+    repeated ExonAlignment exons = 6;
+}
+
+// Store the alignment of one exon to the reference.
+message ExonAlignment {
+    // Start position on reference.
+    int32 alt_start_i = 1;
+    // End position on reference.
+    int32 alt_end_i = 2;
+    // Exon number.
+    int32 ord = 3;
+    // CDS start coordinate.
+    optional int32 alt_cds_start_i = 4;
+    // CDS end coordinate.
+    optional int32 alt_cds_end_i = 5;
+    // CIGAR string of alignment, empty indicates full matches.
+    string cigar = 6;
+}
+
+// Database of transcripts with sequences.
+message TxSeqDatabase {
+    // Store transcripts with their aliases.
+    TranscriptDb tx_db = 1;
+    // Store sequence with their aliases.
+    SequenceDb seq_db = 2;
+    // The version of the database.
+    optional string version = 3;
+    // The reference assembly that this database refers to.
+    optional string genome_release = 4;
+}