Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add loftee_labels and no_lof_flags parameters to filter_vep_transcript_csqs_expr for filtering by loftee labels and flags #753

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions gnomad/utils/vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,8 @@ def filter_vep_transcript_csqs_expr(
mane_select: bool = False,
ensembl_only: bool = False,
protein_coding: bool = False,
loftee_labels: Optional[List[str]] = None,
no_lof_flags: bool = False,
csqs: Optional[List[str]] = None,
keep_csqs: bool = True,
genes: Optional[List[str]] = None,
Expand Down Expand Up @@ -896,6 +898,10 @@ def filter_vep_transcript_csqs_expr(
Emsembl. Default is False.
:param protein_coding: Whether to filter to only protein-coding transcripts.
Default is False.
:param loftee_labels: List of LOFTEE labels to filter to. Default is None, which
filters to all LOFTEE labels.
:param no_lof_flags: Whether to filter to consequences with no LOFTEE flags.
Default is False.
:param csqs: Optional list of consequence terms to filter to. Transcript
consequences are filtered to those where 'most_severe_consequence' is in the
list of consequence terms `csqs`. Default is None.
Expand All @@ -919,9 +925,9 @@ def filter_vep_transcript_csqs_expr(
logger.info("Filtering to most severe consequence of synonymous_variant...")
csqs = ["synonymous_variant"]

csq_fields = csq_expr if is_struct else csq_expr.dtype.element_type.fields
if csqs is not None:
fields = csq_expr if is_struct else csq_expr.dtype.element_type.fields
if "most_severe_consequence" not in fields:
if "most_severe_consequence" not in csq_fields:
logger.info("Adding most_severe_consequence annotation...")
csq_expr = add_most_severe_consequence_to_consequence(csq_expr)

Expand Down Expand Up @@ -955,6 +961,20 @@ def _filter_vep_csq_expr(
if protein_coding:
logger.info("Filtering to protein coding transcripts...")
criteria &= csq.biotype == "protein_coding"
if loftee_labels:
logger.info(
"Filtering to consequences with LOFTEE labels: %s...", loftee_labels
)
criteria &= hl.set(loftee_labels).contains(csq.lof)
if no_lof_flags:
logger.info("Filtering to consequences with no LOFTEE flags...")
if "lof_flags" in csq_fields:
criteria &= hl.is_missing(csq.lof_flags) | (csq.lof_flags == "")
else:
logger.warning(
"'lof_flags' not present in consequence struct, no consequences "
"are filtered based on LOFTEE flags"
)
if genes is not None:
logger.info("Filtering to genes of interest...")
gene_field = "gene_symbol" if match_by_gene_symbol else "gene_id"
Expand Down
Loading