Skip to content

Commit

Permalink
Updated moalmanac db to 2024-04-11 release (#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
brendanreardon authored May 15, 2024
1 parent 40fd946 commit 9c6823c
Show file tree
Hide file tree
Showing 4 changed files with 2,095 additions and 379 deletions.
2 changes: 1 addition & 1 deletion moalmanac/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ plot_preclinical_efficacy = on

[versions]
interpreter = 0.6.0
database = v.2023-11-09
database = v.2024-04-11

[exac]
exac_common_af_threshold = 0.001
Expand Down
8 changes: 5 additions & 3 deletions moalmanac/datasources/moalmanac/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ The Molecular Oncology Almanac attempts to capture the current body of knowledge
Several other services exist within the Molecular Oncology Almanac ecosystem. See [this repository's docs folder](/docs/) for more information.

## Usage: Formatting the database for use
This method uses a document-based format of the database, which is built using the [database repository](https://github.com/vanallenlab/moalmanac-db) and `create_almanac_db.py`. If MOAlmanac is updated, **please also regenerate [preclinical datasources](../preclinical/)**.
This method uses a json-based format of the database, which is built using the [database repository](https://github.com/vanallenlab/moalmanac-db) and `create_almanac_db.py`. If MOAlmanac is updated, **please also regenerate [preclinical datasources](../preclinical/)**.

Arguments:
```
--directory, -d <string> path to the moalmanac-db repository's content folder
--version, -v <string> release name for the database content being used, should match the release from moalmanac-db
--config, -c <string> path to config.ini file, "../../config.ini" by default
--file, -f <string> path to moalmanac-db json file
--release, -r <string> date of moalmanac-db release, should match the release from moalmanac-db
--version, -v <string> database version of the moalmanac-db schema
```

This should be run with this repository's virtual environment enabled.
136 changes: 83 additions & 53 deletions moalmanac/datasources/moalmanac/create_almanac_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,6 @@
import json
import sys


def create_config():
config = configparser.ConfigParser()
config.read('../../config.ini')
return config


CONFIG = create_config()
feature_type_section = 'feature_types'
FEATURE_TYPE_SOMATIC = CONFIG[feature_type_section]['mut']
FEATURE_TYPE_GERMLINE = CONFIG[feature_type_section]['germline']
FEATURE_TYPE_COPY_NUMBER = CONFIG[feature_type_section]['cna']
FEATURE_TYPE_FUSION = CONFIG[feature_type_section]['fusion']
FEATURE_TYPE_MUTATIONAL_BURDEN = CONFIG[feature_type_section]['burden']
FEATURE_TYPE_MUTATIONAL_SIGNATURE = CONFIG[feature_type_section]['signature']
FEATURE_TYPE_MICROSATELLITE_STABILITY = CONFIG[feature_type_section]['microsatellite']
FEATURE_TYPE_ANEUPLOIDY = CONFIG[feature_type_section]['aneuploidy']
FEATURE_TYPE_KNOCKDOWN = CONFIG[feature_type_section]['knockdown']

ASSERTION_FIELDS = [
'disease', 'context', 'oncotree_term', 'oncotree_code',
'therapy_name', 'therapy_strategy', 'therapy_type',
Expand Down Expand Up @@ -135,42 +116,59 @@ def somatic_variant(cls, record):
return f"{gene}"

@classmethod
def generate(cls, record):
def generate(cls, record, config_dictionary):
feature_type_dictionary = generate_feature_type_string_lookup_dictionary(config_dictionary)
feature_type = record['feature_type']
if feature_type == FEATURE_TYPE_SOMATIC:
if feature_type == feature_type_dictionary['somatic_variants']:
return cls.somatic_variant(record)
elif feature_type == FEATURE_TYPE_GERMLINE:
elif feature_type == feature_type_dictionary['germline_variants']:
return cls.germline_variant(record)
elif feature_type == FEATURE_TYPE_COPY_NUMBER:
elif feature_type == feature_type_dictionary['copy_number_alterations']:
return cls.copy_number(record)
elif feature_type == FEATURE_TYPE_FUSION:
elif feature_type == feature_type_dictionary['fusions']:
return cls.rearrangements(record)
elif feature_type == FEATURE_TYPE_ANEUPLOIDY:
elif feature_type == feature_type_dictionary['aneuploidy']:
return cls.aneuploidy(record)
elif feature_type == FEATURE_TYPE_MUTATIONAL_BURDEN:
elif feature_type == feature_type_dictionary['tmb']:
return cls.mutational_burden(record)
elif feature_type == FEATURE_TYPE_MICROSATELLITE_STABILITY:
elif feature_type == feature_type_dictionary['microsatellite']:
return cls.microsatellite_stability(record)
elif feature_type == FEATURE_TYPE_MUTATIONAL_SIGNATURE:
elif feature_type == feature_type_dictionary['signature']:
return cls.cosmic_mutational_signature(record)
elif feature_type == FEATURE_TYPE_KNOCKDOWN:
elif feature_type == feature_type_dictionary['knockdown']:
return cls.knockdown(record)
else:
print(f'ERROR: {feature_type} does not have a feature display format function.')


def generate_feature_type_string_lookup_dictionary(config):
feature_type_section = 'feature_types'
dictionary = {
'somatic_variants': config[feature_type_section]['mut'],
'germline_variants': config[feature_type_section]['germline'],
'copy_number_alterations': config[feature_type_section]['cna'],
'fusions': config[feature_type_section]['fusion'],
'tmb': config[feature_type_section]['burden'],
'signature': config[feature_type_section]['signature'],
'microsatellite': config[feature_type_section]['microsatellite'],
'aneuploidy': config[feature_type_section]['aneuploidy'],
'knockdown': config[feature_type_section]['knockdown'],
}
return dictionary


def check_fields(fields, record):
for field in fields:
if field not in record.keys():
sys.exit(f'{field} not present in record.\n{record}')


def check_format(record):
def check_format(record, config_dictionary):
check_fields(SOURCE_FIELDS, record)
check_fields(ASSERTION_FIELDS, record)

feature_type = record['feature_type']
feature_type_fields = get_feature_type_fields(feature_type)
feature_type_fields = get_feature_type_fields(feature_type, config_dictionary)
check_fields(feature_type_fields, record)


Expand All @@ -186,24 +184,25 @@ def extract_genes(records):
return genes_sorted


def get_feature_type_fields(feature_type):
if feature_type == FEATURE_TYPE_SOMATIC:
def get_feature_type_fields(feature_type, config_dictionary):
feature_type_dictionary = generate_feature_type_string_lookup_dictionary(config_dictionary)
if feature_type == feature_type_dictionary['somatic_variants']:
return VARIANT_FIELDS
elif feature_type == FEATURE_TYPE_GERMLINE:
elif feature_type == feature_type_dictionary['germline_variants']:
return GERMLINE_FIELDS
elif feature_type == FEATURE_TYPE_COPY_NUMBER:
elif feature_type == feature_type_dictionary['copy_number_alterations']:
return COPY_NUMBER_FIELDS
elif feature_type == FEATURE_TYPE_FUSION:
elif feature_type == feature_type_dictionary['fusions']:
return REARRANGEMENT_FIELDS
elif feature_type == FEATURE_TYPE_MUTATIONAL_BURDEN:
elif feature_type == feature_type_dictionary['tmb']:
return MUTATIONAL_BURDEN_FIELDS
elif feature_type == FEATURE_TYPE_MUTATIONAL_SIGNATURE:
elif feature_type == feature_type_dictionary['signature']:
return MUTATIONAL_SIGNATURE_FIELDS
elif feature_type == FEATURE_TYPE_MICROSATELLITE_STABILITY:
elif feature_type == feature_type_dictionary['microsatellite']:
return MICROSATELLITE_FIELDS
elif feature_type == FEATURE_TYPE_ANEUPLOIDY:
elif feature_type == feature_type_dictionary['aneuploidy']:
return ANEUPLOIDY_FIELDS
elif feature_type == FEATURE_TYPE_KNOCKDOWN:
elif feature_type == feature_type_dictionary['knockdown']:
return KNOCKDOWN_FIELDS
else:
sys.exit(f'feature type {feature_type} present in database but not accounted for.')
Expand All @@ -218,6 +217,12 @@ def initialize():
}


def load_config(file_path):
config = configparser.ConfigParser()
config.read(file_path)
return config


def load_json(json_file):
with open(json_file, 'r') as f:
json_data = json.load(f)
Expand All @@ -229,7 +234,7 @@ def write_json(file, data):
json.dump(data, f, ensure_ascii=False, indent=4)


def main(version, release, content):
def main(version, release, content, config):
db = initialize()
db['release'] = release
db['version'] = version
Expand All @@ -238,23 +243,48 @@ def main(version, release, content):
db['genes'] = db_genes

for record in content:
check_format(record)
record['feature_display'] = DisplayAlteration.generate(record)
if record['_deprecated']:
continue
check_format(record, config)
record['feature_display'] = DisplayAlteration.generate(record, config)
db['content'] = content
write_json('molecular-oncology-almanac.json', db)


if __name__ == "__main__":
arg_parser = argparse.ArgumentParser(prog='Create Molecular Oncology Almanac datasource',
description='Compiles MOAlmanac db for use with algorithm')
arg_parser.add_argument('--file', '-f',
help='Molecular Oncology Almanac db file from https://github.com/vanallenlab/moalmanac-db')
arg_parser.add_argument('--version', '-v',
help='Database version; e.g. 1.0.0')
arg_parser.add_argument('--release', '-r',
help='Database content release; e.g. v.2022-12-01')
arg_parser = argparse.ArgumentParser(
prog='Create Molecular Oncology Almanac datasource',
description='Compiles MOAlmanac db for use with algorithm'
)
arg_parser.add_argument(
'--config',
'-c',
help='MOAlmanac configuration file',
default='../../config.ini'
)
arg_parser.add_argument(
'--file',
'-f',
help='Molecular Oncology Almanac db file from https://github.com/vanallenlab/moalmanac-db'
)
arg_parser.add_argument(
'--release',
'-r',
help='Database content release; e.g. v.2022-12-01'
)
arg_parser.add_argument(
'--version',
'-v',
help='Database version; e.g. 1.0.0'
)
args = arg_parser.parse_args()
print(args)

moalmanac_json = load_json(args.file)
main(args.version, args.release, moalmanac_json)
config_ini = load_config(args.config)
main(
version=args.version,
release=args.release,
content=moalmanac_json,
config=config_ini
)
Loading

0 comments on commit 9c6823c

Please sign in to comment.