Skip to content

Commit

Permalink
allele frequencies modifications
Browse files Browse the repository at this point in the history
  • Loading branch information
costero-e committed Oct 17, 2024
1 parent 9b2d91f commit 50aaf62
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 37 deletions.
4 changes: 2 additions & 2 deletions conf/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
#### VCF Conversion config parameters ####
allele_frequency=1 # introduce float number, leave 1 if you want to convert all the variants
reference_genome='GRCh38' # Choose one between NCBI36, GRCh37, GRCh38
datasetId='CINECA_synthetic_cohort_EUROPE_UK1'
case_level_data=True
datasetId='COVID_pop11_fin_2'
case_level_data=False

### MongoDB parameters ###
database_host = 'mongo'
Expand Down
85 changes: 50 additions & 35 deletions genomicVariations_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def num_rows_in_vcf_files():
total_lines += sum(1 for line in f if not line.startswith('#'))
return total_lines

num_rows = num_rows_in_vcf_files()
num_rows = 1000000000

def generate(dict_properties):
total_dict =[]
Expand All @@ -105,13 +105,7 @@ def generate(dict_properties):
#print(v)
dict_to_xls={}
vstringed = str(v)
for population_splitted in pipeline['frequencyInPopulations|frequencies|population']:
splitword=population_splitted['fullname']+'='
v_splitted = vstringed.split(splitword)
if len(v_splitted) < 2:
continue
else:
population = population_splitted['shortname']
population='COVID_pop11_fin_2'

if conf.case_level_data == True:
clinicalRelevanceword=pipeline['caseLevelData|clinicalInterpretations|clinicalRelevance']+'='
Expand Down Expand Up @@ -159,36 +153,53 @@ def generate(dict_properties):
except Exception:
pass
try:
v_resplitted = v_splitted[1].split(';')
allele_frequency = v_resplitted[0]
try:
allele_frequency = float(allele_frequency)
except Exception:
allele_frequency = allele_frequency.split(',')
allele_frequency=float(allele_frequency[0])
if allele_frequency != '' and population != '':
dict_to_xls['frequencyInPopulations|sourceReference']='gnomad.broadinstitute.org/'
dict_to_xls['frequencyInPopulations|source']='The Genome Aggregation Database (gnomAD)'
dict_to_xls['frequencyInPopulations|frequencies|population']=population
dict_to_xls['frequencyInPopulations|frequencies|alleleFrequency']=allele_frequency
except Exception:
try:
allele_frequency=v.INFO.get('AF')
if isinstance(allele_frequency, tuple):
allele_frequency=list(allele_frequency)
allele_frequency[0]
else:
allele_frequency = float(v.INFO.get('AF'))
allele_number = float(v.INFO.get('AN'))
allele_frequency=v.INFO.get('AF')
if isinstance(allele_frequency, tuple):
allele_frequency=list(allele_frequency)
allele_frequency[0]
else:
allele_frequency = float(v.INFO.get('AF'))
if allele_frequency == 0.0:
continue
allele_number=v.INFO.get('AN')
if allele_number == None:
pass
elif isinstance(allele_number, tuple):
allele_number=list(allele_number)
allele_number[0]
else:
allele_number = float(v.INFO.get('AC'))
allele_count=v.INFO.get('AC')
if allele_count == None:
pass
elif isinstance(allele_count, tuple):
allele_count=list(allele_count)
allele_count[0]
else:
allele_count = float(v.INFO.get('AC'))
ac_hom=v.INFO.get('AC_Hom')
if ac_hom == None:
pass
elif isinstance(ac_hom, tuple):
ac_hom=list(ac_hom)
ac_hom[0]
else:
ac_hom = float(v.INFO.get('AC_Hom'))
ac_het= float(v.INFO.get('AC_Het'))
dict_to_xls['frequencyInPopulations|sourceReference']=pipeline["frequencyInPopulations|sourceReference"]
dict_to_xls['frequencyInPopulations|source']=pipeline["frequencyInPopulations|source"]
dict_to_xls['frequencyInPopulations|frequencies|population']=pipeline["frequencyInPopulations|frequencies|population"][0]["fullname"]
dict_to_xls['frequencyInPopulations|frequencies|alleleFrequency']=allele_frequency
except Exception:
ac_het=v.INFO.get('AC_Het')
if ac_het == None:
pass
elif isinstance(ac_het, tuple):
ac_het=list(ac_het)
ac_het[0]
else:
ac_het = float(v.INFO.get('AC_Het'))

dict_to_xls['frequencyInPopulations|sourceReference']=pipeline["frequencyInPopulations|sourceReference"]
dict_to_xls['frequencyInPopulations|source']=pipeline["frequencyInPopulations|source"]
dict_to_xls['frequencyInPopulations|frequencies|population']=population
dict_to_xls['frequencyInPopulations|frequencies|alleleFrequency']=allele_frequency
except Exception as e:
continue
try:
if v.INFO.get('VT') == 'SV': continue
except Exception:
Expand Down Expand Up @@ -222,6 +233,7 @@ def generate(dict_properties):
except Exception:
dict_to_xls['variation|variantType']='UNKNOWN'
#print(v.INFO.get('ANN'))
'''
if v.INFO.get('ANN') is not None:
annot = v.INFO.get('ANN')
transcripts = annot.split(',')
Expand Down Expand Up @@ -354,6 +366,9 @@ def generate(dict_properties):
dict_to_xls['molecularAttributes|aminoacidChanges'] = annotations[10]
dict_to_xls['molecularAttributes|geneIds'] = annotations[4]
'''



zigosity={}
Expand Down

0 comments on commit 50aaf62

Please sign in to comment.