Skip to content

Commit

Permalink
added customized signalp scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
gurdeep330 committed Nov 7, 2022
1 parent 66550bf commit b7ef804
Show file tree
Hide file tree
Showing 280 changed files with 458,452 additions and 3 deletions.
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,31 @@ We assigned mutational information provided in (a)
#### N-terminal analysis
| Location | Description |
| ------ | ----------- |
| pyScripts/fetch_candidates.py | Script to extract candidates from UniProt/SwissProt's human.dat.gz file and run modified SignalP4.1. |
| pyScripts/plot_scatter.py | Script to generate and plot the data. |
| data/part_a_fasta | directory with peptide FASTA sequences given as input to SignalP. |
| data/part_a_signalp | output of SignalP peptide sequence prediction. |
| data/part_a_data.tsv | output of SignalP and protein information in TSV format. |

#### Internal TMDs analysis
| Location | Description |
| ------ | ----------- |
| pyScripts/fetch_candidates_part_C.py | Script to extract candidates from UniProt/SwissProt's human.dat.gz file and run modified SignalP4.1. |
| pyScripts/plot_scatter_part_C.py | Script to generate and plot the data. |
| data/part_c_annotations.tsv | Annotations of TM domains in proteins. |
| data/part_a_fasta | directory with peptide FASTA sequences given as input to SignalP. |
| data/part_a_signalp | output of SignalP peptide sequence prediction. |
| data/part_a_data.tsv | output of SignalP and protein information in TSV format. |
| data/part_c_fasta | directory with peptide FASTA sequences given as input to SignalP. |
| data/part_c_signalp | output of SignalP peptide sequence prediction. |
| data/part_c_data.tsv | output of SignalP and protein information in TSV format. |

#### signalp-4.1
| Location | Description |
| ------ | ----------- |
| signalp-4.1/signalp | Original signalp-4.1 script. |
| signalp-4.1/signalp_TM | Customized signalp-4.1 script in which the TM_TRESHOLD is set to -1 so that it is always less than TMCount and thus forces the program to run in the TM network mode. |
| signalp-4.1/signalp_noTM | Customized signalp-4.1 script in which the TM_TRESHOLD is set to 100 so that it is always greater than TMCount and thus forces the program to run in the no-TM network mode. |


**Note:** The information about the data downloaded from UniProt, COSMIC and ClinVar can be retrieved from Materials and Methods of the accompanying manuscript.

#### [webApp](http://spc.russelllab.org)
| Location | Description |
Expand Down
107 changes: 107 additions & 0 deletions pyScripts/fetch_candidates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
## Script to extract candidates from
## UniProt/SwissProt's human.dat.gz file
import os, gzip, sys

## Select the species
species = 'caeel'
#species = 'human'
sprot = 1
trembl = 1

if sprot == 1 and trembl == 1:
#files = ['../../DB/uniprot/uniprot_sprot_human.dat.gz', '../../DB/uniprot/uniprot_trembl_human.dat.gz']
files = ['../../DB/uniprot/uniprot_sprot_caeel.dat.gz', '../../DB/uniprot/uniprot_trembl_caeel.dat.gz']
fastaFiles = ['../../DB/uniprot/uniprot_sprot.fasta.gz', '../../DB/uniprot/uniprot_trembl.fasta.gz']
elif sprot == 1 and trembl == 0:
#files = ['../../DB/uniprot/uniprot_sprot_human.dat.gz']
files = ['../../DB/uniprot/uniprot_sprot_caeel.dat.gz']
fastaFiles = ['../../DB/uniprot/uniprot_sprot.fasta.gz']
elif sprot == 0 and trembl == 1:
#files = ['../../DB/uniprot/uniprot_trembl_human.dat.gz']
files = ['../../DB/uniprot/uniprot_trembl_caeel.dat.gz']
fastaFiles = ['../../DB/uniprot/uniprot_trembl.fasta.gz']

class spc:
def __init__(self, id):
self.id = ''
self.kw = 'noTM'
self.signal = 0
self.topo = ''
self.fasta = ''

x = []
flag = 0
dic = {}
for file in files:
for line in gzip.open(file, 'rt'):
#for line in gzip.open('../../DB/uniprot/uniprot_sprot_human.dat.gz', 'rt'):
#print (line)
if line[:2] == 'ID':
id = line.split()[1]
dic[id] = spc(id)
flag = 0
elif line[:2] == 'KW' and dic[id].kw == 'noTM':
if 'Transmembrane' in line:
dic[id].kw = 'TM'
elif line[:2] == 'FT':
if line.split()[1] in ['SIGNAL']:
dic[id].signal = 1
if line.split()[1] in ['SIGNAL', 'TRANSMEM', 'TOPO_DOM']:
topo = line.split()[1]
start = line.split()[2].split('.')[0]
end = line.split()[2].split('.')[-1].replace('\n', '')
#print (id, line, end.isnumeric())
if '?' not in start:
if int(start) <= 70 and line.split()[1] != 'SIGNAL':
flag = 1
dic[id].topo += topo+':'+start+':'+end+':'
elif line.split()[1] == 'SIGNAL':
dic[id].topo += topo+':'+start+':'+end + ':\t'
flag = 0
else:
flag = 0
else:
if '/note=' in line.split()[1] and flag == 1:
dic[id].topo += line.split()[1].split('"')[1] + '\t'
flag = 0
#break

print (len(dic))

for file in fastaFiles:
for line in gzip.open(file, 'rt'):
if line[0] == '>':
id = line.split('|')[2].split()[0]
if id in dic:
flag = 1
else:
flag = 0
elif flag == 1:
if len(dic[id].fasta) < 70:
for char in line.replace('\n', ''):
dic[id].fasta += char
#if id == 'YO002_HUMAN':
# print (dic[id].fasta, len(dic[id].fasta))
if len(dic[id].fasta) == 70:
flag = 0
break

print (len(dic))
#sys.exit()
'''
for id in dic:
if dic[id].kw == 'noTM' and dic[id].signal == 1:
#if id == 'HPTR_HUMAN':
print (id, dic[id].kw, dic[id].topo, dic[id].fasta)
'''
l = ''
for id in dic:
if dic[id].fasta != '':
m = '>'+id + '\n' + dic[id].fasta + '\n'
l += m
open('../data/fasta_'+species+'/'+id+'.fasta', 'w').write(m)
os.system('/home/gurdeep/Downloads/signalp-4.1/signalp_TM -f long -s best ../data/fasta_'+species+'/'+id+'.fasta > ../data/signalp_'+species+'/'+id+'_long_TM.out')
os.system('/home/gurdeep/Downloads/signalp-4.1/signalp_noTM -f long -s best ../data/fasta_'+species+'/'+id+'.fasta > ../data/signalp_'+species+'/'+id+'_long_noTM.out')

#open('../data/uniprot_sprot_human_signalp_input.fasta', 'w').write(l)
open('../data/uniprot_'+species+'_signalp_input.fasta', 'w').write(l)
210 changes: 210 additions & 0 deletions pyScripts/fetch_candidates_part_C.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
## To consider TM_start + x AAs such that the total is 70

## Script to extract candidates from UniProt/SwissProt's human.dat.gz file
## for Internal TMD analysis
import os, sys, gzip, requests


## Select the species
species = 'caeel'
#species = 'human'
sprot = 1
trembl = 1

if sprot == 1 and trembl == 1:
#files = ['../../DB/uniprot/uniprot_sprot_human.dat.gz', '../../DB/uniprot/uniprot_trembl_human.dat.gz']
files = ['../../DB/uniprot/uniprot_sprot_caeel.dat.gz', '../../DB/uniprot/uniprot_trembl_caeel.dat.gz']
fastaFiles = ['../../DB/uniprot/uniprot_sprot.fasta.gz', '../../DB/uniprot/uniprot_trembl.fasta.gz']
elif sprot == 1 and trembl == 0:
#files = ['../../DB/uniprot/uniprot_sprot_human.dat.gz']
files = ['../../DB/uniprot/uniprot_sprot_caeel.dat.gz']
fastaFiles = ['../../DB/uniprot/uniprot_sprot.fasta.gz']
elif sprot == 0 and trembl == 1:
#files = ['../../DB/uniprot/uniprot_trembl_human.dat.gz']
files = ['../../DB/uniprot/uniprot_trembl_caeel.dat.gz']
fastaFiles = ['../../DB/uniprot/uniprot_trembl.fasta.gz']


class spc:
def __init__(self, id):
self.id = ''
self.acc = '-'
self.gene = '-'
self.kw = 'noTM'
self.signal = 0
self.topo = ''
self.fasta = ''
self.positions = {}
self.transmem = {}

x = []
flag = 0
dic = {}
for file in files:
for line in gzip.open(file, 'rt'):
#print (line)
if line[:2] == 'ID':
id = line.split()[1]
dic[id] = spc(id)
type = 'Unknown'
flag = 0
count = 0
elif line[:2] == 'AC' and dic[id].acc == '-':
dic[id].acc = line.split('AC')[1].replace(' ', '').replace('\n', '').split(';')[0]
elif line[:2] == 'GN' and dic[id].gene == '-':
#print (line)
dic[id].gene = line.split('=')[1].split(' ')[0].replace(' ', '').replace(',', '').replace(';', '').replace('\n', '')
elif line[:2] == 'KW' and dic[id].kw == 'noTM':
if 'Transmembrane' in line:
dic[id].kw = 'TM'
elif line[:2] == 'FT':
if line.split()[1] in ['SIGNAL']:
dic[id].signal = 1
if line.split()[1] in ['SIGNAL', 'TRANSMEM', 'TOPO_DOM']:
topo = line.split()[1]
if topo == 'TOPO_DOM':
flag = 1
continue
elif topo == 'TRANSMEM':
count += 1
start = line.split()[2].split('.')[0]
end = line.split()[2].split('.')[-1].replace('\n', '')
if '?' not in start and '?' not in end:
start = int(start)
end = int(end)
flag = 0
#if id == 'CXB1_HUMAN':
#print (count, id, line, type)
dic[id].transmem[topo+str(count)] = (type,start,end)
'''
if '?' not in start:
if int(start) <= 70 and line.split()[1] != 'SIGNAL':
flag = 1
dic[id].topo += topo+':'+start+':'+end+':'
elif line.split()[1] == 'SIGNAL':
dic[id].topo += topo+':'+start+':'+end + ':\t'
flag = 0
else:
flag = 0
'''
else:
'''
if '/note=' in line.split()[1] and flag == 1:
dic[id].topo += line.split()[1].split('"')[1] + '\t'
flag = 0
'''
if '/note=' in line.split()[1] and flag == 1:
type = line.split()[1].split('"')[1]
flag = 0

#break

#print (dic['FIMP_HUMAN'].transmem)
#print (dic['UD2A2_HUMAN'].transmem)
#print (dic['T2R33_HUMAN'].transmem)
#sys.exit()

for file in fastaFiles:
for line in gzip.open(file, 'rt'):
if line[0] == '>':
id = line.split('|')[2].split()[0]
if id in dic:
flag = 1
else:
flag = 0
elif flag == 1:
dic[id].fasta += line.replace('\n', '')

#print (dic['FIMP_HUMAN'].fasta)
for id in dic:
if dic[id].fasta == '':
r = requests.get('https://www.uniprot.org/uniprot/'+dic[id].acc+'.fasta')
dic[id].fasta = r.text
print ('Fetched', id)

#sys.exit()

## Extract Type II TMDs (N-term: Cytoplasmic; C-term: Extracellular)
l = ''
for id in dic:
if dic[id].fasta != '' and len(dic[id].fasta) >= 70:
if dic[id].kw == 'TM' and dic[id].signal == 0:
for tm in dic[id].transmem:
type, start, end = dic[id].transmem[tm]
if type == 'Cytoplasmic':
name = id+'_'+str(tm)
start_seq = start-1
end_seq = start_seq+70
if end_seq > len(dic[id].fasta):
#start_seq = len(dic[id].fasta) - 70
end_seq = len(dic[id].fasta)
'''
if end_seq > len(dic[id].fasta):
start_seq = len(dic[id].fasta) - 70
end_seq = len(dic[id].fasta)
'''
'''
if start <= 5:
start_seq = 0
end_seq = 70
else:
start_seq = start-5-1
end_seq = start_seq+70
if end_seq > len(dic[id].fasta):
start_seq = len(dic[id].fasta) - 70
end_seq = len(dic[id].fasta)
'''
l += '>'+name+'\n'+dic[id].fasta[start_seq:end_seq]+'\n'
#if len(dic[id].fasta[start_seq:end_seq]) != 70:
# print (id, name, start_seq, end_seq, len(dic[id].fasta))
#sys.exit()

#print (l)
#sys.exit()
'''
for id in dic:
if dic[id].kw == 'noTM' and dic[id].signal == 1:
#if id == 'HPTR_HUMAN':
print (id, dic[id].kw, dic[id].topo, dic[id].fasta)
'''
l = ''
z = '#Name\tPEP_START\tPEP_END\tDOM_START\tDOM_END\n'
for id in dic:
if dic[id].fasta != '' and len(dic[id].fasta) >= 70:
for tm in dic[id].transmem:
type, start, end = dic[id].transmem[tm]
name = id+'_'+str(tm)
start_seq = start-1
end_seq = start_seq+70
'''
if end_seq > len(dic[id].fasta):
start_seq = len(dic[id].fasta) - 70
end_seq = len(dic[id].fasta)
'''
if end_seq > len(dic[id].fasta):
#start_seq = len(dic[id].fasta) - 70
end_seq = len(dic[id].fasta)

if id == 'FIMP_HUMAN':
print (dic[id].transmem)
m = '>'+name+'\n'+dic[id].fasta[start_seq:end_seq]
l += m + '\n'
if os.path.isfile('../data/part_c_fasta_'+species+'/'+name+'.fasta') == False:
open('../data/part_c_fasta_'+species+'/'+name+'.fasta', 'w').write(m)
z += name + '\t' + str(start) + '\t' + str(start_seq+70) + '\t' + str(start) + '\t' + str(end) + '\n'
if os.path.isfile('../data/part_c_signalp_'+species+'/'+name+'_long_TM.out') == False:
print (name)
os.system('/home/gurdeep/Downloads/signalp-4.1/signalp_TM -f long -s best ../data/part_c_fasta_'+species+'/'+name+'.fasta > ../data/part_c_signalp_'+species+'/'+name+'_long_TM.out')
if os.path.isfile('../data/part_c_signalp_'+species+'/'+name+'_long_noTM.out') == False:
print (name)
os.system('/home/gurdeep/Downloads/signalp-4.1/signalp_noTM -f long -s best ../data/part_c_fasta_'+species+'/'+name+'.fasta > ../data/part_c_signalp_'+species+'/'+name+'_long_noTM.out')
#sys.exit()
elif dic[id].fasta == '' and dic[id].transmem != {}:
print ('No FASTA found for', id)
#elif len(dic[id].fasta) < 70 and dic[id].transmem != {}:
# print ('FASTA seq < 70 aa for', id)


#open('../data/uniprot_sprot_human_signalp_input_part_c.fasta', 'w').write(l)
open('../data/uniprot_'+species+'_signalp_input_part_c.fasta', 'w').write(l)
open('../data/part_c_'+species+'_annotations.tsv', 'w').write(z)
Loading

0 comments on commit b7ef804

Please sign in to comment.