diff --git a/.gitignore b/.gitignore index 2c7b77f..424dcaa 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ IRIS.egg-info/ build/ dist/ __pycache__/ +conda_env_2/ +conda_env_3/ +/.snakemake/ +/readline/ diff --git a/IRIS/IRIS_annotate_ijc.py b/IRIS/IRIS_annotate_ijc.py new file mode 100644 index 0000000..e2d39d2 --- /dev/null +++ b/IRIS/IRIS_annotate_ijc.py @@ -0,0 +1,220 @@ +import numpy as np +import sys +import os, glob, pyBigWig, argparse +from scipy import stats +import statsmodels.stats.weightstats as smw +from . import config +import warnings +warnings.filterwarnings("ignore") + +#python retreive_SJ_info.py test_sj.para SE event_list_test.txt sj_info +def read_SJMatrix_index(fn,outdir): + index = {} + for line in open(outdir+'/'+fn.split('/')[-1]+'.idx', 'r'): + ele = line.strip().split() + index[ele[0]] = int(ele[1]) + return index + +def fetch_SJMatrix(eid, fn, delim, index, head_only): + with open(fn, 'r') as f: + if head_only: + ele = f.readline().strip().split(delim) + retrieved_text = np.asarray([ x.split('.aln')[0] for x in ele ]) + else: + f.seek(index[eid], 0) + retrieved_text = np.asarray(f.readline().strip().split(delim)) + return retrieved_text + + +def loadParametersRow(filter_para, panel_list): + filter_cutoffs='' + if filter_para.strip()!='': + filter_cutoffs = map(float,filter_para.strip().split(' ')[0].split(',')) + filter_panel_list = filter_para.strip().split(' ')[1].split(',') + panel_list+=filter_panel_list + else: + filter_panel_list =[] + return filter_cutoffs, filter_panel_list, panel_list + + + +def readEventRow(row, header_line): + if header_line=='' or header_line==False: + rs=row.strip().split('\t') + return rs + else: + rs=row.strip().split('\t') + return dict(zip(header_line, rs)) + +def convertAS2SJevent(line_dict, splicing_event_type):#only for this script. Diff + as_event=line_dict['as_event'] + event_s=as_event.split(':') + if splicing_event_type=='SE': + event_row_list=[event_s[2]+':'+str(int(event_s[6])+1)+':'+event_s[4], event_s[2]+':'+str(int(event_s[5])+1)+':'+event_s[7], event_s[2]+':'+str(int(event_s[6])+1)+':'+event_s[7]] + + elif splicing_event_type=='A5SS':# Only use one junction for inc. Need to improve by updating db later + event_row_list=[line_dict['chr']+':'+str(int(event_s[5])+1)+':'+event_s[8], line_dict['chr']+':'+str(int(event_s[7])+1)+':'+event_s[8]] + + elif splicing_event_type=='A3SS': # Only use one junction for inc. Need to improve by updating db later + event_row_list=[line_dict['chr']+':'+str(int(event_s[9])+1)+':'+event_s[4],line_dict['chr']+':'+str(int(event_s[9])+1)+':'+event_s[6]] + + else: + exit('[Error] Splicing event type not supported. Exiting.') + return event_row_list, as_event + +def summarizeSJ2ASevent(event_list_fin, splicing_event_type, sig_junction, outdir, out_prefix): + fout_summary_fname=outdir+'/SJ.'+out_prefix+'.'+splicing_event_type+'.summary_by_sig_event.txt' + fout_summary=open(fout_summary_fname,'w') + for event_idx, event_row in enumerate(open(event_list_fin)): + if event_idx==0: + header_list=readEventRow(event_row,'') + continue + line_dict=readEventRow(event_row, header_list) + event_row_list, as_event=convertAS2SJevent(line_dict, splicing_event_type) + as_event_result=[] + as_event_result_list=[] + for k in event_row_list: + if k not in sig_junction: + as_event_result.append(False) + else: + as_event_result.append(True) + as_event_result_list.append(k) + if as_event_result[0]==as_event_result[1]==as_event_result[2]==True: + fout_summary.write(as_event+'\tAll junctions\t'+';'.join(as_event_result_list)+'\n') + elif as_event_result[0]==as_event_result[1]==as_event_result[2]==False: + continue + else: + if as_event_result[0]==as_event_result[1]!=as_event_result[2]: + fout_summary.write(as_event+'\tOnly alternative junctions\t'+';'.join(as_event_result_list)+'\n') + else: + fout_summary.write(as_event+'\tOther combination\t'+';'.join(as_event_result_list)+'\n') + fout_summary.close() + return fout_summary_fname + + + + +def main(args): + ###Loading Parameters#### + para_fin=args.parameter_file + splicing_event_type=args.splicing_event_type + if splicing_event_type!='SE': + exit('[Error] Invalid AS event type.') + event_list_fin=args.screening_result_event_list + outdir=args.outdir.rstrip('/') + + os.system('mkdir -p '+outdir) + fetching_sj_col=1 + out_prefix,db_dir,filter1_para,filter2_para,filter3_para=[l.strip() for l in open(para_fin)][:5] + db_dir=db_dir.rstrip('/') + if os.path.isdir(db_dir+'_sjc'): #automatically use db_sjc if in the same dir. Otherwise, use the user input db_dir + db_dir=db_dir+'_sjc' + panel_list=[out_prefix] + + filter1_cutoffs, filter1_panel_list, panel_list = loadParametersRow(filter1_para, panel_list) + filter2_cutoffs, filter2_panel_list, panel_list = loadParametersRow(filter2_para, panel_list) + filter3_cutoffs, filter3_panel_list, panel_list = loadParametersRow(filter3_para, panel_list) + tumor_dict=dict.fromkeys(filter2_panel_list,'') + tumor_dict[out_prefix]='' + pvalue_cutoff_normal=''; pvalue_cutoff_tumor='' + filter1_group_cutoff=''; filter2_group_cutoff=''; filter3_group_cutoff=''; + if filter1_cutoffs!='': + pvalue_cutoff_normal,filter1_group_cutoff=filter1_cutoffs[3:] + if filter2_cutoffs!='': + pvalue_cutoff_tumor,filter2_group_cutoff=filter2_cutoffs[3:] + if filter3_cutoffs!='': + pvalue_cutoff_normal,filter3_group_cutoff=filter3_cutoffs[3:] + + inc_read_cov_cutoff=int(args.inc_read_cov_cutoff)#2 + event_read_cov_cutoff=int(args.event_read_cov_cutoff)#10 + + ##Load IRIS reference panels to 'fin_list', 'index' + index={} + fin_list={} + for group_name in panel_list: + fin_list[group_name]=db_dir+'/'+group_name+'/sjc_matrix/SJ_count.'+group_name+'.txt' + for group in fin_list.keys(): + if not os.path.isfile(fin_list[group]+'.idx'): + exit('[Error] Need to index '+fin_list[group]) + index[group]=read_SJMatrix_index(fin_list[group],'/'.join(fin_list[group].split('/')[:-1])) + + print('[INFO] Done loading index '+' '.join(panel_list)) + tot=config.file_len(event_list_fin)-1 + fout_ijc=open(outdir+'/'+event_list_fin.split('/')[-1]+'.ijc_info.txt','w') + header_line=[] + sample_size={} + + header_line=['ijc_ratio', 'mean_ijc_by_group', 'percent_sample_imbalanced', 'sample_imbalanced_by_group'] + header_list=[] + print('[INFO] Retrieving inclusion junction info') + for event_idx, event_row in enumerate(open(event_list_fin)): + config.update_progress(event_idx/(0.0+tot)) + if event_idx==0: + header_list=readEventRow(event_row,'') + fout_ijc.write(event_row.strip()+'\t'+'\t'.join(header_line)+'\n') + continue + line_dict=readEventRow(event_row, header_list) + event_row_list, as_event=convertAS2SJevent(line_dict, splicing_event_type) + + forms={} + if splicing_event_type=='SE': + inc1, inc2, skp= event_row_list + forms={'inc1':inc1,'inc2':inc2,'skp':skp} + else: + exit('[Error] Invalid AS event type.') + #Initiate psi matrix by each row to 'sj' + + sj={} + for form in forms: + k=forms[form] + sj[form]={} + for group in panel_list: + random_key=index[group].keys()[0] + sample_names=map(str,fetch_SJMatrix(random_key,fin_list[group],'\t',index[group],True)[fetching_sj_col:]) + sample_size[group]=len(sample_names) + if k in index[group]: + sj[form][group]=map(int,fetch_SJMatrix(k,fin_list[group],'\t',index[group], False)[fetching_sj_col:]) + else: + sj[form][group]=[0]*sample_size[group] + + imbalance={} + total_sample={} + inc_ratio={} + i1_list={} + i2_list={} + for group in panel_list: + i1=map(int,sj['inc1'][group]) + i2=map(int,sj['inc2'][group]) + s=map(int,sj['skp'][group]) + i1_list[group]=[] + i2_list[group]=[] + imbalance[group]=0 + total_sample[group]=0 + for pidx in range(0, len(sj['inc1'][group])): + if i1[pidx]+i2[pidx]2 or ratio<0.5: + imbalance[group]+=1 + i1_list[group].append(i1[pidx]) + i2_list[group].append(i2[pidx]) + + i_by_group='|'.join([str(round(np.mean(i1_list[group]),1))+','+str(round(np.mean(i2_list[group]),1)) for group in panel_list]) + i1_sum=sum(sum(i1_list[g]) for g in panel_list) + i2_sum=sum(sum(i2_list[g]) for g in panel_list) + ratio=round(i1_sum/(i2_sum+0.01),3) + + imbalance_by_group='|'.join([str(imbalance[group])+','+str(total_sample[group]) for group in panel_list]) + imb_count=sum(imbalance[group] for group in panel_list) + tot_count=sum(total_sample[group] for group in panel_list) + imb_perc='-' + if tot_count!=0: + imb_perc=round(imb_count/(tot_count+0.0),3) + fout_ijc.write(event_row.rstrip()+'\t'+'\t'.join([as_event,str(ratio), i_by_group, str(imb_perc),imbalance_by_group])+'\n') + + fout_ijc.close() + + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_append_cpm.py b/IRIS/IRIS_append_cpm.py new file mode 100644 index 0000000..d3bd688 --- /dev/null +++ b/IRIS/IRIS_append_cpm.py @@ -0,0 +1,57 @@ +import sys, glob, os +from . import config +#append annotation to all screening result and epitope result in a specified screening folder +def loadPTsummary(fin): + PT_event={} + for l in open(fin): + ls=l.strip().split('\t') + PT_event[ls[0]]=ls[1]+'|'+ls[2]+'|'+ls[3]+'|'+ls[4] + return PT_event + + +def annotateRAbyPTEvent(fin, event_col, CPM_event, fout_fname): + fout=open(fout_fname,'w') + for n,l in enumerate(open(fin)): + if n==0: + annotation='cpm_test_summary' + fout.write(l.strip()+'\t'+annotation+'\n') + continue + ls=l.strip().split('\t') + annotation='-' + if CPM_event.has_key(ls[event_col]): + annotation=CPM_event[ls[event_col]] + fout.write(l.strip()+'\t'+annotation+'\n') + fout.close() + +def main(args): + CPMsummary=args.cpm_summary + CPM_event=loadPTsummary(CPMsummary) + print('[INFO] Number of event in CPM screen summary:'+str(len(CPM_event))) + splicing_event_type=args.splicing_event_type + screening_result_dir=args.outdir.rstrip('/') + + epitope_file_junction_file_list= glob.glob(screening_result_dir+'/'+splicing_event_type+'.*/epitope_summary.junction-based.txt') + epitope_file_peptide_file_list= glob.glob(screening_result_dir+'/'+splicing_event_type+'.*/epitope_summary.peptide-based.txt') + extracellular_as_file_list= glob.glob(screening_result_dir+'/*'+splicing_event_type+'.*.ExtraCellularAS.txt') + screening_file_list=glob.glob(screening_result_dir+'/*.'+splicing_event_type+'.tier1.txt')+glob.glob(screening_result_dir+'/*.'+splicing_event_type+'.tier2tier3.txt') + + + for fin_fname in screening_file_list: + print '[INFO] Integrating CPM test result to', fin_fname + annotateRAbyPTEvent(fin_fname, 0, CPM_event, fin_fname+'.integratedCPM.txt') + + for fin_fname in extracellular_as_file_list: + print '[INFO] Integrating CPM test result to', fin_fname + annotateRAbyPTEvent(fin_fname, 0, CPM_event, fin_fname+'.integratedCPM.txt') + + for fin_fname in epitope_file_peptide_file_list: + print '[INFO] Integrating CPM test result to', fin_fname + annotateRAbyPTEvent(fin_fname, 1, CPM_event, fin_fname+'.integratedCPM.txt') + + for fin_fname in epitope_file_junction_file_list: + print '[INFO] Integrating CPM test result to', fin_fname + annotateRAbyPTEvent(fin_fname, 0, CPM_event, fin_fname+'.integratedCPM.txt') + + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_append_sjc.py b/IRIS/IRIS_append_sjc.py new file mode 100644 index 0000000..3421e0d --- /dev/null +++ b/IRIS/IRIS_append_sjc.py @@ -0,0 +1,95 @@ +import sys, glob, os +from . import config +#append annotation to all screening result and epitope result in a specified screening folder +def loadPTsummary(fin): + PT_event={} + for l in open(fin): + ls=l.strip().split('\t') + PT_event[ls[0]]=ls[1]+'|'+ls[2] + return PT_event + +def loadIJCsummary(fin): + IJC_info={} + for i,l in enumerate(open(fin)): + if i==0: + continue + ls=l.strip().split('\t') + IJC_info[ls[0]]=ls[-4]+'\t'+ls[-3]+'\t'+ls[-2]+'\t'+ls[-1] + return IJC_info + +def annotateRAbyPTEvent(fin, event_col, PT_event, fout_fname, IJC_info): + fout=open(fout_fname,'w') + for n,l in enumerate(open(fin)): + if n==0: + annotation='sjc_test_summary' + if IJC_info!={}: + annotation+='\t'+'\t'.join(['ijc_ratio', 'mean_ijc_by_group', 'percent_sample_imbalanced', 'sample_imbalanced_by_group']) + fout.write(l.strip()+'\t'+annotation+'\n') + continue + ls=l.strip().split('\t') + annotation='-' + if PT_event.has_key(ls[event_col]): + annotation=PT_event[ls[event_col]] + if IJC_info!={}: + if IJC_info.has_key(ls[event_col]): + annotation+='\t'+IJC_info[ls[event_col]] + else: + annotation+='\t'+'\t'.join(['-']*4) + fout.write(l.strip()+'\t'+annotation+'\n') + fout.close() + +def main(args): + PTsummary=args.sjc_summary + PT_event=loadPTsummary(PTsummary) + print('[INFO] Number of event in SJ count screen summary:'+str(len(PT_event))) + splicing_event_type=args.splicing_event_type + screening_result_dir=args.outdir.rstrip('/') + + epitope_file_junction_file_list= glob.glob(screening_result_dir+'/'+splicing_event_type+'.*/epitope_summary.junction-based.txt') + epitope_file_peptide_file_list= glob.glob(screening_result_dir+'/'+splicing_event_type+'.*/epitope_summary.peptide-based.txt') + extracellular_as_file_list= glob.glob(screening_result_dir+'/*'+splicing_event_type+'.*.ExtraCellularAS.txt') + screening_file_list=glob.glob(screening_result_dir+'/*.'+splicing_event_type+'.tier1.txt')+glob.glob(screening_result_dir+'/*.'+splicing_event_type+'.tier2tier3.txt') + + #optional, if ijc imbalance + add_ijc_info=args.add_ijc_info + use_existing_result=args.use_existing_result#use existing ijc result + para_file=args.parameter_file + event_list_file=args.screening_result_event_list + inc_read_cov_cutoff=args.inc_read_cov_cutoff#2 + event_read_cov_cutoff=args.event_read_cov_cutoff#10 + IJC_info={} + if add_ijc_info or use_existing_result: + if para_file=='' or event_list_file=='': + exit('[Error] Specify parameters and event list file for retrieving inclusion junction information.') + + if add_ijc_info and use_existing_result==False: + cmd='IRIS annotate_ijc -p '+para_file+' --splicing-event-type '+splicing_event_type+' -e '+event_list_file+' -o '+screening_result_dir+' --inc-read-cov-cutoff '+str(inc_read_cov_cutoff)+' --event-read-cov-cutoff '+str(event_read_cov_cutoff) + print('[INFO] Annotating inclusion junction info to '+event_list_file) + print(cmd) + os.system(cmd) + if use_existing_result or add_ijc_info: + file_path=screening_result_dir+'/'+event_list_file.split('/')[-1]+'.ijc_info.txt' + if os.path.exists(file_path)==False: + exit('[Error] Result IJC annotation file not found in path '+file_path) + print('[INFO] Loading inclusion junction info from '+file_path) + IJC_info=loadIJCsummary(file_path) + + for fin_fname in screening_file_list: + print '[INFO] Integrating SJC test result to', fin_fname + annotateRAbyPTEvent(fin_fname, 0, PT_event, fin_fname+'.integratedSJC.txt',IJC_info) + + for fin_fname in extracellular_as_file_list: + print '[INFO] Integrating SJC test result to', fin_fname + annotateRAbyPTEvent(fin_fname, 0, PT_event, fin_fname+'.integratedSJC.txt',IJC_info) + + for fin_fname in epitope_file_peptide_file_list: + print '[INFO] Integrating SJC test result to', fin_fname + annotateRAbyPTEvent(fin_fname, 1, PT_event, fin_fname+'.integratedSJC.txt',IJC_info) + + for fin_fname in epitope_file_junction_file_list: + print '[INFO] Integrating SJC test result to', fin_fname + annotateRAbyPTEvent(fin_fname, 0, PT_event, fin_fname+'.integratedSJC.txt',IJC_info) + + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_epitope_post.py b/IRIS/IRIS_epitope_post.py index bc9d49d..ee0eb8a 100644 --- a/IRIS/IRIS_epitope_post.py +++ b/IRIS/IRIS_epitope_post.py @@ -3,6 +3,7 @@ from . import config # SUPPORT -u mode. will parse both form + def parsePredFile(path_to_peptide_file,fin,fout_pass,IC50_cutoff,gene_name,junction,pred_med,length): seq_list=[] seq_list_seq=[] @@ -25,9 +26,30 @@ def parsePredFile(path_to_peptide_file,fin,fout_pass,IC50_cutoff,gene_name,junct if med_ic50<=IC50_cutoff: fout_pass.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene_name, junction,seq_list[int(l['seq_num'])-1],l['start'],l['end'],length,pred_med, l['allele'],l['peptide'],junc_pep,med_ic50)) -def writePositivePrediction(output_path, IC50_cutoff): +def retrievePrioritizedfromPrimary(output_path, splicing_event_type): + fin_list=glob.glob(output_path+'/tmp/prot.compared/skp/*.fa') + fin_list=fin_list+glob.glob(output_path+'/tmp/prot.compared/inc/*.fa') + pred_fin_list=[] + retrieve_path='/'.join(output_path.split('/')[:-1])+'/'+splicing_event_type+'.tier1' + for fin in fin_list: + event_name_list=fin.split('/')[-1].split('.') + prediction_result_fn=retrieve_path+'/tmp/pred/'+event_name_list[1].split('_')[0]+'/'+'.'.join(event_name_list[:-2])##WORKING ON + prediction_result_fin_list=glob.glob(prediction_result_fn+'*') + pred_fin_list+=prediction_result_fin_list + return pred_fin_list + +def writePositivePrediction(output_path, IC50_cutoff, splicing_event_type, retrieve_prioritized): print '[INFO] Collecting binding predictions:'+output_path.rstrip('/').split('/')[-1] - pred_fin_list=glob.glob(output_path+'/tmp/pred/*/*') + pred_fin_list=[] + if retrieve_prioritized: + if glob.glob(output_path+'/tmp/pred/*/*')!=[] and glob.glob('/'.join(output_path.split('/')[:-1])+'/'+splicing_event_type+'.tier1'+'/tmp/pred/*/*')==[]: + print '[INFO] Tier1 comparison result is empty. Retrieving tier2&tier3 prediction.' + pred_fin_list=glob.glob(output_path+'/tmp/pred/*/*') + else: + print '[INFO] Retrieving t prediction from tier1 comparison.' + pred_fin_list= retrievePrioritizedfromPrimary(output_path, splicing_event_type) + else: + pred_fin_list=glob.glob(output_path+'/tmp/pred/*/*') fout_pass=open(output_path+'/pred_filtered.score'+str(IC50_cutoff)+'.txt','w') tot=len(pred_fin_list)-1 for i,fin in enumerate(pred_fin_list): @@ -86,17 +108,69 @@ def loadGeneExp(gene_exp_matrix_fin): Exp[name]=map(str,[mean, Q1, Q3]) return Exp,['meanGeneExp','Q1GeneExp','Q3GeneExp'] +def buildJunctionKmer(seq, kmer_length, kmer_dict, name, keeper, anno): + if anno in keeper: + return kmer_dict + + else: + keeper[anno]='' + for k in kmer_length: + for i in range(len(seq)-k+1): + s=seq[i:i+k] + if s not in kmer_dict: + kmer_dict[s]=[] + kmer_dict[s].append(name) + return kmer_dict + +def epitopeUniquenessAnnotation(kmer_length, screening_dir): + kmer={} + keeper={} + peptide_fin_dir=screening_dir+'/*.*/tmp/prot/*' + peptide_fin_list=glob.glob(peptide_fin_dir) + print '[INFO] Total pool of splice junctions:',len(peptide_fin_list) + tot=len(peptide_fin_list)-1 + for m, peptide_fin in enumerate(peptide_fin_list): + config.update_progress(m/(0.0+tot)) + pep_gene_name=peptide_fin.split('/')[-1].split('_')[0] + anno='' + for l in open(peptide_fin): + if l[0]=='>': + anno=l.strip() + continue + ls=l.strip().upper() + kmer=buildJunctionKmer(ls, kmer_length, kmer, pep_gene_name, keeper, anno) + anno='' + print '[INFO] Total number of AS junction k mers from the analyzed sequencing data:',len(kmer) + return kmer + + +def loadKmer(kmer_file): + kmer={} + for l in open(kmer_file): + ls=l.strip().split('\t') + kmer[ls[0]]=ls[1].strip(';') + return kmer -def writePeptideSummary(output_path, screening_result_path, gene_exp_path, IC50_cutoff, sample_HLA, sample_list): +def writePeptideSummary(output_path, screening_result_path, gene_exp_path, IC50_cutoff, sample_HLA, sample_list, match_normal, uniqueness, epitope_len_list, db_dir): pep_dict={} - screening_result_dict, screening_result_header=loadScreening(screening_result_path) + kmer_dict={} + header_ext=[] if gene_exp_path: gene_exp_dict, gene_exp_header=loadGeneExp(gene_exp_path)#exp. or fpkm1. matrix + header_ext+=gene_exp_header + if match_normal: + print '[INFO] Loading kmers for checking canonical proteome: '+','.join(map(str,epitope_len_list)) + for kmer_len in epitope_len_list: + kmer_dict[kmer_len]=loadKmer(db_dir+'/resources/kmers/uniprot-all.fasta.'+str(kmer_len)+'mer_dict.txt') + header_ext+=['canonical_match'] + if uniqueness: + junction_kmer=epitopeUniquenessAnnotation(epitope_len_list, '/'.join(screening_result_path.split('/')[:-1])) + header_ext+=['uniqueness'] + screening_result_dict, screening_result_header=loadScreening(screening_result_path) + fout_screening=open(output_path+'/epitope_summary.peptide-based.txt','w') - if gene_exp_path: - fout_screening.write('\t'.join(['epitope','as_event','junction_peptide_form','num_hla','num_sample']+sample_list+screening_result_header+gene_exp_header)+'\n') - else: - fout_screening.write('\t'.join(['epitope','as_event','junction_peptide_form','num_hla','num_sample']+sample_list+screening_result_header)+'\n') + fout_screening.write('\t'.join(['epitope','as_event','junction_peptide_form','inclusion_form','num_hla','num_sample','hla_types']+sample_list+screening_result_header+header_ext)+'\n') + for l in open(output_path+'/pred_filtered.score'+str(IC50_cutoff)+'.txt'): ls=l.strip().split('\t') @@ -105,15 +179,18 @@ def writePeptideSummary(output_path, screening_result_path, gene_exp_path, IC50_ pred_score=ls[10] splicing=(ls[0]+'_'+ls[1]).replace('_',':') form=ls[2].split(':')[5][:3] + inc_form='' + if form=='inc': + inc_form=ls[2].split(':')[5][:4] if peptide+'|'+splicing+form not in pep_dict: - pep_dict[peptide+'|'+splicing+form]=[] - pep_dict[peptide+'|'+splicing+form].append(hla_type+'|'+pred_score) -# print len(pep_dict) + pep_dict[peptide+'|'+splicing+form+'|'+inc_form]=[] + pep_dict[peptide+'|'+splicing+form+'|'+inc_form].append(hla_type+'|'+pred_score) for k in pep_dict: ks=k.split('|') peptide=ks[0] splicing=ks[1][:-3] form=ks[1][-3:] + inc_form=ks[2] hla_info=pep_dict[k] hla={} for info in hla_info: @@ -122,7 +199,7 @@ def writePeptideSummary(output_path, screening_result_path, gene_exp_path, IC50_ pred_score=info_split[1] hla[hla_type]=pred_score num_hla=len(hla) - line= [peptide, splicing, form, str(num_hla)] + line= [peptide, splicing, form, inc_form, str(num_hla), ';'.join(sorted(hla.keys()))] patient_count=len(sample_list) for p in sample_list: patient_hla_info=[] @@ -135,11 +212,25 @@ def writePeptideSummary(output_path, screening_result_path, gene_exp_path, IC50_ patient_hla_info_line=';'.join(patient_hla_info) line.append(patient_hla_info_line) line.insert(4,str(patient_count)) + optional_annotations=[] if gene_exp_path: gene_exp_list=gene_exp_dict[splicing.split(':')[0]] - fout_screening.write('\t'.join(line+screening_result_dict[splicing]+gene_exp_list)+'\n') - else: - fout_screening.write('\t'.join(line+screening_result_dict[splicing])+'\n') + optional_annotations+=gene_exp_list + if match_normal: + matched_protein='NonCanonical' if peptide not in kmer_dict[len(peptide)] else kmer_dict[len(peptide)][peptide] + optional_annotations+=[matched_protein] + if uniqueness: + if peptide not in junction_kmer: + print 'error', ls[0] + else: + uniqueness_annotation='-' + if len(junction_kmer[peptide])==1: + uniqueness_annotation='unique:'+junction_kmer[peptide][0] + else: + uniqueness_annotation='multi:'+';'.join(junction_kmer[peptide]) + optional_annotations+=[uniqueness_annotation] + fout_screening.write('\t'.join(line+screening_result_dict[splicing]+optional_annotations)+'\n') + def writeJunctionSummary(output_path, screening_result_path, gene_exp_path, IC50_cutoff, sample_HLA, sample_list): junction_dict={} @@ -159,13 +250,9 @@ def writeJunctionSummary(output_path, screening_result_path, gene_exp_path, IC50 pred_score=ls[10] splicing=(ls[0]+'_'+ls[1]).replace('_',':') form=ls[2].split(':')[5][:3] - # if peptide+'|'+splicing+form not in pep_dict: - # pep_dict[peptide+'|'+splicing+form]=[] - # pep_dict[peptide+'|'+splicing+form].append(hla_type+'|'+pred_score) if splicing+form not in junction_dict: junction_dict[splicing+form]=[] junction_dict[splicing+form].append(hla_type+'|'+pred_score) -# print len(pep_dict) for k in junction_dict: splicing=k[:-3] form=k[-3:] @@ -192,28 +279,53 @@ def writeJunctionSummary(output_path, screening_result_path, gene_exp_path, IC50 patient_hla_info_line=';'.join(patient_hla_info) line.append(patient_hla_info_line) line.insert(3,str(patient_count)) + optional_annotations=[] if gene_exp_path: gene_exp_list=gene_exp_dict[splicing.split(':')[0]] - fout_screening.write('\t'.join(line+screening_result_dict[splicing]+gene_exp_list)+'\n') - else: - fout_screening.write('\t'.join(line+screening_result_dict[splicing])+'\n') + optional_annotations+=gene_exp_list + fout_screening.write('\t'.join(line+screening_result_dict[splicing]+optional_annotations)+'\n') def main(args): outdir=args.outdir + splicing_event_type=args.splicing_event_type IC50_cutoff=args.ic50_cut_off gene_exp_matrix=args.gene_exp_matrix + match_normal=True if args.no_match_to_canonical_proteome==False else False + uniqueness=True if args.no_uniqueness_annotation==False else False + prioritized_only=args.tier3_only + keep_exist=args.keep_exist + + write_positive=True + if keep_exist==True: + if os.path.exists(outdir+'/'+splicing_event_type+'.tier2tier3/pred_filtered.score'+str(IC50_cutoff)+'.txt') or os.path.exists(outdir+'/'+splicing_event_type+'.tier1/pred_filtered.score'+str(IC50_cutoff)+'.txt'): + write_positive=False - writePositivePrediction(outdir+'/primary', IC50_cutoff) - writePositivePrediction(outdir+'/prioritized', IC50_cutoff) + sample_list,sample_HLA=loadSampleHLA(args.mhc_by_sample) + analysis_name,db_dir=[l.strip() for l in open(args.parameter_fin)][0:2] + db_dir='/'.join(db_dir.rstrip('/').split('/')[:-1]) + analysis_name=analysis_name+'.'+splicing_event_type + screening_result_path=outdir+'/'+analysis_name + if config.file_len(screening_result_path+'.tier1.txt')==1 and prioritized_only==False: + prioritized_only=True + print "[INFO] No tier1 comparisons (tissue-matched normal) found. Use tier2tier3 only mode. " + + epitope_len_list=map(int,args.epitope_len_list.split(',')) + if write_positive: + if prioritized_only: + writePositivePrediction(outdir+'/'+splicing_event_type+'.tier2tier3', IC50_cutoff, splicing_event_type, False) + else: + writePositivePrediction(outdir+'/'+splicing_event_type+'.tier1', IC50_cutoff, splicing_event_type, False) + writePositivePrediction(outdir+'/'+splicing_event_type+'.tier2tier3', IC50_cutoff, splicing_event_type, True) + else: + print '[INFO] File(s) '+outdir+'/'+splicing_event_type+'.*/pred_filtered.score'+str(IC50_cutoff)+'.txt'+' exist. Skip the step generating this output.' + + if prioritized_only==False: + writePeptideSummary(outdir+'/'+splicing_event_type+'.tier1', outdir+'/'+analysis_name+'.tier1.txt', gene_exp_matrix, IC50_cutoff, sample_HLA, sample_list, match_normal, uniqueness, epitope_len_list, db_dir) + writePeptideSummary(outdir+'/'+splicing_event_type+'.tier2tier3', outdir+'/'+analysis_name+'.tier2tier3.txt', gene_exp_matrix, IC50_cutoff, sample_HLA, sample_list, match_normal, uniqueness, epitope_len_list, db_dir) - sample_list,sample_HLA=loadSampleHLA(args.mhc_by_sample) - analysis_name=[l.strip() for l in open(args.parameter_fin)][0] - screening_result_path=outdir+'/'+analysis_name+'.primary.txt' - writePeptideSummary(outdir+'/primary', outdir+'/'+analysis_name+'.primary.txt', gene_exp_matrix, IC50_cutoff, sample_HLA, sample_list) - writePeptideSummary(outdir+'/prioritized', outdir+'/'+analysis_name+'.prioritized.txt', gene_exp_matrix, IC50_cutoff, sample_HLA, sample_list) - - writeJunctionSummary(outdir+'/primary', outdir+'/'+analysis_name+'.primary.txt', gene_exp_matrix, IC50_cutoff, sample_HLA, sample_list) - writeJunctionSummary(outdir+'/prioritized', outdir+'/'+analysis_name+'.prioritized.txt', gene_exp_matrix, IC50_cutoff, sample_HLA, sample_list) + if prioritized_only==False: + writeJunctionSummary(outdir+'/'+splicing_event_type+'.tier1', outdir+'/'+analysis_name+'.tier1.txt', gene_exp_matrix, IC50_cutoff, sample_HLA, sample_list) + writeJunctionSummary(outdir+'/'+splicing_event_type+'.tier2tier3', outdir+'/'+analysis_name+'.tier2tier3.txt', gene_exp_matrix, IC50_cutoff, sample_HLA, sample_list) if __name__ == '__main__': diff --git a/IRIS/IRIS_extract_sjc.py b/IRIS/IRIS_extract_sjc.py new file mode 100644 index 0000000..406937e --- /dev/null +++ b/IRIS/IRIS_extract_sjc.py @@ -0,0 +1,238 @@ +import argparse +import pysam +import os +# Adopted by Yang Pan 2020.12.20 (panyang@ucla.edu) +# Author: Robert Wang, PhD Student (Xing Lab) +# Date: 2020.10.02 +# E-mail: robwang@pennmedicine.upenn.edu + +# This is a script that extracts splice junctions from a STAR-aligned BAM +# file and annotates each splice junction with the number of uniquely +# mapped reads that support that splice junction. Supporting reads for +# annotated splice junctions are by default only required to have a +# minimum overhang of 1 bp. Supporting reads for unannotated and +# canonical splice junctions (GT-AG/CT-AC; GC-AG/CT-GC; AT-AC/GT-AT) are +# by default only required to have a minimum overhang of 8 bp. Supporting +# reads for unannotated and non-canonical splice junctions however are +# by default required to have a minimum overhang of 10 bp. The resulting +# output file is a TSV with two fields: (1) a splice junction ID with +# format [chr#]:[start]:[end], (2) number of uniquely mapped reads +# supporting the splice junction. All genomic coordinates used in +# describing each splice junction are 1-based. Make sure that the BAM +# file and genome FASTA file have been indexed. + +# UPDATE (2020.10.02): Users have the option of strictly filtering +# for reads of a specified size using the -r option + +# Usage: +# python extract_SJ.py -i /path/to/BAM/file \ +# -g /path/to/annotation/GTF/file \ +# -a [minimum overhang length for annotated SJs, default: 1] +# -c [minimum overhang length for unannotated canonical SJs, default: 8] \ +# -n [minimum overhang length for unannotated non-canonical SJs, default: 10] \ +# -r [length of reads to keep when counting junction reads] \ +# -f /path/to/genome/fasta/file \ +# -o /path/to/output/file + +# Dependencies: +# * argparse +# * pysam + +def get_introns(blocks): + introns = [] + + # N blocks are associated with N-1 introns + for i in range(len(blocks)-1): + intronStart = blocks[i][1] + 1 + intronEnd = blocks[i+1][0] + introns += [(intronStart, intronEnd)] + + return introns + +def isCanonical(dn1, dn2): + # Construct tuple representing SJ dinucleotides + sjDN = (dn1, dn2) + + # Establish canonical SJs + canonicalSJ = [('GT', 'AG'), ('CT', 'AC'), ('GC', 'AG'), + ('CT', 'GC'), ('AT', 'AC'), ('GT', 'AT')] + + return sjDN in canonicalSJ + +def get_threshold(chr, introns, genome, annoSJ, minOverhang, minOverhangC, minOverhangNC): + isAnno = [] + isCanon = [] + + # Iterate through each intron + for i in range(len(introns)): + # Construct splice junction ID + sjInfo = [chr, introns[i][0], introns[i][1]] + sjID = ':'.join(map(str, sjInfo)) + + # Retrieve dinucleotides for splice junction + dn1 = genome.fetch(chr, introns[i][0]-1, introns[i][0]+1) + dn2 = genome.fetch(chr, introns[i][1]-2, introns[i][1]) + + isAnno.append(sjID in annoSJ) + isCanon.append(isCanonical(dn1, dn2)) + + if all(isAnno): + threshold = minOverhang + elif all(isCanon): + threshold = minOverhangC + else: + threshold = minOverhangNC + + return threshold + +def update_SJdb(read, sjDB, annoSJ, genome, minOverhang, minOverhangC, minOverhangNC): + # Get chromosome for read + chr = read.reference_name + + # Extract coordinates of blocks for each read + blocks = read.get_blocks() + introns = get_introns(blocks) + + # Determine threshold for read anchor lengths + threshold = get_threshold(chr, introns, genome, annoSJ, minOverhang, minOverhangC, minOverhangNC) + + # Compute anchor lengths + leftAnchorLen = blocks[0][1] - blocks[0][0] + rightAnchorLen = blocks[len(introns)][1] - blocks[len(introns)][0] + + # Only keep reads that satisfy appropriate anchor length + # threshold + if min(leftAnchorLen, rightAnchorLen) >= threshold: + # Iterate through introns and check if SJ is canonical + for i in range(len(introns)): + # Construct splice junction ID + sjInfo = [chr, introns[i][0], introns[i][1]] + sjID = ':'.join(map(str, sjInfo)) + + # Update sjDB with SJ + # Check if SJ exists in dictionary + if sjID in sjDB: + # Increment current read count by 1 + sjDB[sjID] += 1 + else: + # Add sjID to sjDB with one read + sjDB[sjID] = 1 + + # Return the updated dictionary + return sjDB + +def get_transcript_ID(infoString): + infoStringArr = infoString.split(';') + # Get position in the INFO string that has the transcript ID + idx = [i for i, s in enumerate(infoStringArr) if 'transcript_id' in s][0] + return infoStringArr[idx].split('"')[1] + +def build_anno_SJdb(gtfPath, chrList): + annoSJ = {} + + # Create dictionary object to store exons of a transcript + exons = {} + # Create dictionary object to store chr of a transcript + chrDict = {} + + # Read through every line of the gtfPath + with open(gtfPath) as f: + for line in f: + if not line.startswith('#'): + info = line.strip().split('\t') + if info[0] in chrList and info[2] == 'exon': + exonStart = int(info[3]) + exonEnd = int(info[4]) + transcriptID = get_transcript_ID(info[8]) + # Check if transcript is included in exons + if transcriptID in exons: + exons[transcriptID] += [(exonStart, exonEnd)] + else: + exons[transcriptID] = [(exonStart, exonEnd)] + + # Check if transcript is included in chrDict + if transcriptID not in chrDict: + chrDict[transcriptID] = info[0] + + # Extract splice junctions from exons + for transcriptID in exons: + chr = chrDict[transcriptID] + + # Sort the tuple of exons + exonList = sorted(exons[transcriptID]) + + # Build annoSJ + for i in range(len(exonList)-1): + sjID = ':'.join(map(str,[chr, exonList[i][1]+1, exonList[i+1][0]-1])) + if sjID not in annoSJ: + annoSJ[sjID] = 0 + + return annoSJ + +def write_output(sjDB, outfile): + f = open(outfile, 'w') + + # Iterate through all sjIDs in sjDB + for sjID in list(sjDB.keys()): + f.write('\t'.join(map(str,[sjID, sjDB[sjID]]))+'\n') + + f.close() + +def check_cigar(cigarString): + # Only keep read if CIGAR string only contains 'N' and 'M' + return set([i for i in cigarString if i.isalpha()]) == set(['N', 'M']) + +def main(args): + + # Parse command-line arguments + bamPath, gtfPath, fastaPath, outfile = args.bam_path, args.gtf, args.genome_fasta, args.outdir + minOverhang, minOverhangC, minOverhangNC, filterRL = int(args.minimum_overhang_length_annotated), int(args.minimum_overhang_length_unannotated_canonical), int(args.minimum_overhang_length_unannotated_noncanonical), int(args.read_length) + + if os.path.exists(bamPath+'.bai') == False: + pysam.index(bamPath) + # Create list of target chromosomes + chrList = ['chrX', 'chrY'] + for i in range(22): + chrList.append('chr' + str(i+1)) + + # Initialize splice junction database (dictionary object) + sjDB = {} + + # Create dictionary object to store all annotated SJs in the GTF + annoSJ = build_anno_SJdb(gtfPath, chrList) + + # Open the BAM file as a SAM + samfile = pysam.AlignmentFile(bamPath, 'rb') + + # Open FASTA file + genome = pysam.Fastafile(fastaPath) + + # Iterate through the reads in the SAM file and update the + # sjDB object + for read in samfile.fetch(): + # Check if read satisfies the following: + # * is proper pair + # * is uniquely mapped by STAR + # * maps to chromosomes 1-22, X, Y + # * CIGAR string only contains M and N + if (read.is_proper_pair + and read.mapping_quality == 255 + and read.reference_name in chrList + and check_cigar(read.cigarstring)): + # Check if read has the specified read length + # if provided + if filterRL == -1: + # Update sjDB with given read + sjDB = update_SJdb(read, sjDB, annoSJ, genome, + minOverhang, minOverhangC, minOverhangNC) + else: + if read.query_length == filterRL: + # Query read has expected length + sjDB = update_SJdb(read, sjDB, annoSJ, genome, + minOverhang, minOverhangC, minOverhangNC) + + # Print out sjDB to outfile + write_output(sjDB, outfile) + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_formatting.py b/IRIS/IRIS_formatting.py index 408378a..feebfe2 100644 --- a/IRIS/IRIS_formatting.py +++ b/IRIS/IRIS_formatting.py @@ -5,7 +5,7 @@ def loadSamplelist(fin_samples, sample_fin_list, sample_header, sample_name_fiel ls=l.strip() sample_fin_list.append(ls) for r in open(ls): - rs=map(lambda x:x.split('/')[-sample_name_field].split('.bam')[0],r.strip().strip(',').split(',')) + rs=map(lambda x:x.split('/')[-sample_name_field].split('.bam')[0].split('.aln')[0],r.strip().strip(',').split(',')) #rs=map(lambda x:x.split('/')[-2],r.strip().strip(',').split(',')) if sample_name_field==2: sn_list=r.strip().strip(',').split(',') @@ -16,30 +16,96 @@ def loadSamplelist(fin_samples, sample_fin_list, sample_header, sample_name_fiel sample_size[ls]=len(r.split(',')) return sample_fin_list, sample_header, sample_size -def mergeEvents(events_fin_list): +def parseEventRowSE(line_split): + return line_split[1].strip('"')+'\t'+line_split[2].strip('"')+'\t'+'\t'.join(line_split[3:7]+line_split[8:10]) + +def parseEventRow(line_split): + return line_split[1].strip('"')+'\t'+line_split[2].strip('"')+'\t'+'\t'.join(line_split[3:11]) + +def loadGTF(gtf): + exon_start_dict={} + exon_end_dict={} + for l in open(gtf): + if l.startswith('#'): + continue + ls=l.strip().split('\t') + if ls[2]=='exon': + chrom=ls[0] + if chrom.startswith('chr')==False: + chrom='chr'+chrom + exon_start_dict[ls[6]+':'+chrom+':'+ls[3]]='' + exon_end_dict[ls[6]+':'+chrom+':'+ls[4]]='' + return exon_start_dict, exon_end_dict + +def checkNovelSS(head, ls, splicing_event_type, exon_start_dict, exon_end_dict):# This is a conservative def of novelSS than rMATS4.1 (0.4% events less- complex cases ) + ld=dict(zip(head, ls)) + strand=ld['strand'] + chrom=ld['chr'] + if splicing_event_type=='SE': + check1=strand+':'+chrom+':'+str(int(ld['exonStart_0base'])+1) not in exon_start_dict + check2=strand+':'+chrom+':'+str(int(ld['downstreamES'])+1) not in exon_start_dict + check3=strand+':'+chrom+':'+ld['exonEnd'] not in exon_end_dict + check4=strand+':'+chrom+':'+ld['upstreamEE'] not in exon_end_dict + elif splicing_event_type=='A5SS': + check1=strand+':'+chrom+':'+str(int(ld['shortES'])+1) not in exon_start_dict + check2=strand+':'+chrom+':'+str(int(ld['flankingES'])+1) not in exon_start_dict + check3=strand+':'+chrom+':'+ld['longExonEnd'] not in exon_end_dict + check4=False + #check4=strand+':'+chrom+':'+str(int(ld['shortES'])+1) not in exon_end_dict + elif splicing_event_type=='A3SS': + check1=strand+':'+chrom+':'+str(int(ld['longExonStart_0base'])+1) not in exon_start_dict + check2=strand+':'+chrom+':'+str(int(ld['shortES'])+1) not in exon_start_dict + check3=strand+':'+chrom+':'+ld['flankingEE'] not in exon_end_dict + check4=False + + elif splicing_event_type=='RI': + check1,check2,check3,check4=[False,False,False,False] + else: + exit('choose AS type.') + return check1, check2, check3, check4 + + +def mergeEvents(events_fin_list, splicing_event_type, novelSS, exon_start_dict, exon_end_dict): + parseRow=parseEventRow + if splicing_event_type=='SE': + parseRow=parseEventRowSE total_event_dict={} - for events_fin in events_fin_list: + for i, events_fin in enumerate(events_fin_list): + head=[] for index,event_l in enumerate(open(events_fin)): if index==0: + head=event_l.strip().split('\t') continue event_ls=event_l.strip().split('\t') - events_cord=event_ls[1].strip('"')+'\t'+event_ls[2].strip('"')+'\t'+'\t'.join(event_ls[3:7]+event_ls[8:10]) + if novelSS: + check1, check2, check3, check4= checkNovelSS(head, event_ls, splicing_event_type, exon_start_dict, exon_end_dict) + novel=True if check1 or check2 or check3 or check4 else False + if novel==False: # if no novel, will not parse the row and save + continue + events_cord=parseRow(event_ls) if events_cord in total_event_dict: continue total_event_dict[events_cord]='' return total_event_dict -def writeMergedEvents(events_fin_list, splicing_event_type, cov_cutoff, data_name, fout_path): - total_event_dict=mergeEvents(events_fin_list) - print len(total_event_dict) +def writeMergedEvents(events_fin_list, splicing_event_type, cov_cutoff, data_name, fout_path, novelSS, exon_start_dict, exon_end_dict): + total_event_dict=mergeEvents(events_fin_list, splicing_event_type, novelSS, exon_start_dict, exon_end_dict) + novelss_tag='' + if novelSS: + novelss_tag='.novelSS' total_event_list=sorted(total_event_dict.keys()) - fout=open(fout_path+'/prefilter_events.splicing_matrix.'+splicing_event_type+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt','w') + fout=open(fout_path+'/prefilter_events.splicing_matrix.'+splicing_event_type+novelss_tag+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt','w') for e in total_event_list: fout.write(e.strip()+'\n') fout.close() + return total_event_list -def mergeMatrixInBatch(fin_list, events_fin_list, sample_fin_list, cov_cutoff, data_name, splicing_event_type, sample_header, sample_size, total_event_list, file_batch_list, batch, fout_path): +def mergeMatrixInBatch(fin_list, events_fin_list, sample_fin_list, cov_cutoff, data_name, splicing_event_type, sample_header, sample_size, total_event_list, file_batch_list, batch, fout_path, individual_filter, novelSS): + parseRow=parseEventRow + if splicing_event_type=='SE': + parseRow=parseEventRowSE + for b in range(0,len(total_event_list),batch): Intercep_Matrix={} print '[INFO] Merging in progress. Working on batch ',b @@ -51,7 +117,7 @@ def mergeMatrixInBatch(fin_list, events_fin_list, sample_fin_list, cov_cutoff, d if index==0: continue event_ls=event_l.strip().split('\t') - event_cord=event_ls[1].strip('"')+'\t'+event_ls[2].strip('"')+'\t'+'\t'.join(event_ls[3:7]+event_ls[8:10]) + event_cord=parseRow(event_ls) if event_cord in batch_event_dict: eventID[event_ls[0]]=event_cord print '[INFO] Merging file: ', fin, len(eventID) @@ -66,10 +132,16 @@ def mergeMatrixInBatch(fin_list, events_fin_list, sample_fin_list, cov_cutoff, d Cov=[num+Skip[o] for o,num in enumerate(Incl)] psi_values=[] for i,I in enumerate(Incl): - if int(I)+int(Skip[i])==0: - psi_values.append('NaN') + if individual_filter: # individual_filter. Use Cov[i] for each individual sample + if Cov[i]< cov_cutoff: + psi_values.append('NaN') + else: + psi_values.append(str(round(I/int(rs[5])/(I/int(rs[5])+Skip[i]/int(rs[6])),4))) else: - psi_values.append(str(round(I/int(rs[5])/(I/int(rs[5])+Skip[i]/int(rs[6])),4))) + if int(I)+int(Skip[i])==0: + psi_values.append('NaN') + else: + psi_values.append(str(round(I/int(rs[5])/(I/int(rs[5])+Skip[i]/int(rs[6])),4))) if eventID[rs[0]] not in Intercep_Matrix: Intercep_Matrix[eventID[rs[0]]]={} @@ -78,9 +150,20 @@ def mergeMatrixInBatch(fin_list, events_fin_list, sample_fin_list, cov_cutoff, d if len(psi_values)!=sample_size[sample_fin_list[n]]: exit('[Abort] Sample number does not match observations in JC file.') - file_batch_list.append(fout_path+'/splicing_matrix/splicing_matrix.'+splicing_event_type+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt.batch_'+str(b)+'.txt') - fout=open(fout_path+'/splicing_matrix/splicing_matrix.'+splicing_event_type+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt.batch_'+str(b)+'.txt','w') - fout.write('AC\tGeneName\tchr\tstrand\texonStart\texonEnd\tupstreamEE\tdownstreamES\t'+'\t'.join(sample_header)+'\n') + novelss_tag='' + if novelSS: + novelss_tag='.novelSS' + file_path_name=fout_path+'/splicing_matrix/splicing_matrix.'+splicing_event_type+novelss_tag+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt.batch_'+str(b)+'.txt' + file_batch_list.append(file_path_name) + fout=open(file_path_name,'w') + header_line='AC\tGeneName\tchr\tstrand\texonStart\texonEnd\tupstreamEE\tdownstreamES\t'+'\t'.join(sample_header) + if splicing_event_type=='A5SS': + header_line='AC\tGeneName\tchr\tstrand\tlongExonStart\tlongExonEnd\tshortES\tshortEE\tflankingES\tflankingEE\t'+'\t'.join(sample_header) + if splicing_event_type=='A3SS': + header_line='AC\tGeneName\tchr\tstrand\tlongExonStart\tlongExonEnd\tshortES\tshortEE\tflankingES\tflankingEE\t'+'\t'.join(sample_header) + if splicing_event_type=='RI': + header_line='AC\tGeneName\tchr\tstrand\triExonStart\triExonEnd\tupstreamES\tupstreamEE\tdownstreamES\tdownstreamEE\t'+'\t'.join(sample_header) + fout.write(header_line+'\n') for k in sorted(Intercep_Matrix.keys()): psi_value_all=[] cov_all=[] @@ -90,15 +173,22 @@ def mergeMatrixInBatch(fin_list, events_fin_list, sample_fin_list, cov_cutoff, d cov_all+=Intercep_Matrix[k][sample][1] else: psi_value_all+=['NaN']*sample_size[sample] + if individual_filter==False: #if filter by group and cov < cov_cutoff, skip this event K. Otherwise, psi_vallue_all is being wrote to output. + mean=numpy.mean(cov_all) + if mean < cov_cutoff: + continue + if set(psi_value_all)==set(['NaN']): #remove full NaN events 2020 + continue + fout.write(k+'\t'+'\t'.join(psi_value_all)+'\n') - mean=numpy.mean(cov_all) - if mean>=cov_cutoff: - fout.write(k+'\t'+'\t'.join(psi_value_all)+'\n') fout.close() return file_batch_list -def mergeMatrixInOne(file_batch_list, cov_cutoff, data_name, splicing_event_type, fout_path): - fout_merge=open(fout_path+'/splicing_matrix/splicing_matrix.'+splicing_event_type+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt','w') +def mergeMatrixInOne(file_batch_list, cov_cutoff, data_name, splicing_event_type, fout_path, novelSS): + novelss_tag='' + if novelSS: + novelss_tag='.novelSS' + fout_merge=open(fout_path+'/splicing_matrix/splicing_matrix.'+splicing_event_type+novelss_tag+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt','w') header=0 for file_batch in file_batch_list: for j,l in enumerate(open(file_batch)): @@ -109,19 +199,22 @@ def mergeMatrixInOne(file_batch_list, cov_cutoff, data_name, splicing_event_type continue fout_merge.write(l) fout_merge.close() - os.system('rm '+fout_path+'/splicing_matrix/splicing_matrix.'+splicing_event_type+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt.batch_*.txt') - return 'splicing_matrix.'+splicing_event_type+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt' + os.system('rm '+fout_path+'/splicing_matrix/splicing_matrix.'+splicing_event_type+novelss_tag+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt.batch_*.txt') + return 'splicing_matrix.'+splicing_event_type+novelss_tag+'.cov'+str(cov_cutoff)+'.'+data_name+'.txt' -def index_PsiMatrix(fn,outdir,delim): +def index_PsiMatrix(fn, outdir, delim, splicing_event_type): out_fp = outdir+'/'+fn.split('/')[-1]+'.idx' line_formatter = "{id}\t{offset}\n" - offset = 0 + offset = 0 + col_index=10 + if splicing_event_type=='SE':#handle SE and other types of AS events + col_index=8 with open(fn, 'r') as fin: with open(out_fp, 'w') as fout: offset += len(fin.readline()) for line in fin: ele = line.strip().split(delim) - eid = ':'.join([ele[0].split('_')[0].split('.')[0]]+ele[1:8]) + eid = ':'.join([ele[0].split('_')[0].split('.')[0]]+ele[1:col_index]) fout.write( line_formatter.format(id=eid, offset=offset) ) offset += len(line) return @@ -131,6 +224,14 @@ def main(args): data_name=args.data_name sample_name_field=args.sample_name_field splicing_event_type=args.splicing_event_type + individual_filter= args.sample_based_filter + novelSS= args.novelSS + exon_start_dict={} + exon_end_dict={} + if novelSS: + gtf=args.gtf + exon_start_dict, exon_end_dict= loadGTF(gtf) + if sample_name_field==1: print '[INFO] Sample name parsed from bam file. (alternatively can be parsed from up level folder)' if sample_name_field==2: @@ -154,21 +255,21 @@ def main(args): sample_fin_list, sample_header, sample_size= loadSamplelist(args.rmats_sample_order,sample_fin_list, sample_header,sample_name_field, sample_size) #MAKING MERGED EVENTS LIST - total_event_list= writeMergedEvents(events_fin_list, splicing_event_type, cov_cutoff, data_name, fout_path) + total_event_list= writeMergedEvents(events_fin_list, splicing_event_type, cov_cutoff, data_name, fout_path, novelSS, exon_start_dict, exon_end_dict) if args.merge_events_only: exit('[INFO] Done merging events only.') - print '[INFO] Done loading file dir', len(total_event_list) + print '[INFO] Done loading file dir. Total events:', len(total_event_list) #START MERGING MATRICES IN BATCH MODE FOLLOWING EVENTS LIST GENERATED. batch=20000 - file_batch_list=mergeMatrixInBatch(fin_list, events_fin_list, sample_fin_list, cov_cutoff, data_name, splicing_event_type, sample_header, sample_size, total_event_list, file_batch_list, batch, fout_path) + file_batch_list=mergeMatrixInBatch(fin_list, events_fin_list, sample_fin_list, cov_cutoff, data_name, splicing_event_type, sample_header, sample_size, total_event_list, file_batch_list, batch, fout_path, individual_filter, novelSS) print '[INFO] Done merging matrices by batch.' - merged_file_name=mergeMatrixInOne(file_batch_list, cov_cutoff, data_name, splicing_event_type, fout_path) + merged_file_name=mergeMatrixInOne(file_batch_list, cov_cutoff, data_name, splicing_event_type, fout_path, novelSS) print '[INFO] Done merging matrices: '+merged_file_name #create index in IRIS db directory - index_PsiMatrix(fout_path+'/splicing_matrix/'+merged_file_name,fout_path+'/splicing_matrix','\t') + index_PsiMatrix(fout_path+'/splicing_matrix/'+merged_file_name,fout_path+'/splicing_matrix','\t', splicing_event_type) print '[INFO] Finished. Created matrix: '+fout_path if __name__ == '__main__': diff --git a/IRIS/IRIS_indexing.py b/IRIS/IRIS_indexing.py index a2a64bc..cdfc879 100644 --- a/IRIS/IRIS_indexing.py +++ b/IRIS/IRIS_indexing.py @@ -3,32 +3,32 @@ from scipy import stats import statsmodels.stats.weightstats as smw -def index_PsiMatrix(fn,outdir,delim): - out_fp = outdir+'/'+fn.split('/')[-1]+'.idx' +def index_PsiMatrix(fn, outdir, delim, splicing_event_type, out_fp): line_formatter = "{id}\t{offset}\n" offset = 0 + col_index=10 + if splicing_event_type=='SE':#handle SE and other types of AS events + col_index=8 with open(fn, 'r') as fin: with open(out_fp, 'w') as fout: offset += len(fin.readline()) for line in fin: ele = line.strip().split(delim) - eid = ':'.join([ele[0].split('_')[0].split('.')[0]]+ele[1:8]) + eid = ':'.join([ele[0].split('_')[0].split('.')[0]]+ele[1:col_index]) fout.write( line_formatter.format(id=eid, offset=offset) ) offset += len(line) return def main(args): fin=args.splicing_matrix + splicing_event_type=args.splicing_event_type + cov_cutoff=args.cov_cutoff data_name=args.data_name - db_dir=args.db_dir.rstrip('/') - - #prepare files/folders in IRIS db directory - os.system('mkdir -p '+db_dir+'/'+data_name+' '+db_dir+'/'+data_name+'/splicing_matrix') - new_dir_fin=db_dir+'/'+data_name+'/splicing_matrix/splicing_matrix.SE.cov10.'+data_name+'.txt' - os.system('mv '+fin+' '+new_dir_fin) - #create index in IRIS db directory - index_PsiMatrix(new_dir_fin,db_dir+'/'+data_name+'/splicing_matrix','\t') - print '[INFO] Finished. Created matrix: '+new_dir_fin + outdir=args.outdir.rstrip('/') + out_fp = outdir+'/'+fin.split('/')[-1]+'.idx' + #create index in the current directory + index_PsiMatrix(fin,outdir,'\t',splicing_event_type, out_fp) + print '[INFO] Finished. Created matrix: '+out_fp if __name__ == '__main__': main() diff --git a/IRIS/IRIS_makeqsub_mapping.py b/IRIS/IRIS_makeqsub_mapping.py deleted file mode 100644 index e740129..0000000 --- a/IRIS/IRIS_makeqsub_mapping.py +++ /dev/null @@ -1,69 +0,0 @@ -import sys,glob,os -from . import config -#python make.shellsubmiter.py Out_dir Fastq_dir -# two inputs: 1)output folder/sh name/prefix 2)dir of all RNAseq files (if more than 2, recognize and sort by R1 R2 for star input) -def makeSubmit(out_prefix, fin, label_string, starGenomeDir, gtf): - fq_dir=fin - fqs=glob.glob(fq_dir+'/*') - #abs_path=os.path.abspath(sys.argv[2]) - r1=[] - r2=[] - for fq in fqs: - if fq.find('1'+label_string+'f')!=-1: - r1.append(os.path.abspath(fq)) - elif fq.find('2'+label_string+'f')!=-1: - r2.append(os.path.abspath(fq)) - if len(r1)!=len(r2) or len(r1)==0 or len(r2)==0: - print 'file name can not be recognize' - return '','' - out_dir=out_prefix.rstrip('/')+'.aln' - sample_name=out_dir.split('/')[-1].split('.')[0] - fout1=open('submit.STARmap.'+sample_name+'.sh','w') - fout2=open('submit.Cuffquant.'+sample_name+'.sh','w') - fq_path=','.join(sorted(r1)+sorted(r2)) - cmd1='IRIS process_rnaseq --starGenomeDir '+starGenomeDir+' --gtf '+gtf+' --mapping --sort -p '+out_dir+' '+fq_path - fout1.write(cmd1+'\n') - cmd2='IRIS process_rnaseq --starGenomeDir '+starGenomeDir+' --gtf '+gtf+' --quant -p '+out_dir+' '+fq_path - fout2.write(cmd2+'\n') - return 'submit.STARmap.'+sample_name+'.sh','submit.Cuffquant.'+sample_name+'.sh' - -def main(args): - starGenomeDir=args.starGenomeDir - gtf=args.gtf - fastq_folder_dir=args.fastq_folder_dir - fastq_folder_list=glob.glob(fastq_folder_dir+'') - out_dir=args.out_dir.rstrip('/') - task_name=args.data_name - label_string=args.label_string - os.system('mkdir -p '+out_dir) - - fout1=open('cmdlist.STAR.'+task_name,'w') - fout2=open('cmdlist.Cufflinks.'+task_name,'w') - i=0 - for folder in fastq_folder_list: - print folder - fn1,fn2=makeSubmit(out_dir+'/'+folder.split('/')[-1], folder, label_string, starGenomeDir) - if fn1=='': - continue - i+=1 - fout1.write(fn1+'\n') - fout2.write(fn2+'\n') - fout1.close() - fout2.close() - - fout_qsub1=open('qsub.STARmapping.'+task_name+'.sh','w') - cmd='qsub -t 1-'+str(i)+':1 qsub.STARmapping.'+task_name+'.sh' - fout_qsub1.write('#!/bin/bash\n#$ -N STARmapping\n#$ -S /bin/bash\n#$ -R y\n#$ -l '+config.QSUB_ALIGNMENT_CONFIG+'\n#$ -V\n#$ -cwd\n#$ -j y\n#$ -m bea\n') - fout_qsub1.write('export s=`sed -n ${SGE_TASK_ID}p '+'cmdlist.STAR.'+task_name+'`\necho $s\nbash $s') - fout_qsub1.close() - print cmd - - fout_qsub2=open('qsub.Cufflinks.'+task_name+'.sh','w') - cmd='qsub -t 1-'+str(i)+':1 qsub.Cufflinks.'+task_name+'.sh' - fout_qsub2.write('#!/bin/bash\n#$ -N Cufflinks\n#$ -S /bin/bash\n#$ -R y\n#$ -l '+config.QSUB_EXPRESSION_CONFIG+'\n#$ -V\n#$ -cwd\n#$ -j y\n#$ -m bea\n') - fout_qsub2.write('export s=`sed -n ${SGE_TASK_ID}p '+'cmdlist.Cufflinks.'+task_name+'`\necho $s\nnbash $s') - fout_qsub2.close() - print cmd - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/IRIS/IRIS_makeqsub_rmats.py b/IRIS/IRIS_makeqsub_rmats.py deleted file mode 100644 index 76f91c8..0000000 --- a/IRIS/IRIS_makeqsub_rmats.py +++ /dev/null @@ -1,32 +0,0 @@ -import sys, csv, glob, os -from . import config -def main(args): - - bam_dir= args.bam_dir - file_list=glob.glob(bam_dir.rstrip('/')+'/*/*.bam') - length=str(args.read_length) - gtf=args.gtf - task_name=args.data_name - rMATS_path=args.rMATS_path#/u/home/s/shiehshi/rMATS-2017-3-15/rmats.py - list_name='cmdlist.rMATS_prep_'+task_name - - i=0 - fout=open(list_name,'w') - for fin in file_list: - fin_name='/'.join(fin.split('/')[:-1]).rstrip('/') - i+=1 - fout_local=open(fin_name+'/bam_list.txt','w') - fout_local.write(fin) - fout_local.close() - fout.write('python '+args.rMATS_path+' --b1 '+fin_name+'/bam_list.txt --od '+fin_name+' --tmp '+fin_name+'.tmp --anchorLength 1 --readLength '+length+' --gtf '+gtf+' -t paired --task prep --nthread 8 --statoff\n') - fout.close() - - fout_qsub=open('qsub.rMATSturboPrep.'+task_name+'.sh','w') - cmd='qsub -t 1-'+str(i)+':1 qsub.rMATSturboPrep.'+task_name+'.sh' - fout_qsub.write('#!/bin/bash\n#$ -N rmats_prep\n#$ -S /bin/bash\n#$ -R y\n#$ -l '+config.QSUB_RMATS_PREP_CONFIG+'\n#$ -V\n#$ -cwd\n#$ -j y\n#$ -m bea\n') - fout_qsub.write('export s=`sed -n ${SGE_TASK_ID}p '+list_name+'`\necho $s\n$s') - fout_qsub.close() - print cmd - -if __name__ == '__main__': - main() diff --git a/IRIS/IRIS_makesubsh_extractsj.py b/IRIS/IRIS_makesubsh_extractsj.py new file mode 100644 index 0000000..84851de --- /dev/null +++ b/IRIS/IRIS_makesubsh_extractsj.py @@ -0,0 +1,55 @@ +import sys, csv, glob, os, argparse + +def parseMappingLog(log_fin): + read_length='' + for l in open(log_fin): + if l.find('Average input read length')!=-1: + ls=l.strip().split('|') + read_length=ls[1].strip().strip(' ') + if read_length=='': + exit('can not find log file for read length') + return str(int(round(int(read_length)/2))) + +def main(args): + #extractSJ_path=args.extractSJ_path + BAM_prefix=args.BAM_prefix + bam_folder_list=args.bam_folder_list + rl=args.rmats_used_read_length + parserl=False + if rl=='': + parserl=True + print('checking read legnth') + else: + print('use user specified read length') + gtf=args.gtf + task_name= args.task_name + genome_fasta= args.genome_fasta + task_dir = args.task_dir + if not task_dir: + task_dir = os.getcwd() + + list_name='cmdlist.extract_sjc.'+task_name + list_name = os.path.join(task_dir, list_name) + + n=0 + fout=open(list_name,'w') + for bam_folder in open(bam_folder_list): + n+=1 + bam_folder=bam_folder.strip() + if parserl: + rl=parseMappingLog(bam_folder+'/Log.final.out') + fout.write('IRIS extract_sjc -i '+bam_folder+'/'+BAM_prefix+'.bam -g '+gtf+' -r '+rl+' -f '+genome_fasta+' -o '+bam_folder+'/SJcount.txt \n') + fout.close() + + sh_file_name = 'subsh.extract_sjc.{}.sh'.format(task_name) + sh_file_name = os.path.join(task_dir, sh_file_name) + fout_qsub=open(sh_file_name,'w') + + cmd='sbatch --array=1-{} {}'.format(str(n), sh_file_name) + + fout_qsub.write('#!/bin/bash\n#SBATCH --job-name=extract_sjc\n#SBATCH --mem=5G\n#SBATCH -t 15:00:00\n') + fout_qsub.write('export s=`sed -n ${SLURM_ARRAY_TASK_ID}p '+list_name+'`\necho $s\n$s') + fout_qsub.close() + print(cmd) +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_makesubsh_hla.py b/IRIS/IRIS_makesubsh_hla.py new file mode 100644 index 0000000..d8b0aee --- /dev/null +++ b/IRIS/IRIS_makesubsh_hla.py @@ -0,0 +1,58 @@ +import sys,glob,os +from . import config + + +def write_task_script(out_prefix, fin, label_string, task_dir): + fq_dir=fin + fqs=glob.glob(fq_dir+'/*') + r1=[] + r2=[] + for fq in fqs: + if fq.find('1'+label_string+'f')!=-1: + r1.append(os.path.abspath(fq)) + elif fq.find('2'+label_string+'f')!=-1: + r2.append(os.path.abspath(fq)) + if len(r1)!=len(r2) or len(r1)==0 or len(r2)==0: + print '[Error] File name can not be recognized' + return + + out_dir=out_prefix.rstrip('/') + os.system('mkdir -p '+out_dir) + sample_name=out_dir.split('/')[-1].split('.')[0] + task_script_base = 'seq2hla.{}.sh'.format(sample_name) + task_script = os.path.join(task_dir, task_script_base) + fout=open(task_script,'w') + suffix=r1[0].split(label_string)[1] + path='/'.join(r1[0].split('/')[:-1]) + fq_path1=' '.join(sorted(r1)) + cmd_fq1='cat '+fq_path1+' > '+path+'/fq1.'+suffix + fout.write('#!/bin/bash\n'+cmd_fq1+'\n') + + fq_path2=' '.join(sorted(r2)) + cmd_fq2='cat '+fq_path2+' > '+path+'/fq2.'+suffix + fout.write(cmd_fq2+'\n') + + cmd1='seq2HLA -1 '+path+'/fq1.'+suffix+' -2 '+path+'/fq2.'+suffix+' -r '+out_dir+'/'+sample_name+' > '+out_dir+'/seq2hla.log 2>&1' + fout.write(cmd1+'\n') + fout.write('rm '+path+'/fq1.'+suffix+'\nrm '+path+'/fq2.'+suffix+'\n') + fout.close() + + +def main(args): + fastq_folder_dir=args.fastq_folder_dir.rstrip('/') + fastq_folder_list=glob.glob(fastq_folder_dir+'/*') + out_dir=args.outdir.rstrip('/') + task_name=args.data_name + label_string=args.label_string + os.system('mkdir -p '+out_dir) + task_dir=args.task_dir + if not os.path.exists(task_dir): + os.makedirs(task_dir) + + for folder in fastq_folder_list: + print folder + write_task_script(out_dir+'/'+folder.split('/')[-1], folder, label_string, task_dir) + + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_makesubsh_mapping.py b/IRIS/IRIS_makesubsh_mapping.py new file mode 100644 index 0000000..1345fac --- /dev/null +++ b/IRIS/IRIS_makesubsh_mapping.py @@ -0,0 +1,55 @@ +import sys,glob,os +from . import config + +def write_task_script(out_prefix, fin, label_string, starGenomeDir, gtf, task_dir): + fq_dir=fin + fqs=glob.glob(fq_dir+'/*') + r1=[] + r2=[] + for fq in fqs: + if fq.find('1'+label_string+'f')!=-1: + r1.append(os.path.abspath(fq)) + elif fq.find('2'+label_string+'f')!=-1: + r2.append(os.path.abspath(fq)) + if len(r1)!=len(r2) or len(r1)==0 or len(r2)==0: + print 'file name can not be recognize' + return '','' + + out_dir=out_prefix.rstrip('/')+'.aln' + sample_name=out_dir.split('/')[-1].split('.')[0] + task_script_base1 = 'STARmap.{}.sh'.format(sample_name) + task_script1 = os.path.join(task_dir, task_script_base1) + task_script_base2 = 'Cuffquant.{}.sh'.format(sample_name) + task_script2 = os.path.join(task_dir, task_script_base2) + fout1=open(task_script1,'w') + fout2=open(task_script2,'w') + fq_path=','.join(sorted(r1)+sorted(r2)) + + cmd1='IRIS process_rnaseq --starGenomeDir '+starGenomeDir+' --gtf '+gtf+' --mapping --sort -p '+out_dir+' '+fq_path + fout1.write('#!/bin/bash\n'+cmd1+'\n') + cmd2='IRIS process_rnaseq --starGenomeDir '+starGenomeDir+' --gtf '+gtf+' --quant -p '+out_dir+' '+fq_path + fout2.write('#!/bin/bash\n'+cmd2+'\n') + return task_script1,task_script2 + +def main(args): + starGenomeDir=args.starGenomeDir + gtf=args.gtf + fastq_folder_dir=args.fastq_folder_dir.rstrip('/') + fastq_folder_list=glob.glob(fastq_folder_dir+'/*') + out_dir=args.outdir.rstrip('/') + task_name=args.data_name + label_string=args.label_string + os.system('mkdir -p '+out_dir) + task_dir=args.task_dir + if not os.path.exists(task_dir): + os.makedirs(task_dir) + + for folder in fastq_folder_list: + print folder + fn1,fn2=write_task_script(out_dir+'/'+folder.split('/')[-1], folder, label_string, starGenomeDir, gtf, task_dir) + if fn1=='': + continue + + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_makesubsh_rmats.py b/IRIS/IRIS_makesubsh_rmats.py new file mode 100644 index 0000000..f4c9e4a --- /dev/null +++ b/IRIS/IRIS_makesubsh_rmats.py @@ -0,0 +1,66 @@ +import sys, csv, glob, os +from . import config + +def writeShell(rMATS_path, fin_name, folder_name, bam_dir, read_length_argument, gtf, novelSS, task_name, task_dir): + fout_local=open(folder_name+'/bam_list.txt','w') + fout_local.write(fin_name) + fout_local.close() + + sample_name=folder_name.split('/')[-1].split('.')[0] + task_script_base = 'rMATS_prep.{}.sh'.format(sample_name) + task_script = os.path.join(task_dir, task_script_base) + fout=open(task_script,'w') + fout.write('#!/bin/bash\n') + novelSS_str='' + if novelSS: + novelSS_str='--novelSS ' + # TODO the '|| true' at the end of this command ignores a + # failure return code from the python command. + # rMATS produces the desired output file despite the error return. + # A future version of rMATS may fix this behavior. + fout.write('python {} --b1 {}/bam_list.txt --od {} --tmp {}/{}.RL{}/{}.tmp --anchorLength 1 --readLength {} --gtf {} -t paired --task prep --nthread 8 --statoff {}|| true\n'.format(rMATS_path, folder_name, folder_name, bam_dir, task_name, read_length_argument, sample_name, read_length_argument, gtf, novelSS_str)) + fout.close() + +def organizeReadLength(rMATS_path, file_list_mapping, gtf, novelSS, bam_prefix, task_name, task_dir): + rl_dict={} + folder_names={} + for fin_name in file_list_mapping: + for l in open(fin_name): + if l.find('Average input read length |')!=-1: + map_rl=int(round(float(l.split('Average input read length |')[-1].strip())/2,0)) + rl_dict['/'.join(fin_name.split('/')[:-1])]=map_rl + folder_names[map_rl]='' + break + bam_dir='/'.join(file_list_mapping[0].split('/')[:-2]) + for folder_name in folder_names: + os.system('mkdir -p '+bam_dir+'/'+task_name+'.RL'+str(folder_name)) + for folder_name in rl_dict: + writeShell(rMATS_path, folder_name+'/'+bam_prefix+'.bam', folder_name, bam_dir, str(rl_dict[folder_name]), gtf, novelSS, task_name, task_dir) + +def main(args): + gtf=args.gtf + task_name=args.data_name + rMATS_path=args.rMATS_path + bam_dir= args.bam_dir.rstrip('/') + bam_prefix=args.bam_prefix + novelSS=args.novelSS + task_dir=args.task_dir + if not os.path.exists(task_dir): + os.makedirs(task_dir) + if args.read_length: + read_length= int(args.read_length) + + print 'preparing rMATS-turbo prep directories' + if args.read_length==False: + mapping_log_file_list=glob.glob(bam_dir+'/*/Log.final.out') + organizeReadLength(rMATS_path, mapping_log_file_list, gtf, novelSS, bam_prefix, task_name, task_dir) #relocated based on the read length + else: + mapping_bam_list=glob.glob(bam_dir+'/*/'+bam_prefix+'.bam') + os.system('mkdir -p '+bam_dir+'/'+task_name+'.RL'+str(read_length)) + for fin_name in mapping_bam_list: + folder_name= '/'.join(fin_name.split('/')[:-1]) + writeShell(rMATS_path, fin_name, folder_name, bam_dir, str(read_length), gtf, novelSS, task_name, task_dir) + + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_makesubsh_rmatspost.py b/IRIS/IRIS_makesubsh_rmatspost.py new file mode 100644 index 0000000..aec5195 --- /dev/null +++ b/IRIS/IRIS_makesubsh_rmatspost.py @@ -0,0 +1,48 @@ +import sys,glob,os,argparse +from . import config + +def write_task_script(rMATS_path, bam_dir, task_name, gtf, novelSS, task_dir): + read_length=int(bam_dir.split('/')[-1].split('.')[-1][2:]) + dir_name=task_name+'_RL'+str(read_length) + graphlist=glob.glob(bam_dir+'/*.tmp/*') + os.system('mkdir -p '+bam_dir+'/'+dir_name+'.graph') + os.system('cp '+bam_dir+'/*.tmp/* '+bam_dir+'/'+dir_name+'.graph/.') + print '[INFO] Done copy' + cmd='head -n1 -q '+bam_dir+'/'+dir_name+'.graph/*.rmats |paste -d, -s >'+bam_dir+'/'+dir_name+'_rmatspost_list.txt' + print cmd + os.system(cmd) + + task_script_base = 'rMATS_post.{}.sh'.format(dir_name) + task_script = os.path.join(task_dir, task_script_base) + fout=open(task_script,'w') + + novelSS_str='' + if novelSS: + novelSS_str='--novelSS ' + # TODO the '|| true' at the end of this command ignores a + # failure return code from the python command. + # rMATS produces the desired output files despite the error return. + # A future version of rMATS may fix this behavior. + fout.write('#!/bin/bash\npython '+rMATS_path+' --b1 '+bam_dir+'/'+dir_name+'_rmatspost_list.txt --od '+bam_dir+'/'+dir_name+'.matrix --tmp '+bam_dir+'/'+dir_name+'.graph/ --anchorLength 1 --readLength '+str(read_length)+' --gtf '+gtf+' -t paired --nthread 8 --task post --statoff '+novelSS_str+'|| true\n') + fout.close() + return + + +def main(args): + rMATS_path=args.rMATS_path + prep_dir=args.bam_dir.rstrip('/') + gtf=args.gtf + task_name=args.data_name + novelSS=args.novelSS + task_dir=args.task_dir + if not os.path.exists(task_dir): + os.makedirs(task_dir) + + rl_bam_folders=glob.glob(prep_dir+'/'+task_name+'.RL*') + for bam_folder in rl_bam_folders: + print bam_folder + write_task_script(rMATS_path, bam_folder,task_name, gtf, novelSS, task_dir) + + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_ms_makedb.py b/IRIS/IRIS_ms_makedb.py index 65eb75b..effe056 100644 --- a/IRIS/IRIS_ms_makedb.py +++ b/IRIS/IRIS_ms_makedb.py @@ -67,6 +67,9 @@ def main(args): outdir=args.outdir.rstrip('/') exp_fin_list=args.exp_fin_list uniprot_fasta=args.uniprot_fasta + java_path = args.java_path + MSGF_path = args.MSGF_path + print '##Creating ProteoTransicritomic Ref' makeProteoTranscriptomeRef(exp_fin_list, uniprot_fasta, outdir) @@ -91,7 +94,7 @@ def main(args): print cmd1 os.system(cmd1) - cmd2='/u/local/apps/java/jdk1.8.0_111/bin/java -Xmx8g -cp ~/MSGFPlus/MSGFPlus.jar edu.ucsd.msjava.msdbsearch.BuildSA -d '+outdir+'/tmp/proteome_ref_combined.fa' + cmd2=java_path+' -Xmx8g -cp '+MSGF_path+' edu.ucsd.msjava.msdbsearch.BuildSA -d '+outdir+'/tmp/proteome_ref_combined.fa' print '##Indexing the proteotranscriptomic db' print cmd2 os.system(cmd2) diff --git a/IRIS/IRIS_parse_hla.py b/IRIS/IRIS_parse_hla.py index 691026a..f2dc2ea 100644 --- a/IRIS/IRIS_parse_hla.py +++ b/IRIS/IRIS_parse_hla.py @@ -5,15 +5,14 @@ def main(args): outdir=args.outdir.rstrip('/') - fin_list=glob.glob(outdir+'/*/hla_types/hla_types-ClassI.HLAgenotype4digits') + fin_list=glob.glob(outdir+'/*/*-ClassI.HLAgenotype4digits') HLA2patients={} HLA_list=set() for fin in fin_list: - name=fin.split('/')[-3] - print name + name=fin.split('/')[-2] n=0 if name in HLA2patients: - print 'dup' + print 'Duplicated name '+name+'. Exit!' exit() HLA2patients[name]=[] for l in open(fin): @@ -21,12 +20,15 @@ def main(args): n+=1 continue ls=l.strip().split('\t') - if float(ls[2])<=0.05: - HLA2patients[name].append('HLA-'+ls[1].rstrip("'")) - HLA_list.add('HLA-'+ls[1].rstrip("'")) - if float(ls[4])<=0.05: - HLA2patients[name].append('HLA-'+ls[3].rstrip("'")) - HLA_list.add('HLA-'+ls[3].rstrip("'")) + if ls[2]!='NA': + if float(ls[2])<=0.05: + HLA2patients[name].append('HLA-'+ls[1].rstrip("'")) + HLA_list.add('HLA-'+ls[1].rstrip("'")) + if ls[4]!='NA': + if float(ls[4])<=0.05: + HLA2patients[name].append('HLA-'+ls[3].rstrip("'")) + HLA_list.add('HLA-'+ls[3].rstrip("'")) + fout1=open(outdir+'/hla_patient.tsv','w') for k in HLA2patients: fout1.write('\t'.join([k]+HLA2patients[k])+'\n') @@ -38,13 +40,12 @@ def main(args): fout2.write(h+'\n') fout2.close() - fin_list2=glob.glob(outdir+'/*/hla_types/hla_types-ClassI.expression') + fin_list2=glob.glob(outdir+'/*/*-ClassI.expression') HLAexp2patients={} for fin2 in fin_list2: - name=fin2.split('/')[-3] - print name + name=fin2.split('/')[-2] if name in HLAexp2patients: - print 'dup' + print 'Duplicated name '+name+'. Exit!' exit() HLAexp2patients[name]=[] for l in open(fin2): diff --git a/IRIS/IRIS_pep2epitope.py b/IRIS/IRIS_pep2epitope.py index d756dc3..442a2fe 100644 --- a/IRIS/IRIS_pep2epitope.py +++ b/IRIS/IRIS_pep2epitope.py @@ -3,22 +3,11 @@ #import multiprocessing as mp # from Bio.Blast import NCBIXML from subprocess import Popen, PIPE -#now = datetime.datetime.now() + ID = str(uuid.uuid4()).split('-')[0] def loadSampleMHC(f_s_in): - # if f_s_in.find(',')!=-1: hla_allele_list=f_s_in.split(',') - # else: - # n=0 - # hla_allele_list=[] - # for l in open(f_s_in): - # if n==0: - # n+=1 - # continue - # ls=l.strip().split('\t') - # hla_allele_list.append('HLA-'+ls[1].strip("'")) - # hla_allele_list.append('HLA-'+ls[3].strip("'")) return hla_allele_list def parsePred(stdout): @@ -40,7 +29,7 @@ def localIEDBCommand(iedb_path, hla_allele, epitope_len, outdir, JC_pep_fasta, e response = Popen(['python',iedb_path+'/predict_binding.py',IEDB_model,hla_allele,epitope_len,outdir+'/tmp/prot.compared/'+form+'/'+form+'.'+JC_pep_fasta], stdout=fout,shell = False) #response = Popen(['/u/home/p/panyang/local/bin/python',iedb_path+'/predict_binding.py',IEDB_model,hla_allele,epitope_len,outdir+'/tmp/prot.compared/'+form+'/'+form+'.'+JC_pep_fasta], stdout=fout,shell = False) fout.close() - + return response def pep2antigen(JC_pep_fasta, enst_id, hla_allele_list,epitope_len_list, iedb_path, outdir): predicting=[] @@ -58,14 +47,13 @@ def pep2antigen(JC_pep_fasta, enst_id, hla_allele_list,epitope_len_list, iedb_pa #return parsed_dict def pep2antigen_single(JC_pep_fasta, enst_id, form, hla_allele_list,epitope_len_list, iedb_path, outdir): predicting=[] - n=0 - #file_list=[] for hla_allele in hla_allele_list: for epitope_len in epitope_len_list: - #file_list.append(outdir+'/tmp/'+JC_pep_fasta+' '+hla_allele.replace('*','_').replace(':','_')+'.'+epitope_len+'_iedb.txt') - localIEDBCommand(iedb_path, hla_allele, epitope_len, outdir, JC_pep_fasta, enst_id, form) - #parsed_dict=[parsePred(open(tmp_file)) for tmp_file in file_list] - #return parsed_dict + response = localIEDBCommand(iedb_path, hla_allele, epitope_len, outdir, JC_pep_fasta, enst_id, form) + predicting.append(response) + + for response in predicting: + response.wait() def main(args): @@ -79,9 +67,10 @@ def main(args): hla_allele_list=loadSampleMHC(args.hla_allele_list) if hla_allele_list==[]: sys.exit("# No HLA Alleles. Exit.") - epitope_len_list=args.epitope_len_list.split(',') + epitope_len_list=map(int,args.epitope_len_list.split(',')) if min(epitope_len_list)<8: sys.exit("# The request epitope length is too small. Exit.") + epitope_len_list=map(str,epitope_len_list) fs=fin.split('/')[-1].split('.') JC_pep_fasta='.'.join(fs[1:]) diff --git a/IRIS/IRIS_prediction.py b/IRIS/IRIS_prediction.py index 5daecb6..16571ef 100644 --- a/IRIS/IRIS_prediction.py +++ b/IRIS/IRIS_prediction.py @@ -1,5 +1,6 @@ import sys, argparse, os ,datetime,logging, uuid, glob from . import config +import numpy as np ID = str(uuid.uuid4()).split('-')[0] @@ -18,16 +19,40 @@ def loadFeatures(fin): return extracelllularDict -def selectJC(AS_coord,deltaPSI_c2n,cut_off, select_all): - if select_all: - return [(AS_coord[2], AS_coord[3]),(AS_coord[2],AS_coord[0]),(AS_coord[1],AS_coord[3])] - if float(deltaPSI_c2n)'): + continue + junction_peptide.append(l.strip()) + if os.path.exists(peptide_file_name_full_inc): + for l in open(peptide_file_name_full_inc): + if l.startswith('>'): + continue + junction_peptide.append(l.strip()) + junction_peptide=';'.join(junction_peptide) + junction_peptide='-' if junction_peptide=='' else junction_peptide + return junction_peptide + +def loadGeneExp(gene_exp_matrix_fin): + Exp={} + i=0 + for l in open(gene_exp_matrix_fin): + if i==0: + i+=1 + continue + ls=l.strip().split('\t') + name=ls[0].split('_')[0].split('.')[0] + exp_list=map(float, ls[1:]) + Q1=round(np.nanpercentile(exp_list,25),2) + Q3=round(np.nanpercentile(exp_list,75),2) + mean=round(np.nanmean(exp_list),2) + Exp[name]=map(str,[mean, Q1, Q3]) + return Exp,['meanGeneExp','Q1GeneExp','Q3GeneExp'] + +def extracellularAnnotation(screening_result_fin, splicing_event_type, extracelllularDict, deltaPSI_cut_off, select_all, gene_exp_path, pep_dir_prefix): if select_all: deltaPSI_column=0 if select_all==False: @@ -71,7 +134,7 @@ def extracellularAnnotation(screening_result_fin, outdir, extracelllularDict, de continue ls=l.strip().split('\t') des=ls[0].split(':') - JC_pep_fasta=AS2FT(des[2],des[4:8],des[3],ls[deltaPSI_column],deltaPSI_cut_off,select_all,extracelllularDict,ls[0], fout_dict) + JC_coord_list=AS2FT(des[2],des[4:],des[3],ls[deltaPSI_column],deltaPSI_cut_off,select_all,extracelllularDict,ls[0], fout_dict, splicing_event_type) for k in fout_dict.keys(): fout.write(k+'\n') fout.close() @@ -84,8 +147,15 @@ def extracellularAnnotation(screening_result_fin, outdir, extracelllularDict, de if ls[3]+':'+ls[5] not in extracellular_AS[ls[0]]: extracellular_AS[ls[0]][ls[3]+':'+ls[5]]=[] extracellular_AS[ls[0]][ls[3]+':'+ls[5]].append(ls[1]+':'+ls[2]+':'+ls[4]) + + if gene_exp_path: + gene_exp_dict, gene_exp_header=loadGeneExp(gene_exp_path) + fout2=open(screening_result_fin+'.ExtraCellularAS.txt','w') - fout2.write('{}\t{}\t{}\t{}\n'.format('as_event','protein_domain_loc','protein_domain_loc_by_as_exon','\t'.join(screening_result_dict['header']))) + if gene_exp_path: + fout2.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format('as_event','protein_domain_loc','protein_domain_loc_by_as_exon','\t'.join(screening_result_dict['header']),'\t'.join(gene_exp_header), 'junction_peptide')) + else: + fout2.write('{}\t{}\t{}\t{}\t{}\n'.format('as_event','protein_domain_loc','protein_domain_loc_by_as_exon','\t'.join(screening_result_dict['header']),'junction_peptide')) for k in sorted(extracellular_AS): line='' line=';'.join(extracellular_AS[k].keys())+'\t' @@ -97,15 +167,20 @@ def extracellularAnnotation(screening_result_fin, outdir, extracelllularDict, de line+=anno_line+';' screen_print='NA' if k in screening_result_dict: + junction_peptide=retriveJunctionPeptide(k, screening_result_fin, splicing_event_type, pep_dir_prefix) screen_print='\t'.join(screening_result_dict[k]) - fout2.write(k+'\t'+line.rstrip(';')+'\t'+screen_print+'\n') + optional_annotations='' + if gene_exp_path: + gene_exp_list=gene_exp_dict[k.split(':')[0]] + optional_annotations='\t'.join(gene_exp_list) + fout2.write('\t'.join([k,line.rstrip(';'), screen_print, optional_annotations, junction_peptide])+'\n') fout2.close() os.system('rm '+screening_result_fin+'.CellSurfAnno.tmp') -def epitopePredictionPrep(outdir, hla_list_fin, analysis_name, iedb_path): - fin_list=glob.glob(outdir+'/tmp/prot.compared/skp/*.fa') - fin_list=fin_list+glob.glob(outdir+'/tmp/prot.compared/inc/*.fa') +def epitopePredictionPrep(outdir, hla_list_fin, analysis_name, iedb_path, epitope_len_list, task_dir, pep_dir_prefix): + fin_list=glob.glob(outdir+'/tmp/'+pep_dir_prefix+'.compared/skp/*.fa') + fin_list=fin_list+glob.glob(outdir+'/tmp/'+pep_dir_prefix+'.compared/inc/*.fa') hla_list=[] num=90 for l in open(hla_list_fin): @@ -114,67 +189,76 @@ def epitopePredictionPrep(outdir, hla_list_fin, analysis_name, iedb_path): print "[INFO] Total HLA types loaded:", len(hla_list), ". Total peptide splice junctions loaded:",len(fin_list) analysis_name=outdir.split('/')[-1] - list_name='cmdlist.pep2epitope_'+analysis_name - fout_list=open(list_name,'w') - m=0 + script_count = 0 + def write_task_script(script_contents): + task_script_base = 'pep2epitope_{}.{}.sh'.format(analysis_name, script_count) + task_script = os.path.join(task_dir, task_script_base) + + with open(task_script, 'w') as f_h: + f_h.write('#!/bin/bash\n') + f_h.write(script_contents) + for i in xrange(0,len(hla_list),3): hla_types=','.join(hla_list[i:i+3]) n=0 - line='' + script_contents = '' for fin in fin_list: if os.stat(fin).st_size != 0: n+=1 - #line+='echo run '+fin+'\npython '+IRIS_PACKAGE_PATH+'/IRIS/IRIS.pep2epitope.py '+fin+' --hla-allele-list '+hla_types+' -o '+outdir+'\n' - line+='echo run '+fin+'\nIRIS pep2epitope '+fin+' --hla-allele-list '+hla_types+' -o '+outdir+' --iedb-local '+iedb_path+'\n' - line+='sleep 70\n' + script_contents += 'echo run '+fin+'\nIRIS pep2epitope '+fin+' --hla-allele-list '+hla_types+' -o '+outdir+' --iedb-local '+iedb_path+' -e '+epitope_len_list+'\n' if n%num==0: - submission_file_name=outdir+'/tmp/submit.IRIS_pep2epitope.py.'+str(n)+'.'+hla_types+'.sh' - fout_list.write(submission_file_name+'\n') - m+=1 - fout=open(submission_file_name,'w') - fout.write(line) - fout.close() - line='' + write_task_script(script_contents) + script_count += 1 + script_contents = '' if n%num!=0: - # print line - submission_file_name=outdir+'/tmp/submit.IRIS_pep2epitope.py.'+str(n)+'.'+hla_types+'.sh' - fout_list.write(submission_file_name+'\n') - m+=1 - fout=open(submission_file_name,'w') - fout=open(outdir+'/tmp/submit.IRIS_pep2epitope.py.'+str(n)+'.'+hla_types+'.sh','w') - fout.write(line) - fout.close() - line='' - - fout_list.close() - fout_qsub=open('qsub.IRIS_pep2epitope.py.'+analysis_name+'.sh','w') - - cmd='qsub -t 1-'+str(m)+':1 qsub.IRIS_pep2epitope.py.'+analysis_name+'.sh' - - fout_qsub.write('#!/bin/bash\n#$ -N IRIS_pep2epitope\n#$ -S /bin/bash\n#$ -R y\n#$ -l '+config.QSUB_PREDICTION_CONFIG+'\n#$ -V\n#$ -cwd\n#$ -j y\n#$ -m bea\n') - fout_qsub.write('export s=`sed -n ${SGE_TASK_ID}p '+list_name+'`\necho $s\nbash $s') - fout_qsub.close() - print cmd + write_task_script(script_contents) + script_count += 1 + script_contents = '' + def main(args): #Define parameters - IRIS_screening_result=args.IRIS_screening_result_path + IRIS_screening_result=args.IRIS_screening_result_path.rstrip('/') #Modified 2021 deltaPSI_cut_off=float(args.deltaPSI_cut_off) + splicing_event_type=args.splicing_event_type select_all= True if args.extracellular_anno_by_junction==False else False analysis_name=[l.strip() for l in open(args.parameter_fin)][0] - hla_list_fin=args.mhc_list - iedb_path=args.iedb_local - extracelllularDict=loadFeatures(config.EXTRACELLULAR_FEATURES_UNIPROT2GTF_MAP_PATH) #IRIS_package_dir.rstrip('/')+'/IRIS/data/features.uniprot2gtf.ExtraCell.txt' - print "[INFO] Total extracellular annotated loaded:",len(extracelllularDict) - - extracellularAnnotation(IRIS_screening_result+'/'+analysis_name+'.primary.txt', IRIS_screening_result, extracelllularDict, deltaPSI_cut_off, select_all) - epitopePredictionPrep(IRIS_screening_result+'/primary',hla_list_fin, analysis_name, iedb_path) + analysis_name=analysis_name+'.'+splicing_event_type #group_name.type + prioritized_only=args.tier3_only + extracellular_only=args.extracellular_only + gene_exp_matrix=args.gene_exp_matrix + task_dir=args.task_dir + all_orf=args.all_orf + pep_dir_prefix='prot' + if all_orf: + pep_dir_prefix='prot_allorf' + if not os.path.exists(task_dir): + os.makedirs(task_dir) + if extracellular_only==False: + hla_list_fin=args.mhc_list + iedb_path=args.iedb_local + epitope_len_list=args.epitope_len_list.rstrip(',') - extracellularAnnotation(IRIS_screening_result+'/'+analysis_name+'.prioritized.txt', IRIS_screening_result, extracelllularDict, deltaPSI_cut_off, select_all) - epitopePredictionPrep(IRIS_screening_result+'/prioritized',hla_list_fin, analysis_name, iedb_path) + extracelllularDict=loadFeatures(config.EXTRACELLULAR_FEATURES_UNIPROT2GTF_MAP_PATH) + print "[INFO] Total extracellular annotation loaded:",len(extracelllularDict) + + if config.file_len(IRIS_screening_result+'/'+analysis_name+'.tier1.txt')==1 and prioritized_only==False: + prioritized_only=True + print "[INFO] No tier1 comparisons (tissue-matched normal) found. Use tier2&tier3 only mode. " + if prioritized_only: + extracellularAnnotation(IRIS_screening_result+'/'+analysis_name+'.tier2tier3.txt', splicing_event_type, extracelllularDict, deltaPSI_cut_off, select_all, gene_exp_matrix, pep_dir_prefix) + if extracellular_only==False: + epitopePredictionPrep(IRIS_screening_result+'/'+splicing_event_type+'.tier2tier3',hla_list_fin, analysis_name, iedb_path, epitope_len_list, task_dir, pep_dir_prefix) + else: + extracellularAnnotation(IRIS_screening_result+'/'+analysis_name+'.tier1.txt', splicing_event_type, extracelllularDict, deltaPSI_cut_off, select_all, gene_exp_matrix, pep_dir_prefix) + extracellularAnnotation(IRIS_screening_result+'/'+analysis_name+'.tier2tier3.txt', splicing_event_type, extracelllularDict, deltaPSI_cut_off, select_all, gene_exp_matrix, pep_dir_prefix) + if extracellular_only==False: + epitopePredictionPrep(IRIS_screening_result+'/'+splicing_event_type+'.tier1',hla_list_fin, analysis_name, iedb_path, epitope_len_list, task_dir, pep_dir_prefix) + #epitopePredictionPrep(IRIS_screening_result+'/'+splicing_event_type+'.prioritized',hla_list_fin, analysis_name, iedb_path, epitope_len_list, task_dir) #Only primary is needed as priortized can be parsed from it + if __name__ == '__main__': main() diff --git a/IRIS/IRIS_screening.py b/IRIS/IRIS_screening.py index e33b302..9214977 100644 --- a/IRIS/IRIS_screening.py +++ b/IRIS/IRIS_screening.py @@ -1,4 +1,3 @@ - import numpy as np import os, glob, pyBigWig, argparse from scipy import stats @@ -57,28 +56,29 @@ def fetch_PsiMatrix(eid, fn, outdir, delim, index=None): data = np.asarray(f.readline().strip().split(delim)) return (header, data) -def openTestingFout(outdir, out_prefix, summary_file, ref_list, test_mode): +def openTestingFout(outdir, out_prefix, splicing_event_type, summary_file, panel_list, test_mode, fin_name=''): header=['as_event','meanPSI','Q1PSI','Q3PSI'] if test_mode=='group': header_prefix=['_pVal','_deltaPSI','_tumorFC'] if test_mode=='personalized': header_prefix=['_modifiedPctl','_deltaPSI','_tumorFC'] - fout=open(outdir+'/'+out_prefix+'.test.all.txt','w') + fout_name=outdir+'/'+out_prefix+'.'+splicing_event_type+'.test.all_'+fin_name+'.txt' + fout=open(fout_name,'w') if summary_file==False: - header+=['\t'.join(map(lambda x:ref+x ,header_prefix)) for ref in ref_list if ref!=out_prefix] + header+=['\t'.join(map(lambda x:ref+x ,header_prefix)) for ref in panel_list if ref!=out_prefix] fout.write('\t'.join(header)+'\n') - return fout + return fout, fout_name -def openScreeningFout(outdir, out_prefix, fout_name): - fout=open(outdir+'/'+out_prefix+'.'+fout_name+'.txt','w') +def openScreeningFout(outdir, out_prefix, splicing_event_type, fout_name): + fout=open(outdir+'/'+out_prefix+'.'+splicing_event_type+'.'+fout_name+'.txt','w') fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('as_event','meanPSI','Q1PSI','Q3PSI','deltaPSI','fc_of_tumor_isoform','tissue_matched_normal_panel','tumor_panel','normal_panel','tag','mappability','mappability_tag')) return fout -def writeSummaryFile(out_prefix, db_dir, index, fout): #no screening, preparing for MS search - fin_name=db_dir+'/'+out_prefix+'/splicing_matrix/splicing_matrix.SE.cov10.'+group_name+'.txt' +def writeSummaryFile(out_prefix, splicing_event_type, db_dir, index, fout, fetching_data_col): #no screening, preparing for MS search + fin_name=db_dir+'/'+out_prefix+'/splicing_matrix/splicing_matrix.'+splicing_event_type+'.cov10.'+group_name+'.txt' index[out_prefix]=read_PsiMatrix_index(fin_name,'/'.join(fin_name.split('/')[:-1])) for j,k in enumerate(index[out_prefix]): - psi_event=map(float,fetch_PsiMatrix(k,fin_name,'.','\t',index[out_prefix])[1][8:]) + psi_event=map(float,fetch_PsiMatrix(k,fin_name,'.','\t',index[out_prefix])[1][fetching_data_col:]) query_mean=[np.nanmean(psi_event),np.nanpercentile(psi_event,25),np.nanpercentile(psi_event,75)] result=[k]+query_mean fout.write('\t'.join(map(str,result))+'\n') @@ -107,6 +107,21 @@ def getMappability(splicing_event,bw_map,d): mappability=[str(up_mean),str(target_mean),str(down_mean)] return mappability +def getDirection(filter1_panel_list, psi, test, non_parametric, screening_type): + psi_primary=[] + deltaPSI_primary=[] + for primary_group in filter1_panel_list: + if test[primary_group]!=['-']*3: + psi_primary+=psi[primary_group] + deltaPSI_primary.append(test[primary_group][1]) + if psi_primary==[]: #in case tissue-matched normal doesn't have data + return [],[] + else: + direction='greater' if non_parametric else 'larger' + if np.median(deltaPSI_primary)<=0: + direction='less' if non_parametric else 'smaller' + return psi_primary, direction + def calcTumorFormFoc(delta_psi, mean_psi): if delta_psi>0: return mean_psi/(mean_psi - delta_psi+10**-8) @@ -137,91 +152,127 @@ def one2N(p1, g1, test_type): else: return [np.nan,delta_psi,tumor_foc] -#def groupTest(g1,g2, test_type, threshold_tost=0.05): -def groupTest(g1,g2, test_type, direction='two-sided'): +def statTest(g1,g2, direction, non_parametric): + if direction != 'equivalence': + if non_parametric: + pvalue=stats.mannwhitneyu(g1,g2, alternative=direction)[1] + else: + pvalue=smw.ttest_ind(g1,g2, alternative=direction, usevar='unequal')[1] + #pvalue=smw.ttest_ind(g1,g2, alternative=direction)[1] + else: + threshold_tost = 0.05 + pvalue=smw.ttost_ind(g1,g2,-threshold_tost,threshold_tost,usevar='unequal')[0] #equivalence test + return pvalue + +def statTest_minSampleCount(g1,g2, direction, non_parametric):#Only enabled when filters out by min_sample_count. With enough sample, using the default setting assume equal var for both groups is ok. + if direction != 'equivalence': + if non_parametric: + pvalue=stats.mannwhitneyu(g1,g2, alternative=direction)[1] + else: + pvalue=smw.ttest_ind(g1,g2, alternative=direction)[1] + #pvalue=smw.ttest_ind(g1,g2, alternative=direction)[1] + else: + threshold_tost = 0.05 + pvalue=smw.ttost_ind(g1,g2,-threshold_tost,threshold_tost)[0] #equivalence test + return pvalue + +def groupTest(g1,g2, non_parametric=False, direction='two-sided', min_sample_count=False): g1=np.array(g1) g2=np.array(g2) g1=g1[~np.isnan(g1)] g2=g2[~np.isnan(g2)] delta_psi=np.nanmean(g1)-np.nanmean(g2) tumor_foc=calcTumorFormFoc(delta_psi,np.nanmean(g1)) - if test_type=='sig': - t1=stats.ttest_ind(g1,g2)[1] - elif test_type=='equ': - t1=smw.ttest_ind(g1,g2,alternative='two-sided')[1] - # t1=smw.ttost_ind(g1,g2,-threshold_tost,threshold_tost,usevar='unequal')[0] #equalvalence test - return [t1, delta_psi, tumor_foc] - -def getDirection(filter1, psi, test): - psi_primary=[] - deltaPSI_primary=[] - for primary_group in filter1: - if test[primary_group]!=['-']*3: - psi_primary+=psi[primary_group] - deltaPSI_primary+=test[primary_group] - if psi_primary==[]: #in case tissue-matched normal doesn't have data - return [],[] - else: - direction='larger' - if np.median(deltaPSI_primary)<=0: - direction='smaller' - return psi_primary, direction - -def summarizeTestResult(filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc,filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, primary, tumor_rec, pval, deltaPSI, foc, testing_type_index): - differential,equal,positive,negative=[0,0,0,0] - testable=0 + if min_sample_count: + pvalue = statTest_minSampleCount(g1, g2, direction, non_parametric) + else: + pvalue = statTest(g1, g2, direction, non_parametric) + return [pvalue, delta_psi, tumor_foc] + +def performTest(set_matched_tumor, has, j, group, screening_type_list, psi, out_prefix, non_parametric, test, filter1_panel_list, psi_primary, direction, min_sample_count): #PSI value-based screen allow two-sided or one-sided tests. Different from SJ count or CPM-based screen, where only one-sided test is needed. + screening_type = screening_type_list[j] + redirect_output = False + test_result = ['-']*3 #For missing in non-eesential tests/comparisons + if screening_type == 'association': + if has[group]: + test_result = groupTest(psi[out_prefix],psi[group], non_parametric,"two-sided", min_sample_count) + return test_result + + has_matched_tumor = False if psi_primary == [] else True #for clarity + + if screening_type == 'recurrence': + if set_matched_tumor and has_matched_tumor:#set_matched_tumor is redundent here. kept for future implemtation of additional output type. + if has[group]: + test_result = groupTest(psi[group],psi_primary, non_parametric, direction, min_sample_count) + else: + if has[group]: + test_result = groupTest(psi[out_prefix],psi[group], non_parametric, "equivalence", min_sample_count)#No or equivalent testing -John's use case + return test_result + + if screening_type == 'association_high': + if set_matched_tumor and has_matched_tumor: + if has[group]: + test_result = groupTest(psi[out_prefix],psi[group], non_parametric, direction, min_sample_count) + else: + if has[group]: + test_result = groupTest(psi[out_prefix],psi[group], non_parametric, "two-sided", min_sample_count) #Two-sided testing + return test_result + +def summarizeTestResult(filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc,filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, pval, deltaPSI, foc, screening_type_list): + association_passed,recurrence_passed,specificity_positive,specificity_negative, specificity_testable=[0,0,0,0,0] primary_result, primary_result_foc=[[],[]] #take care of multiple tiisue matched norm - deltapsi_list,foc_list=[[],[]] #if no tissue-matched norm, use median - for i,group_type in enumerate(testing_type_index): - if pval[i]=='-': + deltapsi_list_voted,foc_list_voted=[[],[]] #if no tissue-matched norm, use median + for i,group_type in enumerate(screening_type_list): + if pval[i]=='-': #This is important - skip all missing, which is not useful for summarizing but not affecting consistancy. continue - if i=filter1_cutoff_dpsi and float(foc[i])>=filter1_cutoff_foc: - differential+=1 + association_passed+=1 continue - if testing_type_index[i]=='equ': + if screening_type_list[i]=='recurrence': if float(pval[i])<=filter2_cutoff_pval and abs(float(deltaPSI[i]))>=filter2_cutoff_dpsi: - equal+=1 + recurrence_passed+=1 continue - if testing_type_index[i]=='sig' and i>=(primary+tumor_rec): - testable+=1 + if screening_type_list[i]=='association_high':# TODO: judge set/has, then run + specificity_testable+=1 if float(pval[i])<=filter3_cutoff_pval and float(foc[i])>=filter3_cutoff_foc: - deltapsi_list.append(float(deltaPSI[i])) - foc_list.append(float(foc[i])) + deltapsi_list_voted.append(float(deltaPSI[i])) + foc_list_voted.append(float(foc[i])) if float(deltaPSI[i])>=filter3_cutoff_dpsi: - positive+=1 + specificity_positive+=1 continue if float(deltaPSI[i])<=-filter3_cutoff_dpsi: - negative+=1 + specificity_negative+=1 continue - return differential,equal,positive,negative,testable, np.median(primary_result), np.median(primary_result_foc), deltapsi_list,foc_list + return association_passed,recurrence_passed,specificity_positive,specificity_negative,specificity_testable, np.median(primary_result), np.median(primary_result_foc), deltapsi_list_voted,foc_list_voted -def defineTumorEvents(filter1_group_cutoff,filter2_group_cutoff,filter3_group_cutoff, primary, norm_tissue, differential, equal, positive, negative, testable, primary_result, primary_result_foc, deltapsi_list,foc_list, use_ratio): +def defineTumorEvents(filter1_group_cutoff,filter2_group_cutoff,filter3_group_cutoff, set_matched_tumor, specificity_panel_len, association_passed, recurrence_passed, specificity_positive, specificity_negative, specificity_testable, primary_result, primary_result_foc, deltapsi_list_voted, foc_list_voted, use_ratio): tag=[] - if differential>=filter1_group_cutoff: + if association_passed>=filter1_group_cutoff:#Improvement? current: 0>='' is false tag.append('associated') - if equal>=filter2_group_cutoff: + if recurrence_passed>=filter2_group_cutoff: tag.append('recurrent') - if primary==0: - tissue_specificity=max(positive,negative) - ratio=False if use_ratio==False else tissue_specificity/(testable+10**-8)>=filter3_group_cutoff/(norm_tissue+0.0) - if tissue_specificity>=filter3_group_cutoff or ratio: - tag.append('specific') - else: + if set_matched_tumor:#TODO-FUTURE: take care of set-yes has-no redirected events if primary_result>0: - tissue_specificity=positive - ratio=False if use_ratio==False else positive/(testable+10**-8)>=filter3_group_cutoff/(norm_tissue+0.0) - if positive>=filter3_group_cutoff or ratio: - tag.append('specific') + tissue_specificity=specificity_positive + ratio=False if use_ratio==False else specificity_positive/(specificity_testable+10**-8)>=filter3_group_cutoff/(specificity_panel_len+0.0) + if specificity_positive>=filter3_group_cutoff or ratio: + tag.append('high_assoc') else: - tissue_specificity=negative - ratio=False if use_ratio==False else negative/(testable+10**-8)>=filter3_group_cutoff/(norm_tissue+0.0) - if negative>=filter3_group_cutoff or ratio: - tag.append('specific') - primary_deltapsi=primary_result if primary_result!=0 else np.median(deltapsi_list) - primary_foc=primary_result_foc if primary_result!=0 else np.median(foc_list) + tissue_specificity=specificity_negative + ratio=False if use_ratio==False else specificity_negative/(specificity_testable+10**-8)>=filter3_group_cutoff/(specificity_panel_len+0.0) + if specificity_negative>=filter3_group_cutoff or ratio: + tag.append('high_assoc') + else: + tissue_specificity=max(specificity_positive,specificity_negative) + ratio=False if use_ratio==False else tissue_specificity/(specificity_testable+10**-8)>=filter3_group_cutoff/(specificity_panel_len+0.0) + if tissue_specificity>=filter3_group_cutoff or ratio: + tag.append('high_assoc') + primary_deltapsi=primary_result if (primary_result!=0 and set_matched_tumor) else np.median(deltapsi_list_voted)#TODO + primary_foc=primary_result_foc if (primary_result!=0 and set_matched_tumor) else np.median(foc_list_voted) + return primary_deltapsi, primary_foc, tissue_specificity, tag def mappability_write(k, bw_map, calc_length): @@ -244,25 +295,28 @@ def loadBlacklistEvents(fin): BlacklistEvents[des]='' return BlacklistEvents -def translationCMD(ref_genome, outdir, out_prefix, fout_name): +def translationCMD(ref_genome, gtf, outdir, out_prefix, splicing_event_type, all_orf, ignore_annotation, remove_early_stop, fout_name): #uSE name space - #Namespace(outdir='test1', parameter_fin='/u/home/p/panyang/bigdata-nobackup/Glioma_test/GBM.prioritze.par', subcommand='screening', translating=False) - cmd_translation='IRIS translation '+outdir+'/'+out_prefix+'.'+fout_name+'.txt '+' -o '+outdir+'/'+fout_name+' -g '+ref_genome + argument_line='' + if all_orf: + argument_line+=' --all-orf' + if ignore_annotation: + argument_line+=' --ignore-annotation' + if remove_early_stop: + argument_line+=' --remove-early-stop' + cmd_translation='IRIS translate '+outdir+'/'+out_prefix+'.'+splicing_event_type+'.'+fout_name+'.txt '+' -o '+outdir+'/'+splicing_event_type+'.'+fout_name+' -g '+ref_genome+' -t '+splicing_event_type+argument_line+' --gtf '+gtf print '[INFO] Working on translating: '+fout_name os.system(cmd_translation) -def loadParametersRow(filter_para, ref_list): - if len(filter_para.split(' '))==6: - filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_list =filter_para.split(' ')[1:] - filter_cutoff_pval=float(filter_cutoff_pval) - filter_cutoff_dpsi=float(filter_cutoff_dpsi) - filter_cutoff_foc=float(filter_cutoff_foc) - filter_group_cutoff=int(filter_group_cutoff) - filter_list=filter_list.split(',') - ref_list+=filter_list +def loadParametersRow(filter_para, panel_list): + filter_cutoffs='' + if filter_para.strip()!='': + filter_cutoffs = map(float,filter_para.strip().split(' ')[0].split(',')) + filter_panel_list = filter_para.strip().split(' ')[1].split(',') + panel_list+=filter_panel_list else: - filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_list =['','','','',[]] - return filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_list, ref_list + filter_panel_list =[] + return filter_cutoffs, filter_panel_list, panel_list def main(args): @@ -270,162 +324,238 @@ def main(args): index={} fin_list={} para_fin=args.parameter_fin + splicing_event_type=args.splicing_event_type + fetching_data_col=8 if splicing_event_type == 'SE' else 10 out_prefix,db_dir,filter1_para,filter2_para,filter3_para,test_mode,use_ratio,blacklist_path,mappability_path,ref_genome=[l.strip() for l in open(para_fin)] - ref_list=[out_prefix] + panel_list=[out_prefix] + test_mode=test_mode.split(' ') use_ratio=True if use_ratio=='True' else False - if blacklist_path=='': - blacklist_path=config.BRAIN_BLACKLIST_PATH - blacklist_events=loadBlacklistEvents(blacklist_path) + blacklist_events={} + min_sample_count=args.min_sample_count + if min_sample_count: + min_sample_count=int(min_sample_count) + if blacklist_path!='': + #if blacklist_path=='BRAIN_BLACKLIST_PATH': + blacklist_events=loadBlacklistEvents(blacklist_path) bw_map,calc_length=loadMappability(mappability_path) - filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc, filter1_group_cutoff, filter1, ref_list =loadParametersRow(filter1_para, ref_list) - filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter2_group_cutoff, filter2, ref_list =loadParametersRow(filter2_para, ref_list) - filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, filter3_group_cutoff, filter3, ref_list =loadParametersRow(filter3_para, ref_list) - if filter1==[] and filter2==[] and filter3==[] and test_mode!='summary': + all_orf=args.all_orf + ignore_annotation=args.ignore_annotation + remove_early_stop=args.remove_early_stop + use_existing_test_result=args.use_existing_test_result + + filter1_cutoffs, filter1_panel_list, panel_list = loadParametersRow(filter1_para, panel_list) + filter2_cutoffs, filter2_panel_list, panel_list = loadParametersRow(filter2_para, panel_list) + filter3_cutoffs, filter3_panel_list, panel_list = loadParametersRow(filter3_para, panel_list) + + if filter1_cutoffs!='': + filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc, filter1_group_cutoff=filter1_cutoffs[:3]+[filter1_cutoffs[4]] + else: + filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc, filter1_group_cutoff=['','','',''] + if filter2_cutoffs!='': + filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter2_group_cutoff=filter2_cutoffs[:3]+[filter2_cutoffs[4]] + else: + filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter2_group_cutoff=['','','',''] + if filter3_cutoffs!='': + filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, filter3_group_cutoff=filter3_cutoffs[:3]+[filter3_cutoffs[4]] + else: + filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, filter3_group_cutoff=['','','',''] + if filter1_panel_list==[] and filter2_panel_list==[] and filter3_panel_list==[] and test_mode[0]!='summary': exit("[Error] No filtering required in parameteres file. exit!") - group_test=False if test_mode!='group' else True - individual_test=False if test_mode!='personalized' else True - summary_file=False if test_mode!='summary' else True + + non_parametric=False + if len(test_mode)>1: + non_parametric=True if test_mode[1]=='nonparametric' else False + + group_test=False if test_mode[0]!='group' else True + individual_test=False if test_mode[0]!='personalized' else True + summary_file=False if test_mode[0]!='summary' else True if [group_test,individual_test,summary_file]==[False,False,False]: exit('[Error] Need to choose one mode.exit!') - primary=len(filter1) - tumor_rec=len(filter2) - norm_tissue=len(filter3) - filter_count=sum(1 for i in [primary, tumor_rec, norm_tissue] if i!=0) - testing_type_index=['sig']*primary+['equ']*tumor_rec+['sig']*norm_tissue - - db_dir=db_dir.rstrip('/') + association_panel_len=len(filter1_panel_list) + recurrence_panel_len=len(filter2_panel_list) + specificity_panel_len=len(filter3_panel_list) + panel_count=sum(1 for i in [association_panel_len, recurrence_panel_len, specificity_panel_len] if i!=0) + screening_type_list=['association']*association_panel_len+['recurrence']*recurrence_panel_len+['association_high']*specificity_panel_len + set_matched_tumor= True if screening_type_list[0] == 'association' else False + + if args.translating: + gtf=args.gtf + if os.path.exists(gtf)==False: + exit('[Error] No gtf file provided for translation. exit!') + + ###Create Folders/Output#### outdir=args.outdir.rstrip('/') os.system('mkdir -p '+outdir) + db_dir=db_dir.rstrip('/') - ###Create Folders/Output#### - fout= openTestingFout(outdir, out_prefix, summary_file, ref_list, test_mode) - if summary_file: - writeSummaryFile(out_prefix, db_dir, index, fout) - exit() - - fout_filtered=open(outdir+'/'+out_prefix+'.notest.txt','w') - fout_primary=openScreeningFout(outdir, out_prefix, 'primary') - fout_prioritized=openScreeningFout(outdir,out_prefix, 'prioritized') + if use_existing_test_result==False: + fout_direct, fout_direct_name= openTestingFout(outdir, out_prefix, splicing_event_type, summary_file, panel_list, test_mode[0], 'guided') + fout_redirect, fout_redirect_name=openTestingFout(outdir, out_prefix, splicing_event_type, summary_file, panel_list, test_mode[0], 'voted') + if summary_file: + writeSummaryFile(out_prefix, splicing_event_type, db_dir, index, fout_direct, fetching_data_col) + exit() + fout_filtered=open(outdir+'/'+out_prefix+'.'+splicing_event_type+'.notest.txt','w') + else: + fout_direct_name=outdir+'/'+out_prefix+'.'+splicing_event_type+'.test.all_guided.txt' + fout_redirect_name=outdir+'/'+out_prefix+'.'+splicing_event_type+'.test.all_voted.txt' - for group_name in ref_list:##Load IRIS reference panels - fin_list[group_name]=db_dir+'/'+group_name+'/splicing_matrix/splicing_matrix.SE.cov10.'+group_name+'.txt' + fout_primary=openScreeningFout(outdir, out_prefix, splicing_event_type, 'tier1') + fout_prioritized=openScreeningFout(outdir,out_prefix, splicing_event_type, 'tier2tier3') + ##Load IRIS reference panels + for group_name in panel_list: + fin_list[group_name]=db_dir+'/'+group_name+'/splicing_matrix/splicing_matrix.'+splicing_event_type+'.cov10.'+group_name+'.txt' for group in fin_list.keys(): if not os.path.isfile(fin_list[group]+'.idx'): exit('[Error] Need to index '+fin_list[group]) index[group]=read_PsiMatrix_index(fin_list[group],'/'.join(fin_list[group].split('/')[:-1])) - has={} - #for j,k in enumerate(['ENSG00000083520:DIS3:chr13:-:73345041:73345126:73343050:73345218','ENSG00000110075:PPP6R3:chr11:+:68350510:68350597:68343511:68355265']): - tot=len(index[out_prefix])-1 - print '[INFO] IRIS screening started. Total input events:', tot+1 - for event_idx,k in enumerate(index[out_prefix]): - config.update_progress(event_idx/(0.0+tot)) - - for group in ref_list:#Initiate - if group!=out_prefix: - has[group]=True - psi={} - has_count=0 - for group in ref_list: - if k in index[group]: - psi[group]=map(float,fetch_PsiMatrix(k,fin_list[group],'.','\t',index[group])[1][8:]) - has_count+=1 - else: - has[group]=False - cat_psi=[] - for i in psi: - cat_psi+=psi[i] - if abs(max(cat_psi)-min(cat_psi))<0.05:#if change less than 5% skipped and no comparison available - fout_filtered.write('[LowVar]{}\t{}\t{}\n'.format(k,str(abs(max(cat_psi)-min(cat_psi))),str(has_count))) - continue - if k in blacklist_events: - fout_filtered.write('[Blacklisted]{}\t{}\t{}\n'.format(k,'-',str(has_count))) - continue - if has_count<=1: - fout_filtered.write('[NoTest]{}\t{}\t{}\n'.format(k,'-',str(has_count))) - continue + ## Load and perform test by row/event + if use_existing_test_result==False: + has={} + tot=len(index[out_prefix])-1 + print '[INFO] IRIS screen - started. Total input events:', tot+1 + + for event_idx,k in enumerate(index[out_prefix]): + config.update_progress(event_idx/(0.0+tot)) - if group_test: - test={} - query_mean=[np.nanmean(psi[out_prefix]),np.nanpercentile(psi[out_prefix],25),np.nanpercentile(psi[out_prefix],75)] - for j,group in enumerate(ref_list[1:]): - test[group]=['-']*3 - if has[group]: - if testing_type_index[j]=='equ': - psi_primary, direction= getDirection(filter1, psi, test) - if psi_primary==[]: #in case tissue-matched normal doesn't have data - continue - test[group]=groupTest(psi[group],psi_primary, testing_type_index[j], direction) - else: - test[group]=groupTest(psi[out_prefix],psi[group],testing_type_index[j]) - - result=[k]+query_mean+['\t'.join(map(str,test[t])) for t in ref_list if t!=out_prefix] - fout.write('\t'.join(map(str,result))+'\n') - - ## summarize test result and prioritze - pval=[test[t][0] for t in ref_list if t!=out_prefix] - deltaPSI=[test[t][1] for t in ref_list if t!=out_prefix]# select deltapsi col of screening result - foc=[test[t][2] for t in ref_list if t!=out_prefix] - - differential,equal,positive,negative,testable,primary_result, primary_result_foc, deltapsi_list,foc_list=summarizeTestResult(filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc,filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, primary, tumor_rec, pval, deltaPSI, foc, testing_type_index) + #Initiate + for group in panel_list: + if group!=out_prefix: + has[group]=True + psi={} + has_count=0 + for group in panel_list: + if k in index[group]: + psi[group]=map(float,fetch_PsiMatrix(k,fin_list[group],'.','\t',index[group])[1][fetching_data_col:]) + has_count+=1 + else: + has[group]=False + #Filtering + cat_psi=[] + for i in psi: + cat_psi+=psi[i] + if abs(max(cat_psi)-min(cat_psi))<0.05:#if change less than 5% skipped and no comparison available + fout_filtered.write('[Low Range]{}\t{}\t{}\n'.format(k,str(abs(max(cat_psi)-min(cat_psi))),str(has_count))) + continue + if k in blacklist_events: + fout_filtered.write('[Blacklisted]{}\t{}\t{}\n'.format(k,'-',str(has_count))) + continue + if has_count<=1: + fout_filtered.write('[Unique in Input]{}\t{}\t{}\n'.format(k,'-',str(has_count))) + continue + if min_sample_count: + sample_count=np.count_nonzero(~np.isnan(psi[out_prefix])) + if sample_count=filter1_group_cutoff or filter1_group_cutoff=='') and (significant_tumor>=filter2_group_cutoff or filter2_group_cutoff=='') and (significant_normal>=filter3_group_cutoff or filter3_group_cutoff==''): + fout_cpm_sig.write(k+'\t'+'\t'.join(map(str,write_sj_list))+'\n') + sig_junction[k]='|'.join(map(str, [write_sj_list[0],significant_normal_match,significant_tumor,significant_normal])) + fout_cpm_count.write(k+'\t'+'\t'.join(map(str,write_sj_list))+'\n') + fout_cpm_count.close() + fout_cpm_sig.close() + + else: + print 'Use existing testing result.' + fout_cpm_count_name=outdir+'/CPM.'+out_prefix+'.'+splicing_event_type+'.test_all.txt' + for i, l in enumerate(open(fout_cpm_count_name)): + if i==0: + header=l.strip().split('\t') + group_list=map(lambda x: x.split('_CPM')[0], header[2::3]) + else: + ls=l.strip().split('\t') + if ls[1]=='NA': + continue + cpm_value= map(float,ls[2::3])# don't do map because '-' + change_value = map(float,ls[3::3]) + p_value= map(float,ls[4::3]) + significant_normal_match=0 + significant_normal=0 + significant_tumor=0 + #determine difference of a junction + for j,group in enumerate(group_list): + if group in filter1_panel_list: + if p_value[j]<=pvalue_cutoff_normal: + significant_normal_match+=1 + elif group in filter2_panel_list: + if p_value[j]<=pvalue_cutoff_tumor: + significant_tumor+=1 + else: + if p_value[j]<=pvalue_cutoff_normal: + significant_normal+=1 + if (significant_normal_match>=filter1_group_cutoff or filter1_group_cutoff=='') and (significant_tumor>=filter2_group_cutoff or filter2_group_cutoff=='') and (significant_normal>=filter3_group_cutoff or filter3_group_cutoff==''): + fout_cpm_sig.write(l.strip()+'\n') + sig_junction[ls[0]]=':'.join(map(str, [round(float(ls[1]),2),significant_normal_match,significant_tumor,significant_normal])) + fout_cpm_sig.close() + + #sig_junction=loadSigJunction(fout_cpm_sig_fname) + fout_summary_fname=summarizeSJ2ASevent(event_list_fin, splicing_event_type, sig_junction, outdir, out_prefix) + + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_screening_novelss.py b/IRIS/IRIS_screening_novelss.py new file mode 100644 index 0000000..1224e55 --- /dev/null +++ b/IRIS/IRIS_screening_novelss.py @@ -0,0 +1,538 @@ +import numpy as np +import sys +import os, glob, pyBigWig, argparse +from scipy import stats +import statsmodels.stats.weightstats as smw +from . import config +import warnings +warnings.filterwarnings("ignore") + +def read_PsiMatrix_index(fn,outdir): + index = {} + for line in open(outdir+'/'+fn.split('/')[-1]+'.idx', 'r'): + ele = line.strip().split() + index[ele[0]] = int(ele[1]) + return index + +def fetch_PsiMatrix(eid, fn, delim, index=None): + with open(fn, 'r') as f: + ele = f.readline().strip().split(delim) + header = np.asarray([ x.split('.aln')[0] for x in ele ]) + f.seek(index[eid], 0) + data = np.asarray(f.readline().strip().split(delim)) + return (header, data) + +def read_SJMatrix_index(fn,outdir): + index = {} + for line in open(outdir+'/'+fn.split('/')[-1]+'.idx', 'r'): + ele = line.strip().split() + index[ele[0]] = int(ele[1]) + return index + +def fetch_SJMatrix(eid, fn, delim, index, head_only): + with open(fn, 'r') as f: + if head_only: + ele = f.readline().strip().split(delim) + retrieved_text = np.asarray([ x.split('.aln')[0] for x in ele ]) + else: + f.seek(index[eid], 0) + retrieved_text = np.asarray(f.readline().strip().split(delim)) + return retrieved_text + +def loadParametersRow(filter_para, panel_list): + if filter_para.strip()!='': + para, filter_panel_list=filter_para.split(' ') + filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_cutoff_pval_PT, filter_group_cutoff =para.split(',') + filter_cutoff_pval=float(filter_cutoff_pval) + filter_cutoff_dpsi=float(filter_cutoff_dpsi) + filter_cutoff_foc=float(filter_cutoff_foc) + filter_group_cutoff=int(filter_group_cutoff) + filter_panel_list=filter_panel_list.split(',') + panel_list+=filter_panel_list + else: + filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_panel_list =['','','','',[]] + return filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_panel_list, panel_list + +def openTestingFout(outdir, out_prefix, splicing_event_type, summary_file, panel_list, fin_name=''): + header=['as_event','meanPSI','Q1PSI','Q3PSI'] + header_prefix=['_pVal','_deltaPSI','_tumorFC'] + fout_name=outdir+'/'+out_prefix+'.'+splicing_event_type+'.test_JCRA.all_'+fin_name+'.txt' + fout=open(fout_name,'w') + if summary_file==False: + header+=['\t'.join(map(lambda x:ref+x ,header_prefix)) for ref in panel_list if ref!=out_prefix] + fout.write('\t'.join(header)+'\n') + return fout, fout_name + +def openScreeningFout(outdir, out_prefix, splicing_event_type, fout_name): + fout=open(outdir+'/'+out_prefix+'.'+splicing_event_type+'.'+fout_name+'.txt','w') + fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('as_event','meanPSI','Q1PSI','Q3PSI','deltaPSI','fc_of_tumor_isoform','tissue_matched_normal_panel','tumor_panel','normal_panel','tag','mappability','mappability_tag','novel_ss_info')) + return fout + +def loadMappability(bigwig_fin): + bw_map=pyBigWig.open(bigwig_fin) + d=45 + return bw_map,d + +def getMappability(splicing_event,bw_map,d): + arr=splicing_event.split(':') + chrom,strand,start,end,up,down=arr[2],arr[3],int(arr[4]),int(arr[5]),int(arr[6]),int(arr[7]) + up_mean=bw_map.stats("%s"%chrom,up-d,up,type="mean")[0] + down_mean=bw_map.stats("%s"%chrom,down,down+d,type="mean")[0] + if abs(start-end)<2*d: + target_mean=bw_map.stats("%s"%chrom,start,end,type="mean")[0] + else: + target_left=bw_map.stats("%s"%chrom,start,start+d,type="mean")[0] + target_right=bw_map.stats("%s"%chrom,end-d,end,type="mean")[0] + target_mean=(target_right+target_left)/2 + if strand=='-': #switch the order + li=[up_mean,down_mean] + up_mean,down_mean=li[1],li[0] + + mappability=[str(up_mean),str(target_mean),str(down_mean)] + return mappability + +def loadGTF(gtf): + exon_start_dict={} + exon_end_dict={} + for l in open(gtf): + if l.startswith('#'): + continue + ls=l.strip().split('\t') + if ls[2]=='exon': + chrom=ls[0] + if chrom.startswith('chr')==False: + chrom='chr'+chrom + exon_start_dict[ls[6]+':'+chrom+':'+ls[3]]='' + exon_end_dict[ls[6]+':'+chrom+':'+ls[4]]='' + return exon_start_dict, exon_end_dict + +def selectJunction_forGTF(AS_coord, deltaPSI_c2n, cut_off, if_select_all, splicing_event_type): #For A3 and A5, inc1/2 are not considered as current sj db doesn't capture that(and it will not always generate novel sequence). May update later. + if splicing_event_type == 'SE': + # end_pos, start_pos; because rMATS is 0-based for start exon, it should +1 when compare to GTF + skp = (AS_coord[2], str(int(AS_coord[3])+1),'skp') + inc1 = (AS_coord[2], str(int(AS_coord[0])+1),'inc1') + inc2 = (AS_coord[1], str(int(AS_coord[3])+1),'inc2') + elif splicing_event_type == 'A3SS': + skp = (AS_coord[5], str(int(AS_coord[2])+1),'skp') + inc1 = (AS_coord[5],str(int(AS_coord[0])+1),'inc1') + inc2 = ''#(AS_coord[2],AS_coord[2],'inc2') + elif splicing_event_type == 'A5SS': + skp = (AS_coord[3], str(int(AS_coord[4])+1),'skp') + inc1 = ''#(AS_coord[3],AS_coord[3],'inc1') + inc2 = (AS_coord[1],str(int(AS_coord[4])+1),'inc2') + elif splicing_event_type == 'RI': + skp = ''#(AS_coord[3], AS_coord[4],'skp')#???? + inc1 = ''#(AS_coord[3],AS_coord[3],'inc1') + inc2 = ''#(AS_coord[4],AS_coord[4],'inc2') + if if_select_all: + return [skp, inc1, inc2] + if float(deltaPSI_c2n) < cut_off: # tumor skipping + return [skp] + else: + return [inc1, inc2] + +def findNovelSS(event_name, deltaPSI_c2n, cut_off, splicing_event_type, exon_start_dict, exon_end_dict):# This is a conservative def of novelSS than rMATS4.1 (0.4% events less- complex cases ) + es=event_name.split(':') + chrom=es[2] + strand=es[3] + AS_coord=es[4:] + selected=selectJunction_forGTF(AS_coord,deltaPSI_c2n, cut_off, False, splicing_event_type) + info=[] + for j in selected: + if j!='': + check1=strand+':'+chrom+':'+j[0] not in exon_end_dict + check2=strand+':'+chrom+':'+j[1] not in exon_start_dict + if check1 or check2: + info.append(j[2])## + if info==[]: + info.append('none') + return ';'.join(info) + +def readEventRow(row, header_line): + if header_line=='' or header_line==False: + rs=row.strip().split('\t') + return rs + else: + rs=row.strip().split('\t') + return dict(zip(header_line, rs)) + +def convert2SJevent(line_dict, splicing_event_type):#match to SJ db (different from gtf and rMATS - 0-based start; +1 for end exon) + if splicing_event_type=='SE': + event_row_list=[line_dict['chr']+':'+str(int(line_dict['upstreamEE'])+1)+':'+line_dict['exonStart'],line_dict['chr']+':'+str(int(line_dict['exonEnd'])+1)+':'+line_dict['downstreamES'], line_dict['chr']+':'+str(int(line_dict['upstreamEE'])+1)+':'+line_dict['downstreamES']] + as_event=line_dict['AC'].strip('"').split('.')[0]+':'+line_dict['GeneName'].strip('"')+':'+line_dict['chr']+':'+line_dict['strand']+':'+line_dict['exonStart']+':'+line_dict['exonEnd']+':'+line_dict['upstreamEE']+':'+line_dict['downstreamES'] + elif splicing_event_type=='A5SS':# Only use one junction for inc. Need to improve by updating db later + event_row_list=[line_dict['chr']+':'+str(int(line_dict['longExonEnd'])+1)+':'+line_dict['flankingES'],line_dict['chr']+':'+str(int(line_dict['shortEE'])+1)+':'+line_dict['flankingES']] + as_event=line_dict['AC'].strip('"').split('.')[0]+':'+line_dict['GeneName'].strip('"')+':'+line_dict['chr']+':'+line_dict['strand']+':'+line_dict['longExonStart']+':'+line_dict['longExonEnd']+':'+line_dict['shortES']+':'+line_dict['shortEE']+':'+line_dict['flankingES']+':'+line_dict['flankingEE'] + elif splicing_event_type=='A3SS': # Only use one junction for inc. Need to improve by updating db later + event_row_list=[line_dict['chr']+':'+str(int(line_dict['flankingEE'])+1)+':'+line_dict['longExonStart'],line_dict['chr']+':'+str(int(line_dict['flankingEE'])+1)+':'+line_dict['shortES']] + as_event=line_dict['AC'].strip('"').split('.')[0]+':'+line_dict['GeneName'].strip('"')+':'+line_dict['chr']+':'+line_dict['strand']+':'+line_dict['longExonStart']+':'+line_dict['longExonEnd']+':'+line_dict['shortES']+':'+line_dict['shortEE']+':'+line_dict['flankingES']+':'+line_dict['flankingEE'] + else: + exit('splicine event type not supported. Exiting.') + return event_row_list, as_event + +def getDirection(panel_dict, psi, test, non_parametric): + psi_primary=[] + deltaPSI_primary=[] + for primary_group in panel_dict: #check if matching normal + if test[primary_group]!=['-']*3: + psi_primary+=psi[primary_group] + deltaPSI_primary.append(test[primary_group][1]) + if psi_primary==[]: #in case tissue-matched normal doesn't have data + return [],[''] + else: + direction='greater' if non_parametric else 'larger' + if np.median(deltaPSI_primary)<=0: + direction='less' if non_parametric else 'smaller' + return psi_primary, direction + + +def calcTumorFormFoc(delta_psi, mean_psi): + if delta_psi>0: + return mean_psi/(mean_psi - delta_psi+10**-8) + elif delta_psi<0: + return (1- mean_psi)/ (1- mean_psi+ delta_psi+10**-8) + else: + return 1 + +def statTest(g1,g2, direction, non_parametric): + if direction != 'equivalence': + if non_parametric: + pvalue=stats.mannwhitneyu(g1,g2, alternative=direction)[1] + else: + pvalue=smw.ttest_ind(g1,g2, alternative=direction, usevar='unequal')[1] + #pvalue=smw.ttest_ind(g1,g2, alternative=direction)[1] + else: + threshold_tost = 0.05 + pvalue=smw.ttost_ind(g1,g2,-threshold_tost,threshold_tost,usevar='unequal')[0] #equivalence test + return pvalue + +def statTest_minSampleCount(g1,g2, direction, non_parametric):#Only enabled when filters out by min_sample_count. With enough sample, using the default setting assume equal var for both groups is ok. + if direction != 'equivalence': + if non_parametric: + pvalue=stats.mannwhitneyu(g1,g2, alternative=direction)[1] + else: + pvalue=smw.ttest_ind(g1,g2, alternative=direction)[1] + #pvalue=smw.ttest_ind(g1,g2, alternative=direction)[1] + else: + threshold_tost = 0.05 + pvalue=smw.ttost_ind(g1,g2,-threshold_tost,threshold_tost)[0] #equivalence test + return pvalue + +def groupTest(g1,g2, non_parametric=False, direction='two-sided', min_sample_count=False): + g1=np.array(g1) + g2=np.array(g2) + g1=g1[~np.isnan(g1)] + g2=g2[~np.isnan(g2)] + delta_psi=np.nanmean(g1)-np.nanmean(g2) + tumor_foc=calcTumorFormFoc(delta_psi,np.nanmean(g1)) + if min_sample_count: + pvalue = statTest_minSampleCount(g1, g2, direction, non_parametric) + else: + pvalue = statTest(g1, g2, direction, non_parametric) + return [pvalue, delta_psi, tumor_foc] + +def performTest(group, matching_norm_dict, tumor_dict, normal_dict, psi, out_prefix, non_parametric, psi_primary, direction): + redirect_output = False + test_result = ['-']*3 #For missing in non-eesential tests/comparisons + has_matched_tumor = False if psi_primary == [] else True #for clarity + if group in matching_norm_dict: + if psi[group]!=[]: + test_result = groupTest(psi[out_prefix],psi[group], non_parametric,"two-sided") + return test_result + + elif group in tumor_dict: + if has_matched_tumor:#set_matched_tumor is redundent here. kept for future implemtation of additional output type. + if psi[group]!=[]: + test_result = groupTest(psi[group],psi_primary, non_parametric, direction) + else: + if psi[group]!=[]: + test_result = groupTest(psi[out_prefix],psi[group], non_parametric, "equivalence")#No or equivalent testing + return test_result + + elif group in normal_dict: + if has_matched_tumor: + if psi[group]!=[]: + test_result = groupTest(psi[out_prefix],psi[group], non_parametric, direction) + else: + if psi[group]!=[]: + test_result = groupTest(psi[out_prefix],psi[group], non_parametric, "two-sided") #Two-sided testing + return test_result + else: + exit('error in group.') + + +def summarizeTestResult(filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc,filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, pval, deltaPSI, foc, panel_list, matching_norm_dict, tumor_dict, normal_dict): + association_passed,recurrence_passed,specificity_positive,specificity_negative, specificity_testable=[0,0,0,0,0] + primary_result, primary_result_foc=[[],[]] #take care of multiple tiisue matched norm + deltapsi_list_voted,foc_list_voted=[[],[]] #if no tissue-matched norm, use median + for i,group in enumerate(panel_list[1:]): + if pval[i]=='-': #This is important - skip all missing, which is not useful for summarizing but not affecting consistancy. + continue + if group in matching_norm_dict: + primary_result.append(float(deltaPSI[i])) + primary_result_foc.append(float(foc[i])) + if float(pval[i])<=filter1_cutoff_pval and abs(float(deltaPSI[i]))>=filter1_cutoff_dpsi and float(foc[i])>=filter1_cutoff_foc: + association_passed+=1 + continue + elif group in tumor_dict: + if float(pval[i])<=filter2_cutoff_pval and abs(float(deltaPSI[i]))>=filter2_cutoff_dpsi: + recurrence_passed+=1 + continue + elif group in normal_dict:# TODO: judge set/has, then run + specificity_testable+=1 + if float(pval[i])<=filter3_cutoff_pval and float(foc[i])>=filter3_cutoff_foc: + deltapsi_list_voted.append(float(deltaPSI[i])) + foc_list_voted.append(float(foc[i])) + if float(deltaPSI[i])>=filter3_cutoff_dpsi: + specificity_positive+=1 + continue + if float(deltaPSI[i])<=-filter3_cutoff_dpsi: + specificity_negative+=1 + continue + return association_passed,recurrence_passed,specificity_positive,specificity_negative,specificity_testable, np.median(primary_result), np.median(primary_result_foc), deltapsi_list_voted,foc_list_voted + +def defineTumorEvents(filter1_group_cutoff,filter2_group_cutoff,filter3_group_cutoff, matching_norm_dict, specificity_panel_len, association_passed, recurrence_passed, specificity_positive, specificity_negative, specificity_testable, primary_result, primary_result_foc, deltapsi_list_voted, foc_list_voted, use_ratio): + tag=[] + if filter1_group_cutoff!='': + if association_passed>=filter1_group_cutoff:#Improvement? current: 0>='' is false + tag.append('associated') + if filter2_group_cutoff!='': + if recurrence_passed>=filter2_group_cutoff: + tag.append('recurrent') + if matching_norm_dict!={}:#TODO-FUTURE: take care of set-yes has-no redirected events + if primary_result>0: + tissue_specificity=specificity_positive + ratio=False if use_ratio==False else specificity_positive/(specificity_testable+10**-8)>=filter3_group_cutoff/(specificity_panel_len+0.0) + if specificity_positive>=filter3_group_cutoff or ratio: + tag.append('high_assoc') + else: + tissue_specificity=specificity_negative + ratio=False if use_ratio==False else specificity_negative/(specificity_testable+10**-8)>=filter3_group_cutoff/(specificity_panel_len+0.0) + if specificity_negative>=filter3_group_cutoff or ratio: + tag.append('high_assoc') + else: + tissue_specificity=max(specificity_positive,specificity_negative) + ratio=False if use_ratio==False else tissue_specificity/(specificity_testable+10**-8)>=filter3_group_cutoff/(specificity_panel_len+0.0) + if tissue_specificity>=filter3_group_cutoff or ratio: + tag.append('high_assoc') + primary_deltapsi=primary_result if primary_result!=0 else np.median(deltapsi_list_voted)#TODO + primary_foc=primary_result_foc if primary_result!=0 else np.median(foc_list_voted) + return primary_deltapsi, primary_foc, tissue_specificity, tag + +def mappability_write(k, bw_map, calc_length): + mappability_list=getMappability(k, bw_map, calc_length) + mappability_tag='PASS' + min_map_score=min(map(float, mappability_list)) + if min_map_score<0.8: + mappability_tag='MID' + if min_map_score<0.6:#from 0.7 + mappability_tag='LOW' + return mappability_tag, mappability_list + +def loadBlacklistEvents(fin): + BlacklistEvents={} + for l in open(fin): + ls=l.strip().split('\t') + des=ls[0].split(':') + ensg=des[0].split('_')[0].split('.')[0] + des=':'.join([ensg]+des[1:]).strip(':') + BlacklistEvents[des]='' + return BlacklistEvents + +def translationCMD(ref_genome, gtf, outdir, out_prefix, splicing_event_type, all_orf, ignore_annotation, remove_early_stop, fout_name, find_novel_ss): + #uSE name space + argument_line='' + if all_orf: + argument_line+=' --all-orf' + if ignore_annotation: + argument_line+=' --ignore-annotation' + if remove_early_stop: + argument_line+=' --remove-early-stop' + if find_novel_ss: + argument_line+=' --check-novel' + cmd_translation='IRIS translate '+outdir+'/'+out_prefix+'.'+splicing_event_type+'.'+fout_name+'.txt '+' -o '+outdir+'/'+splicing_event_type+'.'+fout_name+' -g '+ref_genome+' -t '+splicing_event_type+argument_line+' --gtf '+gtf + print '[INFO] Working on translating: '+fout_name + os.system(cmd_translation) + +def main(args): + ###Loading Parameters#### + para_fin=args.parameter_fin + splicing_event_type=args.splicing_event_type + event_list_fin=args.event_list_fin + fetching_sj_col=1 + out_prefix,db_dir,filter1_para,filter2_para,filter3_para,test_mode,use_ratio,blacklist_path,mappability_path,ref_genome=[l.strip() for l in open(para_fin)] + panel_list=[out_prefix] + test_mode=test_mode.split(' ') + use_ratio=True if use_ratio=='True' else False + blacklist_events={} + if blacklist_path!='': + blacklist_events=loadBlacklistEvents(blacklist_path) + bw_map,calc_length=loadMappability(mappability_path) + deltaPSI_cutoff=args.deltaPSI_cut_off + find_novel_ss=True if args.report_known_and_novelss_tumor_junction==False else False + if find_novel_ss: + gtf=args.gtf + exon_start_dict, exon_end_dict= loadGTF(gtf) + + all_orf=args.all_orf + ignore_annotation=args.ignore_annotation + remove_early_stop=args.remove_early_stop + use_existing_test_result=args.use_existing_test_result + + filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc, filter1_group_cutoff, filter1_panel_list, panel_list = loadParametersRow(filter1_para, panel_list) + filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter2_group_cutoff, filter2_panel_list, panel_list = loadParametersRow(filter2_para, panel_list) + filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, filter3_group_cutoff, filter3_panel_list, panel_list = loadParametersRow(filter3_para, panel_list) + if filter1_panel_list==[] and filter2_panel_list==[] and filter3_panel_list==[]: + # if filter1_panel_list==[] and filter2_panel_list==[] and filter3_panel_list==[] and test_mode[0]!='summary': + exit("[Error] No filtering required in parameteres file. exit!") + summary_file=False + + non_parametric=False + if len(test_mode)>1: + non_parametric=True if test_mode[1]=='nonparametric' else False + + ##this block is 2020 code; different and improved from screening.py + matching_norm_dict=dict.fromkeys(filter1_panel_list,'') + tumor_dict=dict.fromkeys(filter2_panel_list,'') + normal_dict=dict.fromkeys(filter3_panel_list,'') + tumor_dict[out_prefix]='' + association_panel_len=len(matching_norm_dict) + recurrence_panel_len=len(tumor_dict)-1 + specificity_panel_len=len(normal_dict) + + panel_count=sum(1 for i in [association_panel_len, recurrence_panel_len, specificity_panel_len] if i!=0) + + if args.translating: + gtf=args.gtf + if os.path.exists(gtf)==False: + exit('[Error] No gtf file provided for translation. exit!') + ###Create Folders/Output#### + outdir=args.outdir.rstrip('/') + os.system('mkdir -p '+outdir) + db_dir=db_dir.rstrip('/') + + if use_existing_test_result==False: + fout_direct, fout_direct_name= openTestingFout(outdir, out_prefix, splicing_event_type, summary_file, panel_list, 'guided') + fout_redirect, fout_redirect_name=openTestingFout(outdir, out_prefix, splicing_event_type, summary_file, panel_list, 'voted') + else: + fout_direct_name=outdir+'/'+out_prefix+'.'+splicing_event_type+'.test_JCRA.all_guided.txt' + fout_redirect_name=outdir+'/'+out_prefix+'.'+splicing_event_type+'.test_JCRA.all_voted.txt' + + fout_primary=openScreeningFout(outdir, out_prefix, splicing_event_type, 'tier1_JCRA') + fout_prioritized=openScreeningFout(outdir,out_prefix, splicing_event_type, 'tier2tier3_JCRA') + + ##Load IRIS reference panels to 'fin_list', 'index' + index={} + fin_list={} + for group_name in panel_list: + fin_list[group_name]=db_dir+'/'+group_name.split('_')[0]+'/SJ_count.'+group_name+'.txt' + for group in fin_list.keys(): + if not os.path.isfile(fin_list[group]+'.idx'): + exit('[Error] Need to index '+fin_list[group]) + index[group]=read_SJMatrix_index(fin_list[group],'/'.join(fin_list[group].split('/')[:-1])) + + tot=config.file_len(event_list_fin)-1 + print '[INFO] IRIS screen_novelss - started. Total input events:', tot+1 + ## Load and perform test by row/event + sample_size={} + for group in panel_list: + random_key=index[group].keys()[0] + sample_names=map(str,fetch_SJMatrix(random_key,fin_list[group],'\t',index[group],True)[fetching_sj_col:]) + sample_size[group]=len(sample_names) + + if use_existing_test_result==False: + header_list=[] + junction_dict={} + for event_idx, event_row in enumerate(open(event_list_fin)): + if event_idx==0: + header_list=readEventRow(event_row,'') + continue + line_dict=readEventRow(event_row, header_list) + event_row_list, as_event=convert2SJevent(line_dict, splicing_event_type) + config.update_progress(event_idx/(0.0+tot)) + + sj={} + for eid, k in enumerate(event_row_list): + sj[eid]={} + # if k not in junction_dict: + # junction_dict[k]='' + # else: + # continue + + #Initiate psi matrix by each row to 'sj' + for group in panel_list: + if k in index[group]: + sj[eid][group]=map(int,fetch_SJMatrix(k,fin_list[group],'\t',index[group], False)[fetching_sj_col:]) + else: + sj[eid][group]=[0]*sample_size[group] + psi={} + for group in panel_list: + psi[group]=[] + for sid in xrange(0,sample_size[group]): + inc1=sj[0][group][sid] + inc2=sj[1][group][sid] + skp=sj[2][group][sid] + if inc1+ inc2+ skp>=10: + psi[group].append(((inc1+inc2)/2.0)/(((inc1+inc2)/2.0)+skp)) + + test={} + redirect=False + psi_primary='' + direction='' + query_mean=map(lambda x:round(x,2),[np.nanmean(psi[out_prefix]),np.nanpercentile(psi[out_prefix],25),np.nanpercentile(psi[out_prefix],75)]) + for group in panel_list[1:]: + if group not in matching_norm_dict and direction=='':#redirect or find one-sided test direction + psi_primary, direction= getDirection(matching_norm_dict, psi, test, non_parametric) + redirect = True if psi_primary == [] else False #redirect if tumor-matched normal is missing + test[group]=performTest(group, matching_norm_dict, tumor_dict, normal_dict, psi, out_prefix, non_parametric, psi_primary, direction) + result=[as_event]+query_mean+['\t'.join(map(str,test[t])) for t in panel_list if t!=out_prefix] + + if redirect: + fout_redirect.write('\t'.join(map(str,result))+'\n') #to redicted; summarize direction and calculate FDR differently. + else: + fout_direct.write('\t'.join(map(str,result))+'\n') + + fout_redirect.close() + fout_direct.close() + + testing_intermediate_file = fout_direct_name if matching_norm_dict!={} else fout_redirect_name + + tot=config.file_len(testing_intermediate_file)-1 + print '[INFO] IRIS screen_novelss - summarizing. Total events from last step:', tot + for event_idx,l in enumerate(open(testing_intermediate_file)): + config.update_progress(event_idx/(0.0+tot)) + if event_idx==0: + continue + ls=l.strip().split('\t') + pval= ls[4::3]# don't do map because '-' + deltaPSI = ls[5::3] + foc= ls[6::3] + + association_passed,recurrence_passed,specificity_positive,specificity_negative,specificity_testable,primary_result, primary_result_foc, deltapsi_list_voted,foc_list_voted=summarizeTestResult(filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc,filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, pval, deltaPSI, foc, panel_list, matching_norm_dict, tumor_dict, normal_dict) + + primary_deltapsi, primary_foc, tissue_specificity, tag = defineTumorEvents(filter1_group_cutoff,filter2_group_cutoff,filter3_group_cutoff, matching_norm_dict, specificity_panel_len, association_passed, recurrence_passed, specificity_positive, specificity_negative, specificity_testable, primary_result, primary_result_foc, deltapsi_list_voted, foc_list_voted, use_ratio) + novel_ss_status='disabled' + if find_novel_ss: + novel_ss_status=findNovelSS(ls[0], primary_deltapsi, deltaPSI_cutoff, splicing_event_type, exon_start_dict, exon_end_dict) + if novel_ss_status=='none': + continue + if tag!=[]: + if tag[0]=='associated': + mappability_tag, mappability_list=mappability_write(ls[0], bw_map, calc_length) + fout_primary.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(ls[0],ls[1],ls[2],ls[3],primary_deltapsi, primary_foc, str(association_passed)+'/'+str(association_panel_len),str(recurrence_passed)+'/'+str(recurrence_panel_len),str(tissue_specificity)+'/'+str(specificity_panel_len),';'.join(tag), ';'.join(mappability_list),mappability_tag, novel_ss_status)) + if panel_count==len(tag):#Modified 2021 + mappability_tag, mappability_list=mappability_write(ls[0], bw_map, calc_length) #Modified 2021 + fout_prioritized.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(ls[0],ls[1],ls[2],ls[3],primary_deltapsi, primary_foc, str(association_passed)+'/'+str(association_panel_len),str(recurrence_passed)+'/'+str(recurrence_panel_len),str(tissue_specificity)+'/'+str(specificity_panel_len),';'.join(tag),';'.join(mappability_list),mappability_tag, novel_ss_status)) + mappability_tag, mappability_list=['',''] #clear + + fout_primary.close() + fout_prioritized.close() + + ##Translation + if args.translating: + translationCMD(ref_genome, gtf, outdir, out_prefix, splicing_event_type, all_orf, ignore_annotation, remove_early_stop, 'tier1_JCRA', find_novel_ss) + translationCMD(ref_genome, gtf, outdir, out_prefix, splicing_event_type, all_orf, ignore_annotation, remove_early_stop, 'tier2tier3_JCRA', find_novel_ss) + + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_screening_plot.py b/IRIS/IRIS_screening_plot.py index 663f1e2..f18ce92 100644 --- a/IRIS/IRIS_screening_plot.py +++ b/IRIS/IRIS_screening_plot.py @@ -71,50 +71,54 @@ def fileLength(fin): i=sum(1 for l in open(fin)) return i -def indiviualPlot(psi_df, event_id, ref_list): +def indiviualPlot(psi_df, event_id, panel_list, outdir): plt.figure(figsize=(15,9)) sns_plot=sns.violinplot(data=psi_df, inner="box",cut=0) #sns_plot.set_yticks(np.arange(0,1,0.2)) sns_plot.set(ylim=(0, 1)) sns.despine(offset=10, trim=True) sns_plot.set_ylabel('Percent-Spliced-In',fontweight='bold',fontsize=14) - sns_plot.set_xticklabels(ref_list,rotation=20,ha='right',fontsize=12) + sns_plot.set_xticklabels(panel_list,rotation=20,ha='right',fontsize=12) sns_plot.set_title(event_id,fontsize=15,fontweight='bold') - sns_plot.figure.savefig(event_id+".pdf") + sns_plot.figure.savefig(outdir+'/'+event_id+".pdf") -def loadParametersRow(filter_para, ref_list): - if len(filter_para.split(' '))==6: - filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_list =filter_para.split(' ')[1:] +def loadParametersRow(filter_para, panel_list): + if filter_para.strip()!='': + para, filter_panel_list=filter_para.split(' ') + filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_cutoff_pval_PT, filter_group_cutoff =para.split(',') filter_cutoff_pval=float(filter_cutoff_pval) filter_cutoff_dpsi=float(filter_cutoff_dpsi) filter_cutoff_foc=float(filter_cutoff_foc) filter_group_cutoff=int(filter_group_cutoff) - filter_list=filter_list.split(',') - ref_list+=filter_list + filter_panel_list=filter_panel_list.split(',') + panel_list+=filter_panel_list else: - filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_list =['','','','',[]] - return filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_list, ref_list + filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_panel_list =['','','','',[]] + return filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_panel_list, panel_list ## screening def main(args): index={} fin_list={} - para_fin=args.parameter_fin - fin_plot_query=args.event_list #to the txt file - step=int(args.step) - has_header=args.header - + para_fin=args.parameter_file + splicing_event_type=args.splicing_event_type + fetching_data_col=8 if splicing_event_type == 'SE' else 10 + fin_plot_query=args.event_list#for plot + step=int(args.step)#for plot + has_header=args.header#for plot out_prefix,db_dir,filter1_para,filter2_para,filter3_para,test_mode,use_ratio,blacklist_path,mappability_path,ref_genome=[l.strip() for l in open(para_fin)] - ref_list=[out_prefix] - filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc, filter1_group_cutoff, filter1, ref_list =loadParametersRow(filter1_para, ref_list) - filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter2_group_cutoff, filter2, ref_list =loadParametersRow(filter2_para, ref_list) - filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, filter3_group_cutoff, filter3, ref_list =loadParametersRow(filter3_para, ref_list) - if filter1==[] and filter2==[] and filter3==[] and test_mode!='summary': + panel_list=[out_prefix] + test_mode=test_mode.split(' ') + filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc, filter1_group_cutoff, filter1, panel_list =loadParametersRow(filter1_para, panel_list) + filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter2_group_cutoff, filter2, panel_list =loadParametersRow(filter2_para, panel_list) + filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, filter3_group_cutoff, filter3, panel_list =loadParametersRow(filter3_para, panel_list) + if filter1==[] and filter2==[] and filter3==[] and test_mode[0]!='summary': exit("no filtering required in para file. exit!") - group_test=False if test_mode!='group' else True - individual_test=False if test_mode!='personalized' else True - summary_file=False if test_mode!='summary' else True + + group_test=False if test_mode[0]!='group' else True + individual_test=False if test_mode[0]!='personalized' else True + summary_file=False if test_mode[0]!='summary' else True if [group_test,individual_test,summary_file]==[False,False,False]: exit('Need to choose one mode.exit!') single_plot=False @@ -122,11 +126,15 @@ def main(args): if group_test==individual_test: exit('can only choose one mode') - db_dir=db_dir - ref_list=list([out_prefix]+filter1+filter2+filter3) + db_dir=db_dir.rstrip('/') + outdir=args.outdir.rstrip('/') + os.system('mkdir -p '+outdir) + + panel_list=list([out_prefix]+filter1+filter2+filter3) - for group_name in ref_list:##Load IRIS reference panels - fin_list[group_name]=db_dir+'/'+group_name+'/splicing_matrix/splicing_matrix.SE.cov10.'+group_name+'.txt' + ##Load IRIS reference panels + for group_name in panel_list: + fin_list[group_name]=db_dir+'/'+group_name+'/splicing_matrix/splicing_matrix.'+splicing_event_type+'.cov10.'+group_name+'.txt' for group in fin_list.keys(): if not os.path.isfile(fin_list[group]+'.idx'): @@ -136,7 +144,7 @@ def main(args): filered=0 has={} - for group in ref_list: + for group in panel_list: if group!=out_prefix: has[group]=True @@ -149,7 +157,7 @@ def main(args): start_point=1 for r_start in xrange(start_point,fin_plot_query_len,step): if group_plot: - pdf = PdfPages(fin_plot_query+'.'+str(r_start)+'.pdf') + pdf = PdfPages(outdir+'/'+fin_plot_query.split('/')[-1]+'.'+str(r_start)+'.psiPlot.pdf') fig = plt.figure(figsize=(8.8,12)) fig.suptitle('', fontsize=14,fontweight='bold') sns.set(style="white", color_codes=True) @@ -161,11 +169,11 @@ def main(args): continue if j > r_start+step-1: break - psi[out_prefix]=map(float,fetch_PsiMatrix(event_id,fin_list[out_prefix],'.','\t',index[out_prefix])[1][8:]) + psi[out_prefix]=map(float,fetch_PsiMatrix(event_id,fin_list[out_prefix],'.','\t',index[out_prefix])[1][fetching_data_col:]) - for group in ref_list: + for group in panel_list: try: - psi[group]=map(float,fetch_PsiMatrix(event_id,fin_list[group],'.','\t',index[group])[1][8:]) + psi[group]=map(float,fetch_PsiMatrix(event_id,fin_list[group],'.','\t',index[group])[1][fetching_data_col:]) except: has[group]=False psi[group]=[] @@ -176,9 +184,9 @@ def main(args): filered+=1 continue - psi_df = pd.DataFrame.from_dict(psi, orient='index').transpose()[ref_list] + psi_df = pd.DataFrame.from_dict(psi, orient='index').transpose()[panel_list] if single_plot: - indiviualPlot(psi_df, event_id, ref_list) + indiviualPlot(psi_df, event_id, panel_list) if group_plot: print step, j ax_i = plt.subplot2grid((step,11), (j-r_start*1,0), colspan=10, rowspan=1) @@ -187,11 +195,15 @@ def main(args): sns_plot.set_yticks(np.arange(0,1.1,0.5)) sns_plot.set(xticklabels=[]) sns.despine(offset=0, trim=False) - sns_plot.text(15.7, 0.4, event_id.split(':')[1], horizontalalignment='left', size='large', color='black',fontweight='bold') + ax_j = plt.subplot2grid((step,11), (j-r_start*1,10), colspan=1, rowspan=1) + ax_j.text(0.5, 0.5, event_id.split(':')[1], horizontalalignment='center', size='large', color='black',fontweight='bold') + #ax_j.get_xaxis().set_visible(False) + #ax_j.get_yaxis().set_visible(False) + ax_j.axis('off') if group_plot: pdf.savefig(fig) pdf.close() - plt.savefig('{}.png'.format(fin_plot_query)) + #plt.savefig('{}.png'.format(fin_plot_query)) if __name__ == '__main__': diff --git a/IRIS/IRIS_screening_sjc.py b/IRIS/IRIS_screening_sjc.py new file mode 100644 index 0000000..5bfc771 --- /dev/null +++ b/IRIS/IRIS_screening_sjc.py @@ -0,0 +1,276 @@ +import numpy as np +import sys +import os, glob, pyBigWig, argparse +from scipy import stats +import statsmodels.stats.weightstats as smw +from . import config +import warnings +warnings.filterwarnings("ignore") + +def read_SJMatrix_index(fn,outdir): + index = {} + for line in open(outdir+'/'+fn.split('/')[-1]+'.idx', 'r'): + ele = line.strip().split() + index[ele[0]] = int(ele[1]) + return index + +def fetch_SJMatrix(eid, fn, delim, index, head_only): + with open(fn, 'r') as f: + if head_only: + ele = f.readline().strip().split(delim) + retrieved_text = np.asarray([ x.split('.aln')[0] for x in ele ]) + else: + f.seek(index[eid], 0) + retrieved_text = np.asarray(f.readline().strip().split(delim)) + return retrieved_text + +def loadParametersRow(filter_para, panel_list): + filter_cutoffs='' + if filter_para.strip()!='': + filter_cutoffs = map(float,filter_para.strip().split(' ')[0].split(',')) + filter_panel_list = filter_para.strip().split(' ')[1].split(',') + panel_list+=filter_panel_list + else: + filter_panel_list =[] + return filter_cutoffs, filter_panel_list, panel_list + +def readEventRow(row, header_line): + if header_line=='' or header_line==False: + rs=row.strip().split('\t') + return rs + else: + rs=row.strip().split('\t') + return dict(zip(header_line, rs)) + +def convert2SJevent(line_dict, splicing_event_type): + if splicing_event_type=='SE': + event_row_list=[line_dict['chr']+':'+str(int(line_dict['upstreamEE'])+1)+':'+line_dict['exonStart'],line_dict['chr']+':'+str(int(line_dict['exonEnd'])+1)+':'+line_dict['downstreamES'], line_dict['chr']+':'+str(int(line_dict['upstreamEE'])+1)+':'+line_dict['downstreamES']] + elif splicing_event_type=='A5SS':# Only use one junction for inc. Need to improve by updating db later + event_row_list=[line_dict['chr']+':'+str(int(line_dict['longExonEnd'])+1)+':'+line_dict['flankingES'],line_dict['chr']+':'+str(int(line_dict['shortEE'])+1)+':'+line_dict['flankingES']] + elif splicing_event_type=='A3SS': # Only use one junction for inc. Need to improve by updating db later + event_row_list=[line_dict['chr']+':'+str(int(line_dict['flankingEE'])+1)+':'+line_dict['longExonStart'],line_dict['chr']+':'+str(int(line_dict['flankingEE'])+1)+':'+line_dict['shortES']] + else: + exit('splicine event type not supported. Exiting.') + return event_row_list + +def convert2SJASevent(line_dict, splicing_event_type): + if splicing_event_type=='SE': + event_row_list=[line_dict['chr']+':'+str(int(line_dict['upstreamEE'])+1)+':'+line_dict['exonStart'],line_dict['chr']+':'+str(int(line_dict['exonEnd'])+1)+':'+line_dict['downstreamES'], line_dict['chr']+':'+str(int(line_dict['upstreamEE'])+1)+':'+line_dict['downstreamES']] + as_event=line_dict['AC'].strip('"').split('.')[0]+':'+line_dict['GeneName'].strip('"')+':'+line_dict['chr']+':'+line_dict['strand']+':'+line_dict['exonStart']+':'+line_dict['exonEnd']+':'+line_dict['upstreamEE']+':'+line_dict['downstreamES'] + elif splicing_event_type=='A5SS':# Only use one junction for inc. Need to improve by updating db later + event_row_list=[line_dict['chr']+':'+str(int(line_dict['longExonEnd'])+1)+':'+line_dict['flankingES'],line_dict['chr']+':'+str(int(line_dict['shortEE'])+1)+':'+line_dict['flankingES']] + as_event=line_dict['AC'].strip('"').split('.')[0]+':'+line_dict['GeneName'].strip('"')+':'+line_dict['chr']+':'+line_dict['strand']+':'+line_dict['longExonStart']+':'+line_dict['longExonEnd']+':'+line_dict['shortES']+':'+line_dict['shortEE']+':'+line_dict['flankingES']+':'+line_dict['flankingEE'] + elif splicing_event_type=='A3SS': # Only use one junction for inc. Need to improve by updating db later + event_row_list=[line_dict['chr']+':'+str(int(line_dict['flankingEE'])+1)+':'+line_dict['longExonStart'],line_dict['chr']+':'+str(int(line_dict['flankingEE'])+1)+':'+line_dict['shortES']] + as_event=line_dict['AC'].strip('"').split('.')[0]+':'+line_dict['GeneName'].strip('"')+':'+line_dict['chr']+':'+line_dict['strand']+':'+line_dict['longExonStart']+':'+line_dict['longExonEnd']+':'+line_dict['shortES']+':'+line_dict['shortEE']+':'+line_dict['flankingES']+':'+line_dict['flankingEE'] + else: + exit('splicine event type not supported. Exiting.') + return event_row_list, as_event + +def loadSigJunction(fin): + sig_junction={} + for i,l in enumerate(open(fin)): + if i==0: + continue + sig_junction[l.strip().split('\t')[0]]='' + return sig_junction + +def summarizeSJ2ASevent(event_list_fin, splicing_event_type, sig_junction, outdir, out_prefix): + fout_summary_fname=outdir+'/SJ.'+out_prefix+'.'+splicing_event_type+'.summary_by_sig_event.txt' + fout_summary=open(fout_summary_fname,'w') + for event_idx, event_row in enumerate(open(event_list_fin)): + if event_idx==0: + header_list=readEventRow(event_row,'') + continue + line_dict=readEventRow(event_row, header_list) + event_row_list, as_event=convert2SJASevent(line_dict, splicing_event_type) + as_event_result=[] + as_event_result_list=[] + for k in event_row_list: + if k not in sig_junction: + as_event_result.append(False) + else: + as_event_result.append(True) + as_event_result_list.append(k) + if as_event_result[0]==as_event_result[1]==as_event_result[2]==True: + fout_summary.write(as_event+'\tAll junctions\t'+';'.join(as_event_result_list)+'\n') + elif as_event_result[0]==as_event_result[1]==as_event_result[2]==False: + continue + else: + if as_event_result[0]==as_event_result[1]!=as_event_result[2]: + fout_summary.write(as_event+'\tOnly alternative junctions\t'+';'.join(as_event_result_list)+'\n') + else: + fout_summary.write(as_event+'\tOther combination\t'+';'.join(as_event_result_list)+'\n') + fout_summary.close() + return fout_summary_fname + +def main(args): + ###Loading Parameters#### + para_fin=args.parameter_file + splicing_event_type=args.splicing_event_type + event_list_fin=args.event_list_file + use_existing_test_result=args.use_existing_test_result + + outdir=args.outdir.rstrip('/') + os.system('mkdir -p '+outdir) + fetching_sj_col=1 + out_prefix,db_dir,filter1_para,filter2_para,filter3_para=[l.strip() for l in open(para_fin)][:5] + db_dir=db_dir.rstrip('/') + if os.path.isdir(db_dir+'_sjc'): #automatically use db_sjc if in the same dir. Otherwise, use the user input db_dir + db_dir=db_dir+'_sjc' + panel_list=[out_prefix] + + filter1_cutoffs, filter1_panel_list, panel_list = loadParametersRow(filter1_para, panel_list) + filter2_cutoffs, filter2_panel_list, panel_list = loadParametersRow(filter2_para, panel_list) + filter3_cutoffs, filter3_panel_list, panel_list = loadParametersRow(filter3_para, panel_list) + tumor_dict=dict.fromkeys(filter2_panel_list,'') + tumor_dict[out_prefix]='' + pvalue_cutoff_normal=''; pvalue_cutoff_tumor='' + filter1_group_cutoff=''; filter2_group_cutoff=''; filter3_group_cutoff=''; + if filter1_cutoffs!='': + pvalue_cutoff_normal,filter1_group_cutoff=filter1_cutoffs[3:] + if filter2_cutoffs!='': + pvalue_cutoff_tumor,filter2_group_cutoff=filter2_cutoffs[3:] + if filter3_cutoffs!='': + pvalue_cutoff_normal,filter3_group_cutoff=filter3_cutoffs[3:] + + tumor_read_cov_cutoff=int(args.tumor_read_cov_cutoff)#5 + normal_read_cov_cutoff=int(args.normal_read_cov_cutoff)#2 + # if filter1_panel_list==[] and filter2_panel_list==[] and filter3_panel_list==[] and test_mode[0]!='summary': + # exit("[Error] No filtering required in parameteres file. exit!") + + ##Load IRIS reference panels to 'fin_list', 'index' + index={} + fin_list={} + for group_name in panel_list: + fin_list[group_name]=db_dir+'/'+group_name+'/sjc_matrix/SJ_count.'+group_name+'.txt' + for group in fin_list.keys(): + if not os.path.isfile(fin_list[group]+'.idx'): + exit('[Error] Need to index '+fin_list[group]) + index[group]=read_SJMatrix_index(fin_list[group],'/'.join(fin_list[group].split('/')[:-1])) + + + tot=config.file_len(event_list_fin)-1 + if tot==0: + exit('[Ended] no test performed because no testable events. Check input or filtering parameteres.') #Modified 2021 + print '[INFO] IRIS screening - started. Total input events:', tot+1 + if use_existing_test_result==False: + fout_sj_count=open(outdir+'/SJ.'+out_prefix+'.'+splicing_event_type+'.test_all.txt','w') + header_line=[] + sample_size={} + for group in panel_list: + random_key=index[group].keys()[0] + sample_names=map(str,fetch_SJMatrix(random_key,fin_list[group],'\t',index[group],True)[fetching_sj_col:]) + sample_size[group]=len(sample_names) + if group==out_prefix: + header_line+=[out_prefix+'_carrier_number', out_prefix+'_fraction'] + continue + header_line+=[group+'_carrier_number', group+'_fraction', group+'_pvalue'] + fout_sj_count.write('Junction\t'+'\t'.join(header_line)+'\n') + + header_line=[] + fout_sj_sig_fname=outdir+'/SJ.'+out_prefix+'.'+splicing_event_type+'.test_sig.txt' + fout_sj_sig=open(fout_sj_sig_fname,'w') + for group in panel_list: + if group==out_prefix: + header_line+=[out_prefix+'_carrier_number', out_prefix+'_fraction'] + continue + header_line+=[group+'_carrier_number', group+'_fraction', group+'_pvalue'] + fout_sj_sig.write('Junction\t'+'\t'.join(header_line)+'\n') + + if use_existing_test_result==False: + header_list=[] + junction_dict={} + for event_idx, event_row in enumerate(open(event_list_fin)): + if event_idx==0: + header_list=readEventRow(event_row,'') + continue + line_dict=readEventRow(event_row, header_list) + event_row_list=convert2SJevent(line_dict, splicing_event_type) + for k in event_row_list: + if k not in junction_dict: + junction_dict[k]='' + else: + continue + config.update_progress(event_idx/(0.0+tot)) + + #Initiate psi matrix by each row to 'sj' + sj={} + for group in panel_list: + if k in index[group]: + sj[group]=map(int,fetch_SJMatrix(k,fin_list[group],'\t',index[group], False)[fetching_sj_col:]) + else: + sj[group]=[0]*sample_size[group] + write_sj_list=[] + significant_normal_match=0 + significant_normal=0 + significant_tumor=0 + prevalence={} + prevalence_test='' + for group in panel_list: + if group in tumor_dict: + prevalence[group]=sum(v>=tumor_read_cov_cutoff for v in sj[group]) + else: + prevalence[group]=sum(v>=normal_read_cov_cutoff for v in sj[group]) + + if group==out_prefix: + write_sj_list=[prevalence[group],prevalence[group]/(0.0+sample_size[group])] + continue + else: + if group in filter2_panel_list: # filter2 always require filer1!!! if no filter1, all references should be defined in filter3 + prevalence_test=stats.fisher_exact([[prevalence[group],sample_size[group]-prevalence[group]],[prevalence[filter1_panel_list[0]], sample_size[filter1_panel_list[0]]-prevalence[filter1_panel_list[0]]]], alternative='greater') + else: + prevalence_test=stats.fisher_exact([[prevalence[out_prefix],sample_size[out_prefix]-prevalence[out_prefix]],[prevalence[group], sample_size[group]-prevalence[group]]], alternative='greater') + write_sj_list+=[prevalence[group], prevalence[group]/(0.0+sample_size[group]), prevalence_test[1]] + #determine difference of a junction + if group in filter1_panel_list: + if prevalence_test[1]<=pvalue_cutoff_normal: + significant_normal_match+=1 + elif group in filter2_panel_list: + if prevalence_test[1]<=pvalue_cutoff_tumor: + significant_tumor+=1 + else: + if prevalence_test[1]<=pvalue_cutoff_normal: + significant_normal+=1 + if (significant_normal_match>=filter1_group_cutoff or filter1_group_cutoff=='') and (significant_tumor>=filter2_group_cutoff or filter2_group_cutoff=='') and (significant_normal>=filter3_group_cutoff or filter3_group_cutoff==''): + fout_sj_sig.write(k+'\t'+'\t'.join(map(str,write_sj_list))+'\n') + fout_sj_count.write(k+'\t'+'\t'.join(map(str,write_sj_list))+'\n') + fout_sj_count.close() + fout_sj_sig.close() + + else: + print 'Use existing testing result.' + fout_sj_count_name=outdir+'/SJ.'+out_prefix+'.'+splicing_event_type+'.test_all.txt' + for i, l in enumerate(open(fout_sj_count_name)): + if i==0: + header=l.strip().split('\t') + group_list=map(lambda x: x.split('_carrier_number')[0], header[3::3]) + else: + ls=l.strip().split('\t') + prevalence_value= map(int,ls[3::3])# don't do map because '-' + percent_value = map(float,ls[4::3]) + p_value= map(float,ls[5::3]) + significant_normal_match=0 + significant_normal=0 + significant_tumor=0 + #determine difference of a junction + for j,group in enumerate(group_list): + if group in filter1_panel_list: + if p_value[j]<=pvalue_cutoff_normal: + significant_normal_match+=1 + elif group in filter2_panel_list: + if p_value[j]<=pvalue_cutoff_tumor: + significant_tumor+=1 + else: + if p_value[j]<=pvalue_cutoff_normal: + significant_normal+=1 + if (significant_normal_match>=filter1_group_cutoff or filter1_group_cutoff=='') and (significant_tumor>=filter2_group_cutoff or filter2_group_cutoff=='') and (significant_normal>=filter3_group_cutoff or filter3_group_cutoff==''): + fout_sj_sig.write(l.strip()+'\n') + fout_sj_sig.close() + + sig_junction=loadSigJunction(fout_sj_sig_fname) + fout_summary_fname=summarizeSJ2ASevent(event_list_fin, splicing_event_type, sig_junction, outdir, out_prefix) + + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_screening_sjcplot.py b/IRIS/IRIS_screening_sjcplot.py new file mode 100644 index 0000000..0797b8c --- /dev/null +++ b/IRIS/IRIS_screening_sjcplot.py @@ -0,0 +1,193 @@ +import numpy as np +import os,sys,glob,argparse +from scipy import stats +import statsmodels.stats.weightstats as smw +import matplotlib +matplotlib.use('agg') +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from . import config + +def fileLength(fin): + i=sum(1 for l in open(fin)) + return i + +# def indiviualPlot(psi_df, event_id, panel_list): +# plt.figure(figsize=(15,9)) +# sns_plot=sns.violinplot(data=psi_df, inner="box",cut=0) +# #sns_plot.set_yticks(np.arange(0,1,0.2)) +# sns_plot.set(ylim=(0, 1)) +# sns.despine(offset=10, trim=True) +# sns_plot.set_ylabel('Percent-Spliced-In',fontweight='bold',fontsize=14) +# sns_plot.set_xticklabels(panel_list,rotation=20,ha='right',fontsize=12) +# sns_plot.set_title(event_id,fontsize=15,fontweight='bold') +# sns_plot.figure.savefig(event_id+".pdf") + +def selectJC(event_id, tumor_form, tumor_form_cutoff, splicing_event_type): + coord=event_id.split(':') + if splicing_event_type=='SE': + inc1=coord[2]+':'+str(int(coord[6])+1)+':'+coord[4] + inc2=coord[2]+':'+str(int(coord[5])+1)+':'+coord[7] + skp=coord[2]+':'+str(int(coord[6])+1)+':'+coord[7] + if tumor_form>tumor_form_cutoff:#inc + return [inc1, inc2] + else: + return [skp] + else: + exit('[Error] The splicine event type is not supported currently. Exiting.') + +def loadJCvalue4events(fin_plot_query, deltaPSI_col, tumor_form_cutoff, jc_result, splicing_event_type, has_header): + JC_dict={} + for i,l in enumerate(open(fin_plot_query)): + if i==0 and has_header: + continue + ls=l.strip().split('\t') + event_id=ls[0] + tumor_form=float(ls[deltaPSI_col]) # negative means skipping, pos means inc + JC_select=selectJC(event_id, tumor_form, tumor_form_cutoff, splicing_event_type) + for k in JC_select: + JC_dict[k]='' + for i,l in enumerate(open(jc_result)): + if i==0: + continue + ls=l.strip().split('\t') + if ls[0] in JC_dict: + JC_dict[ls[0]]=l + return JC_dict + +def loadParametersRow(filter_para, panel_list): + if filter_para.strip()!='': + para, filter_panel_list=filter_para.split(' ') + filter_cutoff_pval, filter_cutoff_dpsi, filter_cutoff_foc, filter_cutoff_pval_PT, filter_group_cutoff =para.split(',') + filter_cutoff_pval_PT=float(filter_cutoff_pval_PT) + filter_cutoff_dpsi=float(filter_cutoff_dpsi) + filter_cutoff_foc=float(filter_cutoff_foc) + filter_group_cutoff=int(filter_group_cutoff) + filter_panel_list=filter_panel_list.split(',') + panel_list+=filter_panel_list + else: + filter_cutoff_pval_PT, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_panel_list =['','','','',[]] + return filter_cutoff_pval_PT, filter_cutoff_dpsi, filter_cutoff_foc, filter_group_cutoff, filter_panel_list, panel_list + +## screening +def main(args): + index={} + fin_list={} + para_fin=args.parameter_fin + splicing_event_type=args.splicing_event_type + fin_plot_query=args.event_list #IRIS screening output OR any file contain event ID and deltaPSI/indicators + jc_result=args.jc_full_result #IRIS prev screening output + tumor_form_col=int(args.deltaPSI_column)-1 + tumor_form_cutoff=float(args.deltaPSI_cut_off) + step=int(args.step) + has_header=args.header + outdir=args.outdir.rstrip('/') + + out_prefix,db_dir,filter1_para,filter2_para,filter3_para=[l.strip() for l in open(para_fin)][:5] + db_dir=db_dir.rstrip('/') + if os.path.isdir(db_dir+'_sjc'): #automatically use db_sjc if in the same dir. Otherwise, use the user input db_dir + db_dir=db_dir+'_sjc' + panel_list=[out_prefix] + + filter1_cutoff_pval, filter1_cutoff_dpsi, filter1_cutoff_foc, filter1_group_cutoff, filter1_panel_list, panel_list = loadParametersRow(filter1_para, panel_list) + filter2_cutoff_pval, filter2_cutoff_dpsi, filter2_cutoff_foc, filter2_group_cutoff, filter2_panel_list, panel_list = loadParametersRow(filter2_para, panel_list) + filter3_cutoff_pval, filter3_cutoff_dpsi, filter3_cutoff_foc, filter3_group_cutoff, filter3_panel_list, panel_list = loadParametersRow(filter3_para, panel_list) + if filter1_panel_list==[] and filter2_panel_list==[] and filter3_panel_list==[]: + # if filter1_panel_list==[] and filter2_panel_list==[] and filter3_panel_list==[] and test_mode[0]!='summary': + exit("[Error] No filtering required in parameteres file. exit!") + + single_plot=False + group_plot=True + + #Load JC values into dict based on event file + JC_dict=loadJCvalue4events(fin_plot_query, tumor_form_col, tumor_form_cutoff, jc_result, splicing_event_type, has_header) + #Make tmp file for JC plot based events + query_fin_name=outdir+'/'+fin_plot_query.split('/')[-1]+'.tmpdata.txt' + fout_tmp=open(query_fin_name,'w') + for i,l in enumerate(open(fin_plot_query)): + if i==0 and has_header: + continue + ls=l.strip().split('\t') + event_id=ls[0] + tumor_form=float(ls[tumor_form_col]) # negative means skipping, pos means inc + JC_select=selectJC(event_id, tumor_form, tumor_form_cutoff, splicing_event_type) + if tumor_form r_start+step-1: + break + ks=k.strip().split('\t') + as_event_gene=ks[0].split(':')[1] + if ks[1]=='skp': + sj_id=ks[2] + prev={} + #percentage[out_prefix]=map(float,fetch_PsiMatrix(event_id,fin_list[out_prefix],'.','\t',index[out_prefix])[1][8:]) + prev[out_prefix]=[float(ks[4])] + for n,group in enumerate(panel_list[1:]): + prev[group]=[float(ks[6+n*3])] + + prev_df = pd.DataFrame.from_dict(prev, orient='index').transpose()[panel_list] + if single_plot: + indiviualPlot(prev_df, event_id, panel_list) + if group_plot: + print step, j + ax_i = plt.subplot2grid((step,11), (j-r_start*1,0), colspan=10, rowspan=1) + ##SHOULD SET TO area. use width just for prelim + sns_plot=sns.barplot(data=prev_df, ax=ax_i,linewidth=1.5) + sns_plot.set_yticks(np.arange(0,1.1,0.5)) + sns_plot.set(xticklabels=[]) + sns.despine(offset=0, trim=False) + sns_plot.text(15.3, 0.4, as_event_gene+'\n'+sj_id, horizontalalignment='left', size='small', color='black',fontweight='bold') + else: + sj_id=ks[2]#TODO + prev={} + inc1_list=[float(ks[4])]+[float(ks[6+n*3]) for n in range(len(panel_list[1:]))] + inc2_list=[float(ks[5+(len(panel_list)-1)*3+2])]+[float(ks[5+(len(panel_list)-1)*3+4+n*3]) for n in range(len(panel_list[1:]))] + prev_df = pd.DataFrame({'Groups': panel_list,'Inc1': inc1_list , 'Inc2': inc2_list }) + prev_df_tidy = prev_df.melt(id_vars='Groups').rename(columns=str.title) + + if single_plot: + indiviualPlot(prev_df, event_id, panel_list) + if group_plot: + print step, j + ax_i = plt.subplot2grid((step,11), (j-r_start*1,0), colspan=10, rowspan=1) + ##SHOULD SET TO area. use width just for prelim + sns_plot=sns.barplot(data=prev_df_tidy, x='Groups', y='Value',hue='Variable',ax=ax_i, linewidth=1, ci=None) + my_pal=sns.husl_palette(len(panel_list)*2, s=.75, l=0.7)#color_palette("husl", len(panel_list))#(n_colors=len(panel_list)) + # for i, bar in enumerate(sns_plot.patches): + # bar.set_color(my_pal[i%len(panel_list)])#(i-i%2)/2 + for i, bar in enumerate(sns_plot.patches): + if i<=len(panel_list)-1: + bar.set_color(my_pal[i*2]) + else: + bar.set_color(my_pal[1+(i-len(panel_list))*2]) + sns_plot.legend_.remove() + sns_plot.set_yticks(np.arange(0,1.1,0.5)) + sns_plot.set(xticklabels=[]) + sns.despine(offset=0, trim=False) + sns_plot.text(15.3, 0.4, as_event_gene+'\n'+sj_id, horizontalalignment='left', size='small', color='black',fontweight='bold') + + if group_plot: + pdf.savefig(fig) + pdf.close() + +if __name__ == '__main__': + main() + diff --git a/IRIS/IRIS_seq2hla.py b/IRIS/IRIS_seq2hla.py deleted file mode 100644 index 8c1b4ea..0000000 --- a/IRIS/IRIS_seq2hla.py +++ /dev/null @@ -1,61 +0,0 @@ -import sys, os, csv, argparse, logging, datetime -from . import config -#from utilities.seq2hla import seq2HLA - -def run_seq2HLA(readsFilesCaseRNA,runname,bindir): - if os.path.exists(runname+'/hla_types-ClassI.HLAgenotype4digits')==False: - readsFiles_split=readsFilesCaseRNA.split(',') - os.system('mkdir -p '+runname) - #seq2HLA.main(runname,readsFiles_split[0],readsFiles_split[1]) - cmd = 'python '+bindir+'/seq2HLA.py -1 '+readsFiles_split[0]+' -2 '+readsFiles_split[1]+' -r '+runname+'/hla_types' - os.system (cmd) - print cmd - if os.path.exists(runname+'/hla_types-ClassI.HLAgenotype4digits')==False: - sys.exit('[seq2hla] # An Error Has Occured. seq2hla Incomplete. Exit!') - else: - print '[seq2hla] # Skipped seq2HLA.' - - HLA_type=[] - for n,l in enumerate(open(runname+'/hla_types-ClassI.HLAgenotype4digits')): - if n==0: - continue - ls=l.strip().split('\t') - #print ls - if ls[2]!='NA': - if float(ls[2])<=0.05: - HLA_type.append('HLA-'+ls[1].strip('\'')) - if ls[4]!='NA': - if float(ls[4])<=0.05: - HLA_type.append('HLA-'+ls[3].strip('\'')) - continue - # for n,l in enumerate(open(runname+'-ClassII.HLAgenotype4digits')): - # if n==0: - # continue - # ls=l.strip().split('\t') - # if ls[2]!='NA': - # if float(l[2])<=0.05: - # HLA_type.append('HLA-'+l[1].strip('\'')) - # if ls[4]!='NA': - # if float(ls[4])<=0.05: - # HLA_type.append('HLA-'+l[3].strip('\'')) - # continue - - if len(HLA_type)==0: - sys.exit('# [INFO] No HLA type predicted. Exit.') - - HLA_type_str=','.join(list(set(HLA_type))) - return HLA_type_str - -def main(args): - os.system('mkdir -p '+args.sampleID_outdir) - sampleID = args.sampleID_outdir.rstrip('/') - runname = sampleID+'/hla_types' - bindir = args.seq2hla_path.rstrip('/') - print '[INFO] # Start HLA typing.' - - hla=run_seq2HLA(args.readsFilesCaseRNA, runname, bindir) - - print '[INFO] # Completed. HLA types: '+hla - -if __name__ == '__main__': - main() diff --git a/IRIS/IRIS_sjc_matrix.py b/IRIS/IRIS_sjc_matrix.py new file mode 100644 index 0000000..3587a2b --- /dev/null +++ b/IRIS/IRIS_sjc_matrix.py @@ -0,0 +1,126 @@ +import sys, os + +def loadFinlist(fin_list_input): + fin_list=[] + for l in open(fin_list_input): + fin_list.append(l.strip()) + return fin_list +def readSJfile_STAR(fin, SJ_dict): + for l in open(fin): + ls=l.strip().split('\t') + sj=':'.join(ls[0:3]) + SJ_dict[sj]=ls[5] #add strand info + return SJ_dict + +def readSJfile(fin, SJ_dict): + for l in open(fin): + ls=l.strip().split('\t') + sj=ls[0] + SJ_dict[sj]='' #add strand info + return SJ_dict + +def index_SJMatrix(fn, outdir, delim): + out_fp = outdir+'/'+fn.split('/')[-1]+'.idx' + line_formatter = "{id}\t{offset}\n" + offset = 0 + with open(outdir+'/'+fn, 'r') as fin: + with open(out_fp, 'w') as fout: + offset += len(fin.readline()) + for line in fin: + ele = line.strip().split(delim) + eid = ele[0] + fout.write( line_formatter.format(id=eid, offset=offset) ) + offset += len(line) + return + +def main(args): + + fname_pos=args.sample_name_field#2 + fin_list_input=args.file_list_input + fin_list=loadFinlist(fin_list_input) + data_name=args.data_name + db_dir=args.iris_db_path.rstrip('/') + os.system('mkdir -p '+db_dir+'/'+data_name+' '+db_dir+'/'+data_name+'/sjc_matrix') + + + if os.path.exists(db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.txt'): + print '[INFO] Output '+db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.txt'+' exists. Only perform indexing.' + index_SJMatrix('SJ_count.'+data_name+'.txt', db_dir+'/'+data_name+'/sjc_matrix/', '\t') + exit('[INFO] Index finished.') + + fname_dict={} + for fn in fin_list: + name=fn.split('/')[-fname_pos].split('.aln')[0] + if name in fname_dict: + print name + exit('dup name'+fn+' '+name) + fname_dict[name]='' + fname_list=fname_dict.keys() + print '[INFO] Done checking file names.' + + SJ_dict={} + for i,fin in enumerate(fin_list): + if i%100==0: + print i + SJ_dict=readSJfile(fin, SJ_dict) + + SJ_list=sorted(SJ_dict.keys()) + + fout_SJ=open(db_dir+'/'+data_name+'/sjc_matrix/SJ_coordiate.'+data_name+'.txt','w') + for s in SJ_list: + fout_SJ.write(s+'\t'+SJ_dict[s]+'\n') + fout_SJ.close() + + print '[INFO] Done summarizing SJ coordinates.' + + batch=100000 + for b in range(0, len(SJ_list), batch): + print b + batch_SJ_list=SJ_list[b:min(b+batch,len(SJ_list))] + batch_SJ_dict=dict.fromkeys(batch_SJ_list,0) # can't use this to store values. will only store the last input + batch_SJ_count={} + for fin in fin_list: + fname=fin.split('/')[-fname_pos].split('.aln')[0] + for l in open(fin): + ls=l.strip().split('\t') + # sj=':'.join(ls[:3])# STAR + # count=ls[6]# STAR + sj=ls[0] + count=ls[1] + if sj in batch_SJ_dict: + if sj not in batch_SJ_count: + batch_SJ_count[sj]={} + batch_SJ_count[sj][fname]=count + continue + fout_intermediate=open(db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.batch_'+str(b)+'.txt','w') + for k in sorted(batch_SJ_count.keys()): + sj_line=[k] + for sample in fname_list: + if sample in batch_SJ_count[k]: + sj_line.append(batch_SJ_count[k][sample]) + else: + sj_line.append('0') ##It's okay to change to 0 later + fout_intermediate.write('\t'.join(sj_line)+'\n') + fout_intermediate.close() + fout_head=open(db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.header.txt','w') + fout_head.write('\t'.join(['SJ']+fname_list)+'\n') + fout_head.close() + cmd='cat '+db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.batch_*.txt > '+db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.txt_tmp' + cmd_merge='cat '+db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.header.txt '+db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.txt_tmp > '+db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.txt' + cmd_rm='rm '+db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.batch_*.txt' + cmd_rm2='rm '+db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.header.txt' + cmd_rm3='rm '+db_dir+'/'+data_name+'/sjc_matrix/SJ_count.'+data_name+'.txt_tmp' + print cmd + os.system(cmd) + os.system(cmd_merge) + print cmd_rm + os.system(cmd_rm) + os.system(cmd_rm2) + os.system(cmd_rm3) + + print '[INFO] Matrix finished.' + index_SJMatrix('SJ_count.'+data_name+'.txt', db_dir+'/'+data_name+'/sjc_matrix/', '\t') + print '[INFO] Index finished.' + +if __name__ == '__main__': + main() diff --git a/IRIS/IRIS_translation.py b/IRIS/IRIS_translation.py index fbb777d..3e13197 100644 --- a/IRIS/IRIS_translation.py +++ b/IRIS/IRIS_translation.py @@ -10,8 +10,8 @@ def loadFrame(fin): exonFrameDict={} for l in open(fin): ls=l.strip().split('|') - if len(ls)>5: #no hits - if (float(ls[1])/3)*0.4>float(ls[11]): #identities too less + if len(ls)>5: #skip no hits + if (float(ls[1])/3)*0.4>float(ls[11]): #skip identities too less mis+=1 continue exon_start_end=ls[0].split('-') ##TODO: name two variables, exonFrameDict key can be "'5end|'+exon_start_coord", the same for end. much clearner variable name in downstream. @@ -25,69 +25,136 @@ def loadFrame(fin): nomap+=1 return exonFrameDict, nomap, mis -def findJCRange(AS_chrom,JC_coord,exonFrameDict): +def loadGTFMicroexonInfo(gtf):#laod gtf and exon start and end and legnth, used to check microexon + micro_exon={} + for l in open(gtf): + if l.startswith('#'): + continue + ls=l.strip().split('\t') + if ls[2]=='exon': + exon_name=ls[0]+':'+ls[3]+'-'+ls[4] + if exon_name in micro_exon: + continue + length=int(ls[4])-int(ls[3]) + if length<=30: + micro_exon[exon_name]=int(ls[4])-int(ls[3]) + microexon_start={}#by junction + microexon_end={} + for e in micro_exon: + chrom=e.split(':')[0] + es, ee=e.split(':')[1].split('-') + es=chrom+':'+es + ee=chrom+':'+ee + if es not in microexon_start: + microexon_start[es]=micro_exon[e] + else: + if micro_exon[e]0: # if the left coord is the start, then find the ideal starting poistion + JC_region_d='+' + JC_region_len=min(int(align_e)-int(align_s),30) + JC_region_s=(int(JC_coord[0])-(int(exon_len)-int(align_e))-JC_region_len,JC_coord[0]) + evidence_level[0]=True + break ### Limitation: allow multiple starting + else: # if left coord is the end coord, frame doesn't matter, keep proper length(30-33) + JC_region_d='-' + if JC_region_e=='': + JC_region_e=(max(int(JC_coord[0])-33,int(JC_coord[0])-int(exon_len)), JC_coord[0])# max means min here + evidence_level[1]=True + break ### allowing loop will give more restrained junc region. but will be slower. disable for now. + else: + continue + if AS_chrom+':'+JC_coord[1] in exonFrameDict:# check the right coord of the juction & check if it is the right pair + for exon_info in exonFrameDict[AS_chrom+':'+JC_coord[1]]: exon_len,frame,align_s,align_e=exon_info[1] - if int(frame)>0: # if the left coord is the start, then find the ideal starting poistion - JC_region_d='+' - JC_region_len=min(int(align_e)-int(align_s),30) - JC_region_s=(int(JC_coord[0])-(int(exon_len)-int(align_e))-JC_region_len,JC_coord[0]) + if int(frame)>0:# if right coord is the end coord, frame doesn't matter, keep proper length(30-33) + if JC_region_d=='-': + print 'frame conflict.', JC_coord;JC_region_e='';evidence_level[1]=False;break + if JC_region_d=='':# when left coord has no hit + JC_region_d='+' + JC_region_e=(JC_coord[1],min(int(JC_coord[1])+33,int(exon_info[0]))) + evidence_level[1]=True + break ### allowing loop will give more restrained junc region. but will be slower. disable for now. + else:# right with be the start. then find the ideal starting poistion + if JC_region_d=='+': + print 'frame conflict.', JC_coord;JC_region_s='';evidence_level[0]=False;break + if JC_region_d=='':# when left coord has no hit + JC_region_d='-' + if int(align_e)<=32: + JC_region_s=(JC_coord[1],int(JC_coord[1])+int(align_e)) + elif int(align_e)>32: + shift=int(align_e)%3 + JC_region_len=30+shift + JC_region_s=(JC_coord[1],int(JC_coord[1])+JC_region_len) evidence_level[0]=True - break ### TODO: allow multiple starting - else: # if left coord is the end coord, frame doesn't matter, keep proper length(30-33) - JC_region_d='-' - if JC_region_e=='': - JC_region_e=(max(int(JC_coord[0])-33,int(JC_coord[0])-int(exon_len)), JC_coord[0])# max means min here - evidence_level[1]=True - break ### allowing loop will give more restrained junc region. but will be slower. disable for now. - # else: - # JC_region_e=(max(JC_region_e[0],int(JC_coord[0])-int(exon_len)),JC_coord[0]) + break ### Limitation: allow multiple starting + if JC_region_s!='' and JC_region_e=='':#check if JC region end is beyond exon end. handle microexons + if JC_region_d=='+': + range_bp=30#default + if AS_chrom+':'+str(int(JC_coord[1])+1) in microexon_start:#check if downstream exon start pos is a microexon start + range_bp=min(microexon_start[AS_chrom+':'+str(int(JC_coord[1])+1)],30) + JC_region_e=(JC_coord[1],int(JC_coord[1])+range_bp+1)#+/-1 from gtf else: - continue - if AS_chrom+':'+JC_coord[1] in exonFrameDict:# check the right coord of the juction & check if it is the right pair - for exon_info in exonFrameDict[AS_chrom+':'+JC_coord[1]]: - exon_len,frame,align_s,align_e=exon_info[1] - if int(frame)>0:# if right coord is the end coord, frame doesn't matter, keep proper length(30-33) - if JC_region_d=='-': - print 'frame conflict.', JC_coord;JC_region_e='';evidence_level[1]=False;break - if JC_region_d=='':# when left coord has no hit - JC_region_d='+' - JC_region_e=(JC_coord[1],min(int(JC_coord[1])+33,int(exon_info[0]))) - evidence_level[1]=True - break ### allowing loop will give more restrained junc region. but will be slower. disable for now. - else:# right with be the start. then find the ideal starting poistion - if JC_region_d=='+': - print 'frame conflict.', JC_coord;JC_region_s='';evidence_level[0]=False;break - if JC_region_d=='':# when left coord has no hit - JC_region_d='-' - if int(align_e)<=32: - JC_region_s=(JC_coord[1],int(JC_coord[1])+int(align_e)) - elif int(align_e)>32: - shift=int(align_e)%3 - JC_region_len=30+shift - JC_region_s=(JC_coord[1],int(JC_coord[1])+JC_region_len) - evidence_level[0]=True - break ### TO-DO: allow multiple starting - if JC_region_s!='' and JC_region_e=='': + range_bp=30#default + if AS_chrom+':'+JC_coord[0] in microexon_end:#check if upstream exon(downstream in translation) start pos is microexon end + range_bp=min(microexon_end[AS_chrom+':'+JC_coord[0]],30) + JC_region_e=(int(JC_coord[0])-range_bp-1,JC_coord[0]) # For events partially in the ORF annotation: Extend the end based on the known start frame #Control Microexon!! + ##Step 2: assign 30bp +/-. This is for 3 ORF search. Can be used when need/force 3 ORF search. TODO: check up/down exon length + if (all_orf and evidence_level[0]==False) or ignore_annotation: # For events not in the ORF annotation: use all 3 orf. Note: use 'evidence_level' instead of 'JC_region_s' to avoid mini exon/wrong annotation. + JC_region_d = AS_direction if JC_region_d=='+': - JC_region_e=(JC_coord[1],int(JC_coord[1])+30) + JC_region_s, JC_region_e= [(int(JC_coord[0])-30,JC_coord[0]),(JC_coord[1],int(JC_coord[1])+30)] else: - JC_region_e=(int(JC_coord[0])-30,JC_coord[0]) - return JC_region_s,JC_region_e, JC_region_d, evidence_level + JC_region_s, JC_region_e= [(JC_coord[1],int(JC_coord[1])+30),(int(JC_coord[0])-30,JC_coord[0])] + return JC_region_s, JC_region_e, JC_region_d, evidence_level + +#handle multiple AS types by taking 4 or 6 coordinate and select antigen-deriving junctions +def selectJunction(AS_coord, deltaPSI_c2n, cut_off, if_select_all, splicing_event_type): + if splicing_event_type == 'SE': + skp = (AS_coord[2], AS_coord[3],'skp') + inc1 = (AS_coord[2],AS_coord[0],'inc1') + inc2 = (AS_coord[1],AS_coord[3],'inc2') + elif splicing_event_type == 'A3SS': + skp = (AS_coord[5], AS_coord[2],'skp') + inc1 = (AS_coord[5],AS_coord[0],'inc1') + inc2 = (AS_coord[2],AS_coord[2],'inc2') + elif splicing_event_type == 'A5SS': + skp = (AS_coord[3], AS_coord[4],'skp') + inc1 = (AS_coord[3],AS_coord[3],'inc1') + inc2 = (AS_coord[1],AS_coord[4],'inc2') + elif splicing_event_type == 'RI': + skp = (AS_coord[3], AS_coord[4],'skp') + inc1 = (AS_coord[3],AS_coord[3],'inc1') + inc2 = (AS_coord[4],AS_coord[4],'inc2') -def selectJC(AS_coord,deltaPSI_c2n,cut_off, select_all): - if select_all: - return [(AS_coord[2], AS_coord[3],'skp'),(AS_coord[2],AS_coord[0],'inc1'),(AS_coord[1],AS_coord[3],'inc2')] - if float(deltaPSI_c2n)'): if JC_nuc_seq_ID!='': @@ -158,25 +230,32 @@ def JC2pep(JC_region_bed,ref_genome,outdir,info):#getfasta of the coord and tran if JC_nuc_seq!='': first_half_len=len(JC_nuc_seq) JC_nuc_seq+=row.strip().upper() - JC_prot_seq=translateNuc(JC_nuc_seq,1)[0][0] - if len(JC_prot_seq)<8 or len(JC_prot_seq)*3<=first_half_len:# skip seq finds PTC before junction site - JC_nuc_seq_ID='' - JC_nuc_seq='' - continue - junction_pos=first_half_len/3+1 - JC_prot_seq_up=JC_prot_seq[:junction_pos-1]+JC_prot_seq[junction_pos-1].lower()+JC_prot_seq[junction_pos:] - if junction_pos>11:#long upstream junction peptides, often due to half-mapped but quality BLAST aligment(known exon but not/partially known in protein annotation) - diff=junction_pos-11 - JC_nuc_seq_ID=JC_nuc_seq_ID+'|trim_'+str(diff) - JC_prot_seq_up=JC_prot_seq_up[diff:] - #print JC_nuc_seq, JC_prot_seq_up, JC_prot_seq, first_half_len, junction_pos - fout_JC_pep.write('>{}\n{}\n'.format(JC_nuc_seq_ID,JC_prot_seq_up)) + JC_prot_seq_list=translateNuc(JC_nuc_seq, orf_range, remove_early_stop) + for JC_prot_seq_pair in JC_prot_seq_list: + JC_prot_seq,JC_prot_orf_start=JC_prot_seq_pair + if len(JC_prot_seq)<8 or len(JC_prot_seq)*3<=first_half_len:# skip seq finds PTC before junction site + continue + junction_pos=first_half_len/3+1 + JC_prot_seq_up=JC_prot_seq[:junction_pos-1]+JC_prot_seq[junction_pos-1].lower()+JC_prot_seq[junction_pos:] + if junction_pos>11:#long upstream junction peptides, often due to half-mapped but quality BLAST aligment(known exon but not/partially known in protein annotation) + diff=junction_pos-11 + JC_nuc_seq_ID=JC_nuc_seq_ID+'|trim_'+str(diff) + JC_prot_seq_up=JC_prot_seq_up[diff:] + if JC_prot_orf_start==0: + fout_JC_pep.write('>{}\n{}\n'.format(JC_nuc_seq_ID,JC_prot_seq_up)) + elif JC_prot_orf_start==1: + fout_JC_pep_1.write('>{}\n{}\n'.format('FrameAdd1'.join(JC_nuc_seq_ID.split('Frame')),JC_prot_seq_up)) + elif JC_prot_orf_start==2: + fout_JC_pep_2.write('>{}\n{}\n'.format('FrameAdd2'.join(JC_nuc_seq_ID.split('Frame')),JC_prot_seq_up)) JC_nuc_seq_ID='' JC_nuc_seq='' continue JC_nuc_seq=row.strip().upper() fout_JC_pep.close() - return JC_pep_fasta + if all_orf: + fout_JC_pep_1.close() + fout_JC_pep_2.close() + return JC_pep_fasta_name def loadSeq(fin): dic={} @@ -186,7 +265,7 @@ def loadSeq(fin): for l in open(fin): if l.startswith('>'): name=l.strip()[1:] - form_field=name.split(':')[5]#May be different for different Bedtools#Feb19 + form_field=name.split(':')[5]#May different for different Bedtools. Feb19 if form_field.startswith('inc1'): form='inc1' elif form_field.startswith('inc2'): @@ -247,7 +326,7 @@ def compSeq(seq1,seq2,seq3): pass_js2=True if j+1==len(seq1): short_seq1=True - true_junction2=i+1 + true_junction2=j+1 break continue else: @@ -278,9 +357,9 @@ def compSeq(seq1,seq2,seq3): seq3==seq3.upper() return seq1, seq2, seq3 -def compPepFile(fin,outdir,select_form): +def compPepFile(fin,outdir,pep_dir_prefix,select_form, novel_info): comp_result_dic={} - dic,direction=loadSeq(outdir+'/tmp/prot/'+fin) + dic,direction=loadSeq(outdir+'/tmp/'+pep_dir_prefix+'/'+fin) if len(dic)>1: if 'skp' in dic: #where the comparison is needed seq2='' @@ -307,65 +386,83 @@ def compPepFile(fin,outdir,select_form): dic['inc1'][1]=comp_result[2] if 'inc2' in dic: dic['inc2'][1]=comp_result[1] + if novel_info==[]: + novel_info=dic if select_form!=2: - fout_skp=open(outdir+'/tmp/prot.compared/skp/skp.'+fin.split('/')[-1],'w') - if 'skp' in dic: + fout_skp=open(outdir+'/tmp/'+pep_dir_prefix+'.compared/skp/skp.'+fin.split('/')[-1],'w') + if 'skp' in dic and 'skp' in novel_info: fout_skp.write('>{}\n{}\n'.format(dic['skp'][0],dic['skp'][1])) fout_skp.close() if select_form!=1: - fout_inc=open(outdir+'/tmp/prot.compared/inc/inc.'+fin.split('/')[-1],'w') - if 'inc1' in dic: + fout_inc=open(outdir+'/tmp/'+pep_dir_prefix+'.compared/inc/inc.'+fin.split('/')[-1],'w') + if 'inc1' in dic and 'inc1' in novel_info: fout_inc.write('>{}\n{}\n'.format(dic['inc1'][0],dic['inc1'][1])) - if 'inc2' in dic: + if 'inc2' in dic and 'inc2' in novel_info: fout_inc.write('>{}\n{}\n'.format(dic['inc2'][0],dic['inc2'][1])) fout_inc.close() -def AS2pep(AS_chrom, AS_coord,AS_direction, deltaPSI_c2n,cuf_off, select_all, exonFrameDict,ref_genome,outdir,info): - JC_coord_list=selectJC(AS_coord,deltaPSI_c2n,cuf_off, True) #under select_all=True - select_form=len(selectJC(AS_coord,deltaPSI_c2n,cuf_off, select_all)) - JC_region_bed='_'.join([AS_chrom,AS_direction]+AS_coord)+'.JCregion.bed' - fout_JC_region=open(outdir+'/tmp/junction/'+JC_region_bed,'w') #The JC file created. with all forms included regardless of selecting setting. Required for comparison. +def AS2pep(AS_chrom, AS_coord, AS_direction, deltaPSI_c2n, cuf_off, if_select_all, all_orf, pep_dir_prefix, microexon_start, microexon_end, ignore_annotation, remove_early_stop, splicing_event_type, exonFrameDict, ref_genome, outdir, info, novel_info): + JC_coord_list = selectJunction(AS_coord, deltaPSI_c2n, cuf_off, True, splicing_event_type) #under if_select_all=True + select_form = len(selectJunction(AS_coord, deltaPSI_c2n, cuf_off, if_select_all, splicing_event_type)) + JC_region_bed = '_'.join([AS_chrom,AS_direction]+AS_coord)+'.JCregion.bed' + fout_JC_region = open(outdir+'/tmp/junction/'+JC_region_bed,'w') #The JC file created. with all forms included regardless of selecting setting. Required for comparison. for JC_coord in JC_coord_list: - JC_region=findJCRange(AS_chrom,JC_coord[:2], exonFrameDict) - if JC_region[3][0]: - fout_JC_region.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(AS_chrom,JC_region[0][0],JC_region[0][1],AS_chrom+':'+JC_coord[0]+'|'+JC_coord[1]+':uniprotFrame:'+AS_direction+':form:'+JC_coord[2],'.',JC_region[2])) - fout_JC_region.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(AS_chrom,JC_region[1][0],JC_region[1][1],AS_chrom+':'+JC_coord[0]+'|'+JC_coord[1]+':uniprotFrame:'+AS_direction+':form:'+JC_coord[2],'.',JC_region[2])) - #else: #no known frame; implement 3 orf search latter - #fout_JC_region.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(AS_chrom,JC_region[0][0],JC_region[0][1],AS_chrom+':'+JC_coord[0]+'|'+JC_coord[1]+':unknownFrame:'+AS_direction,'.',JC_region[2])) - #fout_JC_region.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(AS_chrom,JC_region[1][0],JC_region[1][1],AS_chrom+':'+JC_coord[0]+'|'+JC_coord[1]+':unknownFrame:'+AS_direction,'.',JC_region[2])) + JC_region=findJCRange(AS_chrom, JC_coord[:2], AS_direction, exonFrameDict, all_orf, microexon_start, microexon_end, ignore_annotation) + if JC_region[3][0]:#check if the starting end orf is known in annotation + fout_JC_region.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(AS_chrom,JC_region[0][0],JC_region[0][1],AS_chrom+':'+JC_coord[0]+'|'+JC_coord[1]+':uniprotFrame:'+JC_region[2]+':form:'+JC_coord[2],'.',JC_region[2])) + fout_JC_region.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(AS_chrom,JC_region[1][0],JC_region[1][1],AS_chrom+':'+JC_coord[0]+'|'+JC_coord[1]+':uniprotFrame:'+JC_region[2]+':form:'+JC_coord[2],'.',JC_region[2])) + elif (all_orf or ignore_annotation): #3 orf frame; + fout_JC_region.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(AS_chrom,JC_region[0][0],JC_region[0][1],AS_chrom+':'+JC_coord[0]+'|'+JC_coord[1]+':unknownFrame:'+AS_direction+':form:'+JC_coord[2],'.',JC_region[2])) + fout_JC_region.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(AS_chrom,JC_region[1][0],JC_region[1][1],AS_chrom+':'+JC_coord[0]+'|'+JC_coord[1]+':unknownFrame:'+AS_direction+':form:'+JC_coord[2],'.',JC_region[2])) fout_JC_region.close() - JC_pep_fasta=JC2pep(JC_region_bed,ref_genome,outdir,info)# Junction peptides file created. With all forms included regardless of form selection. - compPepFile(JC_pep_fasta, outdir, select_form) # Compared Junction peptides file created. Form selection STARTS HERE. + JC_pep_fasta_name=JC2pep(JC_region_bed, ref_genome, outdir, info, remove_early_stop, all_orf, pep_dir_prefix)# Junction peptides file created. With all forms included regardless of form selection. + compPepFile(JC_pep_fasta_name+'.prot.fa', outdir, pep_dir_prefix, select_form, novel_info) # Compared Junction peptides file created. Form selection STARTS HERE. + if all_orf: + compPepFile(JC_pep_fasta_name+'_orf1.prot.fa', outdir, pep_dir_prefix, select_form, novel_info) + compPepFile(JC_pep_fasta_name+'_orf2.prot.fa', outdir, pep_dir_prefix, select_form, novel_info) def main(args): orf_mapping_file=config.ORF_MAP_PATH - ref_genome=args.ref_genome #ref_genome='~/nobackup-yxing/references.annotations/37.chr/ucsc.hg19.fasta' + ref_genome=args.ref_genome fin=args.as_input + splicing_event_type=args.splicing_event_type + all_orf=args.all_orf + gtf=args.gtf + microexon_start, microexon_end=loadGTFMicroexonInfo(gtf) + ignore_annotation=args.ignore_annotation + remove_early_stop=args.remove_early_stop outdir=args.outdir.rstrip('/') - select_all=args.no_tumor_form_selection + if_select_all=args.no_tumor_form_selection deltaPSI_cut_off=float(args.deltaPSI_cut_off) deltaPSI_column=int(args.deltaPSI_column)-1 - + check_novel=args.check_novel + pep_dir_prefix='prot' + if all_orf: + pep_dir_prefix='prot_allorf' os.system('mkdir -p '+outdir+'/tmp') - os.system('mkdir -p '+outdir+'/tmp/junction '+outdir+'/tmp/rna '+outdir+'/tmp/prot '+outdir+'/tmp/pred') - os.system('mkdir -p '+outdir+'/tmp/prot.compared '+outdir+'/tmp/prot.compared/skp '+outdir+'/tmp/prot.compared/inc') + os.system('mkdir -p '+outdir+'/tmp/junction '+outdir+'/tmp/rna '+outdir+'/tmp/'+pep_dir_prefix+' '+outdir+'/tmp/pred') + os.system('mkdir -p '+outdir+'/tmp/'+pep_dir_prefix+'.compared '+outdir+'/tmp/'+pep_dir_prefix+'.compared/skp '+outdir+'/tmp/'+pep_dir_prefix+'.compared/inc') exonFrameDict,nomap, mis =loadFrame(orf_mapping_file) print '[INFO] Total exon-orf loaded',len(exonFrameDict), nomap, mis #Select AS version, region and frame, translate to peptides. events_processed=0 tot=config.file_len(fin)-1 + header=[] for i,l in enumerate(open(fin)): if l.startswith('ENSG')==False: + header=l.strip().split('\t') continue config.update_progress(i/(0.0+tot)) - # if i%500==0: - # print i - ls=l.strip('\n').split('\t') + ls=l.strip().split('\t') + ld=dict(zip(header,ls)) events_processed+=1 des=ls[0].split(':') info='_'.join(des[:2]).strip('_').replace('/','+') - AS2pep(des[2],des[4:8],des[3],ls[deltaPSI_column],deltaPSI_cut_off,False,exonFrameDict,ref_genome,outdir,info) + novel_info=[] + if check_novel: + novel_info=ld['novel_ss_info'].split(';') + AS2pep(des[2], des[4:], des[3], ls[deltaPSI_column], deltaPSI_cut_off, False, all_orf, pep_dir_prefix, microexon_start, microexon_end, ignore_annotation, remove_early_stop, splicing_event_type, exonFrameDict, ref_genome, outdir, info, novel_info) print '[INFO] Total processed',events_processed if __name__ == '__main__': diff --git a/IRIS/IRIS_visual_summary.py b/IRIS/IRIS_visual_summary.py new file mode 100644 index 0000000..e8b4160 --- /dev/null +++ b/IRIS/IRIS_visual_summary.py @@ -0,0 +1,1154 @@ +from __future__ import print_function + +import collections +import os +import sys + +import matplotlib +matplotlib.use('agg') # sets the plotting mode to non-interactive + +import matplotlib.gridspec as gridspec +import matplotlib.patches +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + +COLOR_BLACK = '#000000' +COLOR_BURNT_ORANGE = '#c04e01' +COLOR_CREAM = '#ffffc2' +COLOR_GREEN = '#15b01a' +COLOR_LIGHT_VIOLET = '#d6b4fc' +COLOR_OCHRE = '#bf9005' + +COLOR_BY_PANEL_TYPE = { + 'output': COLOR_BLACK, + 'tissue_matched_normal': COLOR_GREEN, + 'tumor': COLOR_BURNT_ORANGE, + 'normal': COLOR_GREEN +} + +Z_LOWEST = 0 + +# size in points +FONT_SIZE_MEDIUM = 11 +FONT_SIZE_LARGE = 13 + + +def points_to_pixels(points): + # 1 point = (1/72.0) inches + # 1 inch = 96.0 pixels + inches = points / 72.0 + return inches * 96.0 + + +def get_matrix_index(event_id, index_f_name): + with open(index_f_name, 'rt') as f_h: + for line in f_h: + event, offset_str = line.strip().split('\t') + if event == event_id: + offset_int, error = parse_int(offset_str) + if error: + return None, error + + return offset_int, None + + return None, '{} not in {}'.format(event_id, index_f_name) + + +def read_event_row(event_id, matrix_f_name, index_f_name): + row = collections.OrderedDict() + offset, error = get_matrix_index(event_id, index_f_name) + fill_nan = False + if error: + print('{}; filling with NaN values'.format(error), file=sys.stderr) + fill_nan = True + + with open(matrix_f_name, 'rt') as f_h: + raw_headers = f_h.readline().strip().split('\t') + header = np.asarray([ + x.split('.')[0] if x.startswith('SRR') else x for x in raw_headers + ]) + + if fill_nan: + data = np.asarray(['NaN' for _ in header]) + else: + f_h.seek(offset, 0) + data = np.asarray(f_h.readline().strip().split('\t')) + + row = collections.OrderedDict(zip(header, data)) + + return row, None + + +def read_panel_parameters(line): + parameters = dict() + if not line.strip(): + return parameters, None + + splits = line.split(' ') + if len(splits) != 2: + return parameters, 'expected 2 fields in {} but found {}'.format( + line, len(splits)) + + cutoffs_str, groups_str = splits + cutoffs = cutoffs_str.split(',') + groups = groups_str.split(',') + floats = list() + for cutoff in cutoffs: + parsed_float, error = parse_float(cutoff) + if error: + return parameters, error + + floats.append(parsed_float) + + parameters['psi_pval_cutoff'] = floats[0] + parameters['delta_psi_cutoff'] = floats[1] + parameters['foc_cutoff'] = floats[2] + parameters['sjc_pval_cutoff'] = floats[3] + parameters['group_cutoff'] = floats[4] + parameters['groups'] = groups + return parameters, None + + +def read_parameters(parameter_f_name): + parameters = dict() + + with open(parameter_f_name, 'rt') as f_h: + lines = [line.strip() for line in f_h] + + if len(lines) != 10: + return parameters, 'expected 10 lines in {} but found {}'.format( + parameter_f_name, len(lines)) + + parameters['out_prefix'] = lines[0] + parameters['db_dir'] = lines[1] + panel_params = list() + for line in lines[2:5]: + this_panel_params, error = read_panel_parameters(line) + if error: + return parameters, error + + panel_params.append(this_panel_params) + + parameters['tissue_matched_normal_panel'] = panel_params[0] + parameters['tumor_panel'] = panel_params[1] + parameters['normal_panel'] = panel_params[2] + parameters['test_mode'] = lines[5] + parameters['use_ratio'] = lines[6] == 'True' + parameters['blacklist'] = lines[7] + parameters['mappability_path'] = lines[8] + parameters['ref_genome_path'] = lines[9] + + return parameters, None + + +def get_group_to_panel_type(parameters): + panel_tmn = parameters['tissue_matched_normal_panel'].get('groups', list()) + panel_t = parameters['tumor_panel'].get('groups', list()) + panel_n = parameters['normal_panel'].get('groups', list()) + output_group = parameters['out_prefix'] + + group_to_panel_type = collections.OrderedDict() + group_to_panel_type[output_group] = 'output' + + for group in panel_tmn: + group_to_panel_type[group] = 'tissue_matched_normal' + + for group in panel_t: + group_to_panel_type[group] = 'tumor' + + for group in panel_n: + group_to_panel_type[group] = 'normal' + + return group_to_panel_type + + +def get_matrix_file_names(groups, parameters, event_type): + group_to_matrix_f_name = collections.OrderedDict() + group_to_matrix_index_f_name = collections.OrderedDict() + + if not groups: + return group_to_matrix_f_name, group_to_matrix_index_f_name, 'no groups specified' + + db_dir = parameters['db_dir'] + for group_name in groups: + splicing_f_name = 'splicing_matrix.{}.cov10.{}.txt'.format( + event_type, group_name) + group_to_matrix_f_name[group_name] = os.path.join( + db_dir, group_name, 'splicing_matrix', splicing_f_name) + + for group, matrix_f_name in group_to_matrix_f_name.items(): + if not os.path.isfile(matrix_f_name): + error = 'no matrix file found for {}. expected {}'.format( + group, matrix_f_name) + return group_to_matrix_f_name, group_to_matrix_index_f_name, error + + index_f_name = '{}.idx'.format(matrix_f_name) + if not os.path.isfile(index_f_name): + error = 'no index file found for {}. expected {}'.format( + group, matrix_f_name) + return group_to_matrix_f_name, group_to_matrix_index_f_name, error + + group_to_matrix_index_f_name[group] = index_f_name + + return group_to_matrix_f_name, group_to_matrix_index_f_name, None + + +def get_events(parameters, screening_out_dir, event_type): + events = dict() + + for variant in ['tier1', 'tier2tier3']: + variant_events = list() + events[variant] = variant_events + file_name = '{}.{}.{}.txt'.format(parameters['out_prefix'], event_type, + variant) + file_path = os.path.join(screening_out_dir, file_name) + if not os.path.isfile(file_path): + return events, 'missing required file: {}'.format(file_path) + + event_id_index = None + with open(file_path, 'rt') as f_h: + for i, line in enumerate(f_h): + splits = line.strip().split('\t') + if i == 0: # header + event_id_header = 'as_event' + if event_id_header not in splits: + return events, 'required header {} not found in {}'.format( + event_id_header, file_path) + + event_id_index = splits.index(event_id_header) + continue + + if len(splits) <= event_id_index: + return events, 'no {} for line {} of {}'.format( + event_id_header, i, file_path) + + event_id = splits[event_id_index] + variant_events.append(event_id) + + return events, None + + +def get_psi_data_by_event(group_to_matrix_f_name, group_to_matrix_index_f_name, + events): + psi_data_by_event = collections.OrderedDict() + tier2tier3_events = events['tier2tier3'] + if not tier2tier3_events: + return psi_data_by_event, 'no tier2tier3 events' + + groups = group_to_matrix_f_name.keys() + for event_id in tier2tier3_events: + psi_by_group = collections.OrderedDict() + all_psi = list() + for group in groups: + row, error = read_event_row(event_id, + group_to_matrix_f_name[group], + group_to_matrix_index_f_name[group]) + if error: + return psi_data_by_event, error + + psi_strings = list(row.values())[8:] + psi_floats = list() + for psi_s in psi_strings: + psi_f, error = parse_float(psi_s) + if error: + return psi_data_by_event, error + + psi_floats.append(psi_f) + + psi_by_group[group] = psi_floats + all_psi.extend(psi_floats) + + abs_change = max(all_psi) - min(all_psi) + if abs_change < 0.05: + continue + + psi_df = pd.DataFrame.from_dict(psi_by_group, + orient='index').transpose()[groups] + psi_data_by_event[event_id] = psi_df + + return psi_data_by_event, None + + +def add_or_verify_match(key, source_dict, dest_dict, parser): + if key not in source_dict: + return 'missing {}'.format(key) + + source_v = source_dict[key] + parsed_source_v, error = parser(source_v) + if error: + return error + + if key in dest_dict: + existing_v = dest_dict[key] + if parsed_source_v == existing_v: + return None + + return 'differing values for {}: {}, {}'.format( + key, parsed_source_v, existing_v) + + dest_dict[key] = parsed_source_v + return None + + +def read_tsv(f_name): + rows = list() + header = list() + with open(f_name, 'rt') as f_h: + for i, line in enumerate(f_h): + tokens = line.strip().split('\t') + if i == 0: + header = tokens + continue + + if len(tokens) != len(header): + return header, rows, 'expected {} columns at line {} of {} but found {}'.format( + len(header), i, f_name, len(tokens)) + + rows.append(dict(zip(header, tokens))) + + return header, rows, None + + +def get_hlas_with_binding_affinity(hla_headers, row, row_num, f_name): + hlas_with_binding_affinity = list() + for header in hla_headers: + full_str = row[header] + if full_str == '-': + continue + + col_error_prefix = 'row {} in {} with column {}={}: '.format( + row_num, f_name, header, full_str) + semi_splits = full_str.split(';') + for semi_split in semi_splits: + pipe_splits = semi_split.split('|') + if len(pipe_splits) != 2: + return hlas_with_binding_affinity, '{}expected exactly 1 "|" in {}'.format( + col_error_prefix, semi_split) + + hla_str, binding_str = pipe_splits + hla_prefix = 'HLA-' + if not hla_str.startswith(hla_prefix): + return hlas_with_binding_affinity, '{}expected a value starting with "{}"'.format( + col_error_prefix, hla_prefix) + + hla_sub_str = hla_str[len(hla_prefix):] + binding_float, error = parse_float(binding_str) + if error: + return hlas_with_binding_affinity, '{}{}'.format( + col_error_prefix, error) + + hlas_with_binding_affinity.append((hla_sub_str, binding_float)) + + return hlas_with_binding_affinity, None + + +def parse_int_ratio(s): + splits = s.split('/') + if len(splits) != 2: + return None, 'expected exactly one "/" in {}'.format(s) + + ints = list() + for split in splits: + as_int, error = parse_int(split) + if error: + return None, error + + ints.append(as_int) + + return ints, None + + +def parse_float(s): + try: + as_float = float(s) + except ValueError as e: + return None, 'could not parse {} as float: {}'.format(s, e) + + return as_float, None + + +def parse_int(s): + try: + as_int = int(s) + except ValueError as e: + return None, 'could not parse {} as int: {}'.format(s, e) + + return as_int, None + + +def process_epitope_summary_row(row, row_num, summary, hla_headers, f_name, + has_prediction): + event_id = row['as_event'] + event_summary = summary.get(event_id) + if not event_summary: + event_summary = dict() + summary[event_id] = event_summary + + event_epitopes_hla_affinity_patients = event_summary.get('epitopes') + if not event_epitopes_hla_affinity_patients: + event_epitopes_hla_affinity_patients = list() + event_summary['epitopes'] = event_epitopes_hla_affinity_patients + + row_error_prefix = 'row {} in {}: '.format(row_num, f_name) + int_ratio_keys = [ + 'tissue_matched_normal_panel', 'tumor_panel', 'normal_panel' + ] + for key in int_ratio_keys: + error = add_or_verify_match(key, row, event_summary, parse_int_ratio) + if error: + return '{}{}'.format(row_error_prefix, error) + + error = add_or_verify_match('fc_of_tumor_isoform', row, event_summary, + parse_float) + if error: + return '{}{}'.format(row_error_prefix, error) + + if not has_prediction: + return None + + hlas_with_binding_affinity, error = get_hlas_with_binding_affinity( + hla_headers, row, row_num, f_name) + if error: + return error + + if not hlas_with_binding_affinity: + return '{}no HLAs found'.format(row_error_prefix) + + hlas_with_binding_affinity.sort(key=lambda p: p[1]) + hla, binding_affinity = hlas_with_binding_affinity[0] + event_epitopes_hla_affinity_patients.append({ + 'epitope': + row['epitope'], + 'hla': + hla, + 'binding_affinity': + binding_affinity, + 'num_patients': + row['num_sample'] + }) + + return None + + +def get_epitope_summary(screening_out_dir, has_prediction, parameters, + event_type): + summary = dict() + + out_prefix = parameters['out_prefix'] + if has_prediction: + summary_f_name = os.path.join(screening_out_dir, + '{}.tier2tier3'.format(event_type), + 'epitope_summary.peptide-based.txt') + else: + summary_f_name = os.path.join( + screening_out_dir, + '{}.{}.tier2tier3.txt'.format(out_prefix, event_type)) + + header, rows, error = read_tsv(summary_f_name) + if error: + return summary, error + + expected_columns = { + 'as_event', 'meanPSI', 'Q1PSI', 'Q3PSI', 'deltaPSI', + 'fc_of_tumor_isoform', 'tissue_matched_normal_panel', 'tumor_panel', + 'normal_panel', 'tag', 'mappability', 'mappability_tag' + } + if has_prediction: + expected_columns = expected_columns.union({ + 'epitope', 'junction_peptide_form', 'inclusion_form', 'num_hla', + 'num_sample', 'hla_types', 'canonical_match', 'uniqueness' + }) + + ignored_optional_columns = {'meanGeneExp', 'Q1GeneExp', 'Q3GeneExp'} + header_set = set(header) + missing_headers = expected_columns.difference(header_set) + if missing_headers: + return summary, 'missing headers in {}: {}'.format( + summary_f_name, ', '.join(sorted(missing_headers))) + + ignored_columns = expected_columns.union(ignored_optional_columns) + extra_headers = header_set.difference(ignored_columns) + for i, row in enumerate(rows): + error = process_epitope_summary_row(row, i, summary, extra_headers, + summary_f_name, has_prediction) + if error: + return summary, error + + if has_prediction: + for event_summary in summary.values(): + event_summary['epitopes'].sort(key=lambda p: p['binding_affinity']) + + return summary, None + + +def remove_spines(ax): + for spine in ax.spines.values(): + spine.set_visible(False) + + +def remove_ticks_and_labels(ax): + ax.set_xticks(list()) + ax.set_yticks(list()) + + +def remove_spines_ticks_and_labels(ax): + remove_spines(ax) + remove_ticks_and_labels(ax) + + +def hide_ax(ax): + ax.set_visible(False) + + +def hide_ax_except_color(ax, color): + ax.clear() + ax.set_zorder(Z_LOWEST) + remove_spines_ticks_and_labels(ax) + ax.set_facecolor(color) + + +def make_violin_plots(events, psi_data_by_event, groups, group_to_panel_type, + axes_by_name): + gene_header_ax, gene_axes = axes_by_name['gene_name'] + violins_header_ax, violins_axes = axes_by_name['violins'] + violins_y_ticks_header_ax, violins_y_ticks_axes = axes_by_name[ + 'violins_y_ticks'] + + hide_ax_except_color(violins_y_ticks_header_ax, COLOR_CREAM) + for y_tick_ax in violins_y_ticks_axes: + hide_ax(y_tick_ax) + + make_gene_header(gene_header_ax) + + if len(events) != len(psi_data_by_event) or len(events) != len( + gene_axes) or len(events) != len(violins_axes): + return 'all inputs must have the same length' + + sns.set(style="white", color_codes=True) + for i, event in enumerate(events): + event = events[i] + psi_data = psi_data_by_event.get(event) + if psi_data is None: + return 'no psi data for {}'.format(event) + + gene_ax = gene_axes[i] + violins_ax = violins_axes[i] + + sns.violinplot(data=psi_data, + ax=violins_ax, + inner="box", + cut=0, + scale='width', + linewidth=1.5) + + violins_ax.set_yticks(np.arange(0, 1.1, 0.5)) + violins_ax.set_xticklabels(list()) + violins_ax.xaxis.set_tick_params(which='both', length=0) + sns.despine(ax=violins_ax, offset=0, trim=False) + + remove_spines_ticks_and_labels(gene_ax) + gene_name = event.split(':')[1] + gene_ax.text(0.25, + 0.5, + gene_name, + horizontalalignment='center', + verticalalignment='center', + transform=gene_ax.transAxes, + style='italic', + size=FONT_SIZE_LARGE, + color=COLOR_BLACK, + fontweight='bold') + + top_violin_ax = violins_axes[0] + make_violins_header(violins_header_ax, top_violin_ax, groups, + group_to_panel_type) + return None + + +def make_gene_header(gene_header_ax): + gene_header_ax.text(0.25, + 0, + 'Gene of\nAS event', + horizontalalignment='center', + verticalalignment='bottom', + transform=gene_header_ax.transAxes, + size=FONT_SIZE_LARGE, + color=COLOR_BLACK, + fontweight='bold') + + remove_spines_ticks_and_labels(gene_header_ax) + + +def make_violins_header(violin_header_ax, top_violin_ax, groups, + group_to_panel_type): + violin_xticks_data_coords = [(x, 0) for x in top_violin_ax.get_xticks()] + violin_xticks_display_coords = top_violin_ax.transData.transform( + violin_xticks_data_coords) + violin_tick_label_y_display_coord = violin_header_ax.transAxes.transform_point( + (0, 0))[1] + violin_tick_label_display_coords = [(p[0], + violin_tick_label_y_display_coord) + for p in violin_xticks_display_coords] + trans_display_to_axes = violin_header_ax.transAxes.inverted( + ).transform_point + for i, coord in enumerate(violin_tick_label_display_coords): + x, y = trans_display_to_axes(coord) + group = groups[i] + color = COLOR_BY_PANEL_TYPE[group_to_panel_type[group]] + violin_header_ax.text(x, + y, + group, + horizontalalignment='center', + verticalalignment='bottom', + rotation=90, + transform=violin_header_ax.transAxes, + size=FONT_SIZE_MEDIUM, + color=color, + fontweight='bold') + + violin_header_ax.text(0.5, + 1, + 'PSI by tissue or tumor', + horizontalalignment='center', + verticalalignment='top', + transform=violin_header_ax.transAxes, + size=FONT_SIZE_LARGE, + color=COLOR_BLACK, + fontweight='bold') + text_height = points_to_pixels(FONT_SIZE_LARGE) + top_text_y_max_display = violin_header_ax.transAxes.transform_point( + (0, 1))[1] + top_text_y_min_display = top_text_y_max_display - text_height + top_text_y_min_axes = trans_display_to_axes((0, top_text_y_min_display))[1] + underline_x_min_display = violin_xticks_display_coords[0][0] + underline_x_max_display = violin_xticks_display_coords[-1][0] + underline_x_min_axes = trans_display_to_axes( + (underline_x_min_display, 0))[0] + underline_x_max_axes = trans_display_to_axes( + (underline_x_max_display, 0))[0] + violin_header_ax.plot([underline_x_min_axes, underline_x_max_axes], + [top_text_y_min_axes] * 2, + color=COLOR_BLACK, + linestyle='solid', + transform=violin_header_ax.transAxes) + + remove_spines_ticks_and_labels(violin_header_ax) + violin_header_ax.set_facecolor(COLOR_CREAM) + + +def create_grid_of_axes(fig, num_events, has_prediction): + content_rows = num_events + 1 + spacer_rows = content_rows - 1 + content_rows_to_spacer = 9 + extra_header_room = content_rows_to_spacer + grid_rows = (content_rows_to_spacer * content_rows) + spacer_rows + grid_rows += extra_header_room + + grid_col_names_and_widths = [('gene_name', 3), ('violins_y_ticks', 2), + ('violins', 13), ('tissue_matched', 1), + ('fold_change', 2), ('tumor', 1), + ('normal', 1)] + if has_prediction: + grid_col_names_and_widths.extend([('epitopes', 5), ('hlas', 3), + ('num_patients', 2), + ('affinity', 3)]) + + grid_col_intervals_by_name = dict() + grid_cols = 0 + for name, width in grid_col_names_and_widths: + start_cols = grid_cols + grid_cols += width + grid_col_intervals_by_name[name] = (start_cols, grid_cols) + + grid = gridspec.GridSpec(grid_rows, grid_cols, wspace=0, hspace=0) + + filler_rows = list() + axes_by_name = dict() + for name, col_interval in grid_col_intervals_by_name.items(): + start_col, end_col = col_interval + axes = list() + + row = 0 + for _ in range(0, content_rows): + is_header = row == 0 + if not is_header: + filler_rows.append( + fig.add_subplot(grid[row, start_col:end_col])) + row += 1 + + start_row = row + row += content_rows_to_spacer + if is_header: + row += extra_header_room + + axes.append(fig.add_subplot( + grid[start_row:row, start_col:end_col])) + + axes_by_name[name] = (axes[0], axes[1:]) + + return axes_by_name, filler_rows + + +def make_shaded_dots_column(header_ax, dots_axes, header_text, events, + epitope_summary, summary_ratio_key, color): + header_ax.text(0.5, + 0, + 'vs. ', + horizontalalignment='center', + verticalalignment='bottom', + rotation=90, + transform=header_ax.transAxes, + size=FONT_SIZE_MEDIUM, + color=COLOR_BLACK, + fontweight='bold') + + y_min_display = header_ax.transAxes.transform_point((0, 0))[1] + pixels_per_large_char = points_to_pixels(FONT_SIZE_MEDIUM) + # 'vs. ' is about two large characters in the default variable width font + pixels_of_text = pixels_per_large_char * 2 + vs_space_y_max_display = y_min_display + pixels_of_text + + trans_display_to_axes = header_ax.transAxes.inverted().transform_point + vs_space_y_max_axes = trans_display_to_axes((0, vs_space_y_max_display))[1] + # put the header_text after 'vs. ' in a (possibly) different color + header_ax.text(0.5, + vs_space_y_max_axes, + header_text, + horizontalalignment='center', + verticalalignment='bottom', + rotation=90, + transform=header_ax.transAxes, + size=FONT_SIZE_MEDIUM, + color=color, + fontweight='bold') + + remove_spines_ticks_and_labels(header_ax) + header_ax.set_facecolor(COLOR_CREAM) + + for i, event in enumerate(events): + event_summary = epitope_summary.get(event) + if not event_summary: + return 'no event {} in epitope_summary'.format(event) + + hits, total = event_summary[summary_ratio_key] + if total == 0: + percent = 0 + else: + percent = hits / float(total) + + dots_ax = dots_axes[i] + xmin_display, ymin_display = dots_ax.transAxes.transform_point((0, 0)) + xmax_display, ymax_display = dots_ax.transAxes.transform_point((1, 1)) + x_spread_display = xmax_display - xmin_display + y_spread_display = ymax_display - ymin_display + # dots_ax is a rectangle. + # Normalize the axes scale so a circle is not distorted + x_mid_data = 0.5 + y_mid_data = 0.5 + if x_spread_display > y_spread_display: + new_scale = x_spread_display / float(y_spread_display) + half_new_scale = new_scale / 2.0 + dots_ax.set_xlim((0, new_scale)) + x_mid_data = half_new_scale + else: + new_scale = y_spread_display / float(x_spread_display) + half_new_scale = new_scale / 2.0 + dots_ax.set_ylim((0, new_scale)) + y_mid_data = half_new_scale + + circle = matplotlib.patches.Circle((x_mid_data, y_mid_data), + radius=0.25, + alpha=percent, + color=COLOR_OCHRE, + transform=dots_ax.transData) + dots_ax.add_patch(circle) + + remove_spines_ticks_and_labels(dots_ax) + + return None + + +def show_fold_change_values(header_ax, fc_axes, events, epitope_summary): + header_ax.text(0.5, + 0, + 'FC of tumor\nisoform', + horizontalalignment='center', + verticalalignment='bottom', + rotation=90, + transform=header_ax.transAxes, + size=FONT_SIZE_MEDIUM, + color=COLOR_BLACK, + fontweight='bold') + remove_spines_ticks_and_labels(header_ax) + header_ax.set_facecolor(COLOR_CREAM) + + for i, event in enumerate(events): + event_summary = epitope_summary.get(event) + if not event_summary: + return 'no event {} in epitope_summary'.format(event) + + fc = event_summary['fc_of_tumor_isoform'] + fc_ax = fc_axes[i] + fc_ax.text(0.5, + 0.5, + '{:.1f}'.format(fc), + horizontalalignment='center', + verticalalignment='center', + transform=fc_ax.transAxes, + size=FONT_SIZE_MEDIUM) + remove_spines_ticks_and_labels(fc_ax) + + return None + + +def make_top_underline_header(left_header_ax, right_header_ax, other_axes, + header_text): + # need to plot on all axes using display coordinates to avoid clipping + all_axes = [left_header_ax, right_header_ax] + other_axes + + x_min_display = left_header_ax.transAxes.transform_point((0, 0))[0] + x_max_display = right_header_ax.transAxes.transform_point((1, 0))[0] + x_mid_display = (x_min_display + x_max_display) / 2.0 + y_max_display = left_header_ax.transAxes.transform_point((0, 1))[1] + x_range_display = x_max_display - x_min_display + underline_margin_display = 0.02 * x_range_display + underline_x_min_display = x_min_display + underline_margin_display + underline_x_max_display = x_max_display - underline_margin_display + for ax in all_axes: + trans_display_to_axes = ax.transAxes.inverted().transform_point + x_mid_axes, y_max_axes = trans_display_to_axes( + (x_mid_display, y_max_display)) + ax.text(x_mid_axes, + y_max_axes, + header_text, + horizontalalignment='center', + verticalalignment='top', + transform=ax.transAxes, + size=FONT_SIZE_LARGE, + color=COLOR_BLACK, + fontweight='bold', + clip_on=True) + + text_y_min_display = y_max_display - points_to_pixels(FONT_SIZE_LARGE) + + for ax in all_axes: + trans_display_to_axes = ax.transAxes.inverted().transform_point + underline_x_min_axes, text_y_min_axes = trans_display_to_axes( + (underline_x_min_display, text_y_min_display)) + underline_x_max_axes = trans_display_to_axes( + (underline_x_max_display, 0))[0] + ax.plot([underline_x_min_axes, underline_x_max_axes], + [text_y_min_axes] * 2, + color=COLOR_BLACK, + linestyle='solid', + transform=ax.transAxes, + clip_on=True) + + +def make_shaded_dots_and_show_fold_change(events, epitope_summary, + axes_by_name): + tmn_header_ax, tmn_axes = axes_by_name['tissue_matched'] + t_header_ax, t_axes = axes_by_name['tumor'] + n_header_ax, n_axes = axes_by_name['normal'] + fc_header_ax, fc_axes = axes_by_name['fold_change'] + + error = make_shaded_dots_column( + tmn_header_ax, tmn_axes, 'Tissue Matched', events, epitope_summary, + 'tissue_matched_normal_panel', + COLOR_BY_PANEL_TYPE['tissue_matched_normal']) + if error: + return error + + error = make_shaded_dots_column(t_header_ax, t_axes, 'Tumor', events, + epitope_summary, 'tumor_panel', + COLOR_BY_PANEL_TYPE['tumor']) + if error: + return error + + error = make_shaded_dots_column(n_header_ax, n_axes, 'Normal', events, + epitope_summary, 'normal_panel', + COLOR_BY_PANEL_TYPE['normal']) + if error: + return error + + show_fold_change_values(fc_header_ax, fc_axes, events, epitope_summary) + + make_top_underline_header(tmn_header_ax, n_header_ax, + [t_header_ax, fc_header_ax], 'Summary') + + return None + + +def show_epitope_names(header_ax, epitope_axes, events, epitope_summary): + header_ax.text(0.5, + 0, + 'Junction\nepitopes', + horizontalalignment='center', + verticalalignment='bottom', + transform=header_ax.transAxes, + size=FONT_SIZE_LARGE, + color=COLOR_BLACK, + fontweight='bold') + remove_spines_ticks_and_labels(header_ax) + header_ax.set_facecolor(COLOR_LIGHT_VIOLET) + + for i, event in enumerate(events): + event_summary = epitope_summary.get(event) + if not event_summary: + return 'no event {} in epitope_summary'.format(event) + + epitopes = event_summary['epitopes'] + epitope_names = [e['epitope'] for e in epitopes[:2]] + name_text = '\n'.join(epitope_names) + epitope_ax = epitope_axes[i] + epitope_ax.text(0.5, + 0.5, + name_text, + horizontalalignment='center', + verticalalignment='center', + transform=epitope_ax.transAxes, + size=FONT_SIZE_MEDIUM) + remove_spines_ticks_and_labels(epitope_ax) + + return None + + +def show_hla_names(header_ax, hla_axes, events, epitope_summary): + header_ax.text(0.5, + 0, + 'Best\nHLA', + horizontalalignment='center', + verticalalignment='bottom', + transform=header_ax.transAxes, + size=FONT_SIZE_LARGE, + color=COLOR_BLACK, + fontweight='bold') + remove_spines_ticks_and_labels(header_ax) + header_ax.set_facecolor(COLOR_LIGHT_VIOLET) + + for i, event in enumerate(events): + event_summary = epitope_summary.get(event) + if not event_summary: + return 'no event {} in epitope_summary'.format(event) + + epitopes = event_summary['epitopes'] + epitope_hlas = [e['hla'] for e in epitopes[:2]] + hla_text = '\n'.join(epitope_hlas) + hla_ax = hla_axes[i] + hla_ax.text(0.5, + 0.5, + hla_text, + horizontalalignment='center', + verticalalignment='center', + transform=hla_ax.transAxes, + size=FONT_SIZE_MEDIUM) + remove_spines_ticks_and_labels(hla_ax) + + return None + + +def show_num_patients(header_ax, patient_axes, events, epitope_summary): + header_ax.text(0.5, + 0, + '# Pt.\nw/HLA', + horizontalalignment='center', + verticalalignment='bottom', + transform=header_ax.transAxes, + size=FONT_SIZE_LARGE, + color=COLOR_BLACK, + fontweight='bold') + remove_spines_ticks_and_labels(header_ax) + header_ax.set_facecolor(COLOR_LIGHT_VIOLET) + + for i, event in enumerate(events): + event_summary = epitope_summary.get(event) + if not event_summary: + return 'no event {} in epitope_summary'.format(event) + + epitopes = event_summary['epitopes'] + epitope_patients = [e['num_patients'] for e in epitopes[:2]] + patient_text = '\n'.join(epitope_patients) + patient_ax = patient_axes[i] + patient_ax.text(0.5, + 0.5, + patient_text, + horizontalalignment='center', + verticalalignment='center', + transform=patient_ax.transAxes, + size=FONT_SIZE_MEDIUM) + remove_spines_ticks_and_labels(patient_ax) + + return None + + +def show_binding_affinity(header_ax, affinity_axes, events, epitope_summary): + header_ax.text(0.5, + 0, + 'IC$_{50}$\n(nM)', + horizontalalignment='center', + verticalalignment='bottom', + transform=header_ax.transAxes, + size=FONT_SIZE_LARGE, + color=COLOR_BLACK, + fontweight='bold') + remove_spines_ticks_and_labels(header_ax) + header_ax.set_facecolor(COLOR_LIGHT_VIOLET) + + for i, event in enumerate(events): + event_summary = epitope_summary.get(event) + if not event_summary: + return 'no event {} in epitope_summary'.format(event) + + epitopes = event_summary['epitopes'] + affinities = [ + str(int(round(e['binding_affinity']))) for e in epitopes[:2] + ] + affinity_text = '\n'.join(affinities) + affinity_ax = affinity_axes[i] + affinity_ax.text(0.5, + 0.5, + affinity_text, + horizontalalignment='center', + verticalalignment='center', + transform=affinity_ax.transAxes, + size=FONT_SIZE_MEDIUM) + remove_spines_ticks_and_labels(affinity_ax) + + return None + + +def show_epitope_bindings(events, epitope_summary, axes_by_name): + epitope_header_ax, epitope_axes = axes_by_name['epitopes'] + hla_header_ax, hla_axes = axes_by_name['hlas'] + patient_header_ax, patient_axes = axes_by_name['num_patients'] + affinity_header_ax, affinity_axes = axes_by_name['affinity'] + + error = show_epitope_names(epitope_header_ax, epitope_axes, events, + epitope_summary) + if error: + return error + + error = show_hla_names(hla_header_ax, hla_axes, events, epitope_summary) + if error: + return error + + error = show_num_patients(patient_header_ax, patient_axes, events, + epitope_summary) + if error: + return error + + error = show_binding_affinity(affinity_header_ax, affinity_axes, events, + epitope_summary) + if error: + return error + + make_top_underline_header(epitope_header_ax, affinity_header_ax, + [hla_header_ax, patient_header_ax], + 'Predicted HLA-epitope binding') + + return None + + +def make_plots(psi_data_by_event, epitope_summary, group_to_panel_type, + out_file_name, has_prediction): + num_events = len(psi_data_by_event) + if num_events == 0: + return 'no events to plot' + + fig_width = 8 + if has_prediction: + fig_width = 12 + + # 1.5 inches per row. + # The header is given 2 rows worth of height when creating the grid. + fig_height = 1.5 * (num_events + 2) + fig = plt.figure(figsize=(fig_width, fig_height), constrained_layout=False) + + events = list(psi_data_by_event.keys()) + groups = psi_data_by_event[events[0]].columns + + axes_by_name, filler_rows = create_grid_of_axes(fig, num_events, + has_prediction) + for filler_row in filler_rows: + hide_ax(filler_row) + + error = make_violin_plots(events, psi_data_by_event, groups, + group_to_panel_type, axes_by_name) + if error: + return error + + error = make_shaded_dots_and_show_fold_change(events, epitope_summary, + axes_by_name) + if error: + return error + + if has_prediction: + error = show_epitope_bindings(events, epitope_summary, axes_by_name) + if error: + return error + + plt.savefig(out_file_name) + plt.close(fig) + return None + + +def filter_events(events, epitope_summary): + variants = list(events.keys()) + for variant in variants: + event_list = events[variant] + filtered = list() + for event in event_list: + # filter to events that are in epitope summary + if event in epitope_summary: + filtered.append(event) + # only keep up to 10 events + if len(filtered) == 10: + break + + events[variant] = filtered + + +def exit_with_error(error): + print(error, file=sys.stderr) + sys.exit(1) + + +def main(args): + event_type = args.splicing_event_type + has_prediction = not args.no_prediction + + parameters, error = read_parameters(args.parameter_fin) + if error: + exit_with_error(error) + + group_to_panel_type = get_group_to_panel_type(parameters) + groups = list(group_to_panel_type.keys()) + + group_to_matrix_f_name, group_to_matrix_index_f_name, error = get_matrix_file_names( + groups, parameters, event_type) + if error: + exit_with_error(error) + + events, error = get_events(parameters, args.screening_out_dir, event_type) + if error: + exit_with_error(error) + + epitope_summary, error = get_epitope_summary(args.screening_out_dir, + has_prediction, parameters, + event_type) + if error: + exit_with_error(error) + + filter_events(events, epitope_summary) + + psi_data_by_event, error = get_psi_data_by_event( + group_to_matrix_f_name, group_to_matrix_index_f_name, events) + if error: + exit_with_error(error) + + error = make_plots(psi_data_by_event, epitope_summary, group_to_panel_type, + args.out_file_name, has_prediction) + if error: + exit_with_error(error) diff --git a/IRIS/config.py b/IRIS/config.py index ee35241..945bf89 100644 --- a/IRIS/config.py +++ b/IRIS/config.py @@ -10,7 +10,7 @@ #import yaml -CURRENT_VERSION = "v1.0" +CURRENT_VERSION = "v2.0.0" def update_progress(progress): @@ -40,22 +40,12 @@ def file_len(fin): # For screening and translation -BRAIN_BLACKLIST_PATH = resource_filename('IRIS.data','brain_blacklistMay.txt') +BRAIN_BLACKLIST_PATH = resource_filename('IRIS.data','blacklist.brain_2020.txt') ORF_MAP_PATH = resource_filename('IRIS.data','uniprot2gtf.blastout.uniprotAll.txt') ## For TCR mapping EXTRACELLULAR_FEATURES_UNIPROT2GTF_MAP_PATH = resource_filename('IRIS.data','features.uniprot2gtf.ExtraCell.txt') -# ## For HLA typing -# SEQ2HLA_PATH = resource_filename('IRIS.utilities.seq2hla', 'seq2HLA.py') -# FOURDIGITS_PATH = resource_filename('IRIS.utilities.seq2hla', 'fourdigits.py') - -## For qsub -QSUB_PREDICTION_CONFIG='h_data=15G,h_rt=5:00:00' -QSUB_ALIGNMENT_CONFIG='h_data=38G,h_rt=4:30:00' -QSUB_EXPRESSION_CONFIG='h_data=8G,h_rt=14:00:00' -QSUB_RMATS_PREP_CONFIG='h_data=4G,h_rt=5:00:00' - ## For proteogenomics UNIPROT_ENSG_ID_MAP_PATH = resource_filename('IRIS.data','UniprotENSGmap.txt') diff --git a/IRIS/data/blacklist.brain_2020.txt b/IRIS/data/blacklist.brain_2020.txt new file mode 100644 index 0000000..0441015 --- /dev/null +++ b/IRIS/data/blacklist.brain_2020.txt @@ -0,0 +1,369 @@ +ENSG00000005810:MYCBP2:chr13:-:77692474:77692654:77673148:77695507 +ENSG00000006282:SPATA20:chr17:+:48625080:48625315:48624646:48625643 +ENSG00000008394:MGST1:chr12:+:16507164:16507204:16500644:16510538 +ENSG00000008869:HEATR5B:chr2:-:37217790:37217942:37216002:37227728 +ENSG00000011465:DCN:chr12:-:91573138:91573463:91572362:91576431 +ENSG00000013561:RNF14:chr5:+:141350268:141350442:141348732:141353147 +ENSG00000031823:RANBP3:chr19:-:5957928:5957979:5941846:5978071 +ENSG00000047579:DTNBP1:chr6:-:15651543:15651639:15638035:15652317 +ENSG00000050130:JKAMP:chr14:+:59953398:59953522:59951311:59954391 +ENSG00000050130:JKAMP:chr14:+:59953448:59953522:59951311:59954391 +ENSG00000050426:LETMD1:chr12:+:51447594:51447643:51442968:51450132 +ENSG00000055917:PUM2:chr2:-:20478343:20478580:20463221:20482707 +ENSG00000055950:MRPL43:chr10:-:102743704:102743831:102743574:102746505 +ENSG00000055950:MRPL43:chr10:-:102746846:102746953:102743574:102747069 +ENSG00000059588:TARBP1:chr1:-:234536598:234536694:234534299:234536926 +ENSG00000061676:NCKAP1:chr2:-:183889705:183889723:183888644:183902719 +ENSG00000062598:ELMO2:chr20:-:45027335:45027410:45023171:45035186 +ENSG00000063854:HAGH:chr16:-:1876507:1876603:1873038:1876712 +ENSG00000064115:TM7SF3:chr12:-:27156168:27156323:27152609:27167010 +ENSG00000064115:TM7SF3:chr12:-:27156168:27156338:27152609:27167010 +ENSG00000065029:ZNF76:chr6:+:35261527:35261692:35260821:35262232 +ENSG00000067064:IDI1:chr10:-:1089938:1090111:1089333:1094803 +ENSG00000067836:ROGDI:chr16:-:4851050:4851322:4850579:4851503 +ENSG00000071564:TCF3:chr19:-:1612205:1612432:1611848:1615283 +ENSG00000073350:LLGL2:chr17:+:73570533:73570576:73570341:73570690 +ENSG00000074266:EED:chr11:+:85963189:85963282:85961490:85966263 +ENSG00000075234:TTC38:chr22:+:46688099:46688225:46685796:46688687 +ENSG00000075711:DLG1:chr3:-:196802707:196802741:196796131:196803456 +ENSG00000078549:ADCYAP1R1:chr7:+:31120227:31120248:31117713:31123755 +ENSG00000078668:VDAC3:chr8:+:42254195:42254198:42252651:42256229 +ENSG00000079819:EPB41L2:chr6:-:131188598:131188721:131184858:131206235 +ENSG00000080822:CLDND1:chr3:-:98240496:98240547:98240281:98241692 +ENSG00000080823:MOK:chr14:-:102729882:102729953:102718332:102749814 +ENSG00000080845:DLGAP4:chr20:+:35127990:35128079:35127724:35128601 +ENSG00000085274:MYNN:chr3:+:169501264:169501348:169500431:169502409 +ENSG00000085449:WDFY1:chr2:-:224746658:224746789:224744949:224749364 +ENSG00000087206:UIMC1:chr5:-:176395555:176396292:176385155:176396601 +ENSG00000088543:C3orf18:chr3:-:50602896:50603292:50599178:50604893 +ENSG00000088833:NSFL1C:chr20:-:1436358:1436515:1435777:1438844 +ENSG00000089639:GMIP:chr19:-:19745315:19745512:19744999:19745600 +ENSG00000090006:LTBP4:chr19:+:41122794:41122926:41120352:41123005 +ENSG00000092020:PPP2R3C:chr14:-:35579024:35579053:35577442:35579730 +ENSG00000092199:HNRNPC:chr14:-:21731469:21731495:21702388:21737456 +ENSG00000092199:HNRNPC:chr14:-:21731469:21731741:21702388:21737456 +ENSG00000092330:TINF2:chr14:-:24711095:24711127:24710982:24711346 +ENSG00000092421:SEMA6A:chr5:-:115803278:115803443:115783507:115808768 +ENSG00000092841:MYL6:chr12:+:56553281:56553406:56552495:56553758 +ENSG00000099204:ABLIM1:chr10:-:116207638:116207779:116205162:116211382 +ENSG00000099330:OCEL1:chr19:+:17339611:17339724:17339118:17339817 +ENSG00000099622:CIRBP:chr19:+:1273599:1273714:1272050:1274305 +ENSG00000099785:MARCH2:chr19:+:8483420:8483691:8478304:8486672 +ENSG00000099785:MARCH2:chr19:+:8483625:8483691:8478304:8486672 +ENSG00000099875:MKNK2:chr19:-:2039629:2039855:2037828:2040132 +ENSG00000099889:ARVCF:chr22:-:19958738:19958858:19958266:19959408 +ENSG00000099957:P2RX6:chr22:+:21376964:21377040:21372349:21377230 +ENSG00000100138:SNU13:chr22:-:42078359:42078591:42076368:42084797 +ENSG00000100209:HSCB:chr22:+:29139869:29139911:29138319:29147228 +ENSG00000100209:HSCB:chr22:+:29140602:29140697:29139966:29141851 +ENSG00000100227:POLDIP3:chr22:-:42997975:42998113:42995799:42998775 +ENSG00000100288:CHKB:chr22:-:51018485:51018511:51018231:51018618 +ENSG00000100288:CHKB:chr22:-:51020457:51020524:51020291:51020677 +ENSG00000100379:KCTD17:chr22:+:37456862:37456962:37455478:37457578 +ENSG00000100505:TRIM9:chr14:-:51449659:51449683:51448821:51464767 +ENSG00000101150:TPD52L2:chr20:+:62518916:62518958:62514173:62520542 +ENSG00000101187:SLCO4A1:chr20:+:61299506:61299536:61299262:61299828 +ENSG00000101187:SLCO4A1:chr20:+:61299509:61299536:61299262:61299828 +ENSG00000101363:MANBAL:chr20:+:35927165:35927282:35918089:35929610 +ENSG00000102878:HSF4:chr16:+:67201205:67201305:67201125:67201406 +ENSG00000102977:ACD:chr16:-:67693131:67693166:67692982:67693439 +ENSG00000103121:CMC2:chr16:-:81014374:81014484:81010076:81015410 +ENSG00000103148:NPRL3:chr16:-:167299:167374:162774:180520 +ENSG00000104231:ZFAND1:chr8:-:82629483:82629523:82627349:82630416 +ENSG00000104325:DECR1:chr8:+:91029529:91029554:91013789:91031136 +ENSG00000104723:TUSC3:chr8:+:15615299:15615364:15605974:15621711 +ENSG00000105127:AKAP8:chr19:-:15479877:15480035:15479133:15480956 +ENSG00000105223:PLD3:chr19:+:40871459:40871492:40854631:40872325 +ENSG00000105223:PLD3:chr19:+:40871459:40871492:40854675:40872290 +ENSG00000105223:PLD3:chr19:+:40871459:40871837:40854675:40872325 +ENSG00000105223:PLD3:chr19:+:40871568:40871837:40854675:40872325 +ENSG00000105223:PLD3:chr19:+:40871624:40871837:40854675:40872325 +ENSG00000105278:ZFR2:chr19:-:3855399:3855557:3852612:3868962 +ENSG00000105552:BCAT2:chr19:-:49311012:49311493:49310331:49314240 +ENSG00000106125:MINDY4:chr7:+:30921795:30921976:30915271:30922535 +ENSG00000106133:NSUN5P2:chr7:-:72422690:72422834:72420735:72425163 +ENSG00000106772:PRUNE2:chr9:-:79234255:79234303:79229516:79239938 +ENSG00000107863:ARHGAP21:chr10:-:24879134:24879408:24878231:24880153 +ENSG00000108219:TSPAN14:chr10:+:82228302:82228443:82214127:82248972 +ENSG00000108669:CYTH1:chr17:-:76692088:76692091:76688575:76694350 +ENSG00000108848:LUC7L3:chr17:+:48826584:48826705:48824063:48828107 +ENSG00000109083:IFT20:chr17:-:26659171:26659207:26659013:26662365 +ENSG00000109381:ELF2:chr4:-:139988837:139988924:139983211:139993019 +ENSG00000110074:FOXRED1:chr11:+:126141114:126141552:126139186:126142863 +ENSG00000111144:LTA4H:chr12:-:96397615:96397759:96396842:96400091 +ENSG00000111237:VPS29:chr12:-:110937261:110937351:110934008:110939853 +ENSG00000111596:CNOT2:chr12:+:70687850:70688074:70672054:70704674 +ENSG00000111907:TPD52L1:chr6:+:125578243:125578326:125574901:125583979 +ENSG00000113456:RAD1:chr5:-:34913574:34913683:34911917:34914799 +ENSG00000114698:PLSCR4:chr3:-:145918821:145918864:145914580:145924312 +ENSG00000114993:RTKN:chr2:-:74666677:74667104:74659793:74667479 +ENSG00000115325:DOK1:chr2:+:74783020:74783205:74782795:74783434 +ENSG00000115414:FN1:chr2:-:216257653:216257926:216256537:216259250 +ENSG00000115524:SF3B1:chr2:-:198283619:198283675:198283312:198285151 +ENSG00000116473:RAP1A:chr1:+:112170091:112170185:112162556:112233955 +ENSG00000117523:PRRC2C:chr1:+:171560290:171560339:171557644:171560725 +ENSG00000117616:RSRP1:chr1:-:25570944:25570989:25570715:25571640 +ENSG00000119328:FAM206A:chr9:+:111698587:111698717:111697969:111701475 +ENSG00000119522:DENND1A:chr9:-:126150008:126150137:126146192:126165680 +ENSG00000119979:FAM45A:chr10:+:120864275:120864534:120863709:120867459 +ENSG00000121716:PILRB:chr7:+:99951517:99951635:99950893:99952765 +ENSG00000121964:GTDC1:chr2:-:144764748:144765102:144728329:144899448 +ENSG00000122085:MTERF4:chr2:-:242035807:242035853:242033847:242036657 +ENSG00000122085:MTERF4:chr2:-:242038810:242039309:242036842:242041667 +ENSG00000122203:KIAA1191:chr5:-:175786483:175786570:175782752:175788604 +ENSG00000122203:KIAA1191:chr5:-:175786813:175786921:175782752:175788604 +ENSG00000122257:RBBP6:chr16:+:24579112:24579214:24578826:24580065 +ENSG00000123159:GIPC1:chr19:-:14603668:14603724:14602555:14606848 +ENSG00000123562:MORF4L2:chrX:-:102939608:102939657:102933579:102940098 +ENSG00000123595:RAB9A:chrX:+:13721932:13722021:13707407:13726839 +ENSG00000124074:ENKD1:chr16:-:67698898:67699071:67697965:67699973 +ENSG00000124140:SLC12A5:chr20:+:44651481:44651710:44650525:44652007 +ENSG00000124140:SLC12A5:chr20:+:44651568:44651710:44650525:44652007 +ENSG00000124356:STAMBP:chr2:+:74056531:74056637:74056123:74057971 +ENSG00000125386:FAM193A:chr4:+:2691232:2691431:2674099:2692424 +ENSG00000125388:GRK4:chr4:+:3039100:3039238:3037250:3042298 +ENSG00000125462:C1orf61:chr1:-:156377633:156377767:156374393:156399167 +ENSG00000125462:C1orf61:chr1:-:156386560:156386656:156384545:156399167 +ENSG00000126214:KLC1:chr14:+:104151322:104151373:104145882:104153417 +ENSG00000126261:UBA2:chr19:+:34920997:34921091:34919475:34921480 +ENSG00000126858:RHOT1:chr17:+:30538134:30538257:30535328:30551634 +ENSG00000127419:TMEM175:chr4:+:941903:941942:926328:944208 +ENSG00000128463:EMC4:chr15:+:34520637:34520790:34520047:34521953 +ENSG00000129103:SUMF2:chr7:+:56141806:56141911:56140804:56142278 +ENSG00000129103:SUMF2:chr7:+:56141866:56141911:56140804:56144526 +ENSG00000129538:RNASE1:chr14:-:21270407:21270478:21270252:21270955 +ENSG00000129538:RNASE1:chr14:-:21270407:21270490:21270252:21270955 +ENSG00000129646:QRICH2:chr17:-:74286072:74286158:74283978:74287095 +ENSG00000129993:CBFA2T3:chr16:-:88964485:88964560:88958893:88967911 +ENSG00000130396:AFDN:chr6:+:168355140:168355173:168352867:168363112 +ENSG00000131051:RBM39:chr20:-:34328446:34328519:34326939:34328745 +ENSG00000131095:GFAP:chr17:-:42989039:42989165:42988689:42991095 +ENSG00000131473:ACLY:chr17:-:40052872:40052902:40049427:40054001 +ENSG00000132199:ENOSF1:chr18:-:677344:677444:675402:678695 +ENSG00000132199:ENOSF1:chr18:-:678695:678737:675402:683245 +ENSG00000132341:RAN:chr12:+:131357128:131357162:131356671:131357380 +ENSG00000132613:MTSS1L:chr16:-:70713217:70713226:70712312:70713532 +ENSG00000133612:AGAP3:chr7:+:150817606:150817654:150817232:150819811 +ENSG00000134042:MRO:chr18:-:48327718:48327874:48326513:48331523 +ENSG00000134574:DDB2:chr11:+:47256307:47256485:47256223:47256820 +ENSG00000134717:BTF3L4:chr1:+:52549011:52549213:52530610:52551783 +ENSG00000134769:DTNA:chr18:+:32446074:32446095:32444026:32455202 +ENSG00000134779:TPGS2:chr18:-:34385336:34385360:34380274:34387809 +ENSG00000134851:TMEM165:chr4:+:56277780:56278006:56262563:56290704 +ENSG00000134851:TMEM165:chr4:+:56283969:56284015:56283414:56284214 +ENSG00000135502:SLC26A10:chr12:+:58017785:58017881:58017696:58018648 +ENSG00000136114:THSD1:chr13:-:52960162:52960321:52952924:52971366 +ENSG00000136270:TBRG4:chr7:-:45143697:45143855:45143042:45145039 +ENSG00000136754:ABI1:chr10:-:27047990:27048167:27040712:27052808 +ENSG00000136878:USP20:chr9:+:132627581:132627660:132625578:132630283 +ENSG00000137210:TMEM14B:chr6:+:10755374:10755465:10749931:10770309 +ENSG00000137501:SYTL2:chr11:-:85428525:85428769:85425550:85429832 +ENSG00000137776:SLTM:chr15:-:59191667:59192082:59191051:59193458 +ENSG00000137776:SLTM:chr15:-:59204761:59204809:59193486:59209133 +ENSG00000138443:ABI2:chr2:+:204259422:204259569:204255866:204260378 +ENSG00000138593:SECISBP2L:chr15:-:49311614:49311749:49309825:49319561 +ENSG00000139154:AEBP2:chr12:+:19667604:19667718:19665399:19671020 +ENSG00000139197:PEX5:chr12:+:7354836:7354947:7354437:7355207 +ENSG00000139631:CSAD:chr12:-:53566129:53566220:53565772:53567128 +ENSG00000140416:TPM1:chr15:+:63336225:63336351:63336030:63349183 +ENSG00000140464:PML:chr15:+:74324912:74325056:74317268:74325496 +ENSG00000140538:NTRK3:chr15:-:88671941:88671965:88670457:88678331 +ENSG00000140750:ARHGAP17:chr16:-:24950684:24950918:24946960:24953307 +ENSG00000141127:PRPSAP2:chr17:+:18770569:18770647:18769265:18775895 +ENSG00000141646:SMAD4:chr18:+:48584494:48584614:48581363:48586235 +ENSG00000141646:SMAD4:chr18:+:48584709:48584826:48581363:48586235 +ENSG00000142208:AKT1:chr14:-:105259463:105259547:105259059:105261820 +ENSG00000142208:AKT1:chr14:-:105259463:105259641:105259059:105261820 +ENSG00000142252:GEMIN7:chr19:+:45583164:45583287:45582537:45593364 +ENSG00000143303:RRNAD1:chr1:+:156703800:156703844:156703312:156705516 +ENSG00000143303:RRNAD1:chr1:+:156703800:156704285:156703312:156705516 +ENSG00000143353:LYPLAL1:chr1:+:219366471:219366593:219352588:219383873 +ENSG00000143537:ADAM15:chr1:+:155034379:155034451:155033308:155034720 +ENSG00000143727:ACP1:chr2:+:272036:272065:271939:272191 +ENSG00000143742:SRP9:chr1:+:225974563:225974687:225971070:225976941 +ENSG00000143774:GUK1:chr1:+:228328824:228328989:228328064:228333211 +ENSG00000143776:CDC42BPA:chr1:-:227300371:227300614:227300123:227307504 +ENSG00000144199:FAHD2B:chr2:-:97757198:97757449:97756062:97760437 +ENSG00000144741:SLC25A26:chr3:+:66396787:66396832:66313803:66419901 +ENSG00000145016:RUBCN:chr3:-:197417944:197418019:197411088:197420585 +ENSG00000145349:CAMK2D:chr4:-:114424091:114424133:114421667:114430793 +ENSG00000145740:SLC30A5:chr5:+:68423830:68423959:68419252:68425273 +ENSG00000145782:ATG12:chr5:-:115176193:115176309:115173461:115177086 +ENSG00000146267:FAXC:chr6:-:99781226:99781423:99739696:99790773 +ENSG00000148053:NTRK2:chr9:+:87284594:87284803:87284338:87285291 +ENSG00000148057:IDNK:chr9:+:86242921:86243126:86238136:86243787 +ENSG00000148180:GSN:chr9:+:124062333:124062404:124045670:124064240 +ENSG00000148341:SH3GLB2:chr9:-:131771731:131771746:131771070:131772049 +ENSG00000148399:DPH7:chr9:-:140472028:140472055:140470619:140473076 +ENSG00000148481:MINDY3:chr10:-:15828559:15828647:15824225:15831245 +ENSG00000148660:CAMK2G:chr10:-:75585078:75585105:75583842:75597225 +ENSG00000148840:PPRC1:chr10:+:103906428:103907149:103904064:103908128 +ENSG00000149294:NCAM1:chr11:+:113117089:113117092:113105886:113126598 +ENSG00000149531:FRG1BP:chr20:+:29625872:29625984:29623254:29628226 +ENSG00000150967:ABCB9:chr12:-:123425353:123425542:123424831:123428937 +ENSG00000151276:MAGI1:chr3:-:65361419:65361623:65350621:65364935 +ENSG00000152465:NMT2:chr10:-:15182985:15183028:15177417:15183420 +ENSG00000153391:INO80C:chr18:-:33069295:33069349:33060527:33077682 +ENSG00000154134:ROBO3:chr11:+:124747056:124747205:124746828:124747414 +ENSG00000154134:ROBO3:chr11:+:124747162:124747205:124746828:124747414 +ENSG00000154845:PPP4R1:chr18:-:9562919:9563044:9562073:9563375 +ENSG00000155897:ADCY8:chr8:-:131859669:131859759:131848695:131861847 +ENSG00000156219:ART3:chr4:+:77025749:77025782:77025122:77033538 +ENSG00000156345:CDK20:chr9:-:90584710:90584834:90584264:90585690 +ENSG00000157538:DSCR3:chr21:-:38605662:38605743:38604752:38610760 +ENSG00000158856:DMTN:chr8:+:21938315:21938381:21938136:21938623 +ENSG00000159214:CCDC24:chr1:+:44457883:44458059:44457676:44458194 +ENSG00000159899:NPR2:chr9:+:35802728:35802800:35802604:35805507 +ENSG00000160072:ATAD3B:chr1:+:1425071:1425191:1424654:1425636 +ENSG00000160323:ADAMTS13:chr9:+:136303365:136303486:136303017:136304486 +ENSG00000160408:ST6GALNAC6:chr9:-:130660104:130660289:130658611:130661781 +ENSG00000160584:SIK3:chr11:-:116738661:116738805:116734534:116741046 +ENSG00000160767:FAM189B:chr1:-:155223649:155223769:155223523:155224443 +ENSG00000161203:AP2M1:chr3:+:183898432:183898529:183898039:183898636 +ENSG00000161249:DMKN:chr19:-:35996840:35996888:35996667:36001085 +ENSG00000161692:DBF4B:chr17:+:42818703:42818820:42815792:42824450 +ENSG00000161955:TNFSF13:chr17:+:7462940:7463019:7462614:7463162 +ENSG00000162065:TBC1D24:chr16:+:2547710:2547728:2547114:2548238 +ENSG00000162066:AMDHD2:chr16:+:2577561:2577616:2571124:2578060 +ENSG00000162430:SELENON:chr1:+:26128506:26128608:26127651:26131632 +ENSG00000162910:MRPL55:chr1:-:228296137:228296175:228296019:228296849 +ENSG00000162910:MRPL55:chr1:-:228296137:228296722:228296019:228296849 +ENSG00000162910:MRPL55:chr1:-:228296137:228296722:228296022:228296849 +ENSG00000162910:MRPL55:chr1:-:228296141:228296209:228296019:228296849 +ENSG00000162910:MRPL55:chr1:-:228296655:228296722:228295570:228296849 +ENSG00000162910:MRPL55:chr1:-:228296655:228296722:228296175:228296849 +ENSG00000162961:DPY30:chr2:-:32108454:32108531:32095021:32142994 +ENSG00000163170:BOLA3:chr2:-:74369398:74369487:74362785:74372315 +ENSG00000163875:MEAF6:chr1:-:37962307:37962337:37962205:37967404 +ENSG00000164615:CAMLG:chr5:+:134076752:134077213:134074482:134086448 +ENSG00000164830:OXR1:chr8:+:107749747:107749828:107738537:107751685 +ENSG00000165416:SUGT1:chr13:+:53235609:53235705:53233384:53236783 +ENSG00000165644:COMTD1:chr10:-:76995373:76995501:76995130:76995592 +ENSG00000165669:FAM204A:chr10:-:120101238:120101439:120095935:120101781 +ENSG00000165795:NDRG2:chr14:-:21492188:21492255:21491480:21493187 +ENSG00000165795:NDRG2:chr14:-:21492188:21492298:21491480:21493187 +ENSG00000165949:IFI27:chr14:+:94577970:94578119:94577143:94581196 +ENSG00000165949:IFI27:chr14:+:94578004:94578119:94577143:94581196 +ENSG00000166140:ZFYVE19:chr15:+:41104896:41105100:41102955:41105535 +ENSG00000166295:ANAPC16:chr10:+:73983645:73983814:73975830:73990123 +ENSG00000166352:C11orf74:chr11:+:36657600:36657667:36631789:36669565 +ENSG00000167508:MVD:chr16:-:88726927:88727045:88725128:88729418 +ENSG00000167515:TRAPPC2L:chr16:+:88925752:88925851:88925197:88925980 +ENSG00000167985:SDHAF2:chr11:+:61205475:61205585:61197654:61213412 +ENSG00000168000:BSCL2:chr11:-:62472772:62473030:62462183:62474580 +ENSG00000168137:SETD5:chr3:+:9478533:9478572:9477590:9482139 +ENSG00000168591:TMUB2:chr17:+:42265043:42265111:42264477:42266389 +ENSG00000168591:TMUB2:chr17:+:42265274:42265377:42265111:42266302 +ENSG00000168765:GSTM4:chr1:+:110202411:110202581:110201732:110203786 +ENSG00000168781:PPIP5K1:chr15:-:43856300:43856363:43851130:43857064 +ENSG00000168958:MFF:chr2:+:228207460:228207535:228205096:228220392 +ENSG00000169045:HNRNPH1:chr5:-:179046269:179046361:179045324:179047892 +ENSG00000169180:XPO6:chr16:-:28164250:28164285:28164106:28167394 +ENSG00000169231:THBS3:chr1:-:155170241:155170401:155169904:155170687 +ENSG00000169764:UGP2:chr2:+:64069672:64069733:64069338:64083439 +ENSG00000170632:ARMC10:chr7:+:102732923:102733100:102727211:102737723 +ENSG00000170919:TPT1-AS1:chr13:+:45964848:45965037:45963955:45965166 +ENSG00000170919:TPT1-AS1:chr13:+:45964892:45965037:45963955:45965166 +ENSG00000170954:ZNF415:chr19:-:53618462:53618560:53613161:53618956 +ENSG00000171202:TMEM126A:chr11:+:85361355:85361385:85359133:85365106 +ENSG00000171792:RHNO1:chr12:+:2994327:2994700:2986448:2997076 +ENSG00000171792:RHNO1:chr12:+:2994428:2994700:2986448:2997076 +ENSG00000172046:USP19:chr3:-:49154491:49154791:49154376:49154869 +ENSG00000172508:CARNS1:chr11:+:67183646:67185148:67183234:67185901 +ENSG00000172508:CARNS1:chr11:+:67184877:67185148:67183234:67185901 +ENSG00000172663:TMEM134:chr11:-:67232526:67232738:67232327:67234782 +ENSG00000172785:CBWD1:chr9:-:163977:164037:162469:172080 +ENSG00000172890:NADSYN1:chr11:+:71191264:71191320:71189515:71191800 +ENSG00000173599:PC:chr11:-:66721719:66721907:66719939:66725792 +ENSG00000173660:UQCRH:chr1:+:46775568:46775716:46774799:46775826 +ENSG00000173744:AGFG1:chr2:+:228395806:228395926:228389631:228398264 +ENSG00000174446:SNAPC5:chr15:-:66787240:66787757:66786890:66789979 +ENSG00000175203:DCTN2:chr12:-:57932265:57932307:57929628:57939810 +ENSG00000175309:PHYKPL:chr5:-:177651642:177651730:177651565:177652355 +ENSG00000176261:ZBTB8OS:chr1:-:33100302:33100393:33099673:33116033 +ENSG00000176261:ZBTB8OS:chr1:-:33100368:33100393:33093145:33116033 +ENSG00000176261:ZBTB8OS:chr1:-:33100368:33100393:33099711:33116033 +ENSG00000177225:PDDC1:chr11:-:775065:775142:774113:777398 +ENSG00000177410:ZFAS1:chr20:+:47897021:47897107:47895745:47905581 +ENSG00000177479:ARIH2:chr3:+:48962150:48962404:48960244:48964894 +ENSG00000177697:CD151:chr11:+:834457:834591:833026:836062 +ENSG00000177697:CD151:chr11:+:834529:834591:833022:836062 +ENSG00000177697:CD151:chr11:+:834529:834591:833026:836062 +ENSG00000177830:CHID1:chr11:-:908541:908645:904859:910774 +ENSG00000178104:PDE4DIP:chr1:-:144859758:144859998:144857042:144863317 +ENSG00000178498:DTX3:chr12:+:57999126:57999514:57998641:57999972 +ENSG00000178498:DTX3:chr12:+:57999353:57999514:57998641:57999972 +ENSG00000178761:FAM219B:chr15:-:75198664:75198706:75197572:75198927 +ENSG00000178927:C17orf62:chr17:-:80405455:80405522:80404572:80407045 +ENSG00000179818:PCBP1-AS1:chr2:-:70310669:70310819:70278437:70312678 +ENSG00000181038:METTL23:chr17:+:74725771:74725876:74723260:74729059 +ENSG00000182179:UBA7:chr3:-:49846822:49846897:49846592:49846974 +ENSG00000182534:MXRA7:chr17:-:74679928:74680009:74676961:74681153 +ENSG00000182796:TMEM198B:chr12:+:56228634:56229399:56227325:56229909 +ENSG00000182872:RBM10:chrX:+:47034417:47034491:47032596:47035898 +ENSG00000182985:CADM1:chr11:-:115069125:115069158:115049495:115085327 +ENSG00000183780:SLC35F3:chr1:+:234444846:234445066:234367487:234452347 +ENSG00000185046:ANKS1B:chr12:-:99201621:99201693:99194903:99222951 +ENSG00000185324:CDK10:chr16:+:89755659:89755732:89753167:89756960 +ENSG00000185485:SDHAP1:chr3:-:195695089:195695256:195694861:195698193 +ENSG00000185565:LSAMP:chr3:-:115535460:115535493:115529261:115560691 +ENSG00000185565:LSAMP:chr3:-:115553408:115553444:115529261:115560691 +ENSG00000185596:WASH3P:chr15:+:102512798:102512897:102501844:102513103 +ENSG00000186575:NF2:chr22:+:30079008:30079068:30077590:30090740 +ENSG00000186998:EMID1:chr22:+:29622478:29622540:29611619:29627008 +ENSG00000187735:TCEA1:chr8:-:54915451:54915652:54912610:54922989 +ENSG00000188338:SLC38A3:chr3:+:50244768:50244910:50242781:50251581 +ENSG00000188343:FAM92A:chr8:+:94714747:94714771:94713686:94715847 +ENSG00000189171:S100A13:chr1:-:153599958:153600074:153599009:153600596 +ENSG00000189171:S100A13:chr1:-:153602925:153603132:153600713:153603486 +ENSG00000189403:HMGB1:chr13:-:31035775:31035825:31035670:31036674 +ENSG00000196586:MYO6:chr6:+:76608089:76608128:76602407:76617321 +ENSG00000196839:ADA:chr20:-:43251228:43251293:43249788:43251469 +ENSG00000196923:PDLIM7:chr5:-:176918404:176918421:176918147:176918807 +ENSG00000196923:PDLIM7:chr5:-:176918807:176918977:176918421:176919405 +ENSG00000196923:PDLIM7:chr5:-:176918807:176918996:176918421:176919405 +ENSG00000197451:HNRNPAB:chr5:+:177637132:177637273:177636448:177637553 +ENSG00000197798:FAM118B:chr11:+:126099119:126099231:126081725:126104889 +ENSG00000197971:MBP:chr18:-:74700255:74700483:74696850:74700832 +ENSG00000197971:MBP:chr18:-:74728787:74728802:74702016:74728869 +ENSG00000198276:UCKL1:chr20:-:62573822:62573845:62572561:62575005 +ENSG00000198794:SCAMP5:chr15:+:75299928:75300069:75288014:75304132 +ENSG00000204580:DDR1:chr6:+:30863180:30863291:30862448:30864397 +ENSG00000204843:DCTN1:chr2:-:74600054:74600075:74598855:74604558 +ENSG00000205581:HMGN1:chr21:-:40717755:40717884:40717200:40720217 +ENSG00000205981:DNAJC19:chr3:-:180704785:180704810:180703784:180705810 +ENSG00000214078:CPNE1:chr20:-:34243123:34243266:34220845:34246851 +ENSG00000214078:CPNE1:chr20:-:34243123:34243266:34220845:34252681 +ENSG00000215788:TNFRSF25:chr1:-:6523131:6523187:6523016:6524434 +ENSG00000215908:CROCCP2:chr1:-:16969261:16969345:16961663:16969522 +ENSG00000217555:CKLF:chr16:+:66592092:66592251:66586696:66597024 +ENSG00000221978:CCNL2:chr1:-:1328169:1328183:1326245:1328775 +ENSG00000223482:NUTM2A-AS1:chr10:-:89067713:89067817:89048252:89086386 +ENSG00000225177:RP11-390P2.4:chr6:+:139015529:139015684:139013754:139017733 +ENSG00000228315:GUSBP11:chr22:-:24029017:24029182:24026054:24032422 +ENSG00000228315:GUSBP11:chr22:-:24042912:24043032:24037704:24047615 +ENSG00000228439:TSTD3:chr6:+:99979238:99979412:99973985:99979507 +ENSG00000231312:AC007246.3:chr2:+:39680979:39681072:39664676:39743554 +ENSG00000231312:AC007246.3:chr2:+:39681437:39681506:39664676:39743554 +ENSG00000232527:RP11-14N7.2:chr1:+:148933290:148933368:148932920:148951244 +ENSG00000234171:RNASEH1-AS1:chr2:+:3607037:3607319:3606588:3608906 +ENSG00000234608:MAPKAPK5-AS1:chr12:-:112279130:112279274:112278281:112279487 +ENSG00000237441:RGL2:chr6:-:33266231:33266428:33264892:33266646 +ENSG00000237651:C2orf74:chr2:+:61384996:61385147:61372331:61386452 +ENSG00000239382:ALKBH6:chr19:-:36504415:36504490:36504324:36505076 +ENSG00000239779:WBP1:chr2:+:74686123:74686225:74685798:74686769 +ENSG00000239779:WBP1:chr2:+:74686564:74686679:74685798:74686769 +ENSG00000239779:WBP1:chr2:+:74686604:74686689:74685798:74686769 +ENSG00000241231:RP11-275H4.1:chr3:-:181157764:181157822:181156487:181160161 +ENSG00000243147:MRPL33:chr2:+:27997290:27997397:27995559:28002299 +ENSG00000245958:RP11-33B1.1:chr4:+:120418965:120419058:120415678:120433505 +ENSG00000247828:TMEM161B-AS1:chr5:+:87577858:87577923:87566402:87732089 +ENSG00000260807:RP11-161M6.2:chr16:-:1026326:1026400:1025995:1026777 +ENSG00000269313:MAGIX:chrX:+:49021200:49021428:49021127:49021527 +ENSG00000269313:MAGIX:chrX:+:49021245:49021428:49021127:49021527 +ENSG00000269893:SNHG8:chr4:+:119200098:119200292:119199947:119200543 +ENSG00000270249:RP11-514P8.7:chr7:-:102207028:102207183:102182109:102207443 +ENSG00000275052:PPP4R3B:chr2:-:55825551:55826175:55816092:55831113 +ENSG00000276087:RP11-507M3.1:chr2:+:24362239:24362320:24358057:24369617 +ENSG00000277363:SRCIN1:chr17:-:36724458:36724482:36720549:36728914 +ENSG00000278535:DHRS11:chr17:+:34954591:34954686:34951610:34955349 diff --git a/IRIS_functions.md b/IRIS_functions.md new file mode 100644 index 0000000..6d4181a --- /dev/null +++ b/IRIS_functions.md @@ -0,0 +1,484 @@ +# IRIS modules + +[back to IRIS quick guide](README.md) + +For questions about input file format, see example folder. + +For test runs with files in example folder, users will need to modify directories of `fin_matrices`, etc. + +## `format` + +When starting from standard output of [rMATS](https://github.com/Xinglab/rmats-turbo), users should use this step to 1) reformat splice junction counts into a PSI (percent-spliced-in) value matrix, and 2) index and 3) move the PSI matrix for IRIS screening (when -d is enabled). +``` +usage: IRIS format [-h] -t {SE,RI,A3SS,A5SS} -n DATA_NAME -s {1,2} + [-c COV_CUTOFF] [-i] [-e] [-d IRIS_DB_PATH] [--novelSS] + [--gtf GTF] + rmats_mat_path_manifest rmats_sample_order + +required arguments: + rmats_mat_path_manifest + txt manifest of path(s) to rMATS output folder(s) + rmats_sample_order TXT file manifest of corresponding rMATS input sample + order file(s). Required input for rMATS + -t {SE,RI,A3SS,A5SS}, --splicing-event-type {SE,RI,A3SS,A5SS} + String of splicing event types based on rMATS + definition (SE,RI,A3SS,A5SS).Used to name output file + -n DATA_NAME, --data-name DATA_NAME + Defines dataset name (disease state, study name, group + name etc.). Used during IRIS screening + -s {1,2}, --sample-name-field {1,2} + Specifies sample name field (1- SJ count file name, 2- + SJ count folder name), for each sample the name should + match their name in "rmats_sample_order" + +optional arguments: + -h, --help show this help message and exit + -c COV_CUTOFF, --cov-cutoff COV_CUTOFF + Average coverage filter for merged matrix (Default is + 10) + -i, --sample-based-filter + Coverage filter by individual sample not by entire + input group. (Default is disabled) + -e, --merge-events-only + Do not perform matrix merge, only merge events list + -d IRIS_DB_PATH, --iris-db-path IRIS_DB_PATH + Path to store the formatted/indexed AS matrix. + Strongly recommend to store the AS matrix to the IRIS + db by setting the path to the directory containing + folders of pre-index AS reference + ("full_path/IRIS_data.vX/db"). Default is current + location. + --novelSS Enable formatting events with splice junctions + containing novelSS. (Different and a subset of rMATS + novelSS definition. Default is False) + --gtf GTF Path to the Genome annotation GTF file. Required input + when novelSS is enabled. +``` + +## `screen` + +This step takes a user-defined screening parameter file ([example/NEPC_test.para](example/NEPC_test.para)), and performs comparisons against reference databases, and returns tumor-associated, tumor-recurrent, and tumor-specific AS events based on user-defined criteria. + +When the -t option is enabled, the screening step translates identified tumor AS events into peptide sequences that can be used in the prediction step. +``` +usage: IRIS screen [-h] -p PARAMETER_FIN + [--splicing-event-type {SE,RI,A3SS,A5SS}] -o OUTDIR [-t] + [-g GTF] [--all-orf] [--ignore-annotation] + [--remove-early-stop] [--min-sample-count MIN_SAMPLE_COUNT] + [--use-existing-test-result] + +required arguments: + -p PARAMETER_FIN, --parameter-fin PARAMETER_FIN + File of 'IRIS screen' parameters + --splicing-event-type {SE,RI,A3SS,A5SS} + String of splicing event types based on rMATS + definition (SE,RI,A3SS,A5SS).Used to name output file. + (Default is SE event) + -o OUTDIR, --outdir OUTDIR + Directory of IRIS screening results + +optional arguments: + -h, --help show this help message and exit + -t, --translating Translates IRIS-screened tumor splice junctions into + peptides + -g GTF, --gtf GTF The Genome annotation GTF file. Required by IRIS + translate option. + --all-orf Perform the 3 ORF translation. ORF known in the + UniProtKB will be labeled as uniprotFrame in the bed + file (Default is to use the known ORF ONLY) + --ignore-annotation Perform 3 ORF translation without annotating known ORF + from the UniProtKB (Default is disabled) + --remove-early-stop Discard the peptide if containing early stop codon + (Default is keep the truncated peptide) + --min-sample-count MIN_SAMPLE_COUNT + The minimum number of non-missing sample in the input + group for an event to be considered for testiing. Once + specified, removed events will be written to "notest" + file. (Default is no minimum) + --use-existing-test-result + Skip testing and use existing testing result (Default + is run full testing steps) +``` +Additionally, `screen_sjc` can be performed as part of the 'tumor-specificity' screen. +``` +usage: IRIS screen_sjc [-h] -p PARAMETER_FIN + --splicing-event-type {SE,RI,A3SS,A5SS} + -e EVENT_LIST_FILE -o OUTDIR + [--use-existing-test-result] + [--tumor-read-cov-cutoff TUMOR_READ_COV_CUTOFF] + [--normal-read-cov-cutoff NORMAL_READ_COV_CUTOFF] +``` +Optionally, `screen_cpm` can be performed to as a higher stringent 'tumor-association' screen or less stringent 'tumor-specificity' by using normalized splice junction counts (in CPM). +``` +usage: IRIS screen_cpm [-h] -p PARAMETER_FIN + --splicing-event-type {SE,RI,A3SS,A5SS} + -e EVENT_LIST_FILE -o OUTDIR + [--use-existing-test-result] +``` + +## `predict` + +This step takes the screening result and performs annotation of extracellular and HLA-binding epitope predictions to discover immunotherapy targets. + +IRIS prediction of HLA-binding epitopes is a massive prediction job that can utilize a compute cluster. The `prediction` step will create scripts to perform subtasks. If properly configured, those subtask scripts can be executed concurrently by snakemake. +``` +usage: IRIS predict [-h] --task-dir TASK_DIR -p PARAMETER_FIN + [-t {SE,RI,A3SS,A5SS}] [--iedb-local IEDB_LOCAL] + [-m MHC_LIST] [--extracellular-only] [--tier3-only] + [--gene-exp-matrix GENE_EXP_MATRIX] [-c DELTAPSI_COLUMN] + [-d DELTAPSI_CUT_OFF] [-e EPITOPE_LEN_LIST] [--all-orf] + [--extracellular-anno-by-junction] + IRIS_screening_result_path + +required arguments: + IRIS_screening_result_path + Directory of IRIS screening results + --task-dir TASK_DIR Directory to write individual task scripts + -p PARAMETER_FIN, --parameter-fin PARAMETER_FIN + File of parameters used in 'IRIS screen' + -t {SE,RI,A3SS,A5SS}, --splicing-event-type {SE,RI,A3SS,A5SS} + String of splicing event types based on rMATS + definition (SE,RI,A3SS,A5SS).Used to name output file. + (Default is SE event) + +optional arguments: + -h, --help show this help message and exit + --iedb-local IEDB_LOCAL + Specify local IEDB location (if installed) + -m MHC_LIST, --mhc-list MHC_LIST + List of HLA/MHC types among samples. HLA type follows + seq2HLA format + --extracellular-only Only predict CAR-T Targets. Will not predict HLA + binding. + --tier3-only To only run predict on events passing all screen + tiers, which is the tier3 output. Will be much faster + when both the tier1 and tier3 were used. + --gene-exp-matrix GENE_EXP_MATRIX + Tab-delimited matrix of gene expression vs. samples + -c DELTAPSI_COLUMN, --deltaPSI-column DELTAPSI_COLUMN + Column of deltaPSI value in matrix, 1-based (Default + is 5th column) + -d DELTAPSI_CUT_OFF, --deltaPSI-cut-off DELTAPSI_CUT_OFF + Defines cutoff of deltaPSI (or other metric) to select + tumor-enriched splice form (Default is 0) + -e EPITOPE_LEN_LIST, --epitope-len-list EPITOPE_LEN_LIST + Epitope length for prediction (Default is 9,10,11) + --all-orf Perform prediction based on 3 ORF translation + peptides. Enable this if translation/screening used + this option (Default is False) + --extracellular-anno-by-junction + By default, CAR-T targets are annotated by association + of event with extracellular domain. This option + annotates target based on a junction (not recommended) +``` + +## `epitope_post` + +``` +usage: IRIS epitope_post [-h] -p PARAMETER_FIN -o OUTDIR + [-t {SE,RI,A3SS,A5SS}] -m MHC_BY_SAMPLE + [-e GENE_EXP_MATRIX] [--tier3-only] [--keep-exist] + [--epitope-len-list EPITOPE_LEN_LIST] + [--no-match-to-canonical-proteome] + [--no-uniqueness-annotation] + [--ic50-cut-off IC50_CUT_OFF] + +required arguments: + -p PARAMETER_FIN, --parameter-fin PARAMETER_FIN + File of parameters used in IRIS screen + -o OUTDIR, --outdir OUTDIR + Directory of IRIS screening results + -t {SE,RI,A3SS,A5SS}, --splicing-event-type {SE,RI,A3SS,A5SS} + String of splicing event types based on rMATS + definition (SE,RI,A3SS,A5SS).Used to name output file + (Default is SE event) + -m MHC_BY_SAMPLE, --mhc-by-sample MHC_BY_SAMPLE + Tab-delimited matrix of HLA/MHC type vs. samples. HLA + type follows seq2HLA format + -e GENE_EXP_MATRIX, --gene-exp-matrix GENE_EXP_MATRIX + Tab-delimited matrix of gene expression vs. samples + +optional arguments: + -h, --help show this help message and exit + --tier3-only Only predict tier3 events. Will be much faster. + --keep-exist Do not rewrite a new postive prediction file when the + file existed. Default is False + --epitope-len-list EPITOPE_LEN_LIST + Epitope length for prediction (Default is 9,10,11) + --no-match-to-canonical-proteome + Matches epitopes to UniProt canonical protein + sequences as an annotation. + --no-uniqueness-annotation + Matches epitopes to all IRIS translated junction + peptides in the same analysis as an annotation. + --ic50-cut-off IC50_CUT_OFF + Specifies IC50 cut-off to define HLA-binding epitopes + (Default is 500) +``` + +## `process_rnaseq` + +When starting from a fastq file, users should use this step to perform RNA-Seq alignment and quantification. This module uses STAR and cufflinks. This module only takes one sample (can be multiple fastq files) for each run. Users are recommended to run this module in parallel (use `makesubsh_mapping` for snakemake). +``` +usage: IRIS process_rnaseq [-h] --starGenomeDir STARGENOMEDIR --gtf GTF -p + SAMPLEID_OUTDIR [--db-length DB_LENGTH] [--mapping] + [--quant] [--sort] + readsFilesRNA + +required arguments: + --starGenomeDir STARGENOMEDIR + The path to the STAR indexed reference genome. Pass to + the "genomeDir" parameter in STAR + --gtf GTF Path to the Genome annotation GTF file + -p SAMPLEID_OUTDIR, --sampleID-outdir SAMPLEID_OUTDIR + Output directory where sample ID will be used as the + output folder name + --db-length DB_LENGTH + Pass to the "sjdbOverhang" parameter in STAR. Default + is 100 + readsFilesRNA Specify the path to the paired-end FASTQ files for the + sample. Files are seperated eperated by ",". + +optional arguments: + -h, --help show this help message and exit + --mapping Only perform reads mapping + --quant Only perform gene expression and AS quantification + --sort Only perform BAM file sorting +``` + +## `makesubsh_mapping` + +Run `process_rnaseq` jobs in parallel on HPC or cloud based on snakemake. +``` +usage: IRIS makesubsh_mapping [-h] [--fastq-folder-dir FASTQ_FOLDER_DIR] + --starGenomeDir STARGENOMEDIR --gtf GTF + --data-name DATA_NAME --outdir OUTDIR + --label-string LABEL_STRING --task-dir TASK_DIR + +required arguments: + --fastq-folder-dir FASTQ_FOLDER_DIR + Specify the path to the higher level of all folders + containing FASTQ files + --starGenomeDir STARGENOMEDIR + The path to the STAR indexed reference genome. Pass to + the "genomeDir" parameter in STAR + --gtf GTF Path to the Genome annotation GTF file + --data-name DATA_NAME + Data set name used to name submission shell scripts + files. + --outdir OUTDIR Output directory for folders of aligned BAM files + --label-string LABEL_STRING + String in the fastq file name between the reads pair + number and "fastq/fq". This is used to recognize + paired-end reads. e.g. For FASTQ_file_L1_R2.fastq.gz, + the label string is the "." between "2" and "fastq". + --task-dir TASK_DIR Directory to write individual task scripts + +optional arguments: + -h, --help show this help message and exit +``` + +## `makesubsh_rmats` + +After running `process_rnaseq`, this step can be used to prepare files to run rMATS-turbo in parallel. +``` +usage: IRIS makesubsh_rmats [-h] --rMATS-path RMATS_PATH --bam-dir BAM_DIR + [--bam-prefix BAM_PREFIX] --gtf GTF --data-name + DATA_NAME --task-dir TASK_DIR [--novelSS] + [--read-length READ_LENGTH] + +required arguments: + --rMATS-path RMATS_PATH + Path to the rMATS-turbo script. + --bam-dir BAM_DIR The path one level higher to folders containing BAM + file generated by "process_rnaseq". + --bam-prefix BAM_PREFIX + BAM file prefix (Default is + "Aligned.sortedByCoord.out") + --gtf GTF Path to the Genome annotation GTF file + --data-name DATA_NAME + Data set name used to name submission shell scripts + --task-dir TASK_DIR Directory to write individual task scripts + +optional arguments: + -h, --help show this help message and exit + --novelSS Enable rMATS novelSS option to include novel splice + site detected from the RNA-seq data (Default is False) + --read-length READ_LENGTH + User defined read length instead of using STAR maaping + log file to define automatically. +``` + +## `makesubsh_rmatspost` + +After running `makesubsh_rmats`, this step can be used to merge files to generate final rMATS-turbo results. +``` +usage: IRIS makesubsh_rmatspost [-h] --rMATS-path RMATS_PATH --bam-dir BAM_DIR + --gtf GTF --data-name DATA_NAME [--novelSS] + --task-dir TASK_DIR + +required arguments: + --rMATS-path RMATS_PATH + Path to the rMATS-turbo scripte + --bam-dir BAM_DIR The path one level higher to folders containing BAM + file generated by "process_rnaseq". + --gtf GTF Path to the Genome annotation GTF file + --data-name DATA_NAME + Data set name used to name submission shell scripts + --task-dir TASK_DIR Directory to write individual task scripts + +optional arguments: + -h, --help show this help message and exit + --novelSS Enable rMATS novelSS option to include novel splice + site detected from the RNA-seq data (Default is False) +``` + +## `exp_matrix` + +After running `process_rnaseq`, if samples of interest are all processed, users can use this script to generate a gene expression matrix, which will be used as annotations in downstream IRIS prediction and/or proteomics reports. +``` +usage: IRIS exp_matrix [-h] [--exp-cutoff EXP_CUTOFF] -o OUTDIR -n DATA_NAME + gene_exp_file_list + +required arguments: + gene_exp_file_list A txt manifest of path(s) of cufflinks gene expression + output(s). + -n DATA_NAME, --data-name DATA_NAME + Name of the dataset (disease state, study name, group + name etc.). + +optional arguments: + -h, --help show this help message and exit + --exp-cutoff EXP_CUTOFF + Gene expression cut-off based on FPKM (Default is 1) + -o OUTDIR, --outdir OUTDIR + Output directory for IRIS exp_matrix +``` + +## `index` + +This step is incorporated by formatting. For users who already have a matrix of AS PSI values (generated by rMATS or another tool), this command could finish the indexing and other steps to prepare for IRIS screening. +``` +usage: IRIS index [-h] -t {SE,RI,A3SS,A5SS} -n DATA_NAME [-c COV_CUTOFF] + [-o OUTDIR] + splicing_matrix + +required arguments: + splicing_matrix Tab-delimited matrix of splicing events (row) vs. + sample IDs (col) + -t {SE,RI,A3SS,A5SS}, --splicing-event-type {SE,RI,A3SS,A5SS} + String of splicing event types based on rMATS + definition (SE,RI,A3SS,A5SS).Used to name output file + -n DATA_NAME, --data-name DATA_NAME + Name of data matrix (disease state, study name, group + name, etc.) being indexed. Used by IRIS during + screening + +optional arguments: + -h, --help show this help message and exit + -c COV_CUTOFF, --cov-cutoff COV_CUTOFF + For the naming purpose, Input average coverage cutoff + used when generating the PSI matrix (Default is 10) + -o OUTDIR, --outdir OUTDIR + Output directory for IRIS database +``` + +## `translate` + +``` +usage: IRIS translate [-h] -g REF_GENOME -t {SE,RI,A3SS,A5SS} --gtf GTF -o + OUTDIR [--all-orf] [--ignore-annotation] + [--remove-early-stop] [-c DELTAPSI_COLUMN] + [-d DELTAPSI_CUT_OFF] [--no-tumor-form-selection] + [--check-novel] + as_input + +required arguments: + as_input Inputs AS event coordinates and delta PSI values + -g REF_GENOME, --ref-genome REF_GENOME + Specifies reference genome (FASTA format) location + -t {SE,RI,A3SS,A5SS}, --splicing-event-type {SE,RI,A3SS,A5SS} + String of splicing event types based on rMATS + definition (SE,RI,A3SS,A5SS).Used to name output file + --gtf GTF Path to the Genome annotation GTF file. Used to define + exon ends for microexons + -o OUTDIR, --outdir OUTDIR + Defines IRIS translation output directory + +optional arguments: + -h, --help show this help message and exit + --all-orf Perform the 3 ORF translation. ORF known in the + UniProtKB will be labeled as uniprotFrame in the bed + file (Default is to use the known ORF ONLY) + --ignore-annotation Perform 3 ORF translation without annotating known ORF + from the UniProtKB (Default is disabled) + --remove-early-stop Discard the peptide if containing early stop codon + (Default is keep the truncated peptide) + -c DELTAPSI_COLUMN, --deltaPSI-column DELTAPSI_COLUMN + Column of deltaPSI value in matrix, 1-based (Default + is 5th column) + -d DELTAPSI_CUT_OFF, --deltaPSI-cut-off DELTAPSI_CUT_OFF + Defines cutoff of deltaPSI (or other metric) used to + select tumor-enriched splice form (Default is 0) + --no-tumor-form-selection + Translates splicing junctions derived from both + skipping and inclusion forms (Default is False) + --check-novel Translates splicing junctions derived from novel + splice sites only using information passed from + screen_novelss (Default is False) +``` + +## `makesubsh_hla` + +This step uses the RNA-Seq FASTQ file to infer the HLA type of a sample. +``` +usage: IRIS makesubsh_hla [-h] [--fastq-folder-dir FASTQ_FOLDER_DIR] + --data-name DATA_NAME -o OUTDIR --label-string + LABEL_STRING --task-dir TASK_DIR + +required arguments: + --fastq-folder-dir FASTQ_FOLDER_DIR + Specify the path to the higher level of all folders + containing FASTQ files + --data-name DATA_NAME + Data set name used to name submission shell scripts. + -o OUTDIR, --outdir OUTDIR + Output directory for folders of seq2hla result + --label-string LABEL_STRING + String in the fastq file name between the reads pair + number and "fastq/fq". This is used to recognize + paired-end reads. e.g. For FASTQ_file_L1_R2.fastq.gz, + the label string is the "." between "2" and "fastq". + --task-dir TASK_DIR Directory to write individual task scripts + +optional arguments: + -h, --help show this help message and exit +``` + +## `pep2epitope` + +This module is a wrapper of prediction tools (IEDB) for predicting peptide-HLA binding. The `prediction` and `epitope_post` modules can generate scripts to run this module in parallel and summarize the result into one TCR target report. +``` +usage: IRIS pep2epitope [-h] [-e EPITOPE_LEN_LIST] [-a HLA_ALLELE_LIST] -o + OUTDIR [--iedb-local IEDB_LOCAL] + [--ic50-cut-off IC50_CUT_OFF] + junction_pep_input + +required arguments: + junction_pep_input Inputs junction peptides + -e EPITOPE_LEN_LIST, --epitope-len-list EPITOPE_LEN_LIST + Epitope length for prediction (Default is 9,10,11) + -a HLA_ALLELE_LIST, --hla-allele-list HLA_ALLELE_LIST + List of HLA types (Default is HLA-A*01:01, + HLA-B*08:01, HLA-C*07:01) + -o OUTDIR, --outdir OUTDIR + Define output directory of pep2epitope + --iedb-local IEDB_LOCAL + Specify local IEDB location (if installed) + --ic50-cut-off IC50_CUT_OFF + Cut-off based on median value of consensus-predicted + IC50 values (Default is 500) + +optional arguments: + -h, --help show this help message and exit +``` diff --git a/IRIS_modules.md b/IRIS_modules.md deleted file mode 100644 index 7f41f7e..0000000 --- a/IRIS_modules.md +++ /dev/null @@ -1,271 +0,0 @@ -[back to IRIS quick guide](README.md) - -For questions about input file format, see example folder. - -For test runs with files in example folder, users will need to modify directories of 'fin_matrices', etc. - -##### formatting -When starting from standard output of [rMATS](http://rnaseq-mats.sourceforge.net), users should use this step to 1) reformat splice junction counts into a PSI (percent-spliced-in) value matrix, and 2) index and 3) move the PSI matrix for IRIS screening (when -d is enabled). -```bash -IRIS formatting -h -usage: IRIS formatting [-h] -t {SE,RI,A3,A5} -n DATA_NAME -s {1,2} - [-c COV_CUTOFF] [-e] [-d IRIS_DB_PATH] - rmats_mat_path_manifest rmats_sample_order - -required arguments: - rmats_mat_path_manifest - txt manifest of path(s) to rMATS output folder(s) - rmats_sample_order txt manifest of corresponding rMATS input sample order file(s) - Required input for rMATS - -t {SE,RI,A3,A5}, --splicing_event_type {SE,RI,A3,A5} - String of splicing event types based on rMATS definition (SE,RI,A3,A5) - Used to name output file - -n DATA_NAME, --data-name DATA_NAME - Defines dataset name (disease state, study name, group name etc.) - Used during IRIS screening - -s {1,2}, --sample-name-field {1,2} - Specifies sample name field for each sample in sample order file(s) - listed by "rmats_sample_order" (1- BAM file name, 2- BAM folder name) - -optional arguments: - -h, --help Shows help message and exits - -c COV_CUTOFF, --cov-cutoff COV_CUTOFF - Average coverage filter for merged matrix (default is 10) - -e, --merge-events-only - Do not perform matrix merge, only merge events list - -d IRIS_DB_PATH, --iris-db-path IRIS_DB_PATH - Path to IRIS database - Formatted/indexed AS matrices are stored here and used during IRIS screening -``` - -##### screening -This step takes a user-defined screening parameter file ([example](example/Test.para)), which performs comparisons against reference databases, and returns tumor-associated, tumor-recurrent, and tumor-specific AS events based on user-defined criteria. - -When the -t option is enabled, the screening step translates identified tumor AS events into peptide sequences that can be used in the prediction step. -```bash -IRIS screening -h -usage: IRIS screening [-h] [-o OUTDIR] [-t] parameter_fin - -required arguments: - parameter_fin File of IRIS screening parameters - -o OUTDIR, --outdir OUTDIR - Directory of IRIS screening results - -optional arguments: - -h, --help Shows help message and exits - -t, --translating Translates IRIS-screened tumor splice junctions into peptides -``` - -##### prediction -This step takes the screening result and performs annotation of extracellular and HLA-binding epitope predictions to discover immunotherapy targets. - -IRIS prediction of HLA-binding epitopes is a massive prediction job that requires access to computing clusters with the SGE system for completion. The 'prediction' step will create qsub scripts for job array submission. - -###### Perform extracellular (CAR-T) target annotation & prepare epitope (TCR) target prediction - ``` -IRIS prediction -h -usage: IRIS prediction [-h] [-p PARAMETER_FIN] [--iedb-local IEDB_LOCAL] - [-c DELTAPSI_COLUMN] [-d DELTAPSI_CUT_OFF] -m MHC_LIST - [--extracellular-anno-by-junction] - IRIS_screening_result_path - -required arguments: - IRIS_screening_result_path - Input AS event coordinates and PSI values - -p PARAMETER_FIN, --parameter-fin PARAMETER_FIN - File of parameters used in IRIS screening - --iedb-local IEDB_LOCAL - Specify local IEDB location (if installed) - -m MHC_LIST, --mhc-list MHC_LIST - List of HLA/MHC types among samples - HLA type follows seq2HLA format - -optional arguments: - -h, --help Shows help message and exits - -c DELTAPSI_COLUMN, --deltaPSI-column DELTAPSI_COLUMN - Column of deltaPSI value in matrix, 1-based (default is 5th column) - -d DELTAPSI_CUT_OFF, --deltaPSI-cut-off DELTAPSI_CUT_OFF - Defines cutoff of deltaPSI (or other metric) to select tumor-enriched - splice form (default is 0) - --extracellular-anno-by-junction - By default, CAR-T targets are annotated by association of event - with extracellular domain - This option annotates target based on a junction (not recommended) - ``` - -###### Epitope (TCR) target prediction (requires SGE system) - ``` -IRIS epitope_post -h -usage: IRIS epitope_post [-h] -p PARAMETER_FIN -o OUTDIR -m MHC_BY_SAMPLE - [-e GENE_EXP_MATRIX] [--ic50-cut-off IC50_CUT_OFF] - -required arguments: - -p PARAMETER_FIN, --parameter_fin PARAMETER_FIN - File of parameters used in IRIS screening - -o OUTDIR, --outdir OUTDIR - Directory of IRIS screening results - -m MHC_BY_SAMPLE, --mhc-by-sample MHC_BY_SAMPLE - Tab-delimited matrix of HLA/MHC type vs. samples - HLA type follows seq2HLA format - -e GENE_EXP_MATRIX, --gene-exp-matrix GENE_EXP_MATRIX - Tab-delimited matrix of gene expression vs. samples - -optional arguments: - -h, --help Shows help message and exits - --ic50-cut-off IC50_CUT_OFF - Specifies IC50 cut-off to define HLA-binding epitopes (default is 500) - ``` - -##### process_rnaseq -When starting from a FASTQ file, users should use this step to perform RNA-Seq alignment and quantification. This module uses STAR and cufflinks. This module only takes one sample (can be multiple FASTQ files) for each run. Users are recommended to run this module in parallel in the SGE system. -``` -IRIS process_rnaseq -h -usage: IRIS process_rnaseq [-h] --starGenomeDir STARGENOMEDIR --gtf GTF -p - SAMPLEID_OUTDIR [--db-length DB_LENGTH] [--mapping] - [--quant] [--sort] - readsFilesRNA - -required arguments: - --starGenomeDir STARGENOMEDIR - Path to STAR-indexed reference genome - Passes to "genomeDir" parameter in STAR - --gtf GTF Genome annotation file. - -p SAMPLEID_OUTDIR, --sampleID-outdir SAMPLEID_OUTDIR - Output directory, where sample ID will be used as output folder name - --db-length DB_LENGTH - Passes to "sjdbOverhang" parameter in STAR (default is 100) - readsFilesRNA Specifies path to paired-end FASTQ files for sample - Files separated by "," - -optional arguments: - -h, --help Shows help message and exits - --mapping Only perform reads mapping - --quant Only perform gene expression and AS quantification - --sort Only perform BAM file sorting - ``` - -##### makeqsub_rmats (requires SGE system) -After running 'process_rnaseq', this step can be used to prepare files to run rMATS-turbo in parallel in the SGE system. -``` -IRIS makeqsub_rmats -h -usage: IRIS makeqsub_rmats [-h] --rMATS-path RMATS_PATH --bam-dir BAM_DIR - --gtf GTF --read-length READ_LENGTH - -required arguments: - --rMATS-path RMATS_PATH - Path to rMATS-turbo script - --bam-dir BAM_DIR Path one level higher to folders containing BAM file generated by 'process_rnaseq' - --gtf GTF Genome annotation file - --read-length READ_LENGTH - Passes to "readLength" parameter in rMATS-turbo - -optional arguments: - -h, --help Shows help message and exits - ``` - -##### exp_matrix -After running 'process_rnaseq', if samples of interest are all processed, users can use this script to generate a gene expression matrix, which will be used as annotations in downstream IRIS prediction and/or proteomics reports. -``` -IRIS exp_matrix -h -usage: IRIS exp_matrix [-h] [--exp-cutoff EXP_CUTOFF] -o OUTDIR -n DATA_NAME - gene_exp_file_list - -required arguments: - gene_exp_file_list txt manifest of path(s) of cufflinks gene expression output(s) - -n DATA_NAME, --data-name DATA_NAME - Name of dataset (disease state, study name, group name, etc.) - -optional arguments: - -h, --help Shows help message and exits - --exp-cutoff EXP_CUTOFF - Gene expression cut-off based on FPKM (default is 1) - -o OUTDIR, --outdir OUTDIR - Output directory for IRIS exp_matrix -``` - -##### indexing -This step is incorporated by formatting. For users who already have a matrix of AS PSI values (generated by rMATS or another tool), this command could finish the indexing and other steps to prepare for IRIS screening. -```bash -IRIS indexing -h -usage: IRIS indexing [-h] -n DATA_NAME [-d DB_DIR] splicing_matrix - -required arguments: - splicing_matrix Tab-delimited matrix of splicing events (row) vs. sample IDs (col) - -n DATA_NAME, --data-name DATA_NAME - Name of data matrix (disease state, study name, group name, etc.) being - formatted & indexed - Used by IRIS during screening - -optional arguments: - -h, --help Shows help message and exits - -d DB_DIR, --db-dir DB_DIR - Directory of IRIS database - Program creates a folder in this directory for IRIS to recognize -``` - -##### translation -```bash -IRIS translation -h -usage: IRIS translation [-h] -g REF_GENOME -o OUTDIR [-c DELTAPSI_COLUMN] - [-d DELTAPSI_CUT_OFF] [--no-tumor-form-selection] - as_input - -required arguments: - as_input Inputs AS event coordinates and PSI values - -g REF_GENOME, --ref-genome REF_GENOME - Specifies reference genome (FASTA format) location - -o OUTDIR, --outdir OUTDIR - Defines IRIS translation output directory - -optional arguments: - -h, --help Show help message and exits - -c DELTAPSI_COLUMN, --deltaPSI-column DELTAPSI_COLUMN - Column of deltaPSI value in matrix, 1-based (default is 5th column) - -d DELTAPSI_CUT_OFF, --deltaPSI-cut-off DELTAPSI_CUT_OFF - Defines cutoff of deltaPSI (or other metric) used to select tumor-enriched - splice form (default is 0) - --no-tumor-form-selection - Translates splicing junctions derived from both skipping and inclusion forms - ``` - -##### seq2hla -This step uses the RNA-Seq FASTQ file to infer the HLA type of a sample. -``` -IRIS seq2hla -h -usage: IRIS seq2hla [-h] -b SEQ2HLA_PATH -p SAMPLEID_OUTDIR readsFilesCaseRNA - -required arguments: - -b SEQ2HLA_PATH, --seq2hla-path SEQ2HLA_PATH - Path to seq2hla folder - -p SAMPLEID_OUTDIR, --sampleID-outdir SAMPLEID_OUTDIR - Output directory, where sample ID will be used as output folder name - readsFilesCaseRNA Tumor sample paired-end fastq files, separated by "," - -optional arguments: - -h, --help Shows help message and exits - ``` - -##### pep2epitope -This module is a wrapper of prediction tools (IEDB) for predicting peptide-HLA binding. The 'prediction' and 'epitope_post' modules can make qsub submissions to run this module in parallel and summarize the result into one TCR target report. -```IRIS pep2epitope -h -usage: IRIS pep2epitope [-h] [-e EPITOPE_LEN_LIST] [-a HLA_ALLELE_LIST] -o - OUTDIR [--iedb-local IEDB_LOCAL] - [--ic50-cut-off IC50_CUT_OFF] - junction_pep_input - -required arguments: - junction_pep_input Inputs AS event coordinates and PSI values - -e EPITOPE_LEN_LIST, --epitope-len-list EPITOPE_LEN_LIST - Epitope length for prediction (default is 9,10,11) - -a HLA_ALLELE_LIST, --hla-allele-list HLA_ALLELE_LIST - List of HLA types (default is HLA-A*01:01, HLA-B*08:01, HLA-C*07:01) - -o OUTDIR, --outdir OUTDIR - Define output directory of pep2epitope - --iedb-local IEDB_LOCAL - Specify local IEDB location (if installed) - --ic50-cut-off IC50_CUT_OFF - Cut-off based on median value of consensus-predicted IC50 values (default is 500) - ``` - -optional arguments: - -h, --help Shows help message and exits diff --git a/README.md b/README.md index e01c3ef..89d00a4 100644 --- a/README.md +++ b/README.md @@ -1,94 +1,201 @@ # IRIS: Isoform peptides from RNA splicing for Immunotherapy target Screening +## Quick guide -### Quick guide -- [Dependencies](#dependencies) -- [Installation](#installation) -- [Usage](#usage) - - [Usage - individual modules (for customized pipelines)](#individual-modules) - - [Usage - streamlined major modules (for common use)](#streamlined-major-modules) -- [Example](#example) -- [Output](#example-output) -- [Contact](#contact) -- [Publication](#publication) +* [Dependencies](#dependencies) +* [Installation](#installation) +* [Usage](#usage) + + [Usage - individual functions (for customized pipelines)](#individual-functions) + + [Usage - streamlined major functions (for common use)](#streamlined-major-functions) + + [Snakemake](#snakemake) +* [Example](#example) +* [Output](#example-output) +* [Contact](#contact) +* [Publication](#publication) +## Dependencies +### Core dependencies (required for major IRIS functions/steps - format, screen, and predict) -### Dependencies +* python 2.7.x (numpy, scipy, seaborn, pyBigWig, statsmodels, pysam) +* [IEDB stand-alone 20130222 2.15.5](http://tools.iedb.org/main/download/) + + IEDB additionally depends on: + - [tcsh](http://www.tcsh.org) + - [gawk](http://www.gnu.org/software/gawk/) +* [bedtools 2.29.0](https://bedtools.readthedocs.io/en/latest/) -#### Core dependencies (required for major IRIS modules/steps - formatting, screening, and prediction): -- python 2.7.x (numpy, scipy, seaborn, pyBigWig, etc.) -- [IEDB stand-alone 20130222 2.15.5 (2.22.1 is not fully tested)](http://tools.iedb.org/main/download/) -- [bedtools 2.29.0](https://bedtools.readthedocs.io/en/latest/) +### Other dependencies (required for processing raw RNA-Seq and MS data) -#### Other dependencies (required for processing raw RNA-Seq and MS data) -- [STAR 2.5.3](https://github.com/alexdobin/STAR/releases/tag/2.5.3a): required for IRIS RNA-seq processing -- [samtools 1.3](https://sourceforge.net/projects/samtools/files/samtools/): required for IRIS RNA-seq processing -- [rMATS-turbo](http://rnaseq-mats.sourceforge.net): required for IRIS RNA-seq processing -- [Cufflinks 2.2.1](http://cole-trapnell-lab.github.io/cufflinks/install/): required for IRIS RNA-seq processing -- [seq2HLA](https://bitbucket.org/sebastian_boegel/seq2hla/src/default/): required for HLA typing; requires [bowtie](http://bowtie-bio.sourceforge.net/index.shtml) -- [MS GF+ (v2018.07.17)](https://github.com/MSGFPlus/msgfplus): required for MS search; requiring [Java](https://www.java.com/en/download/) +* [STAR 2.5.3](https://github.com/alexdobin/STAR/releases/tag/2.5.3a): required for IRIS RNA-seq processing +* [samtools 1.3](https://sourceforge.net/projects/samtools/files/samtools/): required for IRIS RNA-seq processing +* [rMATS-turbo](https://github.com/Xinglab/rmats-turbo): required for IRIS RNA-seq processing +* [Cufflinks 2.2.1](http://cole-trapnell-lab.github.io/cufflinks/install/): required for IRIS RNA-seq processing +* [seq2HLA](https://bitbucket.org/sebastian_boegel/seq2hla/src/default/): required for HLA typing; requires [bowtie](http://bowtie-bio.sourceforge.net/index.shtml) +* [MS GF+ (v2018.07.17)](https://github.com/MSGFPlus/msgfplus): required for MS search; requiring [Java](https://www.java.com/en/download/) +* [R](https://www.r-project.org/): used by seq2HLA +## Installation +### 1. Download + +#### 1.1 Download IRIS program -### Installation -Two steps to set up IRIS: -#### 1. Download -##### 1.1 Download IRIS program The IRIS program can be downloaded directly from the repository, as shown below: ``` git clone https://github.com/Xinglab/IRIS.git cd IRIS ``` -__For full functionality, IRIS requires use of the SGE system. For users who want to use functions involving SGE (see [Usage](#usage) for details), please check IRIS/config.py to ensure qsub parameters are correct before moving to the next step.__ -##### 1.2 Download IRIS db -IRIS loads a big-data reference database of splicing events and other genomic annotations. \ -These data are included in [IRIS_data.tgz](https://drive.google.com/file/d/1TaswpWPnEd4TXst46jsa9XSMzLsbzjOQ/view?usp=sharing) (a Google Drive link; size ~10 GB). Users need to move this file to the IRIS folder for streamlined installation. -##### 1.3 Download IEDB MHC I prediction tools -Download IEDB_MHC_I-X.XX.X.tar.gz from IEDB website (see [Dependencies](#dependencies)). Create a folder named 'IEDB' in the IRIS folder, then move the downloaded gz file to the 'IEDB' folder. -#### 2. Install and configure -Under the IRIS folder, do: +__IRIS is designed to make use of a computing cluster to improve performance. For users who want to enable cluster execution for functions that support it (see [Configure](#3-configure-for-compute-cluster) for details), please update the contents of [snakemake_profile/](snakemake_profile/) to ensure compatibility with the available compute environment.__ + +#### 1.2 Download IRIS db + +IRIS loads a big-data reference database of splicing events and other genomic annotations. These data are included in [IRIS_data.v2.0.0](https://drive.google.com/drive/folders/1zhmXoajD5RyjxVTYbGZ-ebic1VPfEYKz?usp=sharing) (a Google Drive link; size of entire folder is ~400 GB; users can select reference groups to download). The files need to be placed under `./IRIS_data/` + +The files can be automatically downloaded with [google_drive_download.py](google_drive_download.py). Downloading a large amount of data with the API requires authentication: +* https://cloud.google.com/docs/authentication/production +* https://cloud.google.com/bigquery/docs/authentication/service-account-file + +To use the script, first create a service account: +* Go to google cloud console -> IAM & Admin -> Service Accounts -> create service account +* Give the new account: role=owner +* Click the new service account email on the service account page +* Download a .json key by clicking: keys -> add key -> create new key -> json + +That .json key is passed to [google_drive_download.py](google_drive_download.py) + +#### 1.3 Download IEDB MHC I prediction tools + +Download `IEDB_MHC_I-2.15.5.tar.gz` from the IEDB website (see [Dependencies](#dependencies)). Create a folder named `IEDB/` in the IRIS folder, then move the downloaded gz file to `IEDB/`. From http://tools.iedb.org/main/download/ +* click "MHC Class I" +* click "previous version" +* find and download version 2.15.5 + +The manual download is needed because there is a license that must be accepted. + +### 2. Install + +[./install](./install) can automatically install most dependencies to conda environments: +* conda must already be installed for the script to work + + https://docs.conda.io/en/latest/miniconda.html +* The install script will check if `IRIS_data/` has been downloaded + + To download see [1.2 Download IRIS db](#12-download-iris-db) +* The install script will check if IEDB tools has been downloaded + + To download see [1.3 Download IEDB MHC I prediction tools](#13-download-iedb-mhc-i-prediction-tools) + +Under the IRIS folder, to install IRIS [core dependencies](#core-dependencies-required-for-major-iris-functionssteps---format-screen-and-predict), do: ``` ./install core ``` -Follow instructions to finish the installation of conda, python and its dependencies, bedtools, the downloaded IEDB package, and the IRIS data and packages. To install optional dependencies not needed for the most common IRIS usage: + +To install [optional dependencies](#other-dependencies-required-for-processing-raw-rna-seq-and-ms-data) not needed for the most common IRIS usage: ``` ./install all ``` -### Usage -- For streamlined AS-derived target discovery, please follow [major modules](#streamlined-major-modules) and run the corresponding toy example. -- For customized pipeline development, please check [all modules](#individual-modules) of IRIS. +### 3. Configure for compute cluster -#### Individual modules -IRIS provides individual modules/steps, allowing users to build pipelines for their customized needs.\ -For a description of each [module/step](IRIS_modules.md), including RNA-seq preprocessing, HLA typing, proteo-transcriptomic MS searching, visualization, etc., please click [here](IRIS_modules.md) or the subheader above. -``` -usage: IRIS [-h] [--version] - {formatting,screening,prediction,epitope_post,process_rnaseq,makeqsub_rmats,exp_matrix,indexing,translation,pep2epitope,screening_plot,seq2hla,parse_hla,ms_makedb,ms_search,ms_parse} - ... +[Snakefile](Snakefile) describes the IRIS pipeline. The configuration for running jobs can be set by editing [snakemake_profile/](snakemake_profile/). The provided configuration adapts IRIS to use Slurm. Other compute environments can be supported by updating this directory +* [snakemake_profile/config.yaml](snakemake_profile/config.yaml): Sets various Snakemake parameters including whether to submit jobs to a cluster. +* [snakemake_profile/cluster_submit.py](snakemake_profile/cluster_submit.py): Script to submit jobs. +* [snakemake_profile/cluster_status.py](snakemake_profile/cluster_status.py): Script to check job status. +* [snakemake_profile/cluster_commands.py](snakemake_profile/cluster_commands.py): Commands specific to the cluster management system being used. The default implementation is for Slurm. Other cluster environments can be used by changing this file. For example, [snakemake_profile/cluster_commands_sge.py](snakemake_profile/cluster_commands_sge.py) can be used to overwrite `cluster_commands.py` to support an SGE cluster. +* To force Snakemake to execute on the local machine modify [snakemake_profile/config.yaml](snakemake_profile/config.yaml): + + comment out `cluster` + + set `jobs: {local cores to use}` + + uncomment the `resources` section and set `mem_mb: {MB of RAM to use}` + +### 4. Known issues + +* The conda install of Python 2 may give an error like `ImportError: No module named _sysconfigdata_x86_64_conda_linux_gnu` + + Check for the error by activating `conda_env_2` and running `python` + + Resolve with commands similar to + - `cd conda_env_2/lib/python2.7/` + - `cp _sysconfigdata_x86_64_conda_cos6_linux_gnu.py _sysconfigdata_x86_64_conda_linux_gnu.py` +* The installed version of R may depend on old version of libreadline that is not available in conda + + Check for the error by activating `conda_env_2` and running `R` + + Resolve by activating `conda_env_2` and manually following the steps in the `install_readline()` function of [./install](./install) +* IRIS uses `--label-string` to determine which fastq files are for read 1 and read 2 + + To avoid any issues name your fastq files so that they end with `1.fastq` and `2.fastq` to indicate which file represents which pair of the read + +## Usage + +* For streamlined AS-derived target discovery, please follow [major functions](#streamlined-major-functions) and run the corresponding toy example. +* For customized pipeline development, please check [all functions](#individual-functions) of IRIS. + +This flowchart shows how the IRIS functions are organized +![iris_diagram](docs/iris_diagram.png) + +### Individual functions + +IRIS provides individual functions/steps, allowing users to build pipelines for their customized needs. [IRIS_functions.md](IRIS_functions.md) describes each model/step, including RNA-seq preprocessing, HLA typing, proteo-transcriptomic MS searching, visualization, etc. +``` IRIS -- IRIS positional arguments: - {formatting,screening,prediction,epitope_post,process_rnaseq,makeqsub_rmats,exp_matrix,indexing,translation,pep2epitope,screening_plot,seq2hla,parse_hla,ms_makedb,ms_search,ms_parse} - formatting Formats AS matrices from rMATS, followed by indexing for IRIS - screening Screens AS-derived tumor antigens using big-data reference - prediction Predicts and annotates AS-derived TCR (pre-prediction) and CAR-T targets - epitope_post Post-prediction step to summarize predicted TCR targets - process_rnaseq Processes RNA-Seq FASTQ files to quantify gene expression and AS - makeqsub_rmats Makes qsub files for running rMATS-turbo 'prep' step - exp_matrix Makes a merged gene expression matrix from multiple cufflinks results - indexing Indexes AS matrices for IRIS - translation Translates AS junctions into junction peptides + format Formats AS matrices from rMATS, followed by indexing + for IRIS + screen Screens AS-derived tumor antigens using big-data + reference + predict Predicts and annotates AS-derived TCR (pre-prediction) + and CAR-T targets + epitope_post Post-prediction step to summarize predicted TCR + targets + process_rnaseq Processes RNA-Seq FASTQ files to quantify gene + expression and AS + makesubsh_mapping Makes submission shell scripts for running + 'process_rnaseq' + makesubsh_rmats Makes submission shell scripts for running rMATS-turbo + 'prep' step + makesubsh_rmatspost + Makes submission shell scripts for running rMATS-turbo + 'post' step + exp_matrix Makes a merged gene expression matrix from multiple + cufflinks results + makesubsh_extract_sjc + Makes submission shell scripts for running + 'extract_sjc' + extract_sjc Extracts SJ counts from STAR-aligned BAM file and + annotates SJs with number of uniquely mapped reads + that support the splice junction. + sjc_matrix Makes SJ count matrix by merging SJ count files from a + specified list of samples. Performs indexing of the + merged file. + index Indexes AS matrices for IRIS + translate Translates AS junctions into junction peptides pep2epitope Wrapper to run IEDB for peptide-HLA binding prediction - screening_plot Makes stacked/individual violin plots for list of AS events - seq2hla Wrapper to run seq2HLA for HLA typing using RNA-Seq - parse_hla Summarizes seq2HLA results of all input samples into matrices for IRIS use + screen_plot Makes stacked/individual violin plots for list of AS + events + screen_sjc Screens AS-derived tumor antigens by comparing number + of samples expressing a splice junction using big-data + reference of SJ counts + append_sjc Appends SJC result as an annotation to PSI-based + screening results and epitope prediction results in a + specified screening output folder. + annotate_ijc Annotates inclusion junction count info to PSI-based + screening results or epitope prediction results in a + specified screening output folder. Can be called from + append sjc to save time. + screen_cpm Screens AS-derived tumor antigens by comparing splice + junction CPM using big-data reference of SJ counts + append_cpm Appends CPM result as an annotation to PSI-based + screening results and epitope prediction results in a + specified screening output folder. + screen_novelss Screens AS-derived tumor antigens for unannotated + events using big-data reference of SJ counts + screen_sjc_plot Makes stacked/individual barplots of percentage of + samples expressing a splice junction for list of AS + events + makesubsh_hla Makes submission shell scripts for running seq2HLA for + HLA typing using RNA-Seq + parse_hla Summarizes seq2HLA results of all input samples into + matrices for IRIS use ms_makedb Generates proteo-transcriptomic database for MS search ms_search Wrapper to run MSGF+ for MS search - ms_parse Parses MS search results to generate tables of identified peptides + ms_parse Parses MS search results to generate tables of + identified peptides + visual_summary Makes a graphic summary of IRIS results optional arguments: -h, --help show this help message and exit @@ -97,98 +204,218 @@ optional arguments: For command line options of each sub-command, type: IRIS COMMAND -h ``` +### Streamlined major functions + +The common use of IRIS immunotherapy target discovery comprises three major steps. For a quick test, see [Example](#example) which uses the snakemake to run a small data set. +* Step 1. generate PSI-based AS matrix from rMATS output (& index) + + IRIS format option -d should be used to save the generated PSI-based AS matrix to the downloaded IRIS db. + + Example files for `rmats_mat_path_manifest` and `rmats_sample_order` can be found under the 'example' folder for the test run. + + IRIS index will create an index for the IRIS format generated PSI-based AS matrix, and -o should be the path to the folder containing the AS matrix. +``` +usage: IRIS format [-h] -t {SE,RI,A3SS,A5SS} -n DATA_NAME -s {1,2} + [-c COV_CUTOFF] [-i] [-e] [-d IRIS_DB_PATH] [--novelSS] + [--gtf GTF] + rmats_mat_path_manifest rmats_sample_order + +usage: IRIS index [-h] -t {SE,RI,A3SS,A5SS} -n DATA_NAME + [-c COV_CUTOFF] [-o OUTDIR] + splicing_matrix +``` -#### Streamlined major modules -The common use of IRIS immunotherapy target discovery comprises three major steps. For a quick test, see [Example](#example), in which a shell script is provided for a streamlined example run: -- Step 1. IRIS formatting (& indexing) +* Step 2. IRIS screen (& translation) ('tumor-association' screen) + + [example/parameter_file_description.txt](example/parameter_file_description.txt) describes `PARAMETER_FIN` and [example/Test.para](example/Test.para) is an example. + + Option -t is required for TCR target prediction ``` -usage: IRIS formatting [-h] -t {SE,RI,A3,A5} -n DATA_NAME -s {1,2} - [-c COV_CUTOFF] [-e] [-d IRIS_DB_PATH] - rmats_mat_path_manifest rmats_sample_order +usage: IRIS screen [-h] -p PARAMETER_FIN + --splicing-event-type {SE,RI,A3SS,A5SS} -o OUTDIR [-t] + [-g GTF] [--all-orf] [--ignore-annotation] + [--remove-early-stop] [--min-sample-count MIN_SAMPLE_COUNT] + [--use-existing-test-result] ``` -- Step 2. IRIS screening (& translation) -Here is a [description of the parameter file](example/parameter_file_description.txt) and an [example file](example/Test.para). +* Step 3. IRIS predict (predicts both extracellular targets and epitopes; __designed for cluster execution__) + + IRIS predict can generate CAR-T annotation results and prepare a job array submission for TCR epitope prediction. TCR prediction preparation is optional and can be disabled by using --extraceullular-only. + + `IRIS epitope_post` will summarize TCR epitope prediction results after TCR epitope prediction jobs from IRIS predict are submitted and finished (job array submission step can be done manually or using snakemake) + + `MHC_LIST` and `MHC_BY_SAMPLE` can be generated by running `HLA_typing` (within or outside IRIS). Note that it is not necessary to restrict HLA types detected from input RNA samples. It is recommended for users to specify dummy files only containing HLA types of interest or common HLA types as long as HLA types in the dummy `hla_types.list` and `hla_patient.tsv` are consistent. Example files for `hla_types.list` and `hla_patient.tsv` can be found under [example/HLA_types/](example/HLA_types/). ``` -usage: IRIS screening [-h] [-o OUTDIR] [-t] parameter_fin +usage: IRIS predict [-h] --task-dir TASK_DIR -p PARAMETER_FIN + -t {SE,RI,A3SS,A5SS} [--iedb-local IEDB_LOCAL] + [-m MHC_LIST] [--extracellular-only] [--tier3-only] + [--gene-exp-matrix GENE_EXP_MATRIX] [-c DELTAPSI_COLUMN] + [-d DELTAPSI_CUT_OFF] [-e EPITOPE_LEN_LIST] [--all-orf] + [--extracellular-anno-by-junction] + IRIS_screening_result_path + +usage: IRIS epitope_post [-h] -p PARAMETER_FIN -o OUTDIR + -t {SE,RI,A3SS,A5SS} -m MHC_BY_SAMPLE + -e GENE_EXP_MATRIX [--tier3-only] [--keep-exist] + [--epitope-len-list EPITOPE_LEN_LIST] + [--no-match-to-canonical-proteome] + [--no-uniqueness-annotation] + [--ic50-cut-off IC50_CUT_OFF] ``` -- Step 3. IRIS prediction (predicts both extracellular targets and epitopes; __requires SGE system__) +* Step 4. IRIS screen of the presence-absence of splice junctions (required for the 'tumor-specificity' screen) + + IRIS append_sjc combines `screen` and `screen_sjc` results (by appending `screen_sjc` outputs to `screen` outputs). The 'integrated' output contains annotations for tumor-specific targets. + + IRIS append_sjc -i option can be used to execute both IRIS append_sjc and IRIS annotate_ijc functions. If -i option is used, -p and -e arguments are required. ``` -usage: IRIS prediction [-h] [-p PARAMETER_FIN] [--iedb-local IEDB_LOCAL] - [-c DELTAPSI_COLUMN] [-d DELTAPSI_CUT_OFF] -m MHC_LIST - [--extracellular-anno-by-junction] - IRIS_screening_result_path - -usage: IRIS epitope_post [-h] -p PARAMETER_FIN -o OUTDIR -m MHC_BY_SAMPLE - [-e GENE_EXP_MATRIX] [--ic50-cut-off IC50_CUT_OFF] +usage: IRIS screen_sjc [-h] -p PARAMETER_FIN + --splicing-event-type {SE,RI,A3SS,A5SS} + -e EVENT_LIST_FILE -o OUTDIR + [--use-existing-test-result] + [--tumor-read-cov-cutoff TUMOR_READ_COV_CUTOFF] + [--normal-read-cov-cutoff NORMAL_READ_COV_CUTOFF] + +usage: IRIS append_sjc [-h] --sjc-summary SJC_SUMMARY + --splicing-event-type {SE,RI,A3SS,A5SS} -o OUTDIR + [-i] [-u] [-p PARAMETER_FILE] + [-e SCREENING_RESULT_EVENT_LIST] + [--inc-read-cov-cutoff INC_READ_COV_CUTOFF] + [--event-read-cov-cutoff EVENT_READ_COV_CUTOFF] + +usage: IRIS annotate_ijc [-h] -p PARAMETER_FILE + --splicing-event-type {SE,RI,A3SS,A5SS} + -e SCREENING_RESULT_EVENT_LIST -o OUTDIR + [--inc-read-cov-cutoff INC_READ_COV_CUTOFF] + [--event-read-cov-cutoff EVENT_READ_COV_CUTOFF] + ``` -### Example -We provide a wrapper ([run_example](run_example)) to run the above [IRIS streamlined major modules](#streamlined-major-modules) using [example files](example), included in the IRIS package. For customized pipeline development, we recommend that users use this script and [run_iris](run_iris) as a reference. Under the IRIS folder, do: +### Snakemake + +The Snakemake workflow can be run with [./run](./run). First set the configuration values in [snakemake_config.yaml](snakemake_config.yaml) +* Set the resources to allocate for each job: + + `{job_name}_{threads}` + + `{job_name}_{mem_gb}` + + `{job_name}_{time_hr}` +* Set the reference files + + Provide the file names as `gtf_name:` and `fasta_name:` + + Either place the files in `./references/` + + Or provide a url under `reference_files:` to download the (potentially gzipped) files: +``` +gtf_name: 'some_filename.gtf' +fasta_name: other_filename.fasta' +reference_files: + some_filename.gtf.gz: + url: 'protocol://url/for/some_filename.gtf.gz' + other_filename.fasta.gz: + url: 'protocol://url/for/other_filename.fasta.gz' +``` +* Set the input files + + `sample_fastqs:` Set the read 1 and read 2 fastq files for each sample. For example: ``` -./run_example +sample_fastqs: + sample_name_1: + - '/path/to/sample_1_read_1.fq' + - '/path/to/sample_1_read_2.fq' + sample_name_2: + - '/path/to/sample_2_read_1.fq' + - '/path/to/sample_2_read_2.fq' ``` -__As mentioned in [Usage](#usage), this example run will involve submitting the job array to the SGE system.__ It will take < 5 min for the formatting and screening steps and usually < 15 min for the prediction step (SGE job arrays).\ -A successful test run will generate the following result files in ./results/example/Glioma_test/screening (row numbers are displayed before each file name): + + `blacklist`: an optional black list of AS events similar to [IRIS/data/blacklist.brain_2020.txt](IRIS/data/blacklist.brain_2020.txt) + + `mapability_bigwig`: an optional file for evaluating splice region mappability similar to `IRIS_data/resources/mappability/wgEncodeCrgMapabilityAlign24mer.bigWig` + + `mhc_list`: required if not starting with fastq files. similar to [example/HLA_types/hla_types.list](example/HLA_types/hla_types.list) + + `mhc_by_sample`: required if not starting with fastq files. similar to [example/HLA_types/hla_patient.tsv](example/HLA_types/hla_patient.tsv) + + `gene_exp_matrix`: optional tsv file with geneName as the first column and the expression for each sample in the remaining columns + + `splice_matrix_txt`: optional output file from IRIS index that can be used as a starting point + + `splice_matrix_idx`: the index file for `splice_matrix_txt` + + `sjc_count_txt`: optional output file from IRIS sjc_matrix that can be used as a starting point. Only relevant if `should_run_sjc_steps` + + `sjc_count_idx`: the index file for `sjc_count_txt` +* Set other options + + `run_core_modules`: set to `true` to start with existing `IRIS format` output and HLA lists + + `run_all_modules`: set to `true` to start with fastq files + + `should_run_sjc_steps`: set to `true` to enable splice junction based evaluation steps + + `star_sjdb_overhang`: used by STAR alignment. Ideally it should be `read_length -1`, but the STAR manual says that 100 should work well as a default + + `run_name`: used to name output files that will be written to `IRIS_data/` + + `splice_event_type`: one of `[SE, RI,A3SS, A5SS]` + + `comparison_mode`:one of `[group, individual]` + + `stat_test_type`: one of `[parametric, nonparametric]` + + `use_ratio`: set to `true` to require a ratio of reference groups to pass the checks rather than a fixed count + + `tissue_matched_normal_..._{cutoff}`: set the cutoffs for the tissue matched normal reference group (tier 1) + + `tissue_matched_normal_reference_group_names`: a comma separate list of directory names under `IRIS_data/db` + + `tumor_..._{cutoff}`: set the cutoffs for the tumor reference group (tier 2) + + `tumor_reference_group_names`: a comma separate list of directory names under `IRIS_data/db` + + `normal_..._{cutoff}`: set the cutoffs for the normal reference group (tier 3) + + `normal_reference_group_names`: a comma separate list of directory names under `IRIS_data/db` + +## Example + +The snakemake is configured to run the above [IRIS streamlined major functions](#streamlined-major-functions) using [example/](example/). For customized pipeline development, we recommend that users refer to the [Snakefile](Snakefile) as a reference. [Snakefile](Snakefile) defines the steps of the pipeline. Update the `/path/to/` values with full paths in [snakemake_config.yaml](snakemake_config.yaml) and make any adjustments to [snakemake_profile/](snakemake_profile/). Then ``` - 0 _example_Glioma_test.notest.txt - 13 _example_Glioma_test.primary.txt - 3 _example_Glioma_test.primary.txt.ExtraCellularAS.txt - 11 _example_Glioma_test.prioritized.txt - 3 _example_Glioma_test.prioritized.txt.ExtraCellularAS.txt - 13 _example_Glioma_test.test.all.txt - 13 primary/epitope_summary.junction-based.txt - 74 primary/epitope_summary.peptide-based.txt - 148 primary/pred_filtered.score500.txt - 11 prioritized/epitope_summary.junction-based.txt - 45 prioritized/epitope_summary.peptide-based.txt - 84 prioritized/pred_filtered.score500.txt +./run ``` -__Users can refer to relative paths in the parameter file Test.para, the file manifest matrice.txt, and the file samples.txt. These relative paths were made for the example run. Users will need to change the path for their own analyses.__ The run_iris script takes as input a [simplified parameter file](example/Test_simplified.para) and a .tar.gz of the [SJ_matrices](example/SJ_matrices.tar.gz) which are preprocessed before calling the IRIS modules. The preprocessing adds absolute paths based on the input relative paths. +__As mentioned in [Usage](#usage), the full example is designed to be run with a compute cluster.__ It will take < 5 min for the formatting and screening steps and usually < 15 min for the prediction step (depending on available cluster resources). + + +A successful test run will generate the following result files in `./results/NEPC_test/screen/` (row numbers are displayed before each file name): +``` + 0 NEPC_test.SE.notest.txt + 1 NEPC_test.SE.test.all_guided.txt + 1 NEPC_test.SE.tier1.txt + 1 NEPC_test.SE.tier1.txt.integratedSJC.txt + 4 NEPC_test.SE.tier2tier3.txt.ExtraCellularAS.txt + 4 NEPC_test.SE.tier2tier3.txt.ExtraCellularAS.txt.integratedSJC.txt + 6 NEPC_test.SE.tier2tier3.txt + 6 NEPC_test.SE.tier2tier3.txt.ijc_info.txt + 6 NEPC_test.SE.tier2tier3.txt.integratedSJC.txt + 11 NEPC_test.SE.test.all_voted.txt + 4 SE.tier2tier3/epitope_summary.junction-based.txt + 4 SE.tier2tier3/epitope_summary.junction-based.txt.integratedSJC.txt + 9 SE.tier2tier3/epitope_summary.peptide-based.txt + 9 SE.tier2tier3/epitope_summary.peptide-based.txt.integratedSJC.txt + 11 SE.tier2tier3/pred_filtered.score500.txt +``` +A summary graphic is generated to `./results/NEPC_test/visualization/summary.png` + +## Example output -### Example output Final reports are shown in __bold__ font. -#### Screening results -[TASK/DATA_NAME].test.all.txt: All AS events tested by IRIS screening +### Screening results -[TASK/DATA_NAME].notest.txt: During screening, AS events skipped due to no variance or no available comparisons +`[TASK/DATA_NAME].[AS_TYPE].test.all_guided.txt`: All AS events tested by IRIS screening with tissue-matched normal tissue reference panel available. One-sided test will be used to generate p-value. -[TASK/DATA_NAME].primary.txt: Tumor AS events after comparison to tissue-matched normal panel ('primary' events) +`[TASK/DATA_NAME].[AS_TYPE].test.all_voted.txt`: All AS events tested by IRIS screening without tissue-matched normal tissue reference panel. Two-sided test will be used to generate p-value for comparisons to normal panels. -[TASK/DATA_NAME].prioritized.txt: Tumor AS events after comparison to tissue-matched normal panel, tumor panel, and normal tissue panel ('prioritized' AS events) +`[TASK/DATA_NAME].[AS_TYPE].notest.txt`: During screening, AS events skipped due to no variance or no available comparisons -#### CAR-T annotation reports -__[TASK/DATA_NAME].primary.txt.ExtraCellularAS.txt__: Tumor AS events in 'primary' set that are associated with protein extracellular annotation and may be used for CAR-T targets +`[TASK/DATA_NAME].[AS_TYPE].tier1.txt`: Tumor AS events after comparison to tissue-matched normal panel ('tier1' events) -__[TASK/DATA_NAME].prioritized.txt.ExtraCellularAS.txt__: Tumor AS events in 'prioritized' set that are associated with protein extracellular annotation and may be used for CAR-T targets +`[TASK/DATA_NAME].[AS_TYPE].tier2tier3.txt`: Tumor AS events after comparison to tissue-matched normal panel, tumor panel, and normal tissue panel ('tier3' AS events) -#### TCR prediction reports -primary/pred_filtered.score500.txt: IEDB prediction outputs for AS junction peptides from 'primary' set with HLA-peptide binding IC50 values passing user-defined cut-off +### CAR-T annotation reports -__primary/epitope_summary.peptide-based.txt__: AS-derived epitopes from 'primary' set that are predicted to bind user-defined HLA type +__`[TASK/DATA_NAME].[AS_TYPE].tier1.txt.ExtraCellularAS.txt`__: Tumor AS events in 'tier1' set that are associated with protein extracellular annotation and may be used for CAR-T targets -__primary/epitope_summary.junction-based.txt__: Epitope-producing AS junctions from 'primary' set that are predicted to bind user-defined HLA type +__`[TASK/DATA_NAME].[AS_TYPE].tier2tier3.txt.ExtraCellularAS.txt`__: Tumor AS events in 'tier3' set that are associated with protein extracellular annotation and may be used for CAR-T targets -prioritized/pred_filtered.score500.txt: IEDB prediction outputs for AS junction peptides from 'prioritized' set with HLA-peptide binding IC50 value passing user-defined cut-off +### TCR prediction reports -__prioritized/epitope_summary.peptide-based.txt__: AS-derived epitopes from 'prioritized' set that are predicted to bind user-defined HLA type +`[AS_TYPE].tier1/pred_filtered.score500.txt`: IEDB prediction outputs for AS junction peptides from 'tier1' set with HLA-peptide binding IC50 values passing user-defined cut-off -__prioritized/epitope_summary.junction-based.txt__: Epitope-producing AS junctions from 'prioritized' set that are predicted to bind user-defined HLA type +__`[AS_TYPE].tier1/epitope_summary.peptide-based.txt`__: AS-derived epitopes from 'tier1' set that are predicted to bind user-defined HLA type +__`[AS_TYPE].tier1/epitope_summary.junction-based.txt`__: Epitope-producing AS junctions from 'tier1' set that are predicted to bind user-defined HLA type +`[AS_TYPE].tier2tier3/pred_filtered.score500.txt`: IEDB prediction outputs for AS junction peptides from 'tier3' set with HLA-peptide binding IC50 value passing user-defined cut-off +__`[AS_TYPE].tier2tier3/epitope_summary.peptide-based.txt`__: AS-derived epitopes from 'tier3' set that are predicted to bind user-defined HLA type +__`[AS_TYPE].tier2tier3/epitope_summary.junction-based.txt`__: Epitope-producing AS junctions from 'tier3' set that are predicted to bind user-defined HLA type + +### Tumor-specific screen reports + +Screening or prediction outputs that integrated `screen` and `screen_sjc` results contain annotation for tumor-specific targets. These output files are indicated by `.integratedSJC.txt`, such as `[TASK/DATA_NAME].[AS_TYPE]tier2tier3.txt.integratedSJC.txt` and __`[AS_TYPE].tier2tier3/epitope_summary.peptide-based.txt.integratedSJC.txt`__, etc. + +## Contact - ### Contact Yang Pan +Eric Kutschera + Yi Xing - +## Publication -### Publication Manuscript in submission - diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..af3cf8f --- /dev/null +++ b/Snakefile @@ -0,0 +1,1766 @@ +import snakemake.utils + +snakemake.utils.min_version('6.5.0') + +configfile: 'snakemake_config.yaml' + +onsuccess: + print('workflow success') + +onerror: + print('workflow error') + +DEFAULT_MEM_MB=4 * 1024 # 4 GB +DEFAULT_TIME_HOURS=12 + +# Specifying this as an input to a rule will disable that rule. +# This can be used in combination with "ruleorder:" to determine what +# rule should be used to create a particular output file. +UNSATISFIABLE_INPUT='unsatisfiable_input_file_path' + + +def all_input(wildcards): + inputs = dict() + run_all_modules = bool(config.get('run_all_modules')) + run_core_modules = bool(config.get('run_core_modules')) + should_run_sjc = bool(config.get('should_run_sjc_steps')) + if run_core_modules or run_all_modules: + # core modules + inputs.update(iris_epitope_post_out_files()) + inputs['visualization'] = os.path.join(result_dir(), 'visualization', + 'summary.png') + + if should_run_sjc: + if has_tier_1(): + inputs['sjc_tier1'] = iris_append_sjc_out_file_name_for_tier('tier1') + if has_tier_3(): + inputs['sjc_tier2tier3'] = iris_append_sjc_out_file_name_for_tier('tier2tier3') + + return inputs + + +localrules: all +rule all: + input: + unpack(all_input), + + +def result_dir(): + return os.path.join('results', config['run_name']) + + +def iris_db_path(): + return os.path.join(config['iris_data'], 'db') + + +def iris_db_sjc_path(): + return os.path.join(config['iris_data'], 'db_sjc') + + +def iris_exp_matrix_out_matrix(): + run_name = config['run_name'] + basename = 'exp.merged_matrix.{}.txt'.format(run_name) + return os.path.join(result_dir(), 'exp_matrix', basename) + + +def gene_exp_matrix_path_for_run(): + from_config = config.get('gene_exp_matrix') + if from_config: + return from_config + + if config.get('run_all_modules'): + return iris_exp_matrix_out_matrix() + + return None + + +def hla_types_list_for_run(): + from_config = config.get('mhc_list') + if from_config: + return from_config + + return os.path.join(result_dir(), 'hla_typing', 'hla_types.list') + + +def hla_from_patients_for_run(): + from_config = config.get('mhc_by_sample') + if from_config: + return from_config + + return os.path.join(result_dir(), 'hla_typing', 'hla_patient.tsv') + +def splicing_matrix_path_for_run(): + db_path = iris_db_path() + + return os.path.join(db_path, config['run_name'], 'splicing_matrix') + + +def sjc_count_path_for_run(): + db_path = iris_db_sjc_path() + + return os.path.join(db_path, config['run_name'], 'sjc_matrix') + + +def splicing_matrix_txt_path_for_run(): + matrix_path = splicing_matrix_path_for_run() + file_name = ('splicing_matrix.{}.cov10.{}.txt' + .format(config['splice_event_type'], config['run_name'])) + return os.path.join(matrix_path, file_name) + + +def splicing_matrix_idx_path_for_run(): + return '{}.idx'.format(splicing_matrix_txt_path_for_run()) + + +def sjc_count_txt_path_for_run(): + matrix_path = sjc_count_path_for_run() + file_name = 'SJ_count.{}.txt'.format(config['run_name']) + return os.path.join(matrix_path, file_name) + + +def sjc_count_idx_path_for_run(): + return '{}.idx'.format(sjc_count_txt_path_for_run()) + + +def format_ref_names(config_key): + configured = config.get(config_key, '') + # if no ref names -> provide a quoted empty string on the command line + if not configured.strip(): + return "''" + + return configured + + +# must have either tier 1 or tier 3 +def has_tier_1(): + return len(tier_1_group_names()) > 0 + + +def has_tier_3(): + return len(tier_3_group_names()) > 0 + + +def tier_1_group_names(): + return group_names_from_config_key('tissue_matched_normal_reference_group_names') + + +def tier_3_group_names(): + return group_names_from_config_key('normal_reference_group_names') + + +def group_names_from_config_key(key): + names_str = config.get(key) + split = names_str.split(',') + return [x.strip() for x in split if x] + + +def reference_file_wildcard_constraints(): + reference_files = config.get('reference_files') + if reference_files: + file_names = '|'.join([re.escape(file_name) + for file_name in reference_files]) + without_gz = '|'.join([re.escape(file_name[:-3]) + for file_name in reference_files + if file_name.endswith('.gz')]) + else: + no_match = '^$' # only matches empty string + file_names = no_match + without_gz = no_match + + return {'file_names': file_names, 'without_gz': without_gz} + + +def get_url_for_download_reference_file(wildcards): + file_name = wildcards.file_name + return config['reference_files'][file_name]['url'] + + +rule download_reference_file: + output: + ref_file=os.path.join('references', '{file_name}'), + log: + out=os.path.join('references', + 'download_reference_file_{file_name}_log.out'), + err=os.path.join('references', + 'download_reference_file_{file_name}_log.err'), + wildcard_constraints: + file_name=reference_file_wildcard_constraints()['file_names'] + params: + url=get_url_for_download_reference_file, + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + 'curl -L \'{params.url}\'' + ' -o {output.ref_file}' + ' 1> {log.out}' + ' 2> {log.err}' + +rule unzip_reference_file: + input: + gz=os.path.join('references', '{file_name}.gz'), + output: + un_gz=os.path.join('references', '{file_name}'), + log: + out=os.path.join('references', + 'unzip_reference_file_{file_name}_log.out'), + err=os.path.join('references', + 'unzip_reference_file_{file_name}_log.err'), + wildcard_constraints: + file_name=reference_file_wildcard_constraints()['without_gz'] + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + ' gunzip -c {input.gz}' + ' 1> {output.un_gz}' + ' 2> {log.err}' + + +def write_param_file_blacklist_param(): + value = config.get('blacklist') + if value: + return '--blacklist-file {}'.format(value) + + return '' + + +def write_param_file_bigwig_param(): + value = config.get('mapability_bigwig') + if value: + return '--mapability-bigwig {}'.format(value) + + return '' + + +def write_param_file_genome_param(): + value = config.get('fasta_name') + if value: + reference_path = os.path.join('references', value) + return '--reference-genome {}'.format(reference_path) + + return '' + + +def write_param_file_input(wildcards): + inputs = dict() + fasta = config.get('fasta_name') + if fasta: + inputs['fasta'] = os.path.join('references', fasta) + + return inputs + + +rule write_param_file: + input: + unpack(write_param_file_input), + output: + param_file=os.path.join(result_dir(), 'screen.para'), + log: + out=os.path.join(result_dir(), 'write_param_file_log.out'), + err=os.path.join(result_dir(), 'write_param_file_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_3=config['conda_env_3'], + script=os.path.join('scripts', 'write_param_file.py'), + group_name=config['run_name'], + iris_db=iris_db_path(), + matched_psi_cut=config.get('tissue_matched_normal_psi_p_value_cutoff', ''), + matched_sjc_cut=config.get('tissue_matched_normal_sjc_p_value_cutoff', ''), + matched_delta_psi_cut=config.get('tissue_matched_normal_delta_psi_p_value_cutoff', ''), + matched_fc_cut=config.get('tissue_matched_normal_fold_change_cutoff', ''), + matched_group_cut=config.get('tissue_matched_normal_group_count_cutoff', ''), + matched_ref_names=format_ref_names('tissue_matched_normal_reference_group_names'), + tumor_psi_cut=config.get('tumor_psi_p_value_cutoff', ''), + tumor_sjc_cut=config.get('tumor_sjc_p_value_cutoff', ''), + tumor_delta_psi_cut=config.get('tumor_delta_psi_p_value_cutoff', ''), + tumor_fc_cut=config.get('tumor_fold_change_cutoff', ''), + tumor_group_cut=config.get('tumor_group_count_cutoff', ''), + tumor_ref_names=format_ref_names('tumor_reference_group_names'), + normal_psi_cut=config.get('normal_psi_p_value_cutoff', ''), + normal_sjc_cut=config.get('normal_sjc_p_value_cutoff', ''), + normal_delta_psi_cut=config.get('normal_delta_psi_p_value_cutoff', ''), + normal_fc_cut=config.get('normal_fold_change_cutoff', ''), + normal_group_cut=config.get('normal_group_count_cutoff', ''), + normal_ref_names=format_ref_names('normal_reference_group_names'), + comparison_mode=config['comparison_mode'], + stat_test_type=config['stat_test_type'], + use_ratio='--use-ratio' if config.get('use_ratio') else '', + blacklist=write_param_file_blacklist_param(), + bigwig=write_param_file_bigwig_param(), + genome=write_param_file_genome_param(), + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + '{params.conda_wrapper} {params.conda_env_3} python {params.script}' + ' --out-path {output.param_file}' + ' --group-name {params.group_name}' + ' --iris-db {params.iris_db}' + ' --psi-p-value-cutoffs' + ' {params.matched_psi_cut},{params.tumor_psi_cut},{params.normal_psi_cut}' + ' --sjc-p-value-cutoffs' + ' {params.matched_sjc_cut},{params.tumor_sjc_cut},{params.normal_sjc_cut}' + ' --delta-psi-cutoffs' + ' {params.matched_delta_psi_cut},{params.tumor_delta_psi_cut},{params.normal_delta_psi_cut}' + ' --fold-change-cutoffs' + ' {params.matched_fc_cut},{params.tumor_fc_cut},{params.normal_fc_cut}' + ' --group-count-cutoffs' + ' {params.matched_group_cut},{params.tumor_group_cut},{params.normal_group_cut}' + ' --reference-names-tissue-matched-normal {params.matched_ref_names}' + ' --reference-names-tumor {params.tumor_ref_names}' + ' --reference-names-normal {params.normal_ref_names}' + ' --comparison-mode {params.comparison_mode}' + ' --statistical-test-type {params.stat_test_type}' + ' {params.use_ratio}' + ' {params.blacklist}' + ' {params.bigwig}' + ' {params.genome}' + ' 1> {log.out}' + ' 2> {log.err}' + + +# if the necessary files are specified in the config, then +# use them rather than run IRIS format +def copy_splice_matrix_files_input(wildcards): + inputs = dict() + inputs['splice_txt'] = config.get('splice_matrix_txt', UNSATISFIABLE_INPUT) + inputs['splice_idx'] = config.get('splice_matrix_idx', UNSATISFIABLE_INPUT) + if config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + +ruleorder: copy_splice_matrix_files > iris_format +localrules: copy_splice_matrix_files +rule copy_splice_matrix_files: + input: + unpack(copy_splice_matrix_files_input), + output: + splice_txt=splicing_matrix_txt_path_for_run(), + splice_idx=splicing_matrix_idx_path_for_run(), + shell: + 'cp {input.splice_txt} {output.splice_txt}' + ' && cp {input.splice_idx} {output.splice_idx}' + + +def copy_sjc_count_files_input(wildcards): + inputs = dict() + inputs['count_txt'] = config.get('sjc_count_txt', UNSATISFIABLE_INPUT) + inputs['count_idx'] = config.get('sjc_count_idx', UNSATISFIABLE_INPUT) + if config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + +ruleorder: copy_sjc_count_files > iris_sjc_matrix +localrules: copy_sjc_count_files +rule copy_sjc_count_files: + input: + unpack(copy_sjc_count_files_input), + output: + count_txt=sjc_count_txt_path_for_run(), + count_idx=sjc_count_idx_path_for_run(), + shell: + 'cp {input.count_txt} {output.count_txt}' + ' && cp {input.count_idx} {output.count_idx}' + + +def create_star_index_out_dir_param(wildcards, output): + return os.path.dirname(output.index) + + +def create_star_index_input(wildcards): + inputs = dict() + inputs['gtf'] = os.path.join('references', config['gtf_name']) + inputs['fasta'] = os.path.join('references', config['fasta_name']) + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule create_star_index: + input: + unpack(create_star_index_input), + output: + index=os.path.join('references', 'star_index', 'SA'), + log: + out=os.path.join('references', 'create_star_index_log.out'), + err=os.path.join('references', 'create_star_index_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + out_dir=create_star_index_out_dir_param, + overhang=config['star_sjdb_overhang'], + threads: config['create_star_index_threads'] + resources: + mem_mb=config['create_star_index_mem_gb'] * 1024, + time_hours=config['create_star_index_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} STAR' + ' --runMode genomeGenerate' + ' --runThreadN {threads}' + ' --genomeDir {params.out_dir}' + ' --genomeFastaFiles {input.fasta}' + ' --sjdbGTFfile {input.gtf}' + ' --sjdbOverhang {params.overhang}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def organize_fastqs_sample_details(): + details = dict() + fastq_dict = config.get('sample_fastqs') + if not fastq_dict: + return details + + sample_names = list() + all_fastqs = list() + for name, fastqs in fastq_dict.items(): + for fastq in fastqs: + sample_names.append(name) + all_fastqs.append(fastq) + + details['sample_names'] = sample_names + details['fastqs'] = all_fastqs + return details + + +def unique_sample_names(): + fastq_dict = config.get('sample_fastqs') + if not fastq_dict: + return list() + + return list(fastq_dict.keys()) + + +def organize_fastqs_input(wildcards): + details = organize_fastqs_sample_details() + if not details: + return {'unsatisfiable': UNSATISFIABLE_INPUT} + + return {'fastqs': details['fastqs']} + + +def organize_fastqs_sample_names_param(): + sample_names = organize_fastqs_sample_details().get('sample_names', list()) + return sample_names + + +localrules: organize_fastqs +rule organize_fastqs: + input: + unpack(organize_fastqs_input), + output: + done=touch(os.path.join(result_dir(), 'fastq_dir', 'organize_fastqs.done')), + params: + sample_names=organize_fastqs_sample_names_param(), + out_dir=os.path.join(result_dir(), 'fastq_dir'), + run: + import os + import os.path + + out_dir = params.out_dir + if os.path.isdir(out_dir): + files = os.listdir(out_dir) + if files: + raise Exception('organize_fastqs: {} already contains files' + .format(out_dir)) + + for i, sample_name in enumerate(params.sample_names): + sample_dir = os.path.join(out_dir, sample_name) + orig_fastq_path = input.fastqs[i] + fastq_basename = os.path.basename(orig_fastq_path) + new_fastq_path = os.path.join(sample_dir, fastq_basename) + os.makedirs(sample_dir, exist_ok=True) + os.symlink(orig_fastq_path, new_fastq_path) + + +def iris_makesubsh_mapping_task_out_file_names(): + task_dir = os.path.join(result_dir(), 'mapping_tasks') + sample_names = unique_sample_names() + star_tasks = list() + cuff_tasks = list() + for sample_name in sample_names: + star_name = 'STARmap.{}.sh'.format(sample_name) + cuff_name = 'Cuffquant.{}.sh'.format(sample_name) + star_tasks.append(os.path.join(task_dir, star_name)) + cuff_tasks.append(os.path.join(task_dir, cuff_name)) + + return {'star_tasks': star_tasks, 'cuff_tasks': cuff_tasks} + + +def iris_makesubsh_mapping_star_done_file_names(): + out_dir = os.path.join(result_dir(), 'process_rnaseq') + sample_names = unique_sample_names() + final_bams = list() + for sample in sample_names: + align_dir = os.path.join(out_dir, '{}.aln'.format(sample)) + final_bam = os.path.join(align_dir, 'Aligned.sortedByCoord.out.bam') + final_bams.append(final_bam) + + return final_bams + + +def iris_makesubsh_mapping_cuff_done_file_names(): + cuff_tasks = iris_makesubsh_mapping_task_out_file_names()['cuff_tasks'] + done_names = list() + for task in cuff_tasks: + done_names.append('{}.done'.format(task)) + + return done_names + + +def iris_makesubsh_mapping_star_dir_param(wildcards): + input = iris_makesubsh_mapping_input(wildcards) + return os.path.dirname(input['index']) + + +def iris_makesubsh_mapping_task_dir_param(wildcards, output): + return os.path.dirname(output.star_tasks[0]) + + +def label_string_param(): + # IRIS uses this value to tell which files are for read 1 or read 2. + # Specifically it looks for '1{label_string}f' and '1{label_string}f' + return '.' + + +def iris_makesubsh_mapping_input(wildcards): + inputs = dict() + inputs['organize_fastqs_done'] = os.path.join(result_dir(), 'fastq_dir', + 'organize_fastqs.done') + inputs['index'] = os.path.join('references', 'star_index', 'SA') + inputs['gtf'] = os.path.join('references', config['gtf_name']) + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_makesubsh_mapping: + input: + unpack(iris_makesubsh_mapping_input), + output: + **iris_makesubsh_mapping_task_out_file_names() + log: + out=os.path.join(result_dir(), 'iris_makesubsh_mapping_log.out'), + err=os.path.join(result_dir(), 'iris_makesubsh_mapping_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + fastq_dir=os.path.join(result_dir(), 'fastq_dir'), + star_dir=iris_makesubsh_mapping_star_dir_param, + run_name=config['run_name'], + out_dir=os.path.join(result_dir(), 'process_rnaseq'), + label_string=label_string_param(), + task_dir=iris_makesubsh_mapping_task_dir_param, + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS makesubsh_mapping' + ' --fastq-folder-dir {params.fastq_dir}' + ' --starGenomeDir {params.star_dir}' + ' --gtf {input.gtf}' + ' --data-name {params.run_name}' + ' --outdir {params.out_dir}' + ' --label-string {params.label_string}' + ' --task-dir {params.task_dir}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_star_task_input(wildcards): + inputs = dict() + inputs['star_task'] = os.path.join(result_dir(), 'mapping_tasks', + 'STARmap.{sample}.sh') + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_star_task: + input: + unpack(iris_star_task_input), + output: + unsorted_bam=os.path.join(result_dir(), 'process_rnaseq', + '{sample}.aln', 'Aligned.out.bam'), + sorted_bam=os.path.join(result_dir(), 'process_rnaseq', '{sample}.aln', + 'Aligned.sortedByCoord.out.bam'), + log: + out=os.path.join(result_dir(), 'mapping_tasks', + 'iris_star_task_{sample}_log.out'), + err=os.path.join(result_dir(), 'mapping_tasks', + 'iris_star_task_{sample}_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + threads: config['iris_star_task_threads'] + resources: + mem_mb=config['iris_star_task_mem_gb'] * 1024, + time_hours=config['iris_star_task_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} bash' + ' {input.star_task}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_cuff_task_input(wildcards): + inputs = dict() + inputs['cuff_task'] = os.path.join(result_dir(), 'mapping_tasks', + 'Cuffquant.{sample}.sh') + inputs['star_task_done'] = os.path.join(result_dir(), 'process_rnaseq', + '{sample}.aln', + 'Aligned.sortedByCoord.out.bam') + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_cuff_task: + input: + unpack(iris_cuff_task_input), + output: + cuff_task_done=touch(os.path.join(result_dir(), 'process_rnaseq', + '{sample}.aln', 'cufflinks', + 'genes.fpkm_tracking')), + log: + out=os.path.join(result_dir(), 'mapping_tasks', + 'iris_cuff_task_{sample}_log.out'), + err=os.path.join(result_dir(), 'mapping_tasks', + 'iris_cuff_task_{sample}_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + threads: config['iris_cuff_task_threads'] + resources: + mem_mb=config['iris_cuff_task_mem_gb'] * 1024, + time_hours=config['iris_cuff_task_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} bash' + ' {input.cuff_task}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_makesubsh_hla_task_out_file_names(): + task_dir = os.path.join(result_dir(), 'hla_tasks') + sample_names = unique_sample_names() + hla_tasks = list() + for sample_name in sample_names: + hla_name = 'seq2hla.{}.sh'.format(sample_name) + hla_tasks.append(os.path.join(task_dir, hla_name)) + + return {'hla_tasks': hla_tasks} + + +def iris_hla_task_done_file_names(): + sample_names = unique_sample_names() + done_file_names = list() + hla_dir = os.path.join(result_dir(), 'hla_typing') + for sample in sample_names: + out_dir = os.path.join(hla_dir, sample) + expression = os.path.join(out_dir, + '{}-ClassI.expression'.format(sample)) + genotype = os.path.join(out_dir, + '{}-ClassI.HLAgenotype4digits'.format(sample)) + done_file_names.append(expression) + done_file_names.append(genotype) + + return done_file_names + + +def iris_makesubsh_hla_task_dir_param(wildcards, output): + return os.path.dirname(output.hla_tasks[0]) + + +def iris_makesubsh_hla_input(wildcards): + inputs = dict() + inputs['organize_fastqs_done'] = os.path.join(result_dir(), 'fastq_dir', + 'organize_fastqs.done') + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_makesubsh_hla: + input: + unpack(iris_makesubsh_hla_input), + + output: + **iris_makesubsh_hla_task_out_file_names() + log: + out=os.path.join(result_dir(), 'iris_makesubsh_hla_log.out'), + err=os.path.join(result_dir(), 'iris_makesubsh_hla_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + fastq_dir=os.path.join(result_dir(), 'fastq_dir'), + run_name=config['run_name'], + out_dir=os.path.join(result_dir(), 'hla_typing'), + label_string=label_string_param(), + task_dir=iris_makesubsh_hla_task_dir_param, + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS makesubsh_hla' + ' --fastq-folder-dir {params.fastq_dir}' + ' --data-name {params.run_name}' + ' --outdir {params.out_dir}' + ' --label-string {params.label_string}' + ' --task-dir {params.task_dir}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_hla_task_input(wildcards): + inputs = dict() + inputs['hla_task'] = os.path.join(result_dir(), 'hla_tasks', + 'seq2hla.{sample}.sh') + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_hla_task: + input: + unpack(iris_hla_task_input), + output: + expression=os.path.join(result_dir(), 'hla_typing', '{sample}', + '{sample}-ClassI.expression'), + genotype=os.path.join(result_dir(), 'hla_typing', '{sample}', + '{sample}-ClassI.HLAgenotype4digits'), + log: + out=os.path.join(result_dir(), 'hla_tasks', + 'iris_hla_task_{sample}_log.out'), + err=os.path.join(result_dir(), 'hla_tasks', + 'iris_hla_task_{sample}_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + threads: config['iris_hla_task_threads'] + resources: + mem_mb=config['iris_hla_task_mem_gb'] * 1024, + time_hours=config['iris_hla_task_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} bash' + ' {input.hla_task}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_parse_hla_input(wildcards): + inputs = dict() + inputs['hla_tasks_done'] = iris_hla_task_done_file_names() + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_parse_hla: + input: + unpack(iris_parse_hla_input), + output: + patient=os.path.join(result_dir(), 'hla_typing', 'hla_patient.tsv'), + types=os.path.join(result_dir(), 'hla_typing', 'hla_types.list'), + exp=os.path.join(result_dir(), 'hla_typing', 'hla_exp.list'), + log: + out=os.path.join(result_dir(), 'iris_parse_hla_log.out'), + err=os.path.join(result_dir(), 'iris_parse_hla_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + out_dir=os.path.join(result_dir(), 'hla_typing'), + resources: + mem_mb=config['iris_parse_hla_mem_gb'] * 1024, + time_hours=config['iris_parse_hla_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS parse_hla' + ' --outdir {params.out_dir}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_makesubsh_rmats_task_out_file_names(): + task_dir = os.path.join(result_dir(), 'rmats_tasks') + sample_names = unique_sample_names() + rmats_tasks = list() + for sample_name in sample_names: + rmats_name = 'rMATS_prep.{}.sh'.format(sample_name) + rmats_tasks.append(os.path.join(task_dir, rmats_name)) + + return {'rmats_tasks': rmats_tasks} + + +def iris_makesubsh_rmats_done_file_names(): + rmats_tasks = iris_makesubsh_rmats_task_out_file_names()['rmats_tasks'] + done_names = list() + for task in rmats_tasks: + done_names.append('{}.done'.format(task)) + + return done_names + + +def iris_makesubsh_rmats_task_dir_param(wildcards, output): + return os.path.dirname(output.rmats_tasks[0]) + + +def iris_makesubsh_rmats_input(wildcards): + inputs = dict() + inputs['star_done'] = iris_makesubsh_mapping_star_done_file_names() + inputs['gtf'] = os.path.join('references', config['gtf_name']) + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_makesubsh_rmats: + input: + unpack(iris_makesubsh_rmats_input), + output: + **iris_makesubsh_rmats_task_out_file_names() + log: + out=os.path.join(result_dir(), 'iris_makesubsh_rmats_log.out'), + err=os.path.join(result_dir(), 'iris_makesubsh_rmats_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + rmats_path=config['rmats_path'], + bam_dir=os.path.join(result_dir(), 'process_rnaseq'), + run_name=config['run_name'], + task_dir=iris_makesubsh_rmats_task_dir_param, + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS makesubsh_rmats' + ' --rMATS-path {params.rmats_path}' + ' --bam-dir {params.bam_dir}' + ' --gtf {input.gtf}' + ' --data-name {params.run_name}' + ' --task-dir {params.task_dir}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_rmats_task_input(wildcards): + inputs = dict() + inputs['rmats_task'] = os.path.join(result_dir(), 'rmats_tasks', + 'rMATS_prep.{sample}.sh') + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_rmats_task: + input: + unpack(iris_rmats_task_input), + output: + # The output files have the format: + # result_dir()/process_rnaseq/{run}.RL{readLength}/{sample}.tmp/{datetime}_{n}.rmats + # Just using a .done file instead. + rmats_task_done=touch(os.path.join(result_dir(), 'rmats_tasks', + 'rMATS_prep.{sample}.sh.done')), + log: + out=os.path.join(result_dir(), 'rmats_tasks', + 'iris_rmats_task_{sample}_log.out'), + err=os.path.join(result_dir(), 'rmats_tasks', + 'iris_rmats_task_{sample}_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + threads: config['iris_rmats_task_threads'] + resources: + mem_mb=config['iris_rmats_task_mem_gb'] * 1024, + time_hours=config['iris_rmats_task_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} bash' + ' {input.rmats_task}' + ' 1> {log.out}' + ' 2> {log.err}' + +checkpoint check_read_lengths: + input: + rmats_done=iris_makesubsh_rmats_done_file_names(), + output: + read_lengths=os.path.join(result_dir(), 'process_rnaseq', + 'read_lengths.txt'), + log: + out=os.path.join(result_dir(), 'process_rnaseq', 'check_read_lengths_log.out'), + err=os.path.join(result_dir(), 'process_rnaseq', 'check_read_lengths_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_3=config['conda_env_3'], + script=os.path.join('scripts', 'check_read_lengths.py'), + parent_dir=os.path.join(result_dir(), 'process_rnaseq'), + run_name=config['run_name'], + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + '{params.conda_wrapper} {params.conda_env_3} python {params.script}' + ' --parent-dir {params.parent_dir}' + ' --run-name {params.run_name}' + ' --out {output.read_lengths}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_makesubsh_rmatspost_input(wildcards): + inputs = dict() + inputs['rmats_done'] = iris_makesubsh_rmats_done_file_names() + inputs['gtf'] = os.path.join('references', config['gtf_name']) + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_makesubsh_rmatspost: + input: + rmats_done=iris_makesubsh_rmats_done_file_names(), + gtf=os.path.join('references', config['gtf_name']), + output: + makesubsh_done=touch(os.path.join(result_dir(), 'iris_makesubsh_rmats_post.done')), + log: + out=os.path.join(result_dir(), 'iris_makesubsh_rmatspost_log.out'), + err=os.path.join(result_dir(), 'iris_makesubsh_rmatspost_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + rmats_path=config['rmats_path'], + bam_dir=os.path.join(result_dir(), 'process_rnaseq'), + run_name=config['run_name'], + task_dir=os.path.join(result_dir(), 'rmats_post_tasks'), + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS makesubsh_rmatspost' + ' --rMATS-path {params.rmats_path}' + ' --bam-dir {params.bam_dir}' + ' --gtf {input.gtf}' + ' --data-name {params.run_name}' + ' --task-dir {params.task_dir}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_rmatspost_task_input(wildcards): + inputs = dict() + inputs['makesubsh_done'] = os.path.join( + result_dir(), 'iris_makesubsh_rmats_post.done') + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_rmatspost_task: + input: + unpack(iris_rmatspost_task_input), + output: + summary=os.path.join(result_dir(), 'process_rnaseq', + '{run_name}.RL{read_length}', + '{run_name}_RL{read_length}.matrix', + 'summary.txt'), + log: + out=os.path.join(result_dir(), 'rmats_post_tasks', + 'iris_rmatspost_task_{run_name}_{read_length}_log.out'), + err=os.path.join(result_dir(), 'rmats_post_tasks', + 'iris_rmatspost_task_{run_name}_{read_length}_log.err'), + wildcard_constraints: + run_name=config['run_name'], + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + # post_task is generated by iris_makesubsh_rmatspost, but since the number of + # read_lengths is not known until the check_read_lengths checkpoint, + # that .sh file is not used in the "input" or "output" sections of the snakemake + post_task=os.path.join(result_dir(), 'rmats_post_tasks', + 'rMATS_post.{run_name}_RL{read_length}.sh'), + threads: config['iris_rmatspost_task_threads'] + resources: + mem_mb=config['iris_rmatspost_task_mem_gb'] * 1024, + time_hours=config['iris_rmatspost_task_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} bash' + ' {params.post_task}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def prepare_iris_format_input(wildcards): + read_lengths_file = checkpoints.check_read_lengths.get().output[0] + summaries = list() + run_name = config['run_name'] + input_prefix = os.path.join(result_dir(), 'process_rnaseq') + with open(read_lengths_file, 'rt') as in_handle: + for line in in_handle: + read_length = line.strip() + run_with_read_length_dot = '{}.RL{}'.format(run_name, read_length) + run_with_read_length_underscore = '{}_RL{}'.format(run_name, read_length) + summary = os.path.join( + input_prefix, run_with_read_length_dot, + '{}.matrix'.format(run_with_read_length_underscore), + 'summary.txt') + summaries.append(summary) + + return {'summaries': summaries} + + +rule prepare_iris_format: + input: + unpack(prepare_iris_format_input), + output: + matrix=os.path.join(result_dir(), 'process_rnaseq', 'matrix_list.txt'), + sample=os.path.join(result_dir(), 'process_rnaseq', 'sample_list.txt'), + log: + out=os.path.join(result_dir(), 'process_rnaseq', + 'prepare_iris_format_log.out'), + err=os.path.join(result_dir(), 'process_rnaseq', + 'prepare_iris_format_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_3=config['conda_env_3'], + script=os.path.join('scripts', 'prepare_iris_format.py'), + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + '{params.conda_wrapper} {params.conda_env_3} python {params.script}' + ' --matrix-out {output.matrix}' + ' --sample-out {output.sample}' + ' --summaries {input.summaries}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_format_input(wildcards): + inputs = dict() + inputs['matrix_list'] = os.path.join(result_dir(), 'process_rnaseq', + 'matrix_list.txt') + inputs['sample_list'] = os.path.join(result_dir(), 'process_rnaseq', + 'sample_list.txt') + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +# after IRIS format, IRIS index does not need to be run +rule iris_format: + input: + unpack(iris_format_input), + output: + matrix=splicing_matrix_txt_path_for_run(), + idx=splicing_matrix_idx_path_for_run(), + log: + out=os.path.join(result_dir(), 'iris_format_log.out'), + err=os.path.join(result_dir(), 'iris_format_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + splice_type=config['splice_event_type'], + run_name=config['run_name'], + iris_db=iris_db_path(), + sample_name_field='2', + resources: + mem_mb=config['iris_format_mem_gb'] * 1024, + time_hours=config['iris_format_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS format' + ' {input.matrix_list}' + ' {input.sample_list}' + ' --splicing-event-type {params.splice_type}' + ' --data-name {params.run_name}' + ' --sample-name-field {params.sample_name_field}' + ' --sample-based-filter' + ' --iris-db-path {params.iris_db}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def prepare_iris_exp_matrix_input(wildcards): + fpkm_files = list() + sample_names = unique_sample_names() + for name in sample_names: + fpkm_path = os.path.join( + result_dir(), 'process_rnaseq', '{}.aln'.format(name), 'cufflinks', + 'genes.fpkm_tracking') + fpkm_files.append(fpkm_path) + + return {'fpkm_files': fpkm_files} + + +rule prepare_iris_exp_matrix: + input: + unpack(prepare_iris_exp_matrix_input), + output: + manifest=os.path.join(result_dir(), 'process_rnaseq', + 'cufflinks_manifest.txt'), + log: + out=os.path.join(result_dir(), 'process_rnaseq', + 'prepare_iris_exp_matrix_log.out'), + err=os.path.join(result_dir(), 'process_rnaseq', + 'prepare_iris_exp_matrix_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_3=config['conda_env_3'], + script=os.path.join('scripts', 'prepare_iris_exp_matrix.py'), + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + '{params.conda_wrapper} {params.conda_env_3} python {params.script}' + ' --out-manifest {output.manifest}' + ' --fpkm-files {input.fpkm_files}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_exp_matrix_out_dir_param(wildcards, output): + return os.path.dirname(output.matrix) + + +def iris_exp_matrix_input(wildcards): + inputs = dict() + inputs['manifest'] = os.path.join(result_dir(), 'process_rnaseq', + 'cufflinks_manifest.txt') + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_exp_matrix: + input: + unpack(iris_exp_matrix_input), + output: + matrix=iris_exp_matrix_out_matrix(), + log: + out=os.path.join(result_dir(), 'iris_exp_matrix_log.out'), + err=os.path.join(result_dir(), 'iris_exp_matrix_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + run_name=config['run_name'], + out_dir=iris_exp_matrix_out_dir_param, + resources: + mem_mb=config['iris_exp_matrix_mem_gb'] * 1024, + time_hours=config['iris_exp_matrix_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS exp_matrix' + ' --outdir {params.out_dir}' + ' --data-name {params.run_name}' + ' {input.manifest}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_makesubsh_extract_sjc_bam_dir_param(wildcards): + input = iris_makesubsh_extract_sjc_input(wildcards) + return os.path.dirname(input['bam']) + + +def iris_makesubsh_extract_sjc_task_dir_param(wildcards, output): + return os.path.dirname(output.extract_task) + + +def iris_makesubsh_extract_sjc_input(wildcards): + inputs = dict() + inputs['bam'] = os.path.join(result_dir(), 'process_rnaseq', '{sample}.aln', + 'Aligned.sortedByCoord.out.bam') + inputs['gtf'] = os.path.join('references', config['gtf_name']) + inputs['fasta'] = os.path.join('references', config['fasta_name']) + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_makesubsh_extract_sjc: + input: + unpack(iris_makesubsh_extract_sjc_input), + output: + extract_task=os.path.join(result_dir(), 'extract_sjc_tasks', + 'cmdlist.extract_sjc.{sample}'), + bam_list=os.path.join(result_dir(), 'extract_sjc_tasks', + 'bam_folder_list_{sample}.txt'), + log: + out=os.path.join(result_dir(), 'extract_sjc_tasks', + 'iris_makesubsh_extract_sjc_{sample}_log.out'), + err=os.path.join(result_dir(), 'extract_sjc_tasks', + 'iris_makesubsh_extract_sjc_{sample}_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + bam_dir=iris_makesubsh_extract_sjc_bam_dir_param, + task_name='{sample}', + bam_prefix='Aligned.sortedByCoord.out', + task_dir=iris_makesubsh_extract_sjc_task_dir_param, + resources: + shell: + 'echo {params.bam_dir} > {output.bam_list}' + ' && {params.conda_wrapper} {params.conda_env_2} IRIS' + ' makesubsh_extract_sjc' + ' --bam-folder-list {output.bam_list}' + ' --task-name {params.task_name}' + ' --gtf {input.gtf}' + ' --genome-fasta {input.fasta}' + ' --BAM-prefix {params.bam_prefix}' + ' --task-dir {params.task_dir}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_extract_sjc_task_input(wildcards): + inputs = dict() + inputs['extract_task'] = os.path.join(result_dir(), 'extract_sjc_tasks', + 'cmdlist.extract_sjc.{sample}') + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_extract_sjc_task: + input: + unpack(iris_extract_sjc_task_input), + output: + sj_count=os.path.join(result_dir(), 'process_rnaseq', '{sample}.aln', + 'SJcount.txt'), + log: + out=os.path.join(result_dir(), 'extract_sjc_tasks', + 'iris_extract_sjc_task_{sample}_log.out'), + err=os.path.join(result_dir(), 'extract_sjc_tasks', + 'iris_extract_sjc_task_{sample}_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + resources: + mem_mb=config['iris_extract_sjc_task_mem_gb'] * 1024, + time_hours=config['iris_extract_sjc_task_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} bash' + ' {input.extract_task}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def prepare_iris_sjc_matrix_input(wildcards): + sample_names = unique_sample_names() + sj_files = list() + for name in sample_names: + sample_dir = '{}.aln'.format(name) + sj_file = os.path.join(result_dir(), 'process_rnaseq', sample_dir, + 'SJcount.txt') + sj_files.append(sj_file) + + return {'sj_files': sj_files} + + +rule prepare_iris_sjc_matrix: + input: + unpack(prepare_iris_sjc_matrix_input), + output: + sj_list=os.path.join(result_dir(), 'process_rnaseq', 'sjc_file_list.txt'), + log: + out=os.path.join(result_dir(), 'process_rnaseq', + 'prepare_iris_sjc_matrix_log.out'), + err=os.path.join(result_dir(), 'process_rnaseq', + 'prepare_iris_sjc_matrix_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_3=config['conda_env_3'], + script=os.path.join('scripts', 'prepare_iris_sjc_matrix.py'), + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + '{params.conda_wrapper} {params.conda_env_3} python {params.script}' + ' --sj-out {output.sj_list}' + ' --sj-files {input.sj_files}' + ' 1> {log.out}' + ' 2> {log.err}' + +def iris_sjc_matrix_input(wildcards): + inputs = dict() + inputs['sj_list'] = os.path.join(result_dir(), 'process_rnaseq', + 'sjc_file_list.txt') + if not config['run_all_modules']: + inputs['run_all_modules'] = UNSATISFIABLE_INPUT + + return inputs + + +rule iris_sjc_matrix: + input: + unpack(iris_sjc_matrix_input), + output: + count_txt=sjc_count_txt_path_for_run(), + count_idx=sjc_count_idx_path_for_run(), + log: + out=os.path.join(result_dir(), 'iris_sjc_matrix_log.out'), + err=os.path.join(result_dir(), 'iris_sjc_matrix_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + run_name=config['run_name'], + sample_name_field='2', + db_sjc=iris_db_sjc_path(), + resources: + mem_mb=config['iris_sjc_matrix_mem_gb'] * 1024, + time_hours=config['iris_sjc_matrix_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS sjc_matrix' + ' --file-list-input {input.sj_list}' + ' --data-name {params.run_name}' + ' --sample-name-field {params.sample_name_field}' + ' --iris-db-path {params.db_sjc}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_screen_out_dir_param(wildcards, output): + return os.path.dirname(output.guided) + + +def iris_screen_out_files(): + out_files = dict() + out_dir = os.path.join(result_dir(), 'screen') + out_prefix = '{}.{}'.format(config['run_name'], config['splice_event_type']) + out_files['guided'] = os.path.join( + out_dir, '{}.test.all_guided.txt'.format(out_prefix)) + out_files['voted'] = os.path.join( + out_dir, '{}.test.all_voted.txt'.format(out_prefix)) + out_files['notest'] = os.path.join( + out_dir, '{}.notest.txt'.format(out_prefix)) + out_files['tier1'] = os.path.join( + out_dir, '{}.tier1.txt'.format(out_prefix)) + out_files['tier2tier3'] = os.path.join( + out_dir, '{}.tier2tier3.txt'.format(out_prefix)) + + return out_files + + +rule iris_screen: + input: + parameter_file=os.path.join(result_dir(), 'screen.para'), + gtf=os.path.join('references', config['gtf_name']), + splice_txt=splicing_matrix_txt_path_for_run(), + splice_idx=splicing_matrix_idx_path_for_run(), + output: + **iris_screen_out_files() + log: + out=os.path.join(result_dir(), 'iris_screen_log.out'), + err=os.path.join(result_dir(), 'iris_screen_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + splice_event_type=config['splice_event_type'], + out_dir=iris_screen_out_dir_param, + resources: + mem_mb=config['iris_screen_mem_gb'] * 1024, + time_hours=config['iris_screen_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS screen' + ' --parameter-fin {input.parameter_file}' + ' --splicing-event-type {params.splice_event_type}' + ' --outdir {params.out_dir}' + ' --translating' # runs IRIS translate within IRIS screen + ' --gtf {input.gtf}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_predict_out_dir_param(wildcards, output): + return os.path.dirname(output.predict_out[0]) + + +def iris_predict_out_file_names(): + tier_names = list() + if has_tier_1(): + tier_names.append('tier1') + if has_tier_3(): + tier_names.append('tier2tier3') + + names = list() + for tier_name in tier_names: + basename = '{}.{}.{}.txt.ExtraCellularAS.txt'.format( + config['run_name'], config['splice_event_type'], tier_name) + name = os.path.join(result_dir(), 'screen', basename) + names.append(name) + + return names + + +def iris_predict_input(wildcards): + inputs = dict() + inputs['parameter_file'] = os.path.join(result_dir(), 'screen.para') + inputs['screen_out'] = iris_screen_out_files()['guided'] + inputs['mhc_list'] = hla_types_list_for_run() + gene_path = gene_exp_matrix_path_for_run() + if gene_path: + inputs['gene_exp_matrix'] = gene_path + + return inputs + + +def iris_predict_gene_exp_param(): + gene_exp_path = gene_exp_matrix_path_for_run() + if not gene_exp_path: + return '' + + return ' --gene-exp-matrix {}'.format(gene_exp_path) + + +rule iris_predict: + input: + unpack(iris_predict_input), + output: + predict_out=iris_predict_out_file_names(), + log: + out=os.path.join(result_dir(), 'iris_predict_log.out'), + err=os.path.join(result_dir(), 'iris_predict_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + out_dir=iris_predict_out_dir_param, + task_dir=os.path.join(result_dir(), 'predict_tasks'), + splice_event_type=config['splice_event_type'], + iedb_path=config['iedb_path'], + task_wildcard_string=os.path.join(result_dir(), 'predict_tasks', + 'pep2epitope_{}.tier*.*.sh'.format( + config['splice_event_type'])), + gene_exp=iris_predict_gene_exp_param(), + resources: + mem_mb=config['iris_predict_mem_gb'] * 1024, + time_hours=config['iris_predict_time_hr'], + shell: + # Remove any existing task scripts. + # Usually snakemake automatically removes output before running a job, but + # in this case the number of output files is not known in advance. + 'if [[ -n "$(ls {params.task_wildcard_string})" ]];' + ' then rm {params.task_wildcard_string};' + ' fi;' + ' {params.conda_wrapper} {params.conda_env_2} IRIS predict' + ' {params.out_dir}' + ' --task-dir {params.task_dir}' + ' --parameter-fin {input.parameter_file}' + ' --splicing-event-type {params.splice_event_type}' + ' --iedb-local {params.iedb_path}' + ' --mhc-list {input.mhc_list}' + ' {params.gene_exp}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def count_iris_predict_tasks_task_dir_param(wildcards, output): + return os.path.dirname(output.predict_task_list) + + +checkpoint count_iris_predict_tasks: + input: + predict_out=iris_predict_out_file_names(), + output: + predict_task_list=os.path.join(result_dir(), 'predict_tasks', + 'predict_tasks_list.txt'), + log: + out=os.path.join(result_dir(), 'predict_tasks', + 'count_iris_predict_tasks_log.out'), + err=os.path.join(result_dir(), 'predict_tasks', + 'count_iris_predict_tasks_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_3=config['conda_env_3'], + script=os.path.join('scripts', 'count_iris_predict_tasks.py'), + task_dir=count_iris_predict_tasks_task_dir_param, + splice_type=config['splice_event_type'], + resources: + mem_mb=DEFAULT_MEM_MB, + time_hours=DEFAULT_TIME_HOURS, + shell: + '{params.conda_wrapper} {params.conda_env_3} python {params.script}' + ' --out-list {output.predict_task_list}' + ' --task-dir {params.task_dir}' + ' --splice-type {params.splice_type}' + ' 1> {log.out}' + ' 2> {log.err}' + +rule iris_predict_task: + input: + predict_task=os.path.join(result_dir(), 'predict_tasks', '{task_name}.sh'), + output: + predict_task_done=touch(os.path.join(result_dir(), 'predict_tasks', + '{task_name}.sh.done')), + log: + out=os.path.join(result_dir(), 'predict_tasks', '{task_name}_log.out'), + err=os.path.join(result_dir(), 'predict_tasks', '{task_name}_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + resources: + mem_mb=config['iris_predict_task_mem_gb'] * 1024, + time_hours=config['iris_predict_task_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} bash' + ' {input.predict_task}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_epitope_post_out_dir_param(): + return os.path.join(result_dir(), 'screen') + + +def iris_epitope_post_out_files(): + files = dict() + splice_type = config['splice_event_type'] + tier1_dir = os.path.join(result_dir(), 'screen', '{}.tier1'.format(splice_type)) + if has_tier_1(): + files['tier1_junction'] = os.path.join(tier1_dir, 'epitope_summary.junction-based.txt') + files['tier1_peptide'] = os.path.join(tier1_dir, 'epitope_summary.peptide-based.txt') + files['tier1_filtered'] = os.path.join(tier1_dir, 'pred_filtered.score500.txt') + + tier2tier3_dir = os.path.join(result_dir(), 'screen', + '{}.tier2tier3'.format(splice_type)) + if has_tier_3(): + files['tier2tier3_junction'] = os.path.join( + tier2tier3_dir, 'epitope_summary.junction-based.txt') + files['tier2tier3_peptide'] = os.path.join( + tier2tier3_dir, 'epitope_summary.peptide-based.txt') + files['tier2tier3_filtered'] = os.path.join( + tier2tier3_dir, 'pred_filtered.score500.txt') + + return files + + +def iris_predict_task_done_file_names(): + tasks_list_file = checkpoints.count_iris_predict_tasks.get().output[0] + predict_tasks_done = list() + with open(tasks_list_file, 'rt') as handle: + for line in handle: + task_file = line.strip() + task_done_file = '{}.done'.format(task_file) + predict_tasks_done.append(task_done_file) + + return predict_tasks_done + + +def iris_epitope_post_input(wildcards): + inputs = dict() + predict_tasks_done = iris_predict_task_done_file_names() + inputs['predict_tasks_done'] = predict_tasks_done + inputs['parameter_file'] = os.path.join(result_dir(), 'screen.para') + inputs['mhc_by_sample'] = hla_from_patients_for_run() + gene_exp = gene_exp_matrix_path_for_run() + if gene_exp: + inputs['gene_exp_matrix'] = gene_exp + + return inputs + + +def iris_epitope_post_gene_exp_param(): + gene_exp_path = gene_exp_matrix_path_for_run() + if not gene_exp_path: + return '' + + return ' --gene-exp-matrix {}'.format(gene_exp_path) + + +rule iris_epitope_post: + input: + unpack(iris_epitope_post_input), + output: + **iris_epitope_post_out_files() + log: + out=os.path.join(result_dir(), 'iris_epitope_post_log.out'), + err=os.path.join(result_dir(), 'iris_epitope_post_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + out_dir=iris_epitope_post_out_dir_param(), + splice_event_type=config['splice_event_type'], + gene_exp=iris_epitope_post_gene_exp_param(), + resources: + mem_mb=config['iris_epitope_post_mem_gb'] * 1024, + time_hours=config['iris_epitope_post_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS epitope_post' + ' --parameter-fin {input.parameter_file}' + ' --outdir {params.out_dir}' + ' --splicing-event-type {params.splice_event_type}' + ' --mhc-by-sample {input.mhc_by_sample}' + ' {params.gene_exp}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_screen_sjc_out_dir_param(wildcards, output): + return os.path.dirname(output.screen_sjc_out) + + +def iris_screen_sjc_out_file_name(): + run_name = config['run_name'] + splice_type = config['splice_event_type'] + name = 'SJ.{}.{}.summary_by_sig_event.txt'.format(run_name, splice_type) + return os.path.join(result_dir(), 'screen_sjc', name) + + +rule iris_screen_sjc: + input: + parameter_file=os.path.join(result_dir(), 'screen.para'), + splice_txt=splicing_matrix_txt_path_for_run(), + splice_idx=splicing_matrix_idx_path_for_run(), + sjc_count_txt=sjc_count_txt_path_for_run(), + sjc_count_idx=sjc_count_idx_path_for_run(), + output: + screen_sjc_out=iris_screen_sjc_out_file_name(), + log: + out=os.path.join(result_dir(), 'iris_screen_sjc_log.out'), + err=os.path.join(result_dir(), 'iris_screen_sjc_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + splice_event_type=config['splice_event_type'], + out_dir=iris_screen_sjc_out_dir_param, + resources: + mem_mb=config['iris_screen_sjc_mem_gb'] * 1024, + time_hours=config['iris_screen_sjc_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS screen_sjc' + ' --parameter-file {input.parameter_file}' + ' --splicing-event-type {params.splice_event_type}' + ' --event-list-file {input.splice_txt}' + ' --outdir {params.out_dir}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_append_sjc_out_dir_param(wildcards): + input = iris_append_sjc_input(wildcards) + return os.path.dirname(input['event_list']) + + +def iris_append_sjc_event_list_for_tier(tier): + screen_out_files = iris_screen_out_files() + if tier not in ['tier1', 'tier2tier3']: + raise Exception('iris_append_sjc_event_list_for_tier({}): unexpected tier' + .format(tier)) + + return screen_out_files.get(tier) + + +def iris_append_sjc_out_file_name_for_tier(tier): + run_name = config['run_name'] + splice_type = config['splice_event_type'] + name = '{}.{}.{}.txt.ijc_info.txt'.format(run_name, splice_type, tier) + return os.path.join(result_dir(), 'screen', name) + + +def iris_append_sjc_out_file_name_with_tier_wildcard(): + run_name = config['run_name'] + splice_type = config['splice_event_type'] + name = '{}.{}.{{tier}}.txt.ijc_info.txt'.format(run_name, splice_type) + return os.path.join(result_dir(), 'screen', name) + + +def iris_append_sjc_input(wildcards): + inputs = dict() + predict_tasks_done = iris_predict_task_done_file_names() + inputs['predict_tasks_done'] = predict_tasks_done + inputs['parameter_file'] = os.path.join(result_dir(), 'screen.para') + inputs['screen_sjc_out'] = iris_screen_sjc_out_file_name() + inputs['event_list'] = iris_append_sjc_event_list_for_tier(wildcards.tier) + + epitope_post_out_files = iris_epitope_post_out_files() + junction_key = '{}_junction'.format(wildcards.tier) + inputs['epitope_post_junction'] = epitope_post_out_files[junction_key] + return inputs + + +rule iris_append_sjc: + input: + unpack(iris_append_sjc_input), + output: + append_sjc_out=iris_append_sjc_out_file_name_with_tier_wildcard(), + log: + out=os.path.join(result_dir(), 'iris_append_sjc_{tier}_log.out'), + err=os.path.join(result_dir(), 'iris_append_sjc_{tier}_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + splice_event_type=config['splice_event_type'], + out_dir=iris_append_sjc_out_dir_param, + resources: + mem_mb=config['iris_append_sjc_mem_gb'] * 1024, + time_hours=config['iris_append_sjc_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS append_sjc' + ' --sjc-summary {input.screen_sjc_out}' + ' --splicing-event-type {params.splice_event_type}' + ' --outdir {params.out_dir}' + ' --add-ijc-info' # runs IRIS annotate_ijc within IRIS append_sjc + ' --parameter-file {input.parameter_file}' + ' --screening-result-event-list {input.event_list}' + ' 1> {log.out}' + ' 2> {log.err}' + + +def iris_visual_summary_input(wildcards): + inputs = dict() + inputs['parameter_file'] = os.path.join(result_dir(), 'screen.para') + splice_type = config['splice_event_type'] + tier1_dir = os.path.join(result_dir(), 'screen', + '{}.tier1'.format(splice_type)) + if has_tier_1(): + inputs['tier1_peptide'] = os.path.join( + tier1_dir, 'epitope_summary.peptide-based.txt') + + tier2tier3_dir = os.path.join(result_dir(), 'screen', + '{}.tier2tier3'.format(splice_type)) + if has_tier_3(): + inputs['tier2tier3_peptide'] = os.path.join( + tier2tier3_dir, 'epitope_summary.peptide-based.txt') + + return inputs + + +rule iris_visual_summary: + input: + unpack(iris_visual_summary_input), + output: + summary=os.path.join(result_dir(), 'visualization', 'summary.png'), + log: + out=os.path.join(result_dir(), 'visualization', + 'iris_visual_summary_log.out'), + err=os.path.join(result_dir(), 'visualization', + 'iris_visual_summary_log.err'), + params: + conda_wrapper=config['conda_wrapper'], + conda_env_2=config['conda_env_2'], + splice_event_type=config['splice_event_type'], + screen_dir=os.path.join(result_dir(), 'screen'), + resources: + mem_mb=config['iris_visual_summary_mem_gb'] * 1024, + time_hours=config['iris_visual_summary_time_hr'], + shell: + '{params.conda_wrapper} {params.conda_env_2} IRIS visual_summary' + ' --parameter-fin {input.parameter_file}' + ' --screening-out-dir {params.screen_dir}' + ' --out-file-name {output.summary}' + ' --splicing-event-type {params.splice_event_type}' + ' 1> {log.out}' + ' 2> {log.err}' diff --git a/bin/IRIS b/bin/IRIS index 799cc64..af2c1d5 100644 --- a/bin/IRIS +++ b/bin/IRIS @@ -19,15 +19,15 @@ def main(): subcommand = args.subcommand - if subcommand == 'formatting': + if subcommand == 'format': from IRIS import IRIS_formatting IRIS_formatting.main(args) - elif subcommand == 'screening': + elif subcommand == 'screen': from IRIS import IRIS_screening IRIS_screening.main(args) - elif subcommand == 'prediction': + elif subcommand == 'predict': from IRIS import IRIS_prediction IRIS_prediction.main( args ) @@ -39,19 +39,39 @@ def main(): from IRIS import IRIS_process_rnaseq IRIS_process_rnaseq.main(args) - elif subcommand == 'makeqsub_rmats': - from IRIS import IRIS_makeqsub_rmats - IRIS_makeqsub_rmats.main(args) + elif subcommand == 'makesubsh_mapping': + from IRIS import IRIS_makesubsh_mapping + IRIS_makesubsh_mapping.main(args) + + elif subcommand == 'makesubsh_rmats': + from IRIS import IRIS_makesubsh_rmats + IRIS_makesubsh_rmats.main(args) + + elif subcommand == 'makesubsh_rmatspost': + from IRIS import IRIS_makesubsh_rmatspost + IRIS_makesubsh_rmatspost.main(args) elif subcommand == 'exp_matrix': from IRIS import IRIS_exp_matrix IRIS_exp_matrix.main(args) + + elif subcommand == 'makesubsh_extract_sjc': + from IRIS import IRIS_makesubsh_extractsj + IRIS_makesubsh_extractsj.main(args) + + elif subcommand == 'extract_sjc': + from IRIS import IRIS_extract_sjc + IRIS_extract_sjc.main(args) + + elif subcommand == 'sjc_matrix': + from IRIS import IRIS_sjc_matrix + IRIS_sjc_matrix.main(args) - elif subcommand == 'indexing': + elif subcommand == 'index': from IRIS import IRIS_indexing IRIS_indexing.main(args) - elif subcommand == 'translation': + elif subcommand == 'translate': from IRIS import IRIS_translation IRIS_translation.main(args) @@ -59,13 +79,41 @@ def main(): from IRIS import IRIS_pep2epitope IRIS_pep2epitope.main(args) - elif subcommand == 'screening_plot': + elif subcommand == 'screen_plot': from IRIS import IRIS_screening_plot IRIS_screening_plot.main(args) - elif subcommand == 'seq2hla': - from IRIS import IRIS_seq2hla - IRIS_seq2hla.main(args) + elif subcommand == 'screen_sjc': + from IRIS import IRIS_screening_sjc + IRIS_screening_sjc.main(args) + + elif subcommand == 'append_sjc': + from IRIS import IRIS_append_sjc + IRIS_append_sjc.main(args) + + elif subcommand == 'annotate_ijc': + from IRIS import IRIS_annotate_ijc + IRIS_annotate_ijc.main(args) + + elif subcommand == 'screen_cpm': + from IRIS import IRIS_screening_cpm + IRIS_screening_cpm.main(args) + + elif subcommand == 'append_cpm': + from IRIS import IRIS_append_cpm + IRIS_append_cpm.main(args) + + elif subcommand == 'screen_novelss': + from IRIS import IRIS_screening_novelss + IRIS_screening_novelss.main(args) + + elif subcommand == 'screen_sjc_plot': + from IRIS import IRIS_screening_sjcplot + IRIS_screening_sjcplot.main(args) + + elif subcommand == 'makesubsh_hla': + from IRIS import IRIS_makesubsh_hla + IRIS_makesubsh_hla.main(args) elif subcommand == 'parse_hla': from IRIS import IRIS_parse_hla @@ -83,6 +131,10 @@ def main(): from IRIS import IRIS_ms_parse IRIS_ms_parse.main(args) + elif subcommand == 'visual_summary': + from IRIS import IRIS_visual_summary + IRIS_visual_summary.main(args) + def get_arg_parser(): """DOCSTRING Args @@ -102,56 +154,86 @@ def get_arg_parser(): add_prediction_parser(subparsers) add_epitope_post_parser(subparsers) add_process_rnaseq_parser(subparsers) - add_rmats_prep_parser(subparsers) + add_makesubsh_mapping_parser(subparsers) + add_makesubsh_rmats_parser(subparsers) + add_makesubsh_rmatspost_parser(subparsers) add_exp_matrix_parser(subparsers) + add_makesubsh_extractsj(subparsers) + add_extract_sjc(subparsers) + add_sjc_matrix(subparsers) add_indexing_parser(subparsers) add_translation_parser(subparsers) add_pep2epitope_parser(subparsers) add_screening_plot_parser(subparsers) - add_seq2hla_parser(subparsers) + add_screening_sjc_parser(subparsers) + add_append_sjc_parser(subparsers) + add_annotate_ijc_parser(subparsers) + add_screening_cpm_parser(subparsers) + add_append_cpm_parser(subparsers) + add_screening_novelss_parser(subparsers) + add_screening_sjcplot_parser(subparsers) + add_makesubsh_hla_parser(subparsers) add_parse_hla_parser(subparsers) add_ms_makedb_parser(subparsers) add_ms_search_parser(subparsers) add_ms_parse_parser(subparsers) + add_visual_summary_parser(subparsers) return argparser def add_formatting_parser( subparsers ): - arg_formatting = subparsers.add_parser("formatting", help="Formats AS matrices from rMATS, followed by indexing for IRIS") + arg_formatting = subparsers.add_parser("format", help="Formats AS matrices from rMATS, followed by indexing for IRIS") optional_args = arg_formatting._action_groups.pop() required_args = arg_formatting.add_argument_group('required arguments') - required_args.add_argument('rmats_mat_path_manifest', help='A txt manifest of path(s) to rMATS output folder(s).') - required_args.add_argument('rmats_sample_order', help='A txt manifest of corresponding rMATS input sample order file(s), which is an required input for rMATS.') - required_args.add_argument('-t','--splicing_event_type', choices=['SE','RI','A3','A5'],help='A string of splicing event type based on rMATS defination (SE,RI,A3,A5). Will be used to name the output file name.', required=True) - required_args.add_argument('-n', '--data-name', help='Name of the dataset (disease state, study name, group name etc.). This will be also used during IRIS screening.', required=True) - required_args.add_argument('-s', '--sample-name-field',type=int, choices=[1, 2], help='Specify a field as the sample name field for each sample in the sample order file(s) listed by "rmats_sample_order". 1- use the BAM file name,2- use the BAM folder name. ', required=True) - optional_args.add_argument('-c', '--cov-cutoff', default=10, type=float, help='Average coverage filter for the merged matrix. Defualt is 10.') - optional_args.add_argument('-e', '--merge-events-only', default=False, action="store_true" ,help='Will not perform the matrix merge, only merge the events list.') - optional_args.add_argument('-d', '--iris-db-path', default='.', help='The path to the IRIS database. The formatted/indexed AS matrices will be added to db and used for IRIS screening. Output to "." when the path is not specified.') + required_args.add_argument('rmats_mat_path_manifest', help='txt manifest of path(s) to rMATS output folder(s)') + required_args.add_argument('rmats_sample_order', help='TXT file manifest of corresponding rMATS input sample order file(s). Required input for rMATS') + required_args.add_argument('-t','--splicing-event-type', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file', required=True) + required_args.add_argument('-n', '--data-name', help='Defines dataset name (disease state, study name, group name etc.). Used during IRIS screening ', required=True) + required_args.add_argument('-s', '--sample-name-field',type=int, choices=[1, 2], help='Specifies sample name field (1- SJ count file name, 2- SJ count folder name), for each sample the name should match their name in "rmats_sample_order"', required=True) + optional_args.add_argument('-c', '--cov-cutoff', default=10, type=float, help='Average coverage filter for merged matrix (Default is 10)') + optional_args.add_argument('-i', '--sample-based-filter', default=False, action="store_true" ,help='Coverage filter by individual sample not by entire input group. (Default is disabled)') + optional_args.add_argument('-e', '--merge-events-only', default=False, action="store_true" ,help='Do not perform matrix merge, only merge events list') + optional_args.add_argument('-d', '--iris-db-path', default='.', help='Path to store the formatted/indexed AS matrix. Strongly recommend to store the AS matrix to the IRIS db by setting the path to the directory containing folders of pre-index AS reference ("full_path/IRIS_data.vX/db"). Default is current location.') + optional_args.add_argument('--novelSS', default=False, action="store_true", help='Enable formatting events with splice junctions containing novelSS. (Different and a subset of rMATS novelSS definition. Default is False)') + optional_args.add_argument('--gtf', help='Path to the Genome annotation GTF file. Required input when novelSS is enabled.') arg_formatting._action_groups.append(optional_args) return def add_screening_parser( subparsers ): - arg_screening = subparsers.add_parser("screening", help="Screens AS-derived tumor antigens using big-data reference") + arg_screening = subparsers.add_parser("screen", help="Screens AS-derived tumor antigens using big-data reference") optional_args = arg_screening._action_groups.pop() required_args=arg_screening.add_argument_group('required arguments') - required_args.add_argument('parameter_fin', help='A file of IRIS screening parameters.') - required_args.add_argument('-o', '--outdir', help='Output directory for IRIS screening.') - optional_args.add_argument('-t', '--translating', action= "store_true", help='Translating IRIS screened tumor splice junction into peptides.') + required_args.add_argument('-p', '--parameter-fin', help="File of 'IRIS screen' parameters",required=True) + required_args.add_argument('--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file. (Default is SE event)') + required_args.add_argument('-o', '--outdir', help='Directory of IRIS screening results', required= True) + optional_args.add_argument('-t', '--translating', action= "store_true", help='Translates IRIS-screened tumor splice junctions into peptides') + optional_args.add_argument('-g', '--gtf', help='The Genome annotation GTF file. Required by IRIS translate option.') + optional_args.add_argument('--all-orf', default=False, action= "store_true", help='Perform the 3 ORF translation. ORF known in the UniProtKB will be labeled as uniprotFrame in the bed file (Default is to use the known ORF ONLY)') + optional_args.add_argument('--ignore-annotation', default=False, action= "store_true", help='Perform 3 ORF translation without annotating known ORF from the UniProtKB (Default is disabled)') + optional_args.add_argument('--remove-early-stop', default=False, action= "store_true", help='Discard the peptide if containing early stop codon (Default is keep the truncated peptide)') + optional_args.add_argument('--min-sample-count', default=False, help='The minimum number of non-missing sample in the input group for an event to be considered for testing. Once specified, removed events will be written to "notest" file. (Default is no minimum)') + optional_args.add_argument('--use-existing-test-result', default=False, action= "store_true", help='Skip testing and use existing testing result (Default is run full testing steps)') arg_screening._action_groups.append(optional_args) return def add_prediction_parser( subparsers ): - arg_prediction = subparsers.add_parser("prediction", help="Predicts and annotates AS-derived TCR (pre-prediction) and CAR-T targets") + arg_prediction = subparsers.add_parser("predict", help="Predicts and annotates AS-derived TCR (pre-prediction) and CAR-T targets") optional_args = arg_prediction._action_groups.pop() required_args = arg_prediction.add_argument_group('required arguments') - required_args.add_argument('IRIS_screening_result_path', help='The same output directory of IRIS screening.') - required_args.add_argument('-p','--parameter-fin', help='The parameter file used in IRIS screening.') - required_args.add_argument('--iedb-local', help='Specify local IEDB location (Needs to be installed).') - optional_args.add_argument('-c','--deltaPSI-column', default=5, help='Column of deltaPSI value in the matrix, 1-based. Default is the 5th column.') - optional_args.add_argument('-d','--deltaPSI-cut-off', default=0, help='Define the cutoff of deltaPSI (or other metric) to be used to select tumor-enriched splice form. Default is 0.') - required_args.add_argument('-m','--mhc-list', help='A list of HLA/MHC types among samples. HLA type follows seq2HLA format.',required=True) - optional_args.add_argument('--extracellular-anno-by-junction', action="store_true", help='The default is to annotate CAR-T target based on if an event is associated with extracellular domain. This option is to annotate target based on a junction (Not recommanded).' ) + required_args.add_argument('IRIS_screening_result_path', help='Directory of IRIS screening results') + required_args.add_argument('--task-dir', help='Directory to write individual task scripts', required=True) + required_args.add_argument('-p','--parameter-fin', help="File of parameters used in 'IRIS screen'",required=True) + required_args.add_argument('-t','--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file. (Default is SE event)') + optional_args.add_argument('--iedb-local', help='Specify local IEDB location (if installed)') + optional_args.add_argument('-m','--mhc-list', help='List of HLA/MHC types among samples. HLA type follows seq2HLA format') + optional_args.add_argument('--extracellular-only', default=False, action="store_true", help='Only predict CAR-T Targets. Will not predict HLA binding.') + optional_args.add_argument('--tier3-only', default=False, action="store_true", help='To only run predict on events passing all screen tiers, which is the tier3 output. Will be much faster when both the tier1 and tier3 were used.') + optional_args.add_argument('--gene-exp-matrix', default=False, help='Tab-delimited matrix of gene expression vs. samples') + optional_args.add_argument('-c','--deltaPSI-column', default=5, help='Column of deltaPSI value in matrix, 1-based (Default is 5th column)') + optional_args.add_argument('-d','--deltaPSI-cut-off', default=0, help='Defines cutoff of deltaPSI (or other metric) to select tumor-enriched splice form (Default is 0)') + optional_args.add_argument('-e', '--epitope-len-list', default='9,10,11', help='Epitope length for prediction (Default is 9,10,11)') + optional_args.add_argument('--all-orf', default=False, action= "store_true", help='Perform prediction based on 3 ORF translation peptides. Enable this if translation/screening used this option (Default is False)') + optional_args.add_argument('--extracellular-anno-by-junction', action="store_true", help='By default, CAR-T targets are annotated by association of event with extracellular domain. This option annotates target based on a junction (not recommended)' ) arg_prediction._action_groups.append(optional_args) return @@ -159,11 +241,17 @@ def add_epitope_post_parser( subparsers ): arg_epitope_post = subparsers.add_parser("epitope_post", help="Post-prediction step to summarize predicted TCR targets") optional_args = arg_epitope_post._action_groups.pop() required_args = arg_epitope_post.add_argument_group('required arguments') - required_args.add_argument('-p','--parameter_fin', help='The parameter file used in IRIS screening.', required=True) - required_args.add_argument('-o','--outdir', help='The same output directory of IRIS screening.', required=True) - required_args.add_argument('-m','--mhc-by-sample', help='A tsv file of HLA/MHC type vs. samples. HLA type follows seq2HLA format.', required=True) - required_args.add_argument('-e','--gene-exp-matrix', default=False, help='A tsv file of gene expression vs. samples.') - optional_args.add_argument('--ic50-cut-off', default=500, type=float, help='The IC50 cut-off to define HLA-binding epitopes. default is 500.') + required_args.add_argument('-p','--parameter-fin', help='File of parameters used in IRIS screen', required=True) + required_args.add_argument('-o','--outdir', help='Directory of IRIS screening results', required=True) + required_args.add_argument('-t','--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file (Default is SE event)') + required_args.add_argument('-m','--mhc-by-sample', help=' Tab-delimited matrix of HLA/MHC type vs. samples. HLA type follows seq2HLA format', required=True) + optional_args.add_argument('-e','--gene-exp-matrix', default=False, help='Tab-delimited matrix of gene expression vs. samples') + optional_args.add_argument('--tier3-only', default=False, action="store_true", help='Only predict tier3 events. Will be much faster.') + optional_args.add_argument('--keep-exist', default=False, action="store_true", help='Do not rewrite a new postive prediction file when the file existed. Default is False') + optional_args.add_argument('--epitope-len-list', default='9,10,11', help='Epitope length for prediction (Default is 9,10,11)') + optional_args.add_argument('--no-match-to-canonical-proteome', default=False, action="store_true", help='Matches epitopes to UniProt canonical protein sequences as an annotation.') + optional_args.add_argument('--no-uniqueness-annotation', default=False, action="store_true", help='Matches epitopes to all IRIS translated junction peptides in the same analysis as an annotation.') + optional_args.add_argument('--ic50-cut-off', default=500, type=float, help='Specifies IC50 cut-off to define HLA-binding epitopes (Default is 500)') arg_epitope_post._action_groups.append(optional_args) return @@ -172,58 +260,143 @@ def add_process_rnaseq_parser( subparsers ): optional_args = arg_process_rnaseq._action_groups.pop() required_args = arg_process_rnaseq.add_argument_group('required arguments') required_args.add_argument('--starGenomeDir',help='The path to the STAR indexed reference genome. Pass to the "genomeDir" parameter in STAR', required=True) - required_args.add_argument('--gtf',help='Genome annotation file.', required=True) - required_args.add_argument('-p','--sampleID-outdir', help='Output directory where sample ID will be used as the output folder name.', required=True) - required_args.add_argument('--db-length',default=100, help='Pass to the "sjdbOverhang" parameter in STAR. Default is 100.') + required_args.add_argument('--gtf',help='Path to the Genome annotation GTF file', required=True) + required_args.add_argument('-p','--sampleID-outdir', help='Output directory where sample ID will be used as the output folder name', required=True) + required_args.add_argument('--db-length',default=100, help='Pass to the "sjdbOverhang" parameter in STAR. Default is 100') required_args.add_argument('readsFilesRNA',help='Specify the path to the paired-end FASTQ files for the sample. Files are seperated eperated by ",".') - optional_args.add_argument('--mapping',help= 'Only perform reads mapping.', action='store_true') + optional_args.add_argument('--mapping',help= 'Only perform reads mapping', action='store_true') optional_args.add_argument('--quant',help='Only perform gene expression and AS quantification', action='store_true') - optional_args.add_argument('--sort',help='Only perform BAM file sorting.',action='store_true') + optional_args.add_argument('--sort',help='Only perform BAM file sorting',action='store_true') arg_process_rnaseq._action_groups.append(optional_args) return -def add_rmats_prep_parser(subparsers): - arg_rmats_prep = subparsers.add_parser("makeqsub_rmats", help="Makes qsub files for running rMATS-turbo 'prep' step") - optional_args = arg_rmats_prep._action_groups.pop() - required_args = arg_rmats_prep.add_argument_group('required arguments') - required_args.add_argument('--rMATS-path',help= 'Path to rMATS-turbo script.', required=True) +def add_makesubsh_mapping_parser(subparsers): + arg_makesubsh_mapping = subparsers.add_parser("makesubsh_mapping", help="Makes submission shell scripts for running 'process_rnaseq'") + optional_args = arg_makesubsh_mapping._action_groups.pop() + required_args = arg_makesubsh_mapping.add_argument_group('required arguments') + required_args.add_argument('--fastq-folder-dir',help='Specify the path to the higher level of all folders containing FASTQ files') + required_args.add_argument('--starGenomeDir',help='The path to the STAR indexed reference genome. Pass to the "genomeDir" parameter in STAR', required=True) + required_args.add_argument('--gtf',help='Path to the Genome annotation GTF file', required=True) + required_args.add_argument('--data-name',help='Data set name used to name submission shell scripts files.', required=True) + required_args.add_argument('--outdir',help='Output directory for folders of aligned BAM files', required=True) + required_args.add_argument('--label-string', help='String in the fastq file name between the reads pair number and "fastq/fq". This is used to recognize paired-end reads. e.g. For FASTQ_file_L1_R2.fastq.gz, the label string is the "." between "2" and "fastq".', required=True) + required_args.add_argument('--task-dir', help='Directory to write individual task scripts', required=True) + arg_makesubsh_mapping._action_groups.append(optional_args) + +def add_makesubsh_rmats_parser(subparsers): + arg_makesubsh_rmats = subparsers.add_parser("makesubsh_rmats", help="Makes submission shell scripts for running rMATS-turbo 'prep' step") + optional_args = arg_makesubsh_rmats._action_groups.pop() + required_args = arg_makesubsh_rmats.add_argument_group('required arguments') + required_args.add_argument('--rMATS-path',help= 'Path to the rMATS-turbo script.', required=True) + required_args.add_argument('--bam-dir',help='The path one level higher to folders containing BAM file generated by "process_rnaseq".', required=True) + required_args.add_argument('--bam-prefix', default='Aligned.sortedByCoord.out', help='BAM file prefix (Default is "Aligned.sortedByCoord.out")') + required_args.add_argument('--gtf',help='Path to the Genome annotation GTF file', required=True) + required_args.add_argument('--data-name',help='Data set name used to name submission shell scripts', required=True) + required_args.add_argument('--task-dir', help='Directory to write individual task scripts', required=True) + optional_args.add_argument('--novelSS',default=False, action= "store_true", help='Enable rMATS novelSS option to include novel splice site detected from the RNA-seq data (Default is False)') + optional_args.add_argument('--read-length',default=False, help='User defined read length instead of using STAR maaping log file to define automatically.') + arg_makesubsh_rmats._action_groups.append(optional_args) + return + +def add_makesubsh_rmatspost_parser(subparsers): + arg_makesubsh_rmatspost = subparsers.add_parser("makesubsh_rmatspost", help="Makes submission shell scripts for running rMATS-turbo 'post' step") + optional_args = arg_makesubsh_rmatspost._action_groups.pop() + required_args = arg_makesubsh_rmatspost.add_argument_group('required arguments') + required_args.add_argument('--rMATS-path',help= 'Path to the rMATS-turbo scripte', required=True) required_args.add_argument('--bam-dir',help='The path one level higher to folders containing BAM file generated by "process_rnaseq".', required=True) - required_args.add_argument('--gtf',help='Genome annotation file.', required=True) - required_args.add_argument('--read-length',help='Pass to the "readLength" parameter in rMATS-turbo.', required=True) - arg_rmats_prep._action_groups.append(optional_args) + required_args.add_argument('--gtf',help='Path to the Genome annotation GTF file', required=True) + required_args.add_argument('--data-name',help='Data set name used to name submission shell scripts', required=True) + optional_args.add_argument('--novelSS',default=False, action= "store_true", help='Enable rMATS novelSS option to include novel splice site detected from the RNA-seq data (Default is False)') + required_args.add_argument('--task-dir', help='Directory to write individual task scripts', required=True) + arg_makesubsh_rmatspost._action_groups.append(optional_args) return -def add_exp_matrix_parser(subparsers): +def add_exp_matrix_parser(subparsers): arg_exp_matrix = subparsers.add_parser("exp_matrix", help="Makes a merged gene expression matrix from multiple cufflinks results") optional_args = arg_exp_matrix._action_groups.pop() required_args = arg_exp_matrix.add_argument_group('required arguments') required_args.add_argument('gene_exp_file_list', help='A txt manifest of path(s) of cufflinks gene expression output(s).') - optional_args.add_argument('--exp-cutoff', default=1, help='Gene expression cut-off based on FPKM. Default is 1.') + optional_args.add_argument('--exp-cutoff', default=1, help='Gene expression cut-off based on FPKM (Default is 1)') optional_args.add_argument('-o','--outdir', default='.',help='Output directory for IRIS exp_matrix', required=True) required_args.add_argument('-n', '--data-name', help='Name of the dataset (disease state, study name, group name etc.).', required=True) arg_exp_matrix._action_groups.append(optional_args) return + +def add_makesubsh_extractsj( subparsers ): + arg_makesubsh_extractsj = subparsers.add_parser('makesubsh_extract_sjc',help="Makes submission shell scripts for running 'extract_sjc'") + optional_args = arg_makesubsh_extractsj._action_groups.pop() + required_args = arg_makesubsh_extractsj.add_argument_group('required arguments') + required_args.add_argument('-b','--bam-folder-list', help='Path to a file listing all paths to BAM folders', required=True) + required_args.add_argument('-g', '--gtf', help='Path to the Genome annotation GTF file', required=True) + required_args.add_argument('-f', '--genome-fasta', help='Path to the reference genome FASTA file', required=True) + required_args.add_argument('-n', '--task-name', help='The task name. Used to name the command file and the bash file', required=True) + required_args.add_argument('--BAM-prefix', default='Aligned.sortedByCoord.out', help='BAM file prefix', required=True) + optional_args.add_argument( + '-r','--rmats-used-read-length', default='', + help=('if not given, the read length will be parsed from' + ' Log.final.out in the bam folder')) + optional_args.add_argument( + '--task-dir', + help=('the directory to write the command and bash file.' + ' Defaults to the working directory')) + arg_makesubsh_extractsj._action_groups.append(optional_args) + return + +def add_extract_sjc( subparsers ): + arg_extract_sjc = subparsers.add_parser('extract_sjc',help='Extracts SJ counts from STAR-aligned BAM file and annotates SJs with number of uniquely mapped reads that support the splice junction.') + optional_args = arg_extract_sjc._action_groups.pop() + required_args = arg_extract_sjc.add_argument_group('required arguments') + required_args.add_argument('-i','--bam-path', help='Path to BAM files', required=True) + required_args.add_argument('-f', '--genome-fasta', help='Path to the reference genome FASTA file', required=True) + required_args.add_argument('-g', '--gtf', help='Path to the Genome annotation GTF file', required=True) + required_args.add_argument('-a','--minimum-overhang-length-annotated', default=1) + required_args.add_argument('-c','--minimum-overhang-length-unannotated-canonical', default=8) + required_args.add_argument('-u','--minimum-overhang-length-unannotated-noncanonical', default=10) + required_args.add_argument('-o', '--outdir', help='', required=True) + optional_args.add_argument('-r','--read-length', help='length of reads to keep when counting junction reads') + arg_extract_sjc._action_groups.append(optional_args) + return + +def add_sjc_matrix ( subparsers ): + arg_sjc_matrix = subparsers.add_parser('sjc_matrix',help='Makes SJ count matrix by merging SJ count files from a specified list of samples. Performs indexing of the merged file.') + optional_args = arg_sjc_matrix._action_groups.pop() + required_args = arg_sjc_matrix.add_argument_group('required arguments') + required_args.add_argument('-i','--file-list-input', help='Path to the file contains a list of SJ count files.', required=True) + required_args.add_argument('-n', '--data-name', help='Defines dataset name (disease state, study name, group name etc.). Used during IRIS screening ', required=True) + required_args.add_argument('-s', '--sample-name-field',type=int, choices=[1, 2], help='Specifies sample name field (1- SJ count file name, 2- SJ count folder name), for each sample the name should match their name in "rmats_sample_order"', required=True) + optional_args.add_argument('-d', '--iris-db-path', default='.', help='Path to IRIS database. Formatted/indexed AS matrices are stored here and used during IRIS screening') + arg_sjc_matrix._action_groups.append(optional_args) + return + def add_indexing_parser( subparsers ): - arg_indexing = subparsers.add_parser("indexing", help="Indexes AS matrices for IRIS") + arg_indexing = subparsers.add_parser("index", help="Indexes AS matrices for IRIS") optional_args = arg_indexing._action_groups.pop() required_args = arg_indexing.add_argument_group('required arguments') - required_args.add_argument('splicing_matrix', help='A tab-delimited matrix of splicing events (row) vs. sample IDs (col).') - required_args.add_argument('-n', '--data-name', help='Name of the dataset (disease state, study name, group name etc.). This will be also used during IRIS screening.', required=True) - optional_args.add_argument('-d', '--db-dir', default='.', help='The directory of the IRIS database. The program will create folder inside this directory in order to make IRIS recognize.') + required_args.add_argument('splicing_matrix', help='Tab-delimited matrix of splicing events (row) vs. sample IDs (col)') + required_args.add_argument('-t','--splicing-event-type', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file', required=True) + required_args.add_argument('-n', '--data-name', help='Name of data matrix (disease state, study name, group name, etc.) being indexed. Used by IRIS during screening', required=True) + optional_args.add_argument('-c', '--cov-cutoff', default=10, type=float, help='For the naming purpose, Input average coverage cutoff used when generating the PSI matrix (Default is 10)') + optional_args.add_argument('-o', '--outdir', default='.', help='Output directory for IRIS database') arg_indexing._action_groups.append(optional_args) return def add_translation_parser( subparsers ): - arg_translation = subparsers.add_parser("translation", help="Translates AS junctions into junction peptides") + arg_translation = subparsers.add_parser("translate", help="Translates AS junctions into junction peptides") optional_args = arg_translation._action_groups.pop() required_args = arg_translation.add_argument_group('required arguments') - required_args.add_argument('as_input', help='A tsv file generated by IRIS screening, containing AS events and deltaPSI value.') - required_args.add_argument('-g','--ref-genome', help='The path to the reference genome file (FASTA).', required=True) - required_args.add_argument('-o','--outdir', help='Output directory for IRIS translation.',required=True ) - optional_args.add_argument('-c','--deltaPSI-column', default=5, help='Column of deltaPSI value in the matrix, 1-based. Default is the 5th column.') - optional_args.add_argument('-d','--deltaPSI-cut-off', default=0, help='Cutoff of deltaPSI (or other metric) to be used to select tumor-enriched splice form. Default is 0.') - optional_args.add_argument('--no-tumor-form-selection', action= "store_true", help='Splicing junctions derived from both skipping and inclusion forms are translated.') + required_args.add_argument('as_input', help='Inputs AS event coordinates and delta PSI values') + required_args.add_argument('-g','--ref-genome', help='Specifies reference genome (FASTA format) location', required=True) + required_args.add_argument('-t','--splicing-event-type', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file', required=True) + required_args.add_argument('--gtf',help='Path to the Genome annotation GTF file. Used to define exon ends for microexons', required=True) + required_args.add_argument('-o','--outdir', help='Defines IRIS translation output directory',required=True ) + optional_args.add_argument('--all-orf', default=False, action= "store_true", help='Perform the 3 ORF translation. ORF known in the UniProtKB will be labeled as uniprotFrame in the bed file (Default is to use the known ORF ONLY)') + optional_args.add_argument('--ignore-annotation', default=False, action= "store_true", help='Perform 3 ORF translation without annotating known ORF from the UniProtKB (Default is disabled)') + optional_args.add_argument('--remove-early-stop', default=False, action= "store_true", help='Discard the peptide if containing early stop codon (Default is keep the truncated peptide)') + optional_args.add_argument('-c','--deltaPSI-column', default=5, help='Column of deltaPSI value in matrix, 1-based (Default is 5th column)') + optional_args.add_argument('-d','--deltaPSI-cut-off', default=0, help='Defines cutoff of deltaPSI (or other metric) used to select tumor-enriched splice form (Default is 0)') + optional_args.add_argument('--no-tumor-form-selection', action= "store_true", help='Translates splicing junctions derived from both skipping and inclusion forms (Default is False)') + optional_args.add_argument('--check-novel', action= "store_true", help='Translates splicing junctions derived from novel splice sites only using information passed from screen_novelss (Default is False)', default=False) arg_translation._action_groups.append(optional_args) return @@ -231,23 +404,25 @@ def add_pep2epitope_parser( subparsers ): arg_pep2epitope = subparsers.add_parser("pep2epitope", help="Wrapper to run IEDB for peptide-HLA binding prediction") optional_args = arg_pep2epitope._action_groups.pop() required_args = arg_pep2epitope.add_argument_group('required arguments') - required_args.add_argument('junction_pep_input', help='input alternative splicing events coordinates and PSI value.') - required_args.add_argument('-e', '--epitope-len-list', default='9,10,11', help='epitope length for prediction. Default is 9,10,11.') - required_args.add_argument('-a', '--hla-allele-list', default='HLA-A*01:01,HLA-B*08:01,HLA-C*07:01', help='a list of HLA types. Default is HLA-A*01:01,HLA-B*08:01,HLA-C*07:01.') - required_args.add_argument('-o', '--outdir', help='Define the output directory of pep2epitope.', required=True) - required_args.add_argument('--iedb-local', help='Specify local IEDB location if it is installed.') - required_args.add_argument('--ic50-cut-off', default=500, help='Cut-off based on median value of concensus predicted IC50 values. Default is 500.') + required_args.add_argument('junction_pep_input', help='Inputs junction peptides') + required_args.add_argument('-e', '--epitope-len-list', default='9,10,11', help='Epitope length for prediction (Default is 9,10,11)') + required_args.add_argument('-a', '--hla-allele-list', default='HLA-A*01:01,HLA-B*08:01,HLA-C*07:01', help='List of HLA types (Default is HLA-A*01:01, HLA-B*08:01, HLA-C*07:01)') + required_args.add_argument('-o', '--outdir', help='Define output directory of pep2epitope', required=True) + required_args.add_argument('--iedb-local', help='Specify local IEDB location (if installed)') + required_args.add_argument('--ic50-cut-off', default=500, help='Cut-off based on median value of consensus-predicted IC50 values (Default is 500)') arg_pep2epitope._action_groups.append(optional_args) return -def add_seq2hla_parser(subparsers): - arg_seq2hla = subparsers.add_parser("seq2hla",help='Wrapper to run seq2HLA for HLA typing using RNA-Seq') - optional_args = arg_seq2hla._action_groups.pop() - required_args = arg_seq2hla.add_argument_group('required arguments') - required_args.add_argument('-b','--seq2hla-path', help='Path to seq2hla folder.', required=True) - required_args.add_argument('-p','--sampleID-outdir', help='Output directory where sample ID will be used as the output folder name.', required=True) - required_args.add_argument('readsFilesCaseRNA',help='Tumor sample paired-end fastq files seperated by ",". ') - arg_seq2hla._action_groups.append(optional_args) +def add_makesubsh_hla_parser(subparsers): + arg_makesubsh_hla = subparsers.add_parser("makesubsh_hla",help='Makes submission shell scripts for running seq2HLA for HLA typing using RNA-Seq') + optional_args = arg_makesubsh_hla._action_groups.pop() + required_args = arg_makesubsh_hla.add_argument_group('required arguments') + required_args.add_argument('--fastq-folder-dir',help='Specify the path to the higher level of all folders containing FASTQ files') + required_args.add_argument('--data-name',help='Data set name used to name submission shell scripts.', required=True) + required_args.add_argument('-o','--outdir',help='Output directory for folders of seq2hla result', required=True) + required_args.add_argument('--label-string', help='String in the fastq file name between the reads pair number and "fastq/fq". This is used to recognize paired-end reads. e.g. For FASTQ_file_L1_R2.fastq.gz, the label string is the "." between "2" and "fastq".', required=True) + required_args.add_argument('--task-dir', help='Directory to write individual task scripts', required=True) + arg_makesubsh_hla._action_groups.append(optional_args) return def add_parse_hla_parser(subparsers): @@ -259,20 +434,124 @@ def add_parse_hla_parser(subparsers): return def add_screening_plot_parser(subparsers): - arg_screening_plot = subparsers.add_parser("screening_plot",help='Makes stacked/individual violin plots for list of AS events') + arg_screening_plot = subparsers.add_parser("screen_plot",help='Makes stacked/individual violin plots for list of AS events') optional_args = arg_screening_plot._action_groups.pop() required_args = arg_screening_plot.add_argument_group('required arguments') - required_args.add_argument('event_list', help='input alternative splicing events coordinates for visualization.') - required_args.add_argument('-p','--parameter-fin', help='The file of parameters used in IRIS screening.', required=True) - required_args.add_argument('--step','-s', default=10, help='number of events in each plot.') - optional_args.add_argument('--header', action="store_true", help='Skipping the header line in the input event list.') + required_args.add_argument('event_list', help='Inputs AS event coordinates for visualization') + required_args.add_argument('-p','--parameter-file', help="Parameter file for 'IRIS screen'", required=True) + required_args.add_argument('-t', '--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file (Default is SE event)') + required_args.add_argument('--step','-s', default=10, help='Number of events in each plot (Default is 10)') + required_args.add_argument('-o', '--outdir', help='Define the output directory of the plot.', required=True) + optional_args.add_argument('--header', action="store_true", default=False, help='Skipping the header line of the input (Default is False)') arg_screening_plot._action_groups.append(optional_args) return +def add_screening_sjc_parser( subparsers ): + arg_screening_sjc = subparsers.add_parser("screen_sjc", help="Screens AS-derived tumor antigens by comparing number of samples expressing a splice junction using big-data reference of SJ counts") + optional_args = arg_screening_sjc._action_groups.pop() + required_args= arg_screening_sjc.add_argument_group('required arguments') + required_args.add_argument('-p','--parameter-file', help='Parameter file containing SJ db directory, selected data sets, etc.', required=True) + required_args.add_argument('--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file. (Default is SE event)') + required_args.add_argument('-e','--event-list-file', help='AS event list in the format of PSI value matrices (see the output format of IRIS format module or IRIS_db PSI matrices)') + required_args.add_argument('-o', '--outdir', help='Directory of IRIS screening results', required= True) + optional_args.add_argument('--use-existing-test-result', default=False, action= "store_true", help='Skip testing and use existing testing result (Default is run full testing steps)') + optional_args.add_argument('--tumor-read-cov-cutoff', default=5, type=int, help='Minimum read coverage for a tumor sample to be considered as expressing the junction (Default is 5)') + optional_args.add_argument('--normal-read-cov-cutoff', default=2, type=int, help='Minimum read coverage for a normal sample to be considered as expressing the junction (Default is 2)') + arg_screening_sjc._action_groups.append(optional_args) + return + +def add_append_sjc_parser( subparsers ): + arg_append_sjc = subparsers.add_parser("append_sjc", help="Appends SJC result as an annotation to PSI-based screening results and epitope prediction results in a specified screening output folder.") + optional_args = arg_append_sjc._action_groups.pop() + required_args= arg_append_sjc.add_argument_group('required arguments') + required_args.add_argument('--sjc-summary',help='Full path to the \"summary\" file from the SJC screening output',required=True) + required_args.add_argument('--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file. (Default is SE event)') + required_args.add_argument('-o', '--outdir', help='Directory of IRIS screening results', required= True) + optional_args.add_argument('-i', '--add-ijc-info', action='store_true', default=False, help='Add inclusion junction related annotation to the PSI-based screening results and epitope prediction results. This can be slow when \'--screening-result-event-list\' is large. (Default is False)') + optional_args.add_argument('-u','--use-existing-result', default=False, action= "store_true", help='Skip retrieving and use existing ijc result (Default is False)') + optional_args.add_argument('-p','--parameter-file', help='Parameter file. This is required when \'--add-ijc-info\' is enabled', default='') + optional_args.add_argument('-e','--screening-result-event-list', help='A list of AS events of interest in the same format as the \'as_event\' column in the IRIS screen output. This is required when \'--add-ijc-info\' is enabled', default='') + optional_args.add_argument('--inc-read-cov-cutoff', default=2, type=int, help='Minimum read coverage for the two inclusion junctions combined to be considered as expressing. This is a parameter for annotate_ijc (Default is 2)') + optional_args.add_argument('--event-read-cov-cutoff', default=10, type=int, help='Minimum read coverage for an event to be considered in the analysis. This is a parameter for annotate_ijc (Default is 10)') + arg_append_sjc._action_groups.append(optional_args) + return + +def add_annotate_ijc_parser( subparsers ): + arg_annotate_ijc = subparsers.add_parser("annotate_ijc", help="Annotates inclusion junction count info to PSI-based screening results or epitope prediction results in a specified screening output folder. Can be called from append sjc to save time.") + optional_args = arg_annotate_ijc._action_groups.pop() + required_args= arg_annotate_ijc.add_argument_group('required arguments') + required_args.add_argument('-p','--parameter-file', help='Parameter file containing SJ db directory, selected data sets, etc.', required=True) + required_args.add_argument('--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file. (Default is SE event)') + required_args.add_argument('-e','--screening-result-event-list', help='A list of AS events of interest in the same format as the \'as_event\' column in the IRIS screen output', required=True) + optional_args.add_argument('--inc-read-cov-cutoff', default=2, type=int, help='Minimum read coverage for the two inclusion junctions combined to be considered as expressing (Default is 2)') + optional_args.add_argument('--event-read-cov-cutoff', default=10, type=int, help='Minimum read coverage for an event to be considered in the analysis (Default is 10)') + required_args.add_argument('-o', '--outdir', help='Directory of IRIS screening results', required= True) + arg_annotate_ijc._action_groups.append(optional_args) + return + +def add_screening_cpm_parser( subparsers ): + arg_screening_cpm = subparsers.add_parser("screen_cpm", help="Screens AS-derived tumor antigens by comparing splice junction CPM using big-data reference of SJ counts") + optional_args = arg_screening_cpm._action_groups.pop() + required_args= arg_screening_cpm.add_argument_group('required arguments') + required_args.add_argument('-p','--parameter-file', help='Parameter file containing SJ db directory, selected data sets, etc.', required=True) + required_args.add_argument('--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file. (Default is SE event)') + required_args.add_argument('-e','--event-list-file', help='AS event list in the format of PSI value matrices (see the output format of IRIS format module or IRIS_db PSI matrices)') + required_args.add_argument('-o', '--outdir', help='Directory of IRIS screening results', required= True) + optional_args.add_argument('--use-existing-test-result', default=False, action= "store_true", help='Skip testing and use existing testing result (Default is run full testing steps)') + arg_screening_cpm._action_groups.append(optional_args) + return + +def add_append_cpm_parser( subparsers ): + arg_append_cpm = subparsers.add_parser("append_cpm", help="Appends CPM result as an annotation to PSI-based screening results and epitope prediction results in a specified screening output folder.") + optional_args = arg_append_cpm._action_groups.pop() + required_args= arg_append_cpm.add_argument_group('required arguments') + required_args.add_argument('--cpm-summary',help='Full path to the \"summary\" file from the CPM screening output',required=True) + required_args.add_argument('--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file. (Default is SE event)') + required_args.add_argument('-o', '--outdir', help='Directory of IRIS screening results', required= True) + arg_append_cpm._action_groups.append(optional_args) + return + +def add_screening_novelss_parser( subparsers ): + arg_screening_novelss = subparsers.add_parser("screen_novelss", help="Screens AS-derived tumor antigens for unannotated events using big-data reference of SJ counts") + optional_args = arg_screening_novelss._action_groups.pop() + required_args= arg_screening_novelss.add_argument_group('required arguments') + required_args.add_argument('-p','--parameter-fin', help='Parameter file containing SJ db directory, selected data sets, etc.', required=True) + required_args.add_argument('--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file. (Default is SE event)') + required_args.add_argument('-e','--event-list-fin', help='AS Event list in the format of PSI value matrices (modifed rMATS format)') + required_args.add_argument('-o', '--outdir', help='Directory of IRIS screening results', required= True) + optional_args.add_argument('--gtf', help='Path to the Genome annotation GTF file. Required input when checking novelSS for tumor junctions and IRIS translate option.') + optional_args.add_argument('-d','--deltaPSI-cut-off', default=0, help='Defines cutoff of deltaPSI (or other metric) used to select tumor-enriched splice form (Default is 0)') + optional_args.add_argument('--use-existing-test-result', default=False, action= "store_true", help='Skip testing and use existing testing result (Default is run full testing steps)') + optional_args.add_argument('-t', '--translating', action= "store_true", help='Translates IRIS-screened tumor splice junctions into peptides') + optional_args.add_argument('--report-known-and-novelss-tumor-junction', default=False, action= "store_true", help='Report both known and novel splice site-derived tumor junctions. Despite the input is AS events from rMATS with novel splice sites detected, not every junctions in an AS event will contain novel splice site(s). By default, an event will be reported when tumor-form junction derived from novel splice site(s). (Default is False)') + optional_args.add_argument('--all-orf', default=False, action= "store_true", help='Perform the 3 ORF translation. ORF known in the UniProtKB will be labeled as uniprotFrame in the bed file (Default is to use the known ORF ONLY)') + optional_args.add_argument('--ignore-annotation', default=False, action= "store_true", help='Perform 3 ORF translation without annotating known ORF from the UniProtKB (Default is disabled)') + optional_args.add_argument('--remove-early-stop', default=False, action= "store_true", help='Discard the peptide if containing early stop codon (Default is keep the truncated peptide)') + arg_screening_novelss._action_groups.append(optional_args) + return + +def add_screening_sjcplot_parser( subparsers ): + arg_screening_sjcplot = subparsers.add_parser("screen_sjc_plot",help='Makes stacked/individual barplots of percentage of samples expressing a splice junction for list of AS events') + optional_args = arg_screening_sjcplot._action_groups.pop() + required_args = arg_screening_sjcplot.add_argument_group('required arguments') + required_args.add_argument('event_list', help='Inputs a list of AS event and direction for visualization. IRIS screening result format is preferred (Default deltaPSI column and cutoff are based on IRIS screening format)') + required_args.add_argument('-j','--jc-full-result', help='File contains information about percentage of samples expressing a SJ from the output of IRIS SJC screening.') + required_args.add_argument('-p','--parameter-fin', help="Parameter file used in 'IRIS screen_sjc' (using SJ db)", required=True) + required_args.add_argument('-t', '--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'],help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file (Default is SE event)') + required_args.add_argument('--step','-s', default=10, help='Number of events in each plot (Default is 10)') + required_args.add_argument('-o', '--outdir', help='Define the output directory of the plot.', required=True) + optional_args.add_argument('-c','--deltaPSI-column', default=5, help='Column of deltaPSI value in matrix, 1-based (Default is 5th column)') + optional_args.add_argument('-d','--deltaPSI-cut-off', default=0, help='Defines cutoff of deltaPSI (or other metric) to select tumor-enriched splice form (Default is 0)') + optional_args.add_argument('--header', action="store_true", default=False, help='Skipping the header line of the input (Default is False)') + arg_screening_sjcplot._action_groups.append(optional_args) + return + def add_ms_makedb_parser(subparsers): arg_ms_makedb = subparsers.add_parser("ms_makedb",help='Generates proteo-transcriptomic database for MS search') optional_args = arg_ms_makedb._action_groups.pop() required_args = arg_ms_makedb.add_argument_group('required arguments') + required_args.add_argument('--java-path', help='The path of Java.') + required_args.add_argument('--MSGF-path', help='The path of MSGF+.') required_args.add_argument('-o', '--outdir', help='The path to IRIS traslation output directory.', required=True) required_args.add_argument('--uniprot-fasta',help='Specify the path of the UniProt proteome FASTA file.', required= True) required_args.add_argument('--exp-fin-list',help='Specify a file contains paths of gene expression files (by rows) that should be considered to form the proteogenomic db.', required= True) @@ -301,11 +580,23 @@ def add_ms_parse_parser(subparsers): required_args.add_argument('-o', '--outdir', help='Specify a directory to output parsed MS result.', required=True) optional_args.add_argument('--dump-all', action= "store_true", help='') arg_ms_parse._action_groups.append(optional_args) - return + return + +def add_visual_summary_parser(subparsers): + arg_visual_summary = subparsers.add_parser("visual_summary",help='Makes a graphic summary of IRIS results') + optional_args = arg_visual_summary._action_groups.pop() + required_args = arg_visual_summary.add_argument_group('required arguments') + required_args.add_argument('-p','--parameter-fin', help="Parameter file used in 'IRIS screen'", required=True) + required_args.add_argument('-s', '--screening-out-dir', help='The directory where IRIS screening output was written', required=True) + required_args.add_argument('-o', '--out-file-name', help='The .png file name to write', required=True) + required_args.add_argument('-t', '--splicing-event-type', default='SE', choices=['SE','RI','A3SS','A5SS'], help='String of splicing event types based on rMATS definition (SE,RI,A3SS,A5SS).Used to name output file (Default is SE event)') + optional_args.add_argument('--no-prediction', action='store_true', required=False) + arg_visual_summary._action_groups.append(optional_args) + return if __name__ == '__main__': try: main() except KeyboardInterrupt: sys.stderr.write("[INFO] User interrupted; program terminated.") - sys.exit(0) \ No newline at end of file + sys.exit(0) diff --git a/conda.sh b/conda.sh deleted file mode 100644 index 1bae2fa..0000000 --- a/conda.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# -# Provide functions for working with conda -# Usage: -# * `source conda.sh` -# * `conda::create_env_with_name_and_python_version {env_name} {python_version}` -# + example: `conda::create_env_with_name_and_python_version my-conda-env 3.6` -# * `conda::activate_env {env_name}` -# + example: `conda::activate_env my-conda-env` -# * `conda::deactivate_env` -# -# Assumes that conda is installed -# https://docs.conda.io/en/latest/miniconda.html -# https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh - -function conda::create_env_with_name_and_python_version() { - local ERROR_PREFIX="error in conda::create_env_with_name_and_python_version()" - - local ENV_NAME="$1" - local PYTHON_VERSION="$2" - - local FOUND_COUNT=$(conda info --envs | grep "^${ENV_NAME} .*/${ENV_NAME}$" | wc -l) - if [[ "$?" -ne 0 ]]; then - echo "${ERROR_PREFIX}: checking conda envs" >&2 - return 1 - fi - - if [[ "${FOUND_COUNT}" -eq 1 ]]; then - echo "using existing ${ENV_NAME} conda environment" - return 0 - fi - - echo "creating new conda environment: ${ENV_NAME} python=${PYTHON_VERSION}" - conda create --name "${ENV_NAME}" python="${PYTHON_VERSION}" - if [[ "$?" -ne 0 ]]; then - echo "${ERROR_PREFIX}: creating env" >&2 - return 1 - fi -} -export -f conda::create_env_with_name_and_python_version - -function conda::activate_env() { - conda activate "$1" -} -export -f conda::activate_env - -function conda::deactivate_env() { - conda deactivate -} -export -f conda::deactivate_env - -function main() { - # need to use the setup that conda init writes to .bashrc - source ${HOME}/.bashrc || return 1 -} - -main "$@" diff --git a/conda_requirements_py2.txt b/conda_requirements_py2.txt new file mode 100644 index 0000000..5d650c9 --- /dev/null +++ b/conda_requirements_py2.txt @@ -0,0 +1,7 @@ +bedtools=2.29.0 +numpy=1.16.5 +pybigwig=0.3.13 +python=2.7.* +scipy=1.2.0 +seaborn=0.9.0 +statsmodels=0.10.2 diff --git a/conda_requirements_py2_optional.txt b/conda_requirements_py2_optional.txt new file mode 100644 index 0000000..e9afb14 --- /dev/null +++ b/conda_requirements_py2_optional.txt @@ -0,0 +1,10 @@ +cufflinks=2.2.1 +# openssl 1.0 needs to be explicitly installed because pysam depends on it, +# but openssl 1.1 would get installed otherwise +openssl=1.0.2u +pysam=0.14.1 +r-base=3.2.2 +rmats=4.1.2 +samtools=1.3 +seq2hla=2.2 +star=2.5.3a diff --git a/conda_requirements_py3.txt b/conda_requirements_py3.txt new file mode 100644 index 0000000..eab4d43 --- /dev/null +++ b/conda_requirements_py3.txt @@ -0,0 +1,5 @@ +beautifulsoup4=4.11.* +google-api-python-client=2.* +python=3.9.* +snakemake=7.17.1 +tqdm=4.64.* diff --git a/conda_wrapper b/conda_wrapper new file mode 100755 index 0000000..e64f439 --- /dev/null +++ b/conda_wrapper @@ -0,0 +1,30 @@ +#!/bin/bash +# conda_wrapper activates the conda environment and then +# executes its arguments in that environment. +# +function set_script_dir() { + local ORIG_DIR="$(pwd)" || return 1 + + local REL_SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})" || return 1 + cd "${REL_SCRIPT_DIR}" || return 1 + SCRIPT_DIR="$(pwd)" || return 1 + cd "${ORIG_DIR}" || return 1 +} + +function main() { + local CONDA_ENV_PREFIX="$1" + shift || return 1 + + set_script_dir || return 1 + source "${SCRIPT_DIR}/set_env_vars.sh" || return 1 + + conda activate "${CONDA_ENV_PREFIX}" || return 1 + + "$@" + local RETURN_VALUE="$?" + + conda deactivate || return 1 + return "${RETURN_VALUE}" +} + +main "$@" diff --git a/docs/iris_diagram.png b/docs/iris_diagram.png new file mode 100644 index 0000000..8148bf8 Binary files /dev/null and b/docs/iris_diagram.png differ diff --git a/example/HLA_types/hla_exp.list b/example/HLA_types/hla_exp.list deleted file mode 100644 index a06dbc4..0000000 --- a/example/HLA_types/hla_exp.list +++ /dev/null @@ -1,22 +0,0 @@ -1142 A: 31.53 RPKM C: 33.37 RPKM B: 30.34 RPKM -2964 A: 88.71 RPKM C: 120.84 RPKM B: 58.44 RPKM -2907 A: 41.98 RPKM C: 31.27 RPKM B: 28.21 RPKM -1325 A: 29.97 RPKM C: 21.13 RPKM B: 44.53 RPKM -2924 A: 49.77 RPKM C: 36.86 RPKM B: 39.57 RPKM -2870 A: 87.05 RPKM C: 94.23 RPKM B: 57.69 RPKM -LB3336 A: 14.37 RPKM C: 15.28 RPKM B: 13.24 RPKM -2867 A: 112.63 RPKM C: 76.33 RPKM B: 126.88 RPKM -1989 A: 71.62 RPKM C: 46.55 RPKM B: 78.67 RPKM -2158 A: 174.12 RPKM C: 163.67 RPKM B: 184.92 RPKM -LB3367 A: 201.63 RPKM C: 131.86 RPKM B: 157.43 RPKM -LB3054 A: 63.74 RPKM C: 35.71 RPKM B: 60.7 RPKM -LB3001 A: 50.7 RPKM C: 44.78 RPKM B: 44.23 RPKM -3404 A: 77.16 RPKM C: 47.01 RPKM B: 29.01 RPKM -LB3120 A: 78.84 RPKM C: 19.98 RPKM B: 25.05 RPKM -2899 A: 11.02 RPKM C: 10.53 RPKM B: 7.77 RPKM -3374 A: 23.12 RPKM C: 20.09 RPKM B: 21.59 RPKM -3372 A: 37.2 RPKM C: 34.45 RPKM B: 42.28 RPKM -3244 A: 93.22 RPKM C: 45.49 RPKM B: 46.79 RPKM -2675 A: 97.63 RPKM C: 84.07 RPKM B: 98.47 RPKM -LB2938 A: 454.31 RPKM C: 229.66 RPKM B: 280.1 RPKM -803 A: 22.04 RPKM C: 10.46 RPKM B: 15.62 RPKM diff --git a/example/HLA_types/hla_patient.tsv b/example/HLA_types/hla_patient.tsv deleted file mode 100644 index d6419f7..0000000 --- a/example/HLA_types/hla_patient.tsv +++ /dev/null @@ -1,22 +0,0 @@ -1142 HLA-A*23:01 HLA-A*02:01 HLA-B*13:02 HLA-B*44:02 HLA-C*06:02 -2964 HLA-A*03:01 HLA-A*74:03 HLA-B*37:01 HLA-B*35:08 HLA-C*04:01 HLA-C*04:01 -2907 HLA-A*02:05 HLA-A*03:01 HLA-B*50:01 HLA-B*35:03 -1325 HLA-A*03:02 HLA-A*24:02 HLA-B*55:01 HLA-B*35:02 HLA-C*03:03 -2924 HLA-A*23:01 HLA-B*50:01 HLA-C*06:02 HLA-C*14:02 -2870 HLA-A*01:01 HLA-B*38:01 HLA-B*57:01 HLA-C*06:02 -LB3336 HLA-A*11:01 HLA-B*13:02 HLA-B*48:01 HLA-C*06:02 HLA-C*08:01 -2867 HLA-A*03:01 HLA-B*07:02 HLA-B*46:01 HLA-C*08:01 HLA-C*07:02 -1989 HLA-A*01:01 HLA-A*32:03 HLA-B*40:01 HLA-B*07:02 HLA-C*03:04 HLA-C*07:02 -2158 HLA-A*26:01 HLA-A*01:01 HLA-B*38:01 HLA-B*35:02 HLA-C*12:03 HLA-C*04:01 -LB3367 HLA-A*24:02 HLA-B*13:01 HLA-C*07:02 -LB3054 HLA-A*01:01 HLA-A*24:02 HLA-B*14:02 HLA-B*55:01 -LB3001 HLA-A*30:02 HLA-A*02:01 HLA-B*44:03 HLA-B*18:01 HLA-C*04:01 -3404 HLA-A*68:02 HLA-B*57:01 HLA-B*53:01 HLA-C*04:01 HLA-C*06:02 -LB3120 HLA-A*02:01 HLA-B*18:01 HLA-B*15:01 HLA-C*12:03 HLA-C*03:04 -2899 HLA-A*01:01 HLA-A*02:01 HLA-B*57:01 HLA-B*15:01 HLA-C*06:02 HLA-C*03:04 -3374 HLA-A*01:01 HLA-A*02:01 HLA-B*08:01 HLA-B*35:02 HLA-C*04:01 HLA-C*07:01 -3372 HLA-A*32:01 HLA-B*44:02 HLA-C*14:02 -3244 HLA-B*38:01 HLA-B*51:01 HLA-C*15:02 HLA-C*12:03 -2675 HLA-A*30:02 HLA-A*29:01 HLA-B*07:05 HLA-B*53:01 HLA-C*15:05 HLA-C*04:01 -LB2938 HLA-A*03:01 HLA-A*03:01 HLA-B*07:02 HLA-B*35:03 HLA-C*04:01 HLA-C*07:02 -803 HLA-A*30:02 HLA-A*03:01 HLA-B*18:01 HLA-B*40:01 HLA-C*03:04 diff --git a/example/NEPC_test.para b/example/NEPC_test.para new file mode 100644 index 0000000..88c561d --- /dev/null +++ b/example/NEPC_test.para @@ -0,0 +1,10 @@ +NEPC_test +IRIS_data.v2.0.0/db/ + + +0.01,0.05,1,0.000001,8 GTEx_Heart,GTEx_Blood,GTEx_Lung,GTEx_Liver,GTEx_Brain,GTEx_Nerve,GTEx_Muscle,GTEx_Spleen,GTEx_Thyroid,GTEx_Skin,GTEx_Kidney +group parametric +False + +IRIS_data.v2.0.0/resources/mappability/wgEncodeCrgMapabilityAlign24mer.bigWig +IRIS_data.v2.0.0/resources/reference/ucsc.hg19.fasta diff --git a/example/SJ_matrices.tar.gz b/example/SJ_matrices.tar.gz deleted file mode 100644 index 45bd6b5..0000000 Binary files a/example/SJ_matrices.tar.gz and /dev/null differ diff --git a/example/SJ_matrices/RL_100.matrix/JC.raw.input.SE.txt b/example/SJ_matrices/RL_100.matrix/JC.raw.input.SE.txt deleted file mode 100644 index c008df7..0000000 --- a/example/SJ_matrices/RL_100.matrix/JC.raw.input.SE.txt +++ /dev/null @@ -1,13 +0,0 @@ -ID IJC_SAMPLE_1 SJC_SAMPLE_1 IJC_SAMPLE_2 SJC_SAMPLE_2 IncFormLen SkipFormLen -15225 20,22,22,32,27,50 42,24,44,39,47,52 187 99 -34517 54,84,63,84,88,57 1,11,1,2,0,0 198 99 -51809 157,56,239,139,132,105 4,4,2,6,1,6 188 99 -58989 205,202,213,230,423,75 13,11,19,13,9,17 198 99 -64939 36,51,21,51,73,8 1,2,3,0,16,0 183 99 -64969 128,48,109,60,37,139 19,5,22,18,6,29 131 99 -68704 65,31,98,48,37,68 14,5,10,18,8,10 198 99 -75710 27,60,84,42,46,40 0,3,1,2,1,3 198 99 -76672 21,16,43,11,27,34 42,25,71,28,16,59 156 99 -77359 2,1,0,1,0,4 90,68,71,70,153,47 198 99 -86641 189,623,470,565,684,386 110,143,153,249,428,189 146 99 -91362 154,44,114,79,45,135 27,11,17,12,38,18 198 99 diff --git a/example/SJ_matrices/RL_100.matrix/fromGTF.SE.txt b/example/SJ_matrices/RL_100.matrix/fromGTF.SE.txt deleted file mode 100644 index 5f19a0b..0000000 --- a/example/SJ_matrices/RL_100.matrix/fromGTF.SE.txt +++ /dev/null @@ -1,13 +0,0 @@ -ID GeneID geneSymbol chr strand exonStart_0base exonEnd upstreamES upstreamEE downstreamES downstreamEE -34517 "ENSG00000137842.6_3" "TMEM62" chr15 + 43470804 43470909 43461790 43461875 43473378 43473497 -51809 "ENSG00000163681.14_3" "SLMAP" chr3 + 57911571 57911661 57908615 57908750 57913022 57913213 -64939 "ENSG00000090554.12_3" "FLT3LG" chr19 + 49982219 49982304 49979679 49979823 49983554 49983733 -64969 "ENSG00000116001.15_3" "TIA1" chr2 - 70456190 70456223 70454866 70454954 70456395 70456450 -68704 "ENSG00000184381.18_3" "PLA2G6" chr22 - 38524275 38524437 38522377 38522456 38525460 38525569 -75710 "ENSG00000154370.15_3" "TRIM11" chr1 - 228588664 228588895 228584843 228584866 228589766 228589862 -76672 "ENSG00000077458.12_2" "FAM76B" chr11 - 95512241 95512299 95511985 95512121 95512770 95512851 -77359 "ENSG00000089159.16_3" "PXN" chr12 - 120657009 120657894 120652905 120653076 120659425 120659561 -86641 "ENSG00000125970.11_3" "RALY" chr20 + 32661624 32661672 32661368 32661441 32663679 32663845 -91362 "ENSG00000117625.13_3" "RCOR3" chr1 + 211486061 211486303 211474802 211477482 211486765 211487181 -15225 "ENSG00000137814.10_2" "HAUS2" chr15 + 42852979 42853068 42851536 42851606 42853467 42853497 -58989 "ENSG00000169919.16_2" "GUSB" chr7 - 65444713 65444898 65444385 65444528 65445210 65445396 diff --git a/example/SJ_matrices/RL_100_rmatspost_list.txt b/example/SJ_matrices/RL_100_rmatspost_list.txt deleted file mode 100644 index 0a23d56..0000000 --- a/example/SJ_matrices/RL_100_rmatspost_list.txt +++ /dev/null @@ -1 +0,0 @@ -BAMs/RL_100/1325.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_100/2158.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_100/2675.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_100/2870.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_100/2867.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_100/803.aln/Aligned.sortedByCoord.out.bam diff --git a/example/SJ_matrices/RL_150.matrix/JC.raw.input.SE.txt b/example/SJ_matrices/RL_150.matrix/JC.raw.input.SE.txt deleted file mode 100644 index fe3c47b..0000000 --- a/example/SJ_matrices/RL_150.matrix/JC.raw.input.SE.txt +++ /dev/null @@ -1,13 +0,0 @@ -ID IJC_SAMPLE_1 SJC_SAMPLE_1 IJC_SAMPLE_2 SJC_SAMPLE_2 IncFormLen SkipFormLen -8580 243,637,161,347,833,665,1184,587,248,1326,995,988,339,355,2083,361 31,73,17,34,70,59,85,91,63,213,55,129,59,8,78,55 298 149 -13714 293,330,119,259,308,320,379,262,198,68,271,244,28,110,529,207 135,3,1,13,5,13,5,8,3,9,2,0,5,12,12,2 238 149 -43344 34,64,93,88,159,64,73,66,47,162,54,132,209,92,81,42 12,11,10,18,5,17,5,10,7,45,13,16,12,18,12,11 233 149 -48104 61,65,15,117,82,130,117,75,115,168,125,82,10,36,182,133 108,87,28,179,234,131,198,101,71,160,97,91,6,38,153,126 237 149 -66005 358,133,63,205,158,159,251,185,397,318,225,229,44,118,390,379 66,40,6,42,64,90,52,39,23,24,50,15,8,22,51,52 298 149 -97372 42,98,18,48,111,43,83,53,39,76,60,53,17,17,65,30 58,70,17,87,116,167,145,119,280,89,100,77,30,25,118,89 206 149 -119182 202,49,20,123,61,271,85,98,341,23,276,134,25,210,233,290 160,19,3,39,10,42,24,10,33,2,37,33,0,14,15,48 298 149 -120773 236,126,48,158,115,155,167,161,277,317,176,260,82,83,185,315 5,5,1,4,6,10,9,16,33,16,10,13,3,5,3,30 298 149 -131604 2,0,0,3,0,4,4,0,0,0,1,0,0,0,2,0 152,435,67,216,463,453,580,177,98,19,648,74,9,43,1609,219 298 149 -194597 139,100,37,300,232,350,230,198,478,170,167,273,41,61,339,588 60,21,7,10,33,80,16,24,113,77,43,39,9,5,72,94 181 149 -195109 1104,1173,527,1369,1902,1401,1194,1128,1848,4572,1535,1524,410,1767,1576,1852 429,561,209,303,560,760,543,543,1427,1191,602,790,158,603,732,1259 196 149 -224828 225,306,37,75,275,158,246,150,207,135,95,223,163,103,352,184 29,3,2,12,8,38,6,7,26,9,2,6,3,1,8,14 253 149 diff --git a/example/SJ_matrices/RL_150.matrix/fromGTF.SE.txt b/example/SJ_matrices/RL_150.matrix/fromGTF.SE.txt deleted file mode 100644 index 3577123..0000000 --- a/example/SJ_matrices/RL_150.matrix/fromGTF.SE.txt +++ /dev/null @@ -1,13 +0,0 @@ -ID GeneID geneSymbol chr strand exonStart_0base exonEnd upstreamES upstreamEE downstreamES downstreamEE -8580 "ENSG00000169919.16_2" "GUSB" chr7 - 65444713 65444898 65444385 65444528 65445210 65445396 -13714 "ENSG00000163681.14_3" "SLMAP" chr3 + 57911571 57911661 57908615 57908750 57913022 57913213 -43344 "ENSG00000090554.12_3" "FLT3LG" chr19 + 49982219 49982304 49979679 49979823 49983554 49983733 -48104 "ENSG00000137814.10_2" "HAUS2" chr15 + 42852979 42853068 42851536 42851606 42853467 42853497 -66005 "ENSG00000117625.13_3" "RCOR3" chr1 + 211486061 211486303 211474802 211477482 211486765 211487181 -97372 "ENSG00000077458.12_2" "FAM76B" chr11 - 95512241 95512299 95511985 95512121 95512770 95512851 -119182 "ENSG00000184381.18_3" "PLA2G6" chr22 - 38524275 38524437 38522377 38522456 38525460 38525569 -120773 "ENSG00000154370.15_3" "TRIM11" chr1 - 228588664 228588895 228584843 228584866 228589766 228589862 -131604 "ENSG00000089159.16_3" "PXN" chr12 - 120657009 120657894 120652905 120653076 120659425 120659561 -194597 "ENSG00000116001.15_3" "TIA1" chr2 - 70456190 70456223 70454866 70454954 70456395 70456450 -195109 "ENSG00000125970.11_3" "RALY" chr20 + 32661624 32661672 32661368 32661441 32663679 32663845 -224828 "ENSG00000137842.6_3" "TMEM62" chr15 + 43470804 43470909 43461790 43461875 43473378 43473497 diff --git a/example/SJ_matrices/RL_150_rmatspost_list.txt b/example/SJ_matrices/RL_150_rmatspost_list.txt deleted file mode 100644 index 4157290..0000000 --- a/example/SJ_matrices/RL_150_rmatspost_list.txt +++ /dev/null @@ -1 +0,0 @@ -BAMs/RL_150/1142.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/1989.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/2899.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/2907.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB2924.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB2938.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB2964.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB3001.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB3054.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB3336.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB3244.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB3404.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB3374.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB3372.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB3367.aln/Aligned.sortedByCoord.out.bam,BAMs/RL_150/LB3120.aln/Aligned.sortedByCoord.out.bam diff --git a/example/SJ_matrices/matrices.txt b/example/SJ_matrices/matrices.txt deleted file mode 100644 index 18c40ba..0000000 --- a/example/SJ_matrices/matrices.txt +++ /dev/null @@ -1,2 +0,0 @@ -RL_100.matrix -RL_150.matrix diff --git a/example/SJ_matrices/samples.txt b/example/SJ_matrices/samples.txt deleted file mode 100644 index 35fcbb1..0000000 --- a/example/SJ_matrices/samples.txt +++ /dev/null @@ -1,2 +0,0 @@ -RL_100_rmatspost_list.txt -RL_150_rmatspost_list.txt diff --git a/example/Test.para b/example/Test.para deleted file mode 100644 index cca8f48..0000000 --- a/example/Test.para +++ /dev/null @@ -1,10 +0,0 @@ -Glioma_test -IRIS_data/db/ -filter1 0.01 0.05 1 1 GTEx_Brain -filter2 0.000001593371574 0.05 1 1 TCGA_GBM,TCGA_LGG -filter3 0.01 0.05 1 2 GTEx_Heart,GTEx_Skin,GTEx_Blood,GTEx_Lung,GTEx_Liver,GTEx_Nerve,GTEx_Muscle,GTEx_Spleen,GTEx_Thyroid,GTEx_Kidney,GTEx_Stomach -group -False - -IRIS_data/resources/mappability/wgEncodeCrgMapabilityAlign24mer.bigWig -IRIS_data/resources/reference/ucsc.hg19.fasta diff --git a/example/Test_simplified.para b/example/Test_simplified.para deleted file mode 100644 index db02cc4..0000000 --- a/example/Test_simplified.para +++ /dev/null @@ -1,6 +0,0 @@ -Glioma_test -filter1 0.01 0.05 1 1 GTEx_Brain -filter2 0.000001593371574 0.05 1 1 TCGA_GBM,TCGA_LGG -filter3 0.01 0.05 1 2 GTEx_Heart,GTEx_Skin,GTEx_Blood,GTEx_Lung,GTEx_Liver,GTEx_Nerve,GTEx_Muscle,GTEx_Spleen,GTEx_Thyroid,GTEx_Kidney,GTEx_Stomach -group -False diff --git a/example/exp_matrix_test.txt b/example/exp_matrix_test.txt new file mode 100644 index 0000000..fd818e3 --- /dev/null +++ b/example/exp_matrix_test.txt @@ -0,0 +1,11 @@ +geneName Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 Sample7 Sample8 Sample9 Sample10 +ENSG00000008282.8_2 13.8136 30.6688 28.7282 26.4562 32.2042 3.24332 29.1389 35.7019 31.3576 13.7008 +ENSG00000008710.19_3 25.6837 200.984 77.3921 92.3767 116.713 15.8125 117.737 128.881 36.1041 7.51057 +ENSG00000090674.15_3 15.863 24.4872 9.84709 18.8131 12.9838 32.5762 3.32612 4.70605 12.2316 37.9599 +ENSG00000105939.12_3 5.60172 5.12532 6.07503 6.76975 11.3333 7.30481 6.73759 15.3272 14.5119 6.05959 +ENSG00000110367.11_2 16.4841 17.6602 6.62134 21.7654 17.4255 12.3803 30.7404 18.8044 36.8079 23.5613 +ENSG00000125814.17_2 8.50579 10.2384 7.186 5.78771 6.18153 3.52837 3.78008 3.41479 9.14832 15.5857 +ENSG00000142192.20_3 176.655 556.907 251.233 347.058 312.269 94.2619 278.205 310.817 348.916 24.6239 +ENSG00000171603.16_2 36.7182 83.7313 6.87977 6.28606 120.779 51.4822 59.4516 111.927 91.1514 7.91309 +ENSG00000183773.15_3 0.846555 2.37515 0.302885 5.09117 5.33169 7.62877 0.69174 2.74661 0.834716 2.1072 +ENSG00000184220.11_3 35.252 32.0691 31.1302 6.09923 6.49715 7.98065 6.13434 6.71738 52.5053 11.1754 diff --git a/example/gene_exp_file_list.txt b/example/gene_exp_file_list.txt deleted file mode 100644 index e719ed4..0000000 --- a/example/gene_exp_file_list.txt +++ /dev/null @@ -1,22 +0,0 @@ -glioma/BAMs/RL_100/1325.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_100/2158.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_100/2675.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_100/2867.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_100/2870.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_100/803.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/1142.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/1989.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/2899.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/2907.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB2924.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB2938.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB2964.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB3001.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB3054.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB3120.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB3244.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB3336.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB3367.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB3372.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB3374.aln/cufflinks/genes.fpkm_tracking -glioma/BAMs/RL_150/LB3404.aln/cufflinks/genes.fpkm_tracking \ No newline at end of file diff --git a/example/hla_patient_test.tsv b/example/hla_patient_test.tsv new file mode 100644 index 0000000..e037797 --- /dev/null +++ b/example/hla_patient_test.tsv @@ -0,0 +1,10 @@ +Sample1 HLA-A*23:01 HLA-A*02:01 HLA-B*13:02 HLA-B*44:02 HLA-C*06:02 +Sample2 HLA-A*03:02 HLA-A*24:02 HLA-B*55:01 HLA-B*35:02 HLA-C*03:03 +Sample3 HLA-A*01:01 HLA-B*38:01 HLA-B*57:01 HLA-C*06:02 +Sample4 HLA-A*01:01 HLA-A*32:03 HLA-B*40:01 HLA-B*07:02 HLA-C*03:04 HLA-C*07:02 +Sample5 HLA-A*26:01 HLA-A*01:01 HLA-B*38:01 HLA-B*35:02 HLA-C*12:03 HLA-C*04:01 +Sample6 HLA-A*24:02 HLA-B*13:01 HLA-C*07:02 +Sample7 HLA-A*02:01 HLA-B*18:01 HLA-B*15:01 HLA-C*12:03 HLA-C*03:04 +Sample8 HLA-A*01:01 HLA-A*02:01 HLA-B*57:01 HLA-B*15:01 HLA-C*06:02 HLA-C*03:04 +Sample9 HLA-A*30:02 HLA-A*29:01 HLA-B*07:05 HLA-B*53:01 HLA-C*15:05 HLA-C*04:01 +Sample10 HLA-A*03:01 HLA-A*03:01 HLA-B*07:02 HLA-B*35:03 HLA-C*04:01 HLA-C*07:02 \ No newline at end of file diff --git a/example/HLA_types/hla_types.list b/example/hla_types_test.list similarity index 63% rename from example/HLA_types/hla_types.list rename to example/hla_types_test.list index 55a60ca..8095ed6 100644 --- a/example/HLA_types/hla_types.list +++ b/example/hla_types_test.list @@ -1,49 +1,31 @@ -HLA-A*01:01 -HLA-A*02:01 -HLA-A*02:05 -HLA-A*03:01 -HLA-A*03:02 -HLA-A*11:01 -HLA-A*23:01 -HLA-A*24:02 -HLA-A*26:01 -HLA-A*29:01 -HLA-A*30:02 -HLA-A*32:01 -HLA-A*32:03 -HLA-A*68:02 -HLA-A*74:03 -HLA-B*07:02 -HLA-B*07:05 -HLA-B*08:01 -HLA-B*13:01 -HLA-B*13:02 -HLA-B*14:02 -HLA-B*15:01 -HLA-B*18:01 -HLA-B*35:02 -HLA-B*35:03 -HLA-B*35:08 -HLA-B*37:01 -HLA-B*38:01 -HLA-B*40:01 -HLA-B*44:02 -HLA-B*44:03 -HLA-B*46:01 -HLA-B*48:01 -HLA-B*50:01 -HLA-B*51:01 -HLA-B*53:01 -HLA-B*55:01 -HLA-B*57:01 -HLA-C*03:03 -HLA-C*03:04 -HLA-C*04:01 -HLA-C*06:02 -HLA-C*07:01 -HLA-C*07:02 -HLA-C*08:01 -HLA-C*12:03 -HLA-C*14:02 -HLA-C*15:02 HLA-C*15:05 +HLA-C*12:03 +HLA-C*07:02 +HLA-C*06:02 +HLA-C*04:01 +HLA-C*03:04 +HLA-C*03:03 +HLA-B*57:01 +HLA-B*55:01 +HLA-B*53:01 +HLA-B*44:02 +HLA-B*40:01 +HLA-B*38:01 +HLA-B*35:03 +HLA-B*35:02 +HLA-B*18:01 +HLA-B*15:01 +HLA-B*13:02 +HLA-B*13:01 +HLA-B*07:05 +HLA-B*07:02 +HLA-A*32:03 +HLA-A*30:02 +HLA-A*29:01 +HLA-A*26:01 +HLA-A*24:02 +HLA-A*23:01 +HLA-A*03:02 +HLA-A*03:01 +HLA-A*02:01 +HLA-A*01:01 diff --git a/example/parameter_file_description.txt b/example/parameter_file_description.txt index a2d79ce..e9a348c 100644 --- a/example/parameter_file_description.txt +++ b/example/parameter_file_description.txt @@ -1,34 +1,45 @@ ------------------------------------------------------------------------------------------------------------------------- -Parameter file format (take the test file for example): ------------------------------------------------------------------------------------------------------------------------- -Glioma_test -IRIS_data/db/ -filter1 0.01 0.05 1 1 GTEx_Brain -filter2 0.000001593371574 0.05 1 1 TCGA_GBM,TCGA_LGG -filter3 0.01 0.05 1 2 GTEx_Heart,GTEx_Skin,GTEx_Blood,GTEx_Lung,GTEx_Liver,GTEx_Nerve,GTEx_Muscle,GTEx_Spleen,GTEx_Thyroid,GTEx_Kidney,GTEx_Stomach -group -False - -IRIS_data/resources/mappability/wgEncodeCrgMapabilityAlign24mer.bigWig -IRIS_data/resources/reference/ucsc.hg19.fasta ------------------------------------------------------------------------------------------------------------------------- -Row1: Input group's name in the IRIS db (see 'formatting' step) -Row2: Directory of IRIS db -Row3: Parameters for 'Tissue-matched normal panel'; screening tumor-associated events. -Fields are separated by ' ': -filter_name p-value_cutoff deltaPSI_cutoff FC_cutoff filter1_group_cutoff filter1_reference_list - # filter_name: Row name (No space; Required) - # p-value_cutoff: Cutoff of p-value for statistical tests being used (Optional) - # deltaPSI_cutoff: Difference of PSI values between input sample and normal control should be larger than this threshold, which ensures the effect size of splicing change (Optional) - # FC_cutoff: Fold Change cutoff of PSI value of input sample compared to normal control (Optional) - # filter1_group_cutoff: Minimum number of tumor/tissue reference panels satisfying above requirements (Optional) - # filter1_reference_list: A list of selected reference panels (separated by ',';Optional) - -Row4: Parameters for 'Tumor panel'; screening for tumor-recurrent events (See Row3) -Row5: Parameters for other 'Normal panel'; screening tumor-specific events (See Row3) -Row3-5: As described in Row3, all fields are optional (except Row name). At least one row has to have values to perform screening. Note that 'Tumor panel' along will not function if 'Tissue-matched normal panel' is missing. -Row6: Comparison mode, 'group' mode (number of input samples >=2) and 'individual mode' (number of input sample =1) are provided -Row7: Use ratio or not for group_cutoff (described in Row3) -Row8: Blacklist file, which can remove the AS events that are error-prone due to mappability or sequencing limitations (See methods for detail) -Row9: Mappability file, which can be used to evaluate the AS events ------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------ +Screening parameter file format (take a screening task for 'Glioma_test' as an example): +------------------------------------------------------------------------------------------------------------------------ +Glioma_test +IRIS_data/db/ +0.01,0.05,1,0.000001,1 GTEx_Brain +0.000001,0.05,1,0.000001,1 TCGA_GBM,TCGA_LGG +0.01,0.05,1,0.000001,5 GTEx_Heart,GTEx_Skin,GTEx_Blood,GTEx_Lung,GTEx_Liver,GTEx_Nerve,GTEx_Muscle,GTEx_Spleen,GTEx_Thyroid,GTEx_Kidney,GTEx_Stomach +group parametric +False + +IRIS_data/resources/mappability/wgEncodeCrgMapabilityAlign24mer.bigWig +IRIS_data/resources/reference/ucsc.hg19.fasta +------------------------------------------------------------------------------------------------------------------------ +Row1 (required): Input group name. This should be identical to the folder name in the IRIS db (see 'format' step) + +Row2 (required): Path to the directory of IRIS db + +Row3 (see below): Parameters for the tier 1 screening (i.e. 'tissue-matched normal reference') +Fields are separated by ' ': fitler1_cutoffs and filter1_reference_list +within each field, parameters are separated by ',': +- tier1_cutoffs: PSI_p-value_cutoff,deltaPSI_cutoff,FC_cutoff,SJC_p-value_cutoff,filter1_group_cutoff + # PSI_p-value_cutoff: Cutoff of p-value for statistical tests in PSI-based tests. + # deltaPSI_cutoff: Minimum difference of PSI values (i.e deltaPSI) between tumor and normal groups + # FC_cutoff: Fold Change cutoff of PSI value of input sample compared to normal control + # SJC_p-value_cutoff: Cutoff of p-value for statistical tests in SJC-based tests. + # tier1_group_cutoff: Minimum number of tumor/tissue reference groups satisfying above requirements. This number should be no larger than the number of reference normal groups selected in the next field ('tier1_reference list') +- tier1_reference_list: A list of selected reference panels (separated by ','). Names should be identical to the folder name in the IRIS db (see 'format' step) + +Row4 (see below): Parameters for the tier 2 screening (i.e. 'tumor reference'). Same format as the row3 + +Row5 (see below): Parameters for the tier 3 screening (i.e. 'normal tissue reference'). Same format as the row3 + +Row3-5 (at least one NORMAL reference row is required) For example, tier 1 alone, or tier 3 alone, or tier 1 + tier 2 + tier 3 are valid settings for screening. Note that tier 2 alone will not function if tier 1 is missing (tumor recurrence is estabilshed by comparing to a user-specified tissue-matched normal reference in tier 1) + +Row6 (required for PSI-based tests): Comparison mode & statistical test type: 'group' mode (number of input samples >=2) and 'individual mode' (number of input sample =1) are provided. 'group' mode is default and recommended; for PSI-based tests, 'parametric' and 'nonparametric' tests are supported. 'parametric' is default + +Row7 (required for PSI-based tests): Use ratio instead of number of groups for the tierX_group_cutoff. Default is False + +Row8 (optional): Blacklist file. Removes the AS events that are error-prone due to artifacts. Optional + +Row9 (optional): Mappability annotation bigWig file. Required for evaluating splice region mappability. + +Row10 (optional): Reference genome file. Required for IRIS translate. +------------------------------------------------------------------------------------------------------------------------ diff --git a/example/sjc_matrix/SJ_count.NEPC_example.txt b/example/sjc_matrix/SJ_count.NEPC_example.txt new file mode 100644 index 0000000..78305c9 --- /dev/null +++ b/example/sjc_matrix/SJ_count.NEPC_example.txt @@ -0,0 +1,31 @@ +SJ Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 Sample7 Sample8 Sample9 Sample10 +chr19:7593857:7593986 213 285 11 149 139 103 185 220 3 119 +chr19:7594089:7594475 203 231 13 110 115 142 184 149 7 97 +chr19:7593857:7594475 0 0 0 0 0 0 0 0 0 0 +chr22:21331228:21331320 40 30 36 0 3 0 23 8 2 10 +chr22:21331385:21331988 40 12 32 1 5 0 20 3 1 5 +chr22:21331228:21331988 0 0 0 0 0 0 0 0 0 0 +chr7:105733584:105736738 19 11 3 0 28 0 116 22 132 53 +chr7:105736749:105738135 11 8 1 0 21 0 115 13 118 45 +chr7:105733584:105738135 11 36 182 958 41 38 70 413 42 58 +chr3:99536888:99770075 0 0 1 0 0 0 0 0 0 0 +chr3:99770151:99865816 0 0 0 0 0 1 0 0 0 0 +chr3:99536888:99865816 9 131 193 340 18 140 29 0 7 23 +chr20:23375823:23377708 8 13 18 63 52 49 88 35 36 10 +chr20:23377826:23383629 3 4 26 46 38 46 67 18 42 3 +chr20:23375823:23383629 7 9 9 16 16 17 9 3 3 15 +chr7:138763400:138763849 2 2 3 20 2 17 0 1 5 20 +chr7:138764990:138768525 17 8 8 135 25 80 0 10 32 114 +chr7:138763400:138768525 0 0 0 0 0 0 0 0 0 1 +chr1:9796101:9797555 0 10 13 14 11 39 9 50 22 27 +chr1:9797613:9801151 0 7 8 12 5 23 5 36 15 19 +chr1:9796101:9801151 70 960 78 452 552 881 323 1308 1050 1373 +chr21:27354791:27369674 237 813 173 140 164 16 161 151 1867 458 +chr21:27369732:27394155 1 1 2 0 5 1 3 7 12 9 +chr21:27354791:27394155 61 43 367 5 181 7 34 1193 266 417 +chr16:2162965:2163041 135 109 76 29 154 114 117 63 24 144 +chr16:2163061:2163161 8 6 7 2 5 7 1 1 1 4 +chr16:2162965:2163161 11 11 8 6 8 15 18 19 7 12 +chr11:118651894:118656760 34 140 196 100 63 105 58 320 26 74 +chr11:118657228:118661806 13 53 44 20 21 31 24 47 10 27 +chr11:118651894:118661806 0 0 0 0 0 0 0 0 0 0 diff --git a/example/sjc_matrix/SJ_count.NEPC_example.txt.idx b/example/sjc_matrix/SJ_count.NEPC_example.txt.idx new file mode 100644 index 0000000..f9710e7 --- /dev/null +++ b/example/sjc_matrix/SJ_count.NEPC_example.txt.idx @@ -0,0 +1,30 @@ +chr19:7593857:7593986 84 +chr19:7594089:7594475 143 +chr19:7593857:7594475 201 +chr22:21331228:21331320 243 +chr22:21331385:21331988 292 +chr22:21331228:21331988 340 +chr7:105733584:105736738 384 +chr7:105736749:105738135 438 +chr7:105733584:105738135 491 +chr3:99536888:99770075 549 +chr3:99770151:99865816 592 +chr3:99536888:99865816 635 +chr20:23375823:23377708 689 +chr20:23377826:23383629 742 +chr20:23375823:23383629 793 +chr7:138763400:138763849 841 +chr7:138764990:138768525 889 +chr7:138763400:138768525 943 +chr1:9796101:9797555 988 +chr1:9797613:9801151 1037 +chr1:9796101:9801151 1083 +chr21:27354791:27369674 1145 +chr21:27369732:27394155 1209 +chr21:27354791:27394155 1254 +chr16:2162965:2163041 1312 +chr16:2163061:2163161 1370 +chr16:2162965:2163161 1412 +chr11:118651894:118656760 1460 +chr11:118657228:118661806 1521 +chr11:118651894:118661806 1577 diff --git a/example/splicing_matrix/splicing_matrix.SE.cov10.NEPC_example.txt b/example/splicing_matrix/splicing_matrix.SE.cov10.NEPC_example.txt new file mode 100644 index 0000000..fcec6b6 --- /dev/null +++ b/example/splicing_matrix/splicing_matrix.SE.cov10.NEPC_example.txt @@ -0,0 +1,11 @@ +AC GeneName chr strand exonStart exonEnd upstreamEE downstreamES Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 Sample7 Sample8 Sample9 Sample10 +ENSG00000090674 MCOLN1 chr19 + 7593986 7594088 7593856 7594475 1.0 NaN NaN NaN NaN NaN 1.0 NaN NaN 1.0 +ENSG00000183773 AIFM3 chr22 + 21331320 21331384 21331227 21331988 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN +ENSG00000008282 SYPL1 chr7 - 105736738 105736748 105733583 105738135 0.5792 0.1807 0.0099 0.0 0.381 0.0 0.5996 0.0401 0.7376 0.4413 +ENSG00000184220 CMSS1 chr3 + 99770075 99770150 99536887 99865816 NaN 0.0 0.003 0.0 0.0 0.0044 0.0 NaN NaN 0.0 +ENSG00000125814 NAPB chr20 - 23377708 23377825 23375822 23383629 0.44 0.4857 0.7097 0.773 0.7377 0.742 0.896 0.8983 0.9286 0.3023 +ENSG00000105939 ZC3HAV1 chr7 - 138763849 138764989 138763399 138768525 NaN NaN NaN 1.0 NaN 1.0 NaN NaN NaN 0.9853 +ENSG00000171603 CLSTN1 chr1 - 9797555 9797612 9796100 9801151 0.0 0.0082 0.1272 0.028 0.0131 0.0341 0.0206 0.0354 0.0167 0.0172 +ENSG00000142192 APP chr21 - 27369674 27369731 27354790 27394155 0.582 0.8617 0.1689 0.9306 0.3113 0.549 0.7069 0.0516 0.7034 0.3077 +ENSG00000008710 PKD1 chr16 - 2163041 2163060 2162964 2163161 0.728 0.7432 0.7465 0.594 0.8069 0.685 0.5441 0.3865 0.5327 0.7358 +ENSG00000110367 DDX6 chr11 - 118656760 118657227 118651893 118661806 NaN 1.0 1.0 NaN NaN NaN NaN 1.0 NaN NaN diff --git a/example/splicing_matrix/splicing_matrix.SE.cov10.NEPC_example.txt.idx b/example/splicing_matrix/splicing_matrix.SE.cov10.NEPC_example.txt.idx new file mode 100644 index 0000000..7349326 --- /dev/null +++ b/example/splicing_matrix/splicing_matrix.SE.cov10.NEPC_example.txt.idx @@ -0,0 +1,10 @@ +ENSG00000090674:MCOLN1:chr19:+:7593986:7594088:7593856:7594475 146 +ENSG00000183773:AIFM3:chr22:+:21331320:21331384:21331227:21331988 249 +ENSG00000008282:SYPL1:chr7:-:105736738:105736748:105733583:105738135 355 +ENSG00000184220:CMSS1:chr3:+:99770075:99770150:99536887:99865816 487 +ENSG00000125814:NAPB:chr20:-:23377708:23377825:23375822:23383629 597 +ENSG00000105939:ZC3HAV1:chr7:-:138763849:138764989:138763399:138768525 727 +ENSG00000171603:CLSTN1:chr1:-:9797555:9797612:9796100:9801151 841 +ENSG00000142192:APP:chr21:-:27369674:27369731:27354790:27394155 969 +ENSG00000008710:PKD1:chr16:-:2163041:2163060:2162964:2163161 1101 +ENSG00000110367:DDX6:chr11:-:118656760:118657227:118651893:118661806 1229 diff --git a/google_drive_download.py b/google_drive_download.py new file mode 100644 index 0000000..0e4033f --- /dev/null +++ b/google_drive_download.py @@ -0,0 +1,187 @@ +import argparse +import os +import os.path +import tempfile + +import tqdm + +from apiclient.http import MediaIoBaseDownload +from google.oauth2 import service_account +from googleapiclient.discovery import build + +TOP_DIR_NAME = 'IRIS_data' +CHUNK_SIZE = 1024 * 1024 * 8 # 8 MB +SCOPES = ['https://www.googleapis.com/auth/drive'] + + +def parse_args(): + parser = argparse.ArgumentParser( + description=('download IRIS files from google drive')) + parser.add_argument('--iris-folder-id', + help=('ID of IRIS_data folder on google drive' + ' (can be found in download url)')) + parser.add_argument( + '--dest-dir', + help='path to directory where IRIS_data/ will be written') + parser.add_argument('--download-all', + action='store_true', + help='download all data') + parser.add_argument('--list-files', + action='store_true', + help='write available files to --selected-tsv') + parser.add_argument('--selected-tsv', + help='path to a tsv with selected files to download') + parser.add_argument( + '--api-key-json-path', + required=True, + help='path to the .json file that is the service account key') + + args = parser.parse_args() + if args.download_all: + if not (args.iris_folder_id and args.dest_dir): + parser.error( + '--download-all requires --iris-folder-id and --dest-dir') + elif args.list_files: + if not (args.iris_folder_id and args.selected_tsv): + parser.error( + '--list-files requires --iris-folder-id and --selected-tsv') + elif not (args.dest_dir and args.selected_tsv): + parser.error( + 'download specific files with --dest-dir and --selected-tsv.' + ' Otherwise use --download-all or --list-files') + + return args + + +def main(): + args = parse_args() + + credentials = service_account.Credentials.from_service_account_file( + args.api_key_json_path, scopes=SCOPES) + with build('drive', 'v3', credentials=credentials) as drive_service: + if args.download_all: + download_all_files(args.iris_folder_id, args.dest_dir, + drive_service) + return + + if args.list_files: + list_all_files(args.iris_folder_id, args.selected_tsv, + drive_service) + return + + download_selected_files(args.dest_dir, args.selected_tsv, + drive_service) + + +def list_files_recursive(parent_id, drive_service): + results = list() + files_c = drive_service.files() + request = files_c.list(q="'{}' in parents".format(parent_id)) + + response = request.execute() + found = response.get('files', list()) + for file_dict in found: + file_id = file_dict['id'] + file_name = file_dict['name'] + is_folder = 'folder' in file_dict['mimeType'] + if is_folder: + sub_results = list_files_recursive(file_id, drive_service) + results.append({ + 'folder': file_name, + 'id': file_id, + 'files': sub_results + }) + else: + results.append({'name': file_name, 'id': file_id}) + + return results + + +def write_tsv_line(columns, tsv_handle): + tsv_handle.write('{}\n'.format('\t'.join(columns))) + + +def write_file_tsv(all_files, parent_dir_path, tsv_handle): + files = list() + folders = list() + for file_dict in all_files: + if 'folder' in file_dict: + folders.append(file_dict) + else: + files.append(file_dict) + + files.sort(key=lambda d: d['name']) + folders.sort(key=lambda d: d['folder']) + for file_dict in files: + full_path = os.path.join(parent_dir_path, file_dict['name']) + write_tsv_line([full_path, file_dict['id']], tsv_handle) + + for folder_dict in folders: + full_path = os.path.join(parent_dir_path, folder_dict['folder']) + write_file_tsv(folder_dict['files'], full_path, tsv_handle) + + +def download_all_files(iris_folder_id, dest_dir, drive_service): + all_files = list_files_recursive(iris_folder_id, drive_service) + + temp_name = None + try: + with tempfile.NamedTemporaryFile(delete=False) as temp_handle: + temp_name = temp_handle.name + + write_file_tsv(all_files, TOP_DIR_NAME, temp_handle) + + download_selected_files(dest_dir, temp_name, drive_service) + finally: + if temp_name: + os.remove(temp_name) + + +def list_all_files(iris_folder_id, selected_tsv, drive_service): + all_files = list_files_recursive(iris_folder_id, drive_service) + with open(selected_tsv, 'wt') as tsv_handle: + write_file_tsv(all_files, TOP_DIR_NAME, tsv_handle) + + +def download_file(file_id, dest_path, drive_service): + files_c = drive_service.files() + request = files_c.get_media(fileId=file_id) + dir_path = os.path.dirname(dest_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + with open(dest_path, 'wb') as out_handle: + downloader = MediaIoBaseDownload(out_handle, + request, + chunksize=CHUNK_SIZE) + progress = tqdm.tqdm(desc=dest_path, total=1.0, unit='file') + progress_so_far = 0 + done = False + while done is False: + status, done = downloader.next_chunk() + if status: + new_progress = status.progress() + additional = new_progress - progress_so_far + progress_so_far = new_progress + progress.update(additional) + + progress.close() + + +def download_selected_files(dest_dir, selected_tsv, drive_service): + with open(selected_tsv, 'rt') as tsv_handle: + for line in tsv_handle: + columns = line.strip().split('\t') + if len(columns) != 2: + raise Exception( + 'expected 2 columns in {}'.format(selected_tsv)) + + drive_file_path = columns[0] + file_id = columns[1] + + local_file_path = os.path.join(dest_dir, drive_file_path) + download_file(file_id, local_file_path, drive_service) + + +if __name__ == '__main__': + main() diff --git a/install b/install index bd316b9..dfdf5c0 100755 --- a/install +++ b/install @@ -2,24 +2,22 @@ # # Install dependencies # -# ** Must manually install conda first. ** -# This script will create two conda environments (Python 2 and 3) -# and install dependencies to them. -# https://docs.conda.io/en/latest/miniconda.html -# https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh -# -# ** Must manually download the IRIS_data set ** -# https://drive.google.com/file/d/1TaswpWPnEd4TXst46jsa9XSMzLsbzjOQ/view?usp=sharing -# -# ** Must manually download IEDB tools version 2.15.5 to the IEDB directory. ** -# download file to IEDB/IEDB_MHC_I-2.15.5.tar.gz -# This script will unpack and install the IEDB tools. -# http://tools.iedb.org/main/download/ -> MHC Class I -> previous version -> 2.15.5 - function install_iedb() { echo echo "checking IEDB dependency" + # From IEDB/mhc_i/README: tcsh and gawk are required + which tcsh + if [[ "$?" -ne 0 ]]; then + echo "IEDB requires tcsh to be installed" >&2 + return 1 + fi + which gawk + if [[ "$?" -ne 0 ]]; then + echo "IEDB requires gawk to be installed" >&2 + return 1 + fi + cd "${SCRIPT_DIR}/IEDB" || return 1 if [[ ! -d mhc_i ]] @@ -29,156 +27,17 @@ function install_iedb() { cd mhc_i || return 1 - ./configure || return 1 -} - -# This script will automatically download and build bedtools -# https://github.com/arq5x/bedtools2/releases -function install_bedtools() { - echo - echo "checking bedtools dependency" - - cd "${SCRIPT_DIR}" || return 1 - - if [[ ! -d bedtools ]] - then - mkdir bedtools || return 1 - fi - - cd bedtools || return 1 - - if [[ ! -f bedtools-2.29.0.tar.gz ]] - then - local BEDTOOLS_URI="https://github.com/arq5x/bedtools2/releases/download/v2.29.0/bedtools-2.29.0.tar.gz" - curl -L "${BEDTOOLS_URI}" -o bedtools-2.29.0.tar.gz || return 1 - fi - - if [[ ! -d bedtools2 ]] - then - tar -xvf bedtools-2.29.0.tar.gz || return 1 - fi - - cd bedtools2 || return 1 - make || return 1 -} - -function install_star() { - echo - echo "checking STAR dependency" - - cd "${SCRIPT_DIR}" || return 1 - - if [[ ! -d STAR ]] - then - mkdir STAR || return 1 - fi - - cd STAR || return 1 - - if [[ ! -f 2.5.3a.tar.gz ]] - then - local STAR_URI="https://github.com/alexdobin/STAR/archive/2.5.3a.tar.gz" - curl -L "${STAR_URI}" -o 2.5.3a.tar.gz || return 1 - fi - - if [[ ! -d STAR-2.5.3a ]] - then - tar -xvf 2.5.3a.tar.gz || return 1 - fi - - cd STAR-2.5.3a/source || return 1 - make || return 1 -} - -function install_samtools() { - echo - echo "checking SAM tools dependency" - - cd "${SCRIPT_DIR}" || return 1 - - if [[ ! -d samtools ]] - then - mkdir samtools || return 1 - fi - - cd samtools || return 1 - - if [[ ! -f samtools-1.3.tar.bz2 ]] - then - local SAMTOOLS_URI="https://sourceforge.net/projects/samtools/files/samtools/1.3/samtools-1.3.tar.bz2/download" - curl -L "${SAMTOOLS_URI}" -o samtools-1.3.tar.bz2 || return 1 - fi - - if [[ ! -d samtools-1.3 ]] - then - tar -xvf samtools-1.3.tar.bz2 || return 1 - fi - - cd samtools-1.3 || return 1 - ./configure --enable-plugins --enable-libcurl --with-plugin-path="$(pwd)"/htslib-1.3 || return 1 - make all plugins-htslib || return 1 -} - -function install_rmats() { - echo - echo "checking rMATS dependency" - - cd "${SCRIPT_DIR}" || return 1 - - if [[ ! -d rMATS ]] - then - mkdir rMATS || return 1 - fi - - cd rMATS || return 1 - - if [[ ! -f rMATS.4.0.2.tgz ]] - then - local RMATS_URI="https://sourceforge.net/projects/rnaseq-mats/files/MATS/rMATS.4.0.2.tgz/download" - curl -L "${RMATS_URI}" -o rMATS.4.0.2.tgz || return 1 - fi - - if [[ ! -d rMATS.4.0.2 ]] - then - tar -xvf rMATS.4.0.2.tgz || return 1 - fi -} - -function install_cufflinks() { - echo - echo "checking Cufflinks dependency" - - cd "${SCRIPT_DIR}" || return 1 - - if [[ ! -d cufflinks ]] - then - mkdir cufflinks || return 1 - fi - - cd cufflinks || return 1 - - if [[ ! -f cufflinks-2.2.1.Linux_x86_64.tar.gz ]] - then - local CUFFLINKS_URI="http://cole-trapnell-lab.github.io/cufflinks/assets/downloads/cufflinks-2.2.1.Linux_x86_64.tar.gz" - curl "${CUFFLINKS_URI}" -o cufflinks-2.2.1.Linux_x86_64.tar.gz || return 1 - fi - - if [[ ! -d cufflinks-2.2.1.Linux_x86_64 ]] - then - tar -xvf cufflinks-2.2.1.Linux_x86_64.tar.gz || return 1 - fi -} - -function install_seq2hla() { - echo - echo "checking seq2HLA dependency" - - cd "${SCRIPT_DIR}" || return 1 - - if [[ ! -d seq2hla ]] - then - local SEQ_2_HLA_URI="https://bitbucket.org/sebastian_boegel/seq2hla" - hg clone "${SEQ_2_HLA_URI}" || return 1 + # The IEDB ./configure script unpacks .tar files which can take a long time. + # Create a .done file to indicate that ./configure has already been run. + # Only run ./configure if there's no .done file + local DONE_FILE='configure.done' + if [[ ! -f "${DONE_FILE}" ]]; then + # Need to activate Python 2 environment so that '/usr/bin/env python' used in + # ./configure finds python 2. Otherwise there is a syntax error. + conda activate "${CONDA_ENV_PREFIX_2}" || return 1 + ./configure || return 1 + conda deactivate || return 1 + touch "${DONE_FILE}" || return 1 fi } @@ -208,24 +67,74 @@ function install_ms_gf() { } function install_python_packages() { + local INSTALL_OPTIONAL="$1" echo echo "checking python dependencies" cd "${SCRIPT_DIR}" || return 1 + # rmats is an optional dependency. + # Record the path if it is installed. + local RMATS_PATH='' + # Python 2 - conda::activate_env "${CONDA_ENV_NAME_2}" || return 1 - pip install -r requirements.txt || return 1 - conda install -c bioconda bowtie # has an error return even when successful - conda::deactivate_env || return 1 + conda activate "${CONDA_ENV_PREFIX_2}" || return 1 + if [[ "${INSTALL_OPTIONAL}" -ne 0 ]]; then + conda install -c conda-forge -c bioconda --file conda_requirements_py2.txt \ + --file conda_requirements_py2_optional.txt || return 1 + # r-base is an optional dependency which requires an old version of libreadline. + # The version of libreadline is not available on conda, but it can be installed manually. + install_readline || return 1 + # Find the rmats path needed to pass as --rMATS-path to IRIS + RMATS_PATH="$(which rmats.py)" + if [[ "$?" -ne 0 ]]; then + echo "could not find path to rmats.py" >&2 + exit 1 + fi + else + conda install -c conda-forge -c bioconda --file conda_requirements_py2.txt || return 1 + fi + + # rmats_path will be '' if optional dependencies were not installed + echo "rmats_path: '${RMATS_PATH}'" \ + >> "${SCRIPT_DIR}/snakemake_config.yaml" || return 1 + + conda deactivate || return 1 # Python 3 - conda::activate_env "${CONDA_ENV_NAME_3}" || return 1 - pip install -r qsub/requirements.txt || return 1 - conda::deactivate_env || return 1 + conda activate "${CONDA_ENV_PREFIX_3}" || return 1 + conda install -c conda-forge -c bioconda --file conda_requirements_py3.txt || return 1 + conda deactivate || return 1 } -function install_iris_data() { +function install_readline() { + local ORIG_DIR="$(pwd)" || return 1 + cd "${SCRIPT_DIR}" || return 1 + mkdir -p readline || return 1 + cd readline || return 1 + local TAR_PATH='readline-6.3.tar.gz' + local TAR_URL='ftp://ftp.gnu.org/gnu/readline/readline-6.3.tar.gz' + if [[ ! -f "${TAR_PATH}" ]]; then + curl "${TAR_URL}" -o "${TAR_PATH}" || return 1 + fi + + if [[ ! -f 'readline-6.3/configure' ]]; then + tar -xvf "${TAR_PATH}" || return 1 + fi + + cd readline-6.3 || return 1 + local SO_PATH='shlib/libreadline.so.6.3' + if [[ ! -f "${SO_PATH}" ]]; then + ./configure || return 1 + make || return 1 + fi + + local DEST="${CONDA_ENV_PREFIX_2}/lib/libreadline.so.6" + cp "${SO_PATH}" "${DEST}" || return 1 + cd "${ORIG_DIR}" +} + +function check_iris_data() { echo echo "checking IRIS data dependency" @@ -233,13 +142,7 @@ function install_iris_data() { if [[ ! -d IRIS_data ]] then - if [[ ! -f IRIS_data.tgz ]]; then - echo "Need to download IRIS_data.tgz from:" - echo "https://drive.google.com/file/d/1TaswpWPnEd4TXst46jsa9XSMzLsbzjOQ/view?usp=sharing" - return 1 - fi - - tar -xvf IRIS_data.tgz || return 1 + echo "Need to download IRIS_data/" >&2 fi } @@ -249,48 +152,20 @@ function install_iris_package() { cd "${SCRIPT_DIR}" || return 1 - conda::activate_env "${CONDA_ENV_NAME_2}" || return 1 - + conda activate "${CONDA_ENV_PREFIX_2}" || return 1 python setup.py install || return 1 - - cd "${SCRIPT_DIR}" || return 1 - - conda::deactivate_env || return 1 + conda deactivate || return 1 } - function ensure_conda_envs() { echo echo "checking conda" - conda::create_env_with_name_and_python_version "${CONDA_ENV_NAME_2}"\ - "${CONDA_PYTHON_VERSION_2}" || return 1 - - conda::create_env_with_name_and_python_version "${CONDA_ENV_NAME_3}"\ - "${CONDA_PYTHON_VERSION_3}" || return 1 + conda create --prefix "${CONDA_ENV_PREFIX_2}" || return 1 + conda create --prefix "${CONDA_ENV_PREFIX_3}" || return 1 } function install_optional() { - install_star - if [[ "$?" -ne 0 ]]; then - echo "Error installing optional dependency: star" >&2 - fi - install_samtools - if [[ "$?" -ne 0 ]]; then - echo "Error installing optional dependency: samtools" >&2 - fi - install_rmats - if [[ "$?" -ne 0 ]]; then - echo "Error installing optional dependency: rmats" >&2 - fi - install_cufflinks - if [[ "$?" -ne 0 ]]; then - echo "Error installing optional dependency: cufflinks" >&2 - fi - install_seq2hla - if [[ "$?" -ne 0 ]]; then - echo "Error installing optional dependency: seq2hla" >&2 - fi install_ms_gf if [[ "$?" -ne 0 ]]; then echo "Error installing optional dependency: ms gf" >&2 @@ -301,17 +176,26 @@ function install() { local INSTALL_OPTIONAL="$1" ensure_conda_envs || return 1 - + install_python_packages "${INSTALL_OPTIONAL}" || return 1 install_iedb || return 1 - install_bedtools || return 1 if [[ "${INSTALL_OPTIONAL}" -ne 0 ]]; then install_optional || return 1 fi - install_python_packages || return 1 - install_iris_data || return 1 install_iris_package || return 1 + check_iris_data || return 1 + + echo "conda_wrapper: '${SCRIPT_DIR}/conda_wrapper'" \ + >> "${SCRIPT_DIR}/snakemake_config.yaml" || return 1 + echo "conda_env_2: '${SCRIPT_DIR}/conda_env_2'" \ + >> "${SCRIPT_DIR}/snakemake_config.yaml" || return 1 + echo "conda_env_3: '${SCRIPT_DIR}/conda_env_3'" \ + >> "${SCRIPT_DIR}/snakemake_config.yaml" || return 1 + echo "iris_data: '${SCRIPT_DIR}/IRIS_data'" \ + >> "${SCRIPT_DIR}/snakemake_config.yaml" || return 1 + echo "iedb_path: '${SCRIPT_DIR}/IEDB/mhc_i/src'" \ + >> "${SCRIPT_DIR}/snakemake_config.yaml" || return 1 } function display_usage() { @@ -341,10 +225,6 @@ function main() { fi source set_env_vars.sh || return 1 - source conda.sh || return 1 - - SCRIPT_DIR="$(pwd)" || return 1 - install "${INSTALL_OPTIONAL}" || return 1 } diff --git a/qsub/qsub.py b/qsub/qsub.py deleted file mode 100644 index b806f4f..0000000 --- a/qsub/qsub.py +++ /dev/null @@ -1,163 +0,0 @@ -import os - -import bs4 - - -class CommandResult(object): - def __init__(self): - self.status_code = None - self.out = None - self.err = None - - def from_completed_process(self, completed_process): - self.status_code = completed_process.returncode - self.out = completed_process.stdout - self.err = completed_process.stderr - - -class QsubJob(object): - """ - cmd_execute_func(cmd_tokens, exec_func_ref_data) -> CommandResult - - cmd_execute_func and log_error_func must be defined at the top level of a module so that - QsubJob can be pickled. - - The execute function allows either running a local qsub or qsub on a remote host. - """ - def __init__(self, - command_result, - job_name, - cmd_execute_func, - exec_func_ref_data=None, - out_dir=None, - log_error_func=print): - self.job_name = job_name - self._cmd_execute_func = cmd_execute_func - self._exec_func_ref_data = exec_func_ref_data - self._log_error_func = log_error_func - - self._status = 'running' - - self.j_id = _extract_qsub_j_id(command_result.out.decode()) - self._set_output_file_names(out_dir) - - def _set_output_file_names(self, out_dir): - if out_dir is None: - self.qsub_out = None - self.qsub_err = None - return - - out_f_name = '{}.o{}'.format(self.job_name, self.j_id) - err_f_name = '{}.e{}'.format(self.job_name, self.j_id) - self.qsub_out = os.path.join(out_dir, out_f_name) - self.qsub_err = os.path.join(out_dir, err_f_name) - - def _execute_cmd(self, cmd_tokens): - return self._cmd_execute_func(cmd_tokens, self._exec_func_ref_data) - - def get_status(self): - if self._status == 'finished': - return self._status - - if self.is_finished(): - self._status = 'finished' - return self._status - - return self._status - - def is_finished(self): - if not self.j_id: - self._log_error_func('no j_id when checking if qsub job is finished') - return False - - qstat_command_tokens = ['qstat', '-j', self.j_id, '-xml'] - qstat_process = self._execute_cmd(qstat_command_tokens) - if not qstat_process: - return None - - qstat_soup = _make_soup(qstat_process.out.decode()) - if _qstat_output_has_job_details(qstat_soup, self.j_id): - return False - - if _qstat_output_lists_job_as_unknown(qstat_soup, self.j_id): - return True - - self._log_error_func('unexpected output from: {}\n{}'.format(qstat_command_tokens, - qstat_soup)) - return True - - def get_stdout(self): - if self.qsub_out is None: - return None - - return self._cat_file(self.qsub_out) - - def get_stderr(self): - if self.qsub_err is None: - return None - - return self._cat_file(self.qsub_err) - - def _cat_file(self, f_name): - cat_command_tokens = ['cat', f_name] - process = self._execute_cmd(cat_command_tokens) - if not process: - return None - - return process.out - - -def _make_soup(output): - return bs4.BeautifulSoup(output, 'html.parser') - - -def _extract_qsub_j_id(out): - """ - Find j_id in out that looks like: - Your job {j_id}.{array_details} ("{job_name}") has been submitted - """ - tokens = out.split(' ') - for token in tokens: - if token and token[0].isdigit(): - return token.split('.')[0] - - return None - - -def _qstat_output_has_job_details(soup, j_id): - """ - Return True if j_id is a JB_job_number in soup that looks like: - - - - 11693229 - """ - djobs = soup.find_all('djob_info') - if len(djobs) != 1: - return False - - numbers = djobs[0].find_all('jb_job_number') - for number in numbers: - if number.string == j_id: - return True - - return False - - -def _qstat_output_lists_job_as_unknown(soup, j_id): - """ - Return True if j_id is an st_name in soup that looks like: - - - 123 - """ - unks = soup.find_all('unknown_jobs') - if len(unks) != 1: - return False - - names = unks[0].find_all('st_name') - for name in names: - if name.string == j_id: - return True - - return False diff --git a/qsub/requirements.txt b/qsub/requirements.txt deleted file mode 100644 index c1f5f71..0000000 --- a/qsub/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -beautifulsoup4 diff --git a/qsub/submit_qsub_and_wait.py b/qsub/submit_qsub_and_wait.py deleted file mode 100644 index fe88c20..0000000 --- a/qsub/submit_qsub_and_wait.py +++ /dev/null @@ -1,63 +0,0 @@ -import argparse -import subprocess -import time - -import qsub - - -def local_subprocess_cmd_exec_func(tokens, _): - process = subprocess.run(tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False) - if process.returncode != 0: - print('returncode: {}, command: {}, stdout: {}, stderr: {}'.format( - process.returncode, tokens, process.stdout, process.stderr)) - return None - - command_result = qsub.CommandResult() - command_result.from_completed_process(process) - return command_result - - -def submit_job(cmd): - cmd = cmd.rstrip() - print('executing: {}'.format(cmd)) - - tokens = cmd.split(' ') - command_result = local_subprocess_cmd_exec_func(tokens, None) - if not command_result: - print('error submitting job') - return None - - job_name = '' - return qsub.QsubJob(command_result, job_name, local_subprocess_cmd_exec_func) - - -def main(): - parser = argparse.ArgumentParser( - description='execute qsub commands and wait for those jobs to complete') - parser.add_argument('command_file', type=str, help='a file with 1 qsub command per line') - parser.add_argument( - '--poll-interval-seconds', type=int, default=30, help='how frequently to check job status') - args = parser.parse_args() - - jobs = list() - with open(args.command_file, 'rt') as f_handle: - for cmd in f_handle: - job = submit_job(cmd) - if job: - jobs.append(job) - - while jobs: - time.sleep(args.poll_interval_seconds) - print('checking {} job(s)'.format(len(jobs))) - new_jobs = list() - for job in jobs: - if not job.is_finished(): - new_jobs.append(job) - else: - print('finished j_id: {}'.format(job.j_id)) - - jobs = new_jobs - - -if __name__ == '__main__': - main() diff --git a/qsub/test b/qsub/test deleted file mode 100755 index 83b0653..0000000 --- a/qsub/test +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# -# run tests - -function main() { - source ../set_env_vars.sh || return 1 - source ../conda.sh || return 1 - - conda::activate_env "${CONDA_ENV_NAME_3}" || return 1 - - if [[ "$#" -ne 0 ]]; then - echo "arguments given, but none expected" >&2 - return 1 - fi - - python test_submit_qsub_and_wait.py || return 1 - - conda::deactivate_env || return 1 -} - -main "$@" diff --git a/qsub/test_submit_qsub_and_wait.py b/qsub/test_submit_qsub_and_wait.py deleted file mode 100644 index 95d7b5d..0000000 --- a/qsub/test_submit_qsub_and_wait.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import sys -import tempfile -import time -import unittest - -import submit_qsub_and_wait - - -def _write_lines(f_name, lines): - with open(f_name, 'wt') as f_h: - for line in lines: - f_h.write('{}\n'.format(line)) - - -class TestSubmitQsubAndWait(unittest.TestCase): - def test(self): - temp_f_name_1 = None - temp_f_name_2 = None - try: - with tempfile.NamedTemporaryFile(delete=False) as temp_f_handle: - temp_f_name_1 = temp_f_handle.name - - with tempfile.NamedTemporaryFile(delete=False) as temp_f_handle: - temp_f_name_2 = temp_f_handle.name - - self._test(temp_f_name_1, temp_f_name_2, 1, 60) - self._test(temp_f_name_1, temp_f_name_2, 30, 90) - finally: - if temp_f_name_1 is not None: - os.remove(temp_f_name_1) - - if temp_f_name_2 is not None: - os.remove(temp_f_name_2) - - def _test(self, f_name_1, f_name_2, sleep_seconds, max_seconds): - lines_1 = [ - '#!/bin/bash', - 'sleep {}'.format(sleep_seconds), - ] - _write_lines(f_name_1, lines_1) - lines_2 = [ - 'qsub {}'.format(f_name_1), - 'qsub {}'.format(f_name_1), - ] - _write_lines(f_name_2, lines_2) - - sys.argv = ['submit_qsub_and_wait.py', f_name_2, '--poll-interval-seconds', '5'] - - begin = time.time() - submit_qsub_and_wait.main() - end = time.time() - - elapsed_seconds = end - begin - self.assertTrue(elapsed_seconds >= sleep_seconds) - self.assertTrue(elapsed_seconds <= max_seconds) - - -if __name__ == '__main__': - unittest.main(verbosity=2) diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 0d7a59c..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -numpy -scipy -seaborn -pyBigWig -statsmodels diff --git a/run b/run new file mode 100755 index 0000000..9496d76 --- /dev/null +++ b/run @@ -0,0 +1,3 @@ +#!/bin/bash + +./conda_wrapper ./conda_env_3 snakemake --profile ./snakemake_profile diff --git a/run_example b/run_example deleted file mode 100755 index 5a930a1..0000000 --- a/run_example +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# -# Run the IRIS pipeline for the inputs in example/ - -function main() { - if [[ "$#" -ne 0 ]]; then - echo "arguments given, but none expected" >&2 - return 1 - fi - - SCRIPT_DIR="$(pwd)" || return 1 - - USERNAME="example" - RMATS_MATRICES_TAR="${SCRIPT_DIR}/example/SJ_matrices.tar.gz" - RAW_SCREENING_PARAMS="${SCRIPT_DIR}/example/Test_simplified.para" - MHC_BY_SAMPLE="${SCRIPT_DIR}/example/HLA_types/hla_patient.tsv" - MHC_LIST="${SCRIPT_DIR}/example/HLA_types/hla_types.list" - - ./run_iris "${USERNAME}" "${RMATS_MATRICES_TAR}" "${RAW_SCREENING_PARAMS}"\ - "${MHC_BY_SAMPLE}" "${MHC_LIST}" || return 1 -} - -main "$@" diff --git a/run_iris b/run_iris deleted file mode 100755 index 139dbd3..0000000 --- a/run_iris +++ /dev/null @@ -1,238 +0,0 @@ -#!/bin/bash -# -# run the iris pipeline - -function parse_arguments() { - if [[ "$#" -ne 5 ]]; then - echo " -usage: - ./run_iris USERNAME RMATS_MATRICES_TAR SCREENING_PARAMS MHC_BY_SAMPLE MHC_LIST - -example: - ./run_iris user matrices.tar.gz job.para hla_patient.tsv hla_types.list -" - return 1 - fi - - USERNAME="$1" - RMATS_MATRICES_TAR="$2" - RAW_SCREENING_PARAMS="$3" - MHC_BY_SAMPLE="$4" - MHC_LIST="$5" -} - -function get_line_of_file() { - local LINE_NUM="$1" - local FILE_NAME="$2" - local LINE="$(head -n ${LINE_NUM} ${FILE_NAME} | tail -n 1)" || return 1 - echo "${LINE}" -} - -function pre_process_inputs() { - local RAW_DATA_SET_NAME="$(get_line_of_file 1 ${RAW_SCREENING_PARAMS})" || return 1 - local FILTER_1="$(get_line_of_file 2 ${RAW_SCREENING_PARAMS})" || return 1 - local FILTER_2="$(get_line_of_file 3 ${RAW_SCREENING_PARAMS})" || return 1 - local FILTER_3="$(get_line_of_file 4 ${RAW_SCREENING_PARAMS})" || return 1 - local TEST_MODE="$(get_line_of_file 5 ${RAW_SCREENING_PARAMS})" || return 1 - local USE_RATIO="$(get_line_of_file 6 ${RAW_SCREENING_PARAMS})" || return 1 - - # The leading underscore is used to distinguish user data sets from reference data sets - DATA_SET_NAME="_${USERNAME}_${RAW_DATA_SET_NAME}" - - RESULT_DIR="${SCRIPT_DIR}/results/${USERNAME}/${RAW_DATA_SET_NAME}" - mkdir -p "${RESULT_DIR}" || return 1 - SCREENING_OUT_DIR="${RESULT_DIR}/screening" - mkdir -p "${SCREENING_OUT_DIR}" || return 1 - RUN_DIR="${RESULT_DIR}/temp" - mkdir -p "${RUN_DIR}" || return 1 - - local IRIS_DATA="${SCRIPT_DIR}/IRIS_data" - IRIS_DB="${IRIS_DATA}/db/" - local IRIS_RESOURCES="${IRIS_DATA}/resources" - MAPPABILITY_PATH="${IRIS_RESOURCES}/mappability/wgEncodeCrgMapabilityAlign24mer.bigWig" - REF_GENOME="${IRIS_RESOURCES}/reference/ucsc.hg19.fasta" - - # create new param file - SCREENING_PARAMS="${RESULT_DIR}/job.para" - echo "${DATA_SET_NAME}" > "${SCREENING_PARAMS}" || return 1 - echo "${IRIS_DB}" >> "${SCREENING_PARAMS}" || return 1 - echo "${FILTER_1}" >> "${SCREENING_PARAMS}" || return 1 - echo "${FILTER_2}" >> "${SCREENING_PARAMS}" || return 1 - echo "${FILTER_3}" >> "${SCREENING_PARAMS}" || return 1 - echo "${TEST_MODE}" >> "${SCREENING_PARAMS}" || return 1 - echo "${USE_RATIO}" >> "${SCREENING_PARAMS}" || return 1 - local BLACKLIST_PATH="" - echo "${BLACKLIST_PATH}" >> "${SCREENING_PARAMS}" || return 1 - echo "${MAPPABILITY_PATH}" >> "${SCREENING_PARAMS}" || return 1 - echo "${REF_GENOME}" >> "${SCREENING_PARAMS}" || return 1 - - # update matrices files to use absolute paths - cd "${RESULT_DIR}" || return 1 - tar -xf "${RMATS_MATRICES_TAR}" || return 1 - - local MATRICES_PATH="${RESULT_DIR}/SJ_matrices" - cd "${MATRICES_PATH}" || return 1 - - local TEMP_F_NAME="$(mktemp)" || return 1 - local FILE_NAME - for FILE_NAME in matrices.txt samples.txt; do - mv "${FILE_NAME}" "${TEMP_F_NAME}" || return 1 - local LINE - while read LINE; do - echo "${MATRICES_PATH}/${LINE}" >> "${FILE_NAME}" || return 1 - done < "${TEMP_F_NAME}" - done - rm "${TEMP_F_NAME}" || return 1 - - cd "${SCRIPT_DIR}" || return 1 - - RMATS_MAT_PATH_MANIFEST="${MATRICES_PATH}/matrices.txt" - RMATS_SAMPLE_ORDER="${MATRICES_PATH}/samples.txt" -} - -function formatting_step() { - echo - echo "formatting" - - cd "${RUN_DIR}" || return 1 - - local SAMPLE_NAME_FIELD='2' - local SPLICING_EVENT_TYPE='SE' - - IRIS formatting "${RMATS_MAT_PATH_MANIFEST}" "${RMATS_SAMPLE_ORDER}" -s "${SAMPLE_NAME_FIELD}"\ - -d "${IRIS_DB}" -t "${SPLICING_EVENT_TYPE}" -n "${DATA_SET_NAME}" || return 1 -} - -function screening_step() { - echo - echo "screening" - - cd "${RUN_DIR}" || return 1 - - IRIS screening "${SCREENING_PARAMS}" -t -o "${SCREENING_OUT_DIR}" || return 1 -} - -function find_and_execute_qsub_commands_in_file() { - local IN_FILE="$1" - - local QSUB_CMDS=() - local LINE - while read LINE; do - local GREP_RES="$(echo ${LINE} | grep '^qsub.*\.sh$')" - if [[ -n "${GREP_RES}" ]]; then - QSUB_CMDS+=("${LINE}") - fi - done < "${IN_FILE}" - - if [[ "${#QSUB_CMDS[@]}" == 0 ]]; then - echo "could not find any qsub commands" - return 1 - fi - - echo - echo "executing qsub commands" - - local SUBMIT_AND_WAIT_PY="${SCRIPT_DIR}/qsub/submit_qsub_and_wait.py" - local TEMP_F_NAME="$(mktemp)" || return 1 - - for QSUB_CMD in "${QSUB_CMDS[@]}"; do - echo "${QSUB_CMD}" >> "${TEMP_F_NAME}" - done - - echo "execute: ${PYTHON_3_EXECUTABLE} ${SUBMIT_AND_WAIT_PY}" - echo "with qsub commands:" - cat "${TEMP_F_NAME}" - - "${PYTHON_3_EXECUTABLE}" "${SUBMIT_AND_WAIT_PY}" "${TEMP_F_NAME}" || return 1 - - rm "${TEMP_F_NAME}" || return 1 -} - -function prediction_step() { - echo - echo "prediction" - - cd "${RUN_DIR}" || return 1 - - local TEMP_F_NAME="$(mktemp)" || return 1 - - local DELTA_PSI_COLUMN='5' - local IEDB_DIR="${SCRIPT_DIR}/IEDB/mhc_i/src" - - # TODO --iedb-local should be required=True - IRIS prediction "${SCREENING_OUT_DIR}" -c "${DELTA_PSI_COLUMN}" -m "${MHC_LIST}"\ - -p "${SCREENING_PARAMS}" --iedb-local "${IEDB_DIR}"\ - > "${TEMP_F_NAME}" || return 1 - - cat "${TEMP_F_NAME}" || return 1 - - find_and_execute_qsub_commands_in_file "${TEMP_F_NAME}" || return 1 - rm "${TEMP_F_NAME}" || return 1 -} - -function epitope_post_step() { - echo - echo "epitope_post" - - cd "${RUN_DIR}" || return 1 - - # TODO -e is actually not required? - IRIS epitope_post -p "${SCREENING_PARAMS}" -o "${SCREENING_OUT_DIR}"\ - -m "${MHC_BY_SAMPLE}" || return 1 -} - -function screening_plot_step() { - echo - echo "screening_plot" - - cd "${RUN_DIR}" || return 1 - - local IN_PREFIX="${SCREENING_OUT_DIR}/${DATA_SET_NAME}" - local PRIMARY_IN="${IN_PREFIX}.primary.txt" - local PRIORITIZED_IN="${IN_PREFIX}.prioritized.txt" - - local OUT_PREFIX="${RESULT_DIR}/violin" - local PRIMARY_OUT="${OUT_PREFIX}_primary" - local PRIORITIZED_OUT="${OUT_PREFIX}_prioritized" - - cut -f 1 "${PRIMARY_IN}" | tail -n +2 > "${PRIMARY_OUT}" || return 1 - cut -f 1 "${PRIORITIZED_IN}" | tail -n +2 > "${PRIORITIZED_OUT}" || return 1 - - IRIS screening_plot "${PRIMARY_OUT}" -p "${SCREENING_PARAMS}" || return 1 - IRIS screening_plot "${PRIORITIZED_OUT}" -p "${SCREENING_PARAMS}" || return 1 -} - -function set_python3_executable() { - # Need to use Python3 for submit_qsub_and_wait.py. - # Also need to have the Python2 conda environment to run IRIS. - # Get the Python3 path and then go back to the Python2 environment - conda::activate_env "${CONDA_ENV_NAME_3}" || return 1 - - PYTHON_3_EXECUTABLE="$(which python)" || return 1 - - conda::deactivate_env || return 1 -} - -function main() { - source set_env_vars.sh || return 1 - source conda.sh || return 1 - - SCRIPT_DIR="$(pwd)" - export PATH="${PATH}:${SCRIPT_DIR}/bedtools/bedtools2/bin" - - set_python3_executable || return 1 - parse_arguments "$@" || return 1 - pre_process_inputs || return 1 - - conda::activate_env "${CONDA_ENV_NAME_2}" || return 1 - - formatting_step || return 1 - screening_step || return 1 - prediction_step || return 1 - epitope_post_step || return 1 - screening_plot_step || return 1 - - conda::deactivate_env || return 1 -} - -main "$@" diff --git a/scripts/check_read_lengths.py b/scripts/check_read_lengths.py new file mode 100644 index 0000000..254c168 --- /dev/null +++ b/scripts/check_read_lengths.py @@ -0,0 +1,57 @@ +import argparse +import os +import os.path + + +def parse_args(): + parser = argparse.ArgumentParser( + description=('determine the set of read lengths based on' + ' the rmats output file names')) + parser.add_argument( + '--parent-dir', + required=True, + help='path of directory which contains 1 directory per read length') + parser.add_argument('--run-name', + required=True, + help='prefix used to name output files') + parser.add_argument('--out', + required=True, + help='path to write read lengths') + + args = parser.parse_args() + return args + + +def check_read_lengths(parent_dir, run_name, out): + file_names = os.listdir(parent_dir) + prefix = '{}.RL'.format(run_name) + read_lengths = list() + for file_name in file_names: + file_path = os.path.join(parent_dir, file_name) + if not (os.path.isdir(file_path) and file_name.startswith(prefix)): + continue + + suffix = file_name[len(prefix):] + try: + read_length = int(suffix) + except ValueError: + print('ignoring: {}'.format(file_path)) + continue + + read_lengths.append(suffix) + + if not read_lengths: + raise Exception('no read lengths found in {}'.format(parent_dir)) + + with open(out, 'wt') as out_handle: + for read_length in read_lengths: + out_handle.write('{}\n'.format(read_length)) + + +def main(): + args = parse_args() + check_read_lengths(args.parent_dir, args.run_name, args.out) + + +if __name__ == '__main__': + main() diff --git a/scripts/count_iris_predict_tasks.py b/scripts/count_iris_predict_tasks.py new file mode 100644 index 0000000..caf34f5 --- /dev/null +++ b/scripts/count_iris_predict_tasks.py @@ -0,0 +1,62 @@ +import argparse +import os +import os.path + + +def parse_args(): + parser = argparse.ArgumentParser( + description=('find which tasks were created by IRIS predict')) + parser.add_argument('--out-list', + required=True, + help='path to write a file listing all created tasks') + parser.add_argument( + '--task-dir', + required=True, + help='directory where task files are expected to be found') + parser.add_argument( + '--splice-type', + required=True, + help='alternative splicing event type (expected in task file name)') + + args = parser.parse_args() + return args + + +def count_iris_predict_tasks(out_list, task_dir, splice_type): + file_names = os.listdir(task_dir) + task_paths = list() + base_prefix = 'pep2epitope_{}.'.format(splice_type) + tiers = ['tier1', 'tier2tier3'] + prefixes = ['{}{}.'.format(base_prefix, tier) for tier in tiers] + suffix = '.sh' + for name in file_names: + file_path = os.path.join(task_dir, name) + if not name.endswith(suffix): + continue + + for prefix in prefixes: + if name.startswith(prefix): + number_string = name[len(prefix):-len(suffix)] + try: + int(number_string) + except ValueError: + raise Exception('unexpected file: {}'.format(file_path)) + + task_paths.append(file_path) + continue + + if not task_paths: + raise Exception('could not find any predict tasks') + + with open(out_list, 'wt') as out_handle: + for task_path in task_paths: + out_handle.write('{}\n'.format(task_path)) + + +def main(): + args = parse_args() + count_iris_predict_tasks(args.out_list, args.task_dir, args.splice_type) + + +if __name__ == '__main__': + main() diff --git a/scripts/prepare_iris_exp_matrix.py b/scripts/prepare_iris_exp_matrix.py new file mode 100644 index 0000000..bfc0b93 --- /dev/null +++ b/scripts/prepare_iris_exp_matrix.py @@ -0,0 +1,31 @@ +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser( + description=('write input file for IRIS exp_matrix')) + parser.add_argument('--out-manifest', + required=True, + help='path to write the list of gene expression files') + parser.add_argument('--fpkm-files', + required=True, + nargs='+', + help='the fpkm files from cufflinks') + + args = parser.parse_args() + return args + + +def prepare_iris_exp_matrix(out_manifest, fpkm_files): + with open(out_manifest, 'wt') as out_handle: + for file_name in fpkm_files: + out_handle.write('{}\n'.format(file_name)) + + +def main(): + args = parse_args() + prepare_iris_exp_matrix(args.out_manifest, args.fpkm_files) + + +if __name__ == '__main__': + main() diff --git a/scripts/prepare_iris_format.py b/scripts/prepare_iris_format.py new file mode 100644 index 0000000..5c2c4bc --- /dev/null +++ b/scripts/prepare_iris_format.py @@ -0,0 +1,57 @@ +import argparse +import os +import os.path + + +def parse_args(): + parser = argparse.ArgumentParser( + description=('write input files for IRIS format')) + parser.add_argument('--matrix-out', + required=True, + help='path to write the list of matrix directories') + parser.add_argument('--sample-out', + required=True, + help='path to write the list of BAM lists') + parser.add_argument('--summaries', + required=True, + nargs='+', + help='the summary files from the matrix directories') + + args = parser.parse_args() + return args + + +def prepare_iris_format(matrix_out, sample_out, summaries): + with open(matrix_out, 'wt') as matrix_out_handle: + with open(sample_out, 'wt') as sample_out_handle: + prepare_iris_format_with_handles(matrix_out_handle, + sample_out_handle, summaries) + + +def prepare_iris_format_with_handles(matrix_out_handle, sample_out_handle, + summaries): + for summary in summaries: + matrix_dir_path = os.path.dirname(summary) + matrix_dir_name = os.path.basename(matrix_dir_path) + matrix_dir_name_suffix = '.matrix' + if not matrix_dir_name.endswith(matrix_dir_name_suffix): + raise Exception('unexpected directory name for {}'.format(summary)) + + matrix_dir_name_prefix = matrix_dir_name[:-len(matrix_dir_name_suffix)] + matrix_dir_parent_dir_path = os.path.dirname(matrix_dir_path) + sample_list_name = '{}_rmatspost_list.txt'.format( + matrix_dir_name_prefix) + sample_path = os.path.join(matrix_dir_parent_dir_path, + sample_list_name) + + matrix_out_handle.write('{}\n'.format(matrix_dir_path)) + sample_out_handle.write('{}\n'.format(sample_path)) + + +def main(): + args = parse_args() + prepare_iris_format(args.matrix_out, args.sample_out, args.summaries) + + +if __name__ == '__main__': + main() diff --git a/scripts/prepare_iris_sjc_matrix.py b/scripts/prepare_iris_sjc_matrix.py new file mode 100644 index 0000000..73eec09 --- /dev/null +++ b/scripts/prepare_iris_sjc_matrix.py @@ -0,0 +1,31 @@ +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser( + description=('write input file for IRIS sjc_matrix')) + parser.add_argument('--sj-out', + required=True, + help='path to write the list of SJ count files') + parser.add_argument('--sj-files', + required=True, + nargs='+', + help='the SJ count files from extract_sjc') + + args = parser.parse_args() + return args + + +def prepare_iris_sjc_matrix(sj_out, sj_files): + with open(sj_out, 'wt') as out_handle: + for file_name in sj_files: + out_handle.write('{}\n'.format(file_name)) + + +def main(): + args = parse_args() + prepare_iris_sjc_matrix(args.sj_out, args.sj_files) + + +if __name__ == '__main__': + main() diff --git a/scripts/write_param_file.py b/scripts/write_param_file.py new file mode 100644 index 0000000..9c237ba --- /dev/null +++ b/scripts/write_param_file.py @@ -0,0 +1,231 @@ +import argparse +import os +import os.path + + +def parse_args(): + parser = argparse.ArgumentParser( + description=('write the parameter file for IRIS screen')) + parser.add_argument('--out-path', + required=True, + help='path to write parameter file') + parser.add_argument('--group-name', + required=True, + help='name to use for sub directory in IRIS_data/db/') + parser.add_argument('--iris-db', + required=True, + help='/path/to/IRIS_data/db') + parser.add_argument( + '--psi-p-value-cutoffs', + required=True, + help=('comma separated p-value cutoffs for PSI-based statistical tests' + ' (tissue-matched normal, tumor, normal)')) + parser.add_argument( + '--sjc-p-value-cutoffs', + required=True, + help=('comma separated p-value cutoffs for SJC-based statistical tests' + ' (tissue-matched normal, tumor, normal)')) + parser.add_argument('--delta-psi-cutoffs', + required=True, + help=('comma separated minimum required delta PSIs' + ' (tissue-matched normal, tumor, normal)')) + parser.add_argument('--fold-change-cutoffs', + required=True, + help=('comma separated minimum required fold changes' + ' (tissue-matched normal, tumor, normal)')) + parser.add_argument( + '--group-count-cutoffs', + required=True, + help=('comma separated minimum counts of reference groups that' + ' need to meet other requirements' + ' (tissue-matched normal, tumor, normal)')) + parser.add_argument( + '--reference-names-tissue-matched-normal', + required=True, + help='comma separated reference groups for tissue-matched normal') + parser.add_argument('--reference-names-tumor', + required=True, + help='comma separated reference groups for tumor') + parser.add_argument('--reference-names-normal', + required=True, + help='comma separated reference groups for normal') + parser.add_argument('--comparison-mode', + required=True, + choices=['group', 'individual'], + help=('mode for statistical test' + ' (group requires at least 2 input samples)')) + parser.add_argument('--statistical-test-type', + required=True, + choices=['parametric', 'nonparametric'], + help='type of statistical test') + parser.add_argument('--use-ratio', + action='store_true', + help='use ratio instead of count for group cutoffs') + parser.add_argument('--blacklist-file', help='list of AS events to remove') + parser.add_argument('--mapability-bigwig', + help='allows evaluatio of splice region mapability') + parser.add_argument('--reference-genome', + help='required for IRIS translate') + + args = parser.parse_args() + check_file_exists(args.blacklist_file, parser) + check_file_exists(args.mapability_bigwig, parser) + check_file_exists(args.reference_genome, parser) + + args.psi_p_value_cutoffs = parse_floats(args.psi_p_value_cutoffs) + args.sjc_p_value_cutoffs = parse_floats(args.sjc_p_value_cutoffs) + args.delta_psi_cutoffs = parse_floats(args.delta_psi_cutoffs) + args.fold_change_cutoffs = parse_floats(args.fold_change_cutoffs) + args.group_count_cutoffs = parse_floats(args.group_count_cutoffs) + + if not (1 <= len(args.psi_p_value_cutoffs) <= 3): + parser.error('must give 1 to 3 cutoffs') + + expected_len = len(args.psi_p_value_cutoffs) + expected_non_none = [x is not None for x in args.psi_p_value_cutoffs] + if not (expected_non_none[0] or + ((expected_len == 3) and expected_non_none[2])): + parser.error('must provide values for at least one of' + ' tissue-matched-normal or normal') + + for name, values in [('sjc_p_value_cutoffs', args.sjc_p_value_cutoffs), + ('delta_psi_cutoffs', args.delta_psi_cutoffs), + ('fold_change_cutoffs', args.fold_change_cutoffs), + ('group_count_cutoffs', args.group_count_cutoffs)]: + if len(values) != expected_len: + parser.error('{} has len {}, but expected {}'.format( + name, len(values), expected_len)) + + for i, value in enumerate(values): + is_non_none = value is not None + if is_non_none != expected_non_none[i]: + expected = 'non-None' if expected_non_none[i] else 'None' + parser.error('{} value {} was {} when {} was expected'.format( + name, i, value, expected)) + + db_names = get_db_names(args.iris_db, parser) + args.reference_names_tissue_matched_normal = parse_reference_names( + args.reference_names_tissue_matched_normal, args.group_count_cutoffs, + 0, 'tissue-matched-normal', db_names, args.use_ratio, parser) + args.reference_names_tumor = parse_reference_names( + args.reference_names_tumor, args.group_count_cutoffs, 1, 'tumor', + db_names, args.use_ratio, parser) + args.reference_names_normal = parse_reference_names( + args.reference_names_normal, args.group_count_cutoffs, 2, 'normal', + db_names, args.use_ratio, parser) + + return args + + +def get_db_names(db_path, parser): + if not os.path.exists(db_path): + parser.error('{} does not exist'.format(db_path)) + if not os.path.isdir(db_path): + parser.error('{} is not a directory'.format(db_path)) + + db_names = list() + dir_entries = os.listdir(db_path) + for db_name in dir_entries: + full_path = os.path.join(db_path, db_name) + if os.path.isdir(full_path): + db_names.append(db_name) + + return db_names + + +def parse_floats(floats_str): + parts = floats_str.split(',') + stripped = [x.strip() for x in parts] + results = list() + for string in stripped: + if len(string) != 0: + float_value = float(string) + results.append(float_value) + else: + results.append(None) + + return results + + +def parse_reference_names(names_str, group_cutoffs, cutoff_index, group_name, + db_names, use_ratio, parser): + if len(group_cutoffs) <= cutoff_index: + parser.error( + 'missing list of reference groups for {}'.format(group_name)) + + cutoff = group_cutoffs[cutoff_index] + parts = names_str.split(',') + stripped = [x.strip() for x in parts] + if cutoff is None: + return list() # this group is skipped + + if use_ratio: + if not (0 <= cutoff <= 1): + parser.error('cutoff for {} was {} with use_ratio'.format( + group_name, cutoff)) + elif cutoff > len(stripped): + parser.error('{} cutoff is {}, but only {} references'.format( + group_name, cutoff, len(stripped))) + + for name in stripped: + if name not in db_names: + parser.error('reference {} in {} not found in db/'.format( + name, group_name)) + + return stripped + + +def check_file_exists(file_path, parser): + if file_path is None: + return + + if not os.path.isfile(file_path): + parser.error('{} does not exist'.format(file_path)) + + +def write_file_line_or_empty_line(out_handle, maybe_file): + if maybe_file: + out_handle.write('{}\n'.format(maybe_file)) + else: + out_handle.write('\n') + + +def write_param_file(args): + with open(args.out_path, 'wt') as out_handle: + out_handle.write('{}\n'.format(args.group_name)) + abs_db_path = os.path.abspath(args.iris_db) + out_handle.write('{}\n'.format(abs_db_path)) + references_by_i = [ + args.reference_names_tissue_matched_normal, + args.reference_names_tumor, args.reference_names_normal + ] + for i, psi_cutoff in enumerate(args.psi_p_value_cutoffs): + if psi_cutoff is None: + out_handle.write('\n') + continue + + cutoffs = [ + psi_cutoff, args.delta_psi_cutoffs[i], + args.fold_change_cutoffs[i], args.sjc_p_value_cutoffs[i], + args.group_count_cutoffs[i] + ] + cutoffs = [str(x) for x in cutoffs] + references = references_by_i[i] + out_handle.write('{} {}\n'.format(','.join(cutoffs), + ','.join(references))) + + out_handle.write('{} {}\n'.format(args.comparison_mode, + args.statistical_test_type)) + out_handle.write('{}\n'.format('True' if args.use_ratio else 'False')) + write_file_line_or_empty_line(out_handle, args.blacklist_file) + write_file_line_or_empty_line(out_handle, args.mapability_bigwig) + write_file_line_or_empty_line(out_handle, args.reference_genome) + + +def main(): + args = parse_args() + write_param_file(args) + + +if __name__ == '__main__': + main() diff --git a/set_env_vars.sh b/set_env_vars.sh index 197d0e4..097032e 100644 --- a/set_env_vars.sh +++ b/set_env_vars.sh @@ -1,9 +1,24 @@ #!/bin/bash # # Set environment variables used by other scripts +# +function set_conda_env_prefixes() { + local ORIG_DIR="$(pwd)" || return 1 + + local REL_SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})" || return 1 + cd "${REL_SCRIPT_DIR}" || return 1 + SCRIPT_DIR="$(pwd)" || return 1 + + cd "${ORIG_DIR}" || return 1 + + CONDA_ENV_PREFIX_2="${SCRIPT_DIR}/conda_env_2" + CONDA_ENV_PREFIX_3="${SCRIPT_DIR}/conda_env_3" +} -CONDA_ENV_NAME_2="iris-2" -CONDA_PYTHON_VERSION_2="2.7" +function main() { + # need to use the setup that conda init writes to .bashrc + source "${HOME}/.bashrc" || return 1 + set_conda_env_prefixes || return 1 +} -CONDA_ENV_NAME_3="iris-3" -CONDA_PYTHON_VERSION_3="3.6" +main "$@" diff --git a/setup.py b/setup.py index e08f81a..ab8f1c0 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ def main(): setup( name='IRIS', - version='1.0.0', + version='2.0.0', description='Isoform peptides from RNA splicing for Immunotherapy target Screening', @@ -24,7 +24,7 @@ def main(): include_package_data=True, package_data={'IRIS.data':[ - 'brain_blacklistMay.txt', + 'blacklist.brain_2020.txt', 'features.uniprot2gtf.ExtraCell.txt', 'UniprotENSGmap.txt', 'uniprot2gtf.blastout.uniprotAll.txt', diff --git a/snakemake_config.yaml b/snakemake_config.yaml new file mode 100644 index 0000000..faa1b2e --- /dev/null +++ b/snakemake_config.yaml @@ -0,0 +1,111 @@ +# Resource allocation +create_star_index_threads: 4 +create_star_index_mem_gb: 40 +create_star_index_time_hr: 12 +iris_append_sjc_mem_gb: 8 +iris_append_sjc_time_hr: 24 +# TODO 16 threads hardcoded in iris process_rnaseq +iris_cuff_task_threads: 8 +iris_cuff_task_mem_gb: 8 +iris_cuff_task_time_hr: 12 +iris_epitope_post_mem_gb: 8 +iris_epitope_post_time_hr: 12 +iris_exp_matrix_mem_gb: 8 +iris_exp_matrix_time_hr: 12 +iris_extract_sjc_task_mem_gb: 8 +iris_extract_sjc_task_time_hr: 12 +iris_format_mem_gb: 8 +iris_format_time_hr: 12 +# TODO seq2HLA defaults to 6 threads since IRIS does not supply the -p argument +iris_hla_task_threads: 6 +iris_hla_task_mem_gb: 8 +iris_hla_task_time_hr: 12 +iris_parse_hla_mem_gb: 8 +iris_parse_hla_time_hr: 12 +iris_predict_mem_gb: 8 +iris_predict_time_hr: 12 +iris_predict_task_mem_gb: 8 +iris_predict_task_time_hr: 12 +# TODO 8 hardcoded in makesubsh_rmats +iris_rmats_task_threads: 8 +iris_rmats_task_mem_gb: 8 +iris_rmats_task_time_hr: 12 +# TODO 8 hardcoded in makesubsh_rmatspost +iris_rmatspost_task_threads: 8 +iris_rmatspost_task_mem_gb: 8 +iris_rmatspost_task_time_hr: 12 +iris_screen_mem_gb: 8 +iris_screen_time_hr: 12 +iris_screen_sjc_mem_gb: 8 +iris_screen_sjc_time_hr: 12 +iris_sjc_matrix_mem_gb: 8 +iris_sjc_matrix_time_hr: 12 +# TODO 6 threads hardcoded in iris process_rnaseq +iris_star_task_threads: 6 +iris_star_task_mem_gb: 40 +iris_star_task_time_hr: 12 +iris_visual_summary_mem_gb: 8 +iris_visual_summary_time_hr: 12 +# Command options +run_core_modules: true +# run_all_modules toggles which rules can be run by +# conditionally adding UNSATISFIABLE_INPUT to certain rules. +run_all_modules: false +should_run_sjc_steps: true +star_sjdb_overhang: 100 +run_name: 'NEPC_test' # used to name output files +splice_event_type: 'SE' # one of [SE, RI,A3SS, A5SS] +comparison_mode: 'group' # group or individual +stat_test_type: 'parametric' # parametric or nonparametric +use_ratio: false +tissue_matched_normal_psi_p_value_cutoff: '' +tissue_matched_normal_sjc_p_value_cutoff: '' +tissue_matched_normal_delta_psi_p_value_cutoff: '' +tissue_matched_normal_fold_change_cutoff: '' +tissue_matched_normal_group_count_cutoff: '' +tissue_matched_normal_reference_group_names: '' +tumor_psi_p_value_cutoff: '' +tumor_sjc_p_value_cutoff: '' +tumor_delta_psi_p_value_cutoff: '' +tumor_fold_change_cutoff: '' +tumor_group_count_cutoff: '' +tumor_reference_group_names: '' +normal_psi_p_value_cutoff: '0.01' +normal_sjc_p_value_cutoff: '0.000001' +normal_delta_psi_p_value_cutoff: '0.05' +normal_fold_change_cutoff: '1' +normal_group_count_cutoff: '8' +normal_reference_group_names: 'GTEx_Heart,GTEx_Blood,GTEx_Lung,GTEx_Liver,GTEx_Brain,GTEx_Nerve,GTEx_Muscle,GTEx_Spleen,GTEx_Thyroid,GTEx_Skin,GTEx_Kidney' +# Input files +# sample_fastqs are not needed when just running the core modules +# sample_fastqs: +# sample_name_1: +# - '/path/to/sample_1_read_1.fq' +# - '/path/to/sample_1_read_2.fq' +# sample_name_2: +# - '/path/to/sample_2_read_1.fq' +# - '/path/to/sample_2_read_2.fq' +blacklist: '' +mapability_bigwig: '/path/to/IRIS_data/resources/mappability/wgEncodeCrgMapabilityAlign24mer.bigWig' +mhc_list: '/path/to/example/hla_types_test.list' +mhc_by_sample: '/path/to/example/hla_patient_test.tsv' +gene_exp_matrix: '' +splice_matrix_txt: '/path/to/example/splicing_matrix/splicing_matrix.SE.cov10.NEPC_example.txt' +splice_matrix_idx: '/path/to/example/splicing_matrix/splicing_matrix.SE.cov10.NEPC_example.txt.idx' +sjc_count_txt: '/path/to/example/sjc_matrix/SJ_count.NEPC_example.txt' +sjc_count_idx: '/path/to/example/sjc_matrix/SJ_count.NEPC_example.txt.idx' +# Reference files +gtf_name: 'gencode.v26lift37.annotation.gtf' +fasta_name: 'ucsc.hg19.fasta' +reference_files: + gencode.v26lift37.annotation.gtf.gz: + url: 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_26/GRCh37_mapping/gencode.v26lift37.annotation.gtf.gz' + ucsc.hg19.fasta.gz: + url: 'http://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz' +# Additional configuration +# rmats_path: '/path/to/conda_env_2/bin/rmats.py' # should be written by ./install +# conda_wrapper: '/path/to/conda_wrapper' # should be written by ./install +# conda_env_2: '/path/to/conda_env_2' # should be written by ./install +# conda_env_3: '/path/to/conda_env_3' # should be written by ./install +# iris_data: '/path/to/IRIS_data # should be written by ./install +# iedb_path: '/path/to/IEDB/mhc_i/src' # should be written by ./install diff --git a/snakemake_profile/.gitignore b/snakemake_profile/.gitignore new file mode 100644 index 0000000..6fdf14e --- /dev/null +++ b/snakemake_profile/.gitignore @@ -0,0 +1 @@ +/job_resource_usage/ diff --git a/snakemake_profile/cluster_commands.py b/snakemake_profile/cluster_commands.py new file mode 100644 index 0000000..f89f34e --- /dev/null +++ b/snakemake_profile/cluster_commands.py @@ -0,0 +1,384 @@ +import os +import os.path + + +def submit_command(log_out, log_err, threads, time_hours, mem_mb, + mem_mb_per_thread, gpus, gpu_name, jobscript): + # sbatch requires that the directories for the log files already exist + for log_path in [log_out, log_err]: + log_dir = os.path.dirname(log_path) + if log_dir != '': + os.makedirs(log_dir, exist_ok=True) + + command = ['sbatch', '-o', log_out, '-e', log_err] + if threads: + command.extend(['-c', str(threads)]) + + if time_hours: + days, hours_float = divmod(time_hours, 24) + hours_whole, hours_part = divmod(hours_float, 1) + minutes = hours_part * 60 + time_str = '{}-{}:{}'.format(int(days), int(hours_whole), int(minutes)) + command.extend(['--time', time_str]) + + if mem_mb: + command.append('--mem={}M'.format(mem_mb)) + + if gpus: + gres_argument_base = '--gres=gpu' + if gpu_name: + gres_argument = '{}:{}:{}'.format(gres_argument_base, gpu_name, + gpus) + else: + gres_argument = '{}:{}'.format(gres_argument_base, gpus) + + command.extend(['-p', 'gpuq', gres_argument]) + + command.append(jobscript) + return command + + +def try_extract_job_id_from_submit_output(stdout): + tokens = stdout.split() + if len(tokens) < 4: + return None, 'expected at least 4 tokens' + + if (((tokens[0] != 'Submitted') or (tokens[1] != 'batch') + or (tokens[2] != 'job'))): + return None, 'expected output to look like "Submitted batch job ..."' + + try: + job_id = int(tokens[3]) + except ValueError as e: + return None, 'could not parse {} as an int: {}'.format(tokens[3], e) + + return job_id, None + + +def status_command(job_id): + status_fields = [ + 'ElapsedRaw', + 'End', + 'ExitCode', + 'JobIDRaw', + 'MaxDiskRead', + 'MaxDiskWrite', + 'MaxRss', + 'MaxVMSize', + 'Start', + 'State', + 'Submit', + 'TotalCPU', + ] + return [ + 'sacct', '--parsable', '-j', job_id, + '--format={}'.format(','.join(status_fields)) + ] + + +def try_extract_job_info_from_status_output(stdout, job_id): + rows, error = _parse_rows(stdout) + if error: + return None, error + + if not rows: + # the output may be empty if the job was submitted very recently + return {'status': 'running', 'resource_usage': None}, None + + parent_rows = list() + batch_rows = list() + other_rows = list() + for row in rows: + row_job_id, row_job_step = _get_job_id_and_step(row.get('JobIDRaw')) + if row_job_id != job_id: + continue + + if row_job_step is None: + parent_rows.append(row) + elif row_job_step == 'batch': + batch_rows.append(row) + else: + other_rows.append(row) + + usage = { + 'cpu': None, + 'end_time': None, + 'exit_code': None, + 'exit_signal': None, + 'max_disk_read': None, + 'max_disk_write': None, + 'max_rss': None, + 'max_vmem': None, + 'start_time': None, + 'state': None, + 'submit_time': None, + 'wallclock': None, + } + status, error = _update_from_parent_rows(parent_rows, usage) + if error: + return None, error + + status, error = _update_from_batch_rows(batch_rows, status, usage) + if error: + return None, error + + status, error = _update_from_other_rows(other_rows, status, usage) + if error: + return None, error + + if status is None: + return None, 'no status found' + + resource_usage = ('cpu: {cpu},' + ' end_time: {end_time},' + ' exit_code: {exit_code},' + ' exit_signal: {exit_signal},' + ' max_disk_read: {max_disk_read},' + ' max_disk_write: {max_disk_write},' + ' max_rss: {max_rss},' + ' max_vmem: {max_vmem},' + ' start_time: {start_time},' + ' state: {state},' + ' submit_time: {submit_time},' + ' wallclock: {wallclock}'.format(**usage)) + return {'status': status, 'resource_usage': resource_usage}, None + + +def _parse_rows(stdout): + lines = stdout.splitlines() + if not lines: + return list(), None + + header = lines[0] + header_cols = header.split('|') + rows = list() + for i, line in enumerate(lines[1:]): + row_cols = line.split('|') + if len(header_cols) != len(row_cols): + return None, 'row {} had {} columns but expected {}'.format( + i, len(row_cols), len(header_cols)) + + row = dict(zip(header_cols, row_cols)) + rows.append(row) + + return rows, None + + +def _get_job_id_and_step(job_id_raw): + job_id_sep_index = job_id_raw.find('.') + if job_id_sep_index <= 0: + return job_id_raw, None + + job_id_base = job_id_raw[:job_id_sep_index] + job_id_step = job_id_raw[job_id_sep_index + 1:] + return job_id_base, job_id_step + + +def _update_from_parent_rows(rows, usage): + if not rows: + return None, None + + if len(rows) > 1: + return None, 'expected at most 1 parent row' + + row = rows[0] + parsed_values = _parse_values(row) + # parent row is handled first. + # Add starting values which may be overwritten later + _add_if_not_none_keys(parsed_values, usage, [ + 'cpu', 'exit_code', 'exit_signal', 'wallclock', 'submit_time', + 'start_time', 'end_time', 'state', 'max_disk_read', 'max_disk_write', + 'max_rss', 'max_vmem' + ]) + + parsed_status = parsed_values.get('state_for_snakemake') + return parsed_status, None + + +def _update_from_batch_rows(rows, status, usage): + if not rows: + return status, None + + if len(rows) > 1: + return None, 'expected at most 1 batch row' + + row = rows[0] + parsed_values = _parse_values(row) + # the batch row seems to have more details for these fields + _overwrite_if_not_none_keys(parsed_values, usage, [ + 'cpu', 'exit_code', 'exit_signal', 'max_disk_read', 'max_disk_write', + 'max_rss', 'max_vmem' + ]) + # prefer the info from the parent row for these fields + _add_if_not_none_keys( + parsed_values, usage, + ['wallclock', 'submit_time', 'start_time', 'end_time', 'state']) + + # prefer the parent status + if status is None: + status = parsed_values.get('state_for_snakemake') + + return status, None + + +def _update_from_other_rows(rows, status, usage): + for row in rows: + parsed_values = _parse_values(row) + # use the "other" rows to fill in missing information + _add_if_not_none_keys(parsed_values, usage, [ + 'cpu', 'exit_code', 'exit_signal', 'wallclock', 'submit_time', + 'start_time', 'end_time', 'state', 'max_disk_read', + 'max_disk_write', 'max_rss', 'max_vmem' + ]) + + if status is None: + status = parsed_values.get('state_for_snakemake') + + return status, None + + +def _parse_values(row): + values = dict() + values['cpu'] = _parse_cpu_time(row) + exit_code, exit_signal = _parse_exit_code(row) + values['exit_code'] = exit_code + values['exit_signal'] = exit_signal + values['wallclock'] = _parse_wallclock(row) + values['submit_time'] = _parse_submit_time(row) + values['start_time'] = _parse_start_time(row) + values['end_time'] = _parse_end_time(row) + raw_state, state_for_snakemake = _parse_state(row) + values['state'] = raw_state + values['state_for_snakemake'] = state_for_snakemake + values['max_disk_read'] = _parse_max_disk_read(row) + values['max_disk_write'] = _parse_max_disk_write(row) + values['max_rss'] = _parse_max_rss(row) + values['max_vmem'] = _parse_max_vmem(row) + return values + + +def _parse_state(row): + raw = row.get('State') + if not raw: + return None, None + + # translate the slurm state into snakemake terms + for_snakemake = None + if ((raw.startswith('RUNNING') or raw.startswith('PENDING') + or raw.startswith('REQUEUED') or raw.startswith('RESIZING') + or raw.startswith('SUSPENDED'))): + for_snakemake = 'running' + elif raw.startswith('COMPLETED'): + for_snakemake = 'success' + else: + for_snakemake = 'failed' + + return raw, for_snakemake + + +def _parse_submit_time(row): + return _parse_datetime_col(row, 'Submit') + + +def _parse_start_time(row): + return _parse_datetime_col(row, 'Start') + + +def _parse_end_time(row): + return _parse_datetime_col(row, 'End') + + +def _parse_datetime_col(row, col): + # yyyy-mm-ddThh:mm:ss + raw = row.get(col) + if not raw: + return None + + return raw + + +def _parse_cpu_time(row): + # 'mm:ss.millis' + raw = row.get('TotalCPU') + if not raw: + return None + + return raw + + +def _parse_wallclock(row): + # 'num_seconds' + raw = row.get('ElapsedRaw') + if not raw: + return None + + return raw + + +def _parse_max_disk_read(row): + return _parse_disk(row, 'MaxDiskRead') + + +def _parse_max_disk_write(row): + return _parse_disk(row, 'MaxDiskWrite') + + +def _parse_disk(row, col): + # '{float}M' + raw = row.get(col) + if not raw: + return None + + return raw + + +def _parse_max_rss(row): + return _parse_mem(row, 'MaxRss') + + +def _parse_max_vmem(row): + return _parse_mem(row, 'MaxVMSize') + + +def _parse_mem(row, col): + # '{int}K' + raw = row.get(col) + if not raw: + return None + + return raw + + +def _parse_exit_code(row): + # 'exitcode:signal_num' + raw = row.get('ExitCode') + if not raw: + return None, None + + splits = raw.split(':') + if len(splits) != 2: + return raw, None + + return splits[0], splits[1] + + +def _overwrite_if_not_none_keys(source, dest, keys): + for key in keys: + value = source.get(key) + _overwrite_if_not_none(key, value, dest) + + +def _overwrite_if_not_none(key, value, dest): + if value is not None: + dest[key] = value + + +def _add_if_not_none_keys(source, dest, keys): + for key in keys: + value = source.get(key) + _add_if_not_none(key, value, dest) + + +def _add_if_not_none(key, value, dest): + if value is not None and dest.get(key) is None: + dest[key] = value diff --git a/snakemake_profile/cluster_commands_sge.py b/snakemake_profile/cluster_commands_sge.py new file mode 100644 index 0000000..5ce35ac --- /dev/null +++ b/snakemake_profile/cluster_commands_sge.py @@ -0,0 +1,132 @@ +import bs4 + + +def submit_command(log_out, log_err, threads, time_hours, mem_mb, + mem_mb_per_thread, gpus, gpu_name, jobscript): + command = ['qsub', '-o', log_out, '-e', log_err] + if threads: + command.extend(['-pe', 'smp', str(threads)]) + + if mem_mb_per_thread: + command.extend(['-l', 'h_vmem={}M'.format(mem_mb_per_thread)]) + + command.append(jobscript) + return command + + +def try_extract_job_id_from_submit_output(stdout): + tokens = stdout.split() + if len(tokens) < 3: + return None, 'expected at least 3 tokens' + + if (tokens[0] != 'Your') or (tokens[1] != 'job'): + return None, 'expected output to look like "Your job ..."' + + try: + job_id = int(tokens[2]) + except ValueError as e: + return None, 'could not parse {} as an int: {}'.format(tokens[2], e) + + return job_id, None + + +def status_command(job_id): + return ['qstat', '-j', job_id, '-xml'] + + +def try_extract_job_info_from_status_output(stdout, job_id): + info = {'status': None, 'resource_usage': None} + soup = bs4.BeautifulSoup(stdout, 'html.parser') + + unks = soup.find_all('unknown_jobs') + if unks: + names = unks[0].find_all('st_name') + if names: + if names[0].string == job_id: + info['status'] = 'success' + return info, None + + djobs = soup.find_all('djob_info') + if djobs: + numbers = djobs[0].find_all('jb_job_number') + if numbers: + if numbers[0].string == job_id: + info['status'] = 'running' + info['resource_usage'] = _extract_resource_usage(djobs[0]) + return info, None + + return None, 'unexpected output' + + +def _extract_resource_usage(soup): + resources = _extract_resource_usage_components(soup) + return ('wallclock: {wallclock}, cpu: {cpu}, io_wait: {io_wait},' + ' max_vmem: {max_vmem}, max_rss: {max_rss}, vmem: {vmem},' + ' rss: {rss}'.format(**resources)) + + +def _extract_resource_usage_components(soup): + resources = { + 'wallclock': None, + 'cpu': None, + 'io_wait': None, + 'vmem': None, + 'max_vmem': None, + 'rss': None, + 'max_rss': None, + } + usage_list = soup.find_all('jat_scaled_usage_list') + if not usage_list: + return resources + + wallclock_float = _extract_ua_by_name(usage_list[0], 'wallclock') + if wallclock_float: + resources['wallclock'] = '{:.2f}s'.format(wallclock_float) + + cpu_float = _extract_ua_by_name(usage_list[0], 'cpu') + if cpu_float: + resources['cpu'] = '{:.2f}s'.format(cpu_float) + + io_wait_float = _extract_ua_by_name(usage_list[0], 'iow') + if io_wait_float: + resources['io_wait'] = '{:.2f}s'.format(io_wait_float) + + bytes_per_gb = 1024**3 + vmem_float = _extract_ua_by_name(usage_list[0], 'vmem') + if vmem_float: + resources['vmem'] = '{:.2f}GB'.format(vmem_float / bytes_per_gb) + + max_vmem_float = _extract_ua_by_name(usage_list[0], 'maxvmem') + if max_vmem_float: + resources['max_vmem'] = '{:.2f}GB'.format(max_vmem_float / + bytes_per_gb) + + rss_float = _extract_ua_by_name(usage_list[0], 'rss') + if rss_float: + resources['rss'] = '{:.2f}GB'.format(rss_float / bytes_per_gb) + + max_rss_float = _extract_ua_by_name(usage_list[0], 'maxrss') + if max_rss_float: + resources['max_rss'] = '{:.2f}GB'.format(max_rss_float / bytes_per_gb) + + return resources + + +def _extract_ua_by_name(soup, ua_name): + name_node = soup.find_all('ua_name', string=ua_name) + if not name_node: + return None + + value_node = name_node[0].find_next_siblings('ua_value') + if not value_node: + return None + + value_str = value_node[0].string + return _try_parse_float(value_str) + + +def _try_parse_float(s): + try: + return float(s) + except ValueError: + return None diff --git a/snakemake_profile/cluster_status.py b/snakemake_profile/cluster_status.py new file mode 100644 index 0000000..1af63b5 --- /dev/null +++ b/snakemake_profile/cluster_status.py @@ -0,0 +1,127 @@ +import argparse +import datetime +import os.path +import sys +import time + +import cluster_commands +import try_command + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + 'job_id', help='the cluster id of the job to check the status of') + parser.add_argument( + '--retry-status-interval-seconds', + default='', + help='a "," separated list of integers representing' + ' the number of seconds to wait after sequential failed' + ' job status commands before retrying') + parser.add_argument( + '--resource-usage-dir', + help='a directory for storing the file paths where the resource usage' + ' of each job should be logged') + parser.add_argument( + '--resource-usage-min-interval', + type=float, + default=120, + help='only log the resource usage if it has been at least this many' + ' seconds since the last log') + args = parser.parse_args() + + retry_status_interval_seconds = list() + for int_str in args.retry_status_interval_seconds.split(','): + retry_status_interval_seconds.append(int(int_str)) + + return { + 'job_id': args.job_id, + 'retry_status_interval_seconds': retry_status_interval_seconds, + 'resource_usage_dir': args.resource_usage_dir, + 'resource_usage_min_interval': args.resource_usage_min_interval, + } + + +def extract_job_info(stdout, job_id): + info, error = cluster_commands.try_extract_job_info_from_status_output( + stdout, job_id) + if error: + print('error: {}\n{}'.format(error, stdout), file=sys.stderr) + sys.exit(1) + + now = datetime.datetime.now() + formatted_now = now.isoformat() + info['resource_usage'] = 'current_time: {}, {}'.format( + formatted_now, info['resource_usage']) + + return info + + +def update_resource_log(status, resource_usage, resource_dir, + min_interval_seconds, job_id): + if not (resource_dir and os.path.isdir(resource_dir)): + return + + resource_dir_job_file = os.path.join(resource_dir, '{}.txt'.format(job_id)) + if not os.path.exists(resource_dir_job_file): + return + + with open(resource_dir_job_file, 'rt') as f_handle: + resource_log_file = f_handle.read().strip() + + is_final_update = status != 'running' + update_resource_log_with_file(resource_usage, min_interval_seconds, + resource_log_file, is_final_update) + + if is_final_update: + os.remove(resource_dir_job_file) + + +def update_resource_log_with_file(resource_usage, min_interval_seconds, + resource_log_file, is_final_update): + if not resource_usage: + return + + # resource_log_file is created and written to by this function. + # log_dir should have been created by snakemake when the job was submitted. + log_dir = os.path.dirname(resource_log_file) + if not (resource_log_file.endswith('.cluster.usage') + and os.path.isdir(log_dir)): + return + + # The first write creates the file and does not check min_interval_seconds + if ((min_interval_seconds and os.path.exists(resource_log_file) + and not is_final_update)): + mod_time_seconds = os.stat(resource_log_file).st_mtime + current_seconds = time.time() + diff_seconds = current_seconds - mod_time_seconds + if diff_seconds < min_interval_seconds: + return + + with open(resource_log_file, 'at') as f_handle: + f_handle.write('{}\n'.format(resource_usage)) + + +def run_status_command(command, retry_status_interval_seconds): + stdout, error = try_command.try_command(command, retry_status_interval_seconds) + if error: + sys.exit(error) + + return stdout + + +def main(): + args = parse_args() + job_id = args['job_id'] + command = cluster_commands.status_command(job_id) + stdout = run_status_command(command, args['retry_status_interval_seconds']) + job_info = extract_job_info(stdout, job_id) + status = job_info['status'] + update_resource_log(status, job_info['resource_usage'], + args['resource_usage_dir'], + args['resource_usage_min_interval'], job_id) + print(status) + + +if __name__ == '__main__': + main() diff --git a/snakemake_profile/cluster_submit.py b/snakemake_profile/cluster_submit.py new file mode 100644 index 0000000..8f0c1e5 --- /dev/null +++ b/snakemake_profile/cluster_submit.py @@ -0,0 +1,149 @@ +import argparse +import math +import os +import os.path +import sys + +from snakemake.utils import read_job_properties + +import cluster_commands +import try_command + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('jobscript', + help='the script to be executed on the cluster') + parser.add_argument( + '--retry-submit-interval-seconds', + default='', + help='a "," separated list of integers representing' + ' the number of seconds to wait after sequential failed' + ' job submission commands before retrying') + parser.add_argument( + '--resource-usage-dir', + help='a directory for storing the file paths where cluster_status.py' + ' should log the resource usage of each job') + args = parser.parse_args() + jobscript = args.jobscript + + job_properties = read_job_properties(jobscript) + + retry_submit_interval_seconds = list() + for int_str in args.retry_submit_interval_seconds.split(','): + retry_submit_interval_seconds.append(int(int_str)) + + resource_usage_dir = args.resource_usage_dir + if resource_usage_dir: + os.makedirs(resource_usage_dir, exist_ok=True) + + return { + 'jobscript': jobscript, + 'job_properties': job_properties, + 'retry_submit_interval_seconds': retry_submit_interval_seconds, + 'resource_usage_dir': resource_usage_dir, + } + + +def get_base_path_from_jobscript(jobscript): + with open(jobscript) as f_handle: + for line in f_handle: + tokens = line.split() + if len(tokens) == 4: + if ((tokens[0] == 'cd' and tokens[2] == '&&' + and tokens[3] == '\\')): + base_path = tokens[1] + if os.path.isdir(base_path): + return os.path.abspath(base_path) + + return None + + +def get_cluster_log_paths(jobscript, job_properties): + cluster_logs = {'out': os.devnull, 'err': os.devnull, 'usage': os.devnull} + base_path = get_base_path_from_jobscript(jobscript) + orig_logs = job_properties.get('log') + if (not base_path) or (not orig_logs): + return cluster_logs + + if len(orig_logs) == 1: + orig_log_out = orig_logs[0] + orig_log_err = orig_log_out + else: + orig_log_out = orig_logs[0] + orig_log_err = orig_logs[1] + + cluster_logs['out'] = os.path.join(base_path, + '{}.cluster.out'.format(orig_log_out)) + cluster_logs['err'] = os.path.join(base_path, + '{}.cluster.err'.format(orig_log_err)) + cluster_logs['usage'] = os.path.join( + base_path, '{}.cluster.usage'.format(orig_log_out)) + return cluster_logs + + +def build_submit_command(jobscript, job_properties, cluster_log_out, + cluster_log_err): + threads = job_properties.get('threads') + resources = job_properties.get('resources') + time_hours = None + mem_mb = None + mem_mb_per_thread = None + if resources: + time_hours = resources.get('time_hours') + gpus = resources.get('gpus') + gpu_name = resources.get('gpu_name') + mem_mb = resources.get('mem_mb') + mem_mb_per_thread = mem_mb + if mem_mb and threads: + mem_mb_per_thread /= float(threads) + mem_mb_per_thread = math.ceil(mem_mb_per_thread) + + return cluster_commands.submit_command(cluster_log_out, cluster_log_err, + threads, time_hours, mem_mb, + mem_mb_per_thread, gpus, gpu_name, + jobscript) + + +def run_submit_command(command, retry_submit_interval_seconds): + stdout, error = try_command.try_command(command, + retry_submit_interval_seconds) + if error: + sys.exit(error) + + return stdout + + +def extract_job_id(stdout): + job_id, error = cluster_commands.try_extract_job_id_from_submit_output( + stdout) + if error: + print('error: {}\n{}'.format(error, stdout), file=sys.stderr) + sys.exit(1) + + return job_id + + +def record_usage_file(job_id, cluster_log_usage, resource_usage_dir): + job_file_path = os.path.join(resource_usage_dir, '{}.txt'.format(job_id)) + with open(job_file_path, 'wt') as f_handle: + f_handle.write('{}\n'.format(cluster_log_usage)) + + +def main(): + parsed_args = parse_args() + jobscript = parsed_args['jobscript'] + job_properties = parsed_args['job_properties'] + cluster_logs = get_cluster_log_paths(jobscript, job_properties) + command = build_submit_command(jobscript, job_properties, + cluster_logs['out'], cluster_logs['err']) + stdout = run_submit_command(command, + parsed_args['retry_submit_interval_seconds']) + job_id = extract_job_id(stdout) + record_usage_file(job_id, cluster_logs['usage'], + parsed_args['resource_usage_dir']) + print(job_id) + + +if __name__ == '__main__': + main() diff --git a/snakemake_profile/config.yaml b/snakemake_profile/config.yaml new file mode 100644 index 0000000..a0ce6df --- /dev/null +++ b/snakemake_profile/config.yaml @@ -0,0 +1,36 @@ +# Commenting out 'cluster' will force jobs to be run locally. +# '>-' is yaml syntax for splitting a string over multiple lines +cluster: >- + python ./snakemake_profile/cluster_submit.py + --retry-submit-interval-seconds 30,120,300 + --resource-usage-dir ./snakemake_profile/job_resource_usage +cluster-status: >- + python ./snakemake_profile/cluster_status.py + --retry-status-interval-seconds 30,120,300 + --resource-usage-dir ./snakemake_profile/job_resource_usage + --resource-usage-min-interval 300 + +# 'jobs' has two interpretations: +# * if running with 'cluster' +# + max number of jobs concurrently submitted to the cluster +# * else +# + max number of local cores to use +jobs: 100 + +# 'resources' defines limits that apply to both local and 'cluster' jobs. +# 'resources' is commented out when using 'cluster' to allow the cluster +# scheduler to determine the available memory. +# resources: +# - 'mem_mb=16384' + +# wait up to a minute for result files to be visible through the shared filesystem +latency-wait: 60 + +# Allow a failed job to be re-started once +restart-times: 1 + +# output settings +verbose: false +printshellcmds: true +show-failed-logs: false +reason: true diff --git a/snakemake_profile/try_command.py b/snakemake_profile/try_command.py new file mode 100644 index 0000000..05d9ae9 --- /dev/null +++ b/snakemake_profile/try_command.py @@ -0,0 +1,40 @@ +import subprocess +import time + + +def try_command_once(command): + completed_process = subprocess.run(command, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + decoded_stdout = completed_process.stdout.decode() + decoded_stderr = completed_process.stderr.decode() + if completed_process.returncode != 0: + return None, 'command:{}\nstdout:\n{}\n\nstderr:\n{}'.format( + command, decoded_stdout, decoded_stderr) + + return decoded_stdout, None + + +def try_command(command, retry_interval_seconds): + errors = list() + + # The final (retry_seconds: None) allows running the command, but + # without the ability to wait and retry. + retry_interval_seconds = retry_interval_seconds + [None] + for retry_seconds in retry_interval_seconds: + stdout, error = try_command_once(command) + if not error: + return stdout, None + + errors.append(error) + if retry_seconds is None: + break + + time.sleep(retry_seconds) + + formatted_errors = list() + for i, error in enumerate(errors): + formatted_errors.append('attempt {}\n{}'.format(i, error)) + + return None, '\n'.join(formatted_errors)