diff --git a/bin/map_dssp.py b/bin/map_dssp.py index afbfcc0..59baebf 100644 --- a/bin/map_dssp.py +++ b/bin/map_dssp.py @@ -44,15 +44,15 @@ def handle_unmappable(df): # Iterate over rows where df.same == False for index, row in df[df['same'] == False].iterrows(): if type(row['model']) == str : - print(row) + #print(row) fixed=False model_seq = row['model'] unali_seq = row['unali'] dssp = row['dssp'] - print(unali_seq) - print(model_seq) - print(dssp) + #print(unali_seq) + #print(model_seq) + #print(dssp) if len(model_seq)==len(unali_seq): #unmatching is because of point mutations, we ignore point mutations counter = 0 @@ -66,7 +66,7 @@ def handle_unmappable(df): else: #either the sequence of entry of the sequence of the 3D structure/model have different length # align alignments = pairwise2.align.globalms(model_seq, unali_seq,2, -1, -10, -0.1) - print(alignments) + #print(alignments) aligned_model_seq = alignments[0][0] aligned_unali_seq = alignments[0][1] @@ -207,13 +207,6 @@ def write_fasta_from_df(df,outname): line = df.iloc[i,0]+"_dssp" + "\n"+ str(df.iloc[i,3]) lines.append(line) - #line = df.iloc[i,0]+"_seq_inputted" + "\n"+ df.iloc[i,1] - #lines.append(line) - #line = df.iloc[i,0]+"_seq_model" + "\n"+ df.iloc[i,2] - #lines.append(line) - #line = df.iloc[i,0]+"_dssp" + "\n"+ df.iloc[i,3] - #lines.append(line) - with open (out_name, 'w') as m: for line in lines: line = line.replace('\\n','\n').replace(' ','') diff --git a/bin/shannons_entropy.py b/bin/shannons_entropy.py deleted file mode 100644 index 4b3a062..0000000 --- a/bin/shannons_entropy.py +++ /dev/null @@ -1,65 +0,0 @@ - -import math -from Bio import AlignIO -import sys -#Shannon_entropy calculates the conservation of the amino acids at each position. It takes into account -#the gaps in the columns. Check https://doi.org/10.1002/prot.10146 for info formula - -msa_filename_noDesigned = sys.argv[1] - -def shannon_entropy(list_input): - tot = len(list_input) #total number of AA at particular position in MSA - gaps = list_input.count("-") / tot #count frequency of gaps in that column of MSA - unique_base = set(list_input) #remove duplicates, "-" is seen as an amino acid type - unique_base_len = len(unique_base) #total number of AA at particular position in MSA with no duplicates - entropy_list = [] # entropy of AA at particular position - for base in unique_base: - n_i = list_input.count(base) - P_i = n_i / tot - entropy_i = P_i * (math.log(P_i, 2)) - entropy_list.append(entropy_i) - #sum entropy of every residue at a position and normalize it so it is between 0 and 1 - entropy_sum = math.fsum(entropy_list) - if unique_base_len == 1: # log(1, 2) = 0; n/0 throws ZeroDivisionError - shannon_entropy = entropy_sum - else: - unique_base_len_log = math.log(unique_base_len, 2) - shannon_entropy = (-1 / unique_base_len_log) * entropy_sum - #Return entropy AA and entropy gaps at 1 position - #If entropy is high than there are many possible arrangements (high variability) - return shannon_entropy, gaps -def conservation(alignment_file): - conservation_AA_list = [] - for col_no in range(len(list(alignment_file[0]))): - list_input = list(alignment_file[:, col_no]) - sh_entropy_AA, sh_entropy_gaps = shannon_entropy(list_input) - #Translate entropy into conservation. 0 means no conservation, 1 highly conserved - #As e took into account the gaps: if only gaps: entropy=0 freq_gaps=1 conservation = 0 - conservation_AA = (1 - sh_entropy_AA) * (1 - sh_entropy_gaps) - conservation_AA_list.append(conservation_AA) - return conservation_AA_list -#Retrieve sequence conservation (Shannon entropy) -msa_type = "fasta" -alignment_file_natural = AlignIO.read(msa_filename_noDesigned, msa_type) -conservation_AA_list = conservation(alignment_file_natural) - - -average = (sum(conservation_AA_list) / len(conservation_AA_list))*100 - - -sys.stdout.write(str(round(average,4))) -sys.exit(0) - - - -""" -Alternative:totalCount is number of seqs, aaCount the count for one amino acid type. - -value = 0.0 -totalCount = sum(aaList) * 1.0 -for aaCount in aaList: - if aaCount: - pj = aaCount / totalCount - value += pj * math.log(pj) # Is natural logarithm ln by default! - -""" \ No newline at end of file diff --git a/magic_align.sh b/magic_align.sh index 2884539..a45828b 100755 --- a/magic_align.sh +++ b/magic_align.sh @@ -5,16 +5,18 @@ data=toy_example house=$(pwd) now=`date +"%Y_%m_%d_%H_%M_%S"` output_name=${data}_${now}_test -output_folder=$house/results/$output_name +output_folder=$house/$data/results/$output_name + mkdir -p $house/results/ mkdir -p $output_folder echo 'Starting nextflow' + nextflow run simsapiper.nf \ - -profile server,withsingularity \ + -profile standard,withdocker \ --data $house/$data/data \ --magic \ --outFolder $output_folder \ - |& tee $output_folder/run_report_$output_name.nflog + | tee $output_folder/run_report_$output_name.nflog sessionName=$(sed -n '2s/.*\[\(.*\)\].*/\1/p' $output_folder/run_report_$output_name.nflog) nextflow log | grep $sessionName >> $output_folder/run_report_$output_name.nflog diff --git a/magic_hydra.sh b/magic_hydra.sh index 62aeb6c..b218f47 100755 --- a/magic_hydra.sh +++ b/magic_hydra.sh @@ -3,7 +3,7 @@ data=toy_example -module load Nextflow/23.04.2 +module load Nextflow/23.10.0 house=$VSC_SCRATCH_VO_USER/simsapiper now=`date +"%Y_%m_%d_%H_%M_%S"` output_name=${data}_${now}_test diff --git a/modules/msas.nf b/modules/msas.nf index a90bde6..aab91ba 100644 --- a/modules/msas.nf +++ b/modules/msas.nf @@ -167,8 +167,6 @@ process squeeze{ """ python3 $projectDir/bin/squeeze_msa.py $msa $dssp "$squeeze" $squeezePerc squeezed_${msa.baseName} """ - //INFO: - //choose conserved secondary structure elements according to dssp across your dataset as it is does elements that TCOFFEE will use to align your proteins } process reorder{ diff --git a/modules/structures.nf b/modules/structures.nf index bb15901..cb5c780 100644 --- a/modules/structures.nf +++ b/modules/structures.nf @@ -65,20 +65,11 @@ process runDssp{ echo Gate is open $gate mkdssp -i $model -o ${model.baseName}.dssp """ - //INFO: secondary structure elements according to dssp - //H = alpha-helix - //B = beta-bridge residue - //E = extended strand (in beta ladder) - //G = 3/10-helix - //I = 5-helix - //T = H-bonded turn - //S = beta-bend or beta-turn } process esmFolds{ publishDir "$params.structures", mode: "copy" - //errorStrategy { task.attempt > 3 ? 'retry' : 'complete' } input: path structureless diff --git a/modules/utils.nf b/modules/utils.nf index 615b027..aa83022 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -43,10 +43,6 @@ process attendance{ echo 'No. of sequences in final alignment: ' \$fin >> "sequence_report.txt" - av_conservation=`python3 $projectDir/bin/shannons_entropy.py $finalMsa` - echo 'Average sequence conservation (Shannons Entropy): ' \$av_conservation - - if (( \$fin !=$collapsedSequencesCount + $structurelessCount )) then echo "ERROR: Not all valid sequences are found in the output file, please check $finalMsa in in the output directory!" @@ -315,25 +311,5 @@ process createSummary{ cp .command.out \$outfile - """ - - // add new dependency - //python3 $projectDir/bin/sequence_sim.py $finalMSA - - - //av_conservation=`python3 $projectDir/bin/shannons_entropy.py $finalMsa` - // echo 'Average sequence conservation (Shannons Entropy): ' \$av_conservation - - //md improvemends - //echo \$(readlink -f $seqsInvalidFile) - //inputSeqFilePath=\$(readlink -f $inputSeqFiles ) - //echo '[$inputSeqFiles]('\$inputSeqFilePath')' - - //[link](file:///Users/matb/Desktop/cat.gif) - //echo 'link' - - //inputSeqFilesPath=\$(readlink -f $inputSeqFiles) - //echo '\n SIMSApiper found these files: ![$inputSeqFiles](file://'\$inputSeqFilesPath')' - - + """ } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 9b094b7..32fdec8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,9 +66,6 @@ if (params.squeeze){params.dssp = true} //minimal parameter value should be 1 //if (params.localModel){params.localModel = 1} -//type test localModel -//if (params.localModel) {assert Number.isCase(params.useSubsets), " localModel can only be 'false' or a number, please check your launch file or command line"} - report { enabled = true diff --git a/simsapiper.nf b/simsapiper.nf index 5b3799a..80600a0 100644 --- a/simsapiper.nf +++ b/simsapiper.nf @@ -250,11 +250,6 @@ workflow { seqs_to_model = writeFastaFromMissing.out.found esmFolds(seqs_to_model) - //if seqs_to_model is empty, the pipeline does not complete, but if it is not empty, strucQC needs to wait for esm? - //esmStructuresCounter= Channel.fromPath("$params.structures/*.pdb").count() - //this does not work as a gate - - foundSequencesCount = finalModelFound.mix(esmFolds.out.esmFoldsStructures).count() structureless_seqs=Channel.empty()