git clean up

Bio2Byte · Mar 21, 2024 · 6d9d028 · 6d9d028
1 parent 98cd53a
commit 6d9d028
Show file tree

Hide file tree

Showing 9 changed files with 12 additions and 125 deletions.
diff --git a/bin/map_dssp.py b/bin/map_dssp.py
@@ -44,15 +44,15 @@ def handle_unmappable(df):
     # Iterate over rows where df.same == False
     for index, row in df[df['same'] == False].iterrows():
         if type(row['model']) == str :
-            print(row)
+            #print(row)
             fixed=False
             model_seq = row['model']
             unali_seq = row['unali']
             dssp = row['dssp']
 
-            print(unali_seq)
-            print(model_seq)
-            print(dssp)
+            #print(unali_seq)
+            #print(model_seq)
+            #print(dssp)
 
             if len(model_seq)==len(unali_seq): #unmatching is because of point mutations, we ignore point mutations
                 counter = 0
@@ -66,7 +66,7 @@ def handle_unmappable(df):
             else: #either the sequence of entry of the sequence of the 3D structure/model have different length
                 # align
                 alignments = pairwise2.align.globalms(model_seq, unali_seq,2, -1, -10, -0.1)
-                print(alignments)
+                #print(alignments)
                 aligned_model_seq = alignments[0][0]
                 aligned_unali_seq = alignments[0][1]
 
@@ -207,13 +207,6 @@ def write_fasta_from_df(df,outname):
                 line = df.iloc[i,0]+"_dssp" + "\n"+ str(df.iloc[i,3])
                 lines.append(line)
 
-                #line = df.iloc[i,0]+"_seq_inputted" + "\n"+ df.iloc[i,1]
-                #lines.append(line)
-                #line = df.iloc[i,0]+"_seq_model" + "\n"+ df.iloc[i,2]
-                #lines.append(line)
-                #line = df.iloc[i,0]+"_dssp" + "\n"+ df.iloc[i,3]
-                #lines.append(line)
-
         with open (out_name, 'w') as m:
                 for line in lines:
                     line = line.replace('\\n','\n').replace(' ','')

diff --git a/bin/shannons_entropy.py b/bin/shannons_entropy.py
diff --git a/magic_align.sh b/magic_align.sh
@@ -5,16 +5,18 @@ data=toy_example
 house=$(pwd)
 now=`date +"%Y_%m_%d_%H_%M_%S"`
 output_name=${data}_${now}_test
-output_folder=$house/results/$output_name
+output_folder=$house/$data/results/$output_name
+
 mkdir -p $house/results/
 mkdir -p $output_folder
 echo 'Starting nextflow'
+
 nextflow run simsapiper.nf \
-    -profile server,withsingularity \
+    -profile standard,withdocker \
     --data $house/$data/data \
     --magic \
     --outFolder $output_folder \
-    |& tee  $output_folder/run_report_$output_name.nflog
+    | tee  $output_folder/run_report_$output_name.nflog
 sessionName=$(sed -n '2s/.*\[\(.*\)\].*/\1/p' $output_folder/run_report_$output_name.nflog)
 nextflow log | grep $sessionName >> $output_folder/run_report_$output_name.nflog
 
diff --git a/magic_hydra.sh b/magic_hydra.sh
@@ -3,7 +3,7 @@
 
 data=toy_example
 
-module load Nextflow/23.04.2
+module load Nextflow/23.10.0
 house=$VSC_SCRATCH_VO_USER/simsapiper
 now=`date +"%Y_%m_%d_%H_%M_%S"`
 output_name=${data}_${now}_test

diff --git a/modules/msas.nf b/modules/msas.nf
@@ -167,8 +167,6 @@ process squeeze{
     """
     python3 $projectDir/bin/squeeze_msa.py $msa $dssp "$squeeze" $squeezePerc squeezed_${msa.baseName} 
     """
-    //INFO: 
-    //choose conserved secondary structure elements according to dssp across your dataset as it is does elements that TCOFFEE will use to align your proteins
 }
 
 process reorder{

diff --git a/modules/structures.nf b/modules/structures.nf
@@ -65,20 +65,11 @@ process runDssp{
     echo Gate is open $gate
     mkdssp -i $model -o ${model.baseName}.dssp   
     """
-    //INFO: secondary structure elements according to dssp  
-    //H = alpha-helix
-    //B = beta-bridge residue
-    //E = extended strand (in beta ladder)
-    //G = 3/10-helix
-    //I = 5-helix
-    //T = H-bonded turn
-    //S = beta-bend or beta-turn
 }
 
 
 process esmFolds{
     publishDir "$params.structures", mode: "copy"
-    //errorStrategy { task.attempt > 3 ? 'retry' : 'complete' }
 
     input:
     path structureless

diff --git a/modules/utils.nf b/modules/utils.nf
@@ -43,10 +43,6 @@ process attendance{
 
     echo 'No. of sequences in final alignment: ' \$fin >> "sequence_report.txt"
 
-    av_conservation=`python3 $projectDir/bin/shannons_entropy.py $finalMsa`
-    echo 'Average sequence conservation (Shannons Entropy): ' \$av_conservation 
-
-
     if (( \$fin !=$collapsedSequencesCount + $structurelessCount ))
     then
         echo "ERROR: Not all valid sequences are found in the output file, please check $finalMsa in in the output directory!"
@@ -315,25 +311,5 @@ process createSummary{
 
     cp .command.out \$outfile
     
-    """    
-
-    // add new dependency
-    //python3  $projectDir/bin/sequence_sim.py $finalMSA
-
-
-    //av_conservation=`python3 $projectDir/bin/shannons_entropy.py $finalMsa`
-   // echo 'Average sequence conservation (Shannons Entropy): ' \$av_conservation 
-
-    //md improvemends
-    //echo  \$(readlink -f $seqsInvalidFile)
-    //inputSeqFilePath=\$(readlink -f $inputSeqFiles )
-    //echo '[$inputSeqFiles]('\$inputSeqFilePath')'
-
-    //[link](file:///Users/matb/Desktop/cat.gif) 
-    //echo '<a href="file://'\$inputSeqFilesPath'">link</a>'
-
-    //inputSeqFilesPath=\$(readlink -f $inputSeqFiles) 
-    //echo '\n SIMSApiper found these files: ![$inputSeqFiles](file://'\$inputSeqFilesPath')'
-
-
+    """   
 }
diff --git a/nextflow.config b/nextflow.config
@@ -66,9 +66,6 @@ if (params.squeeze){params.dssp = true}
 
 //minimal parameter value should be 1
 //if (params.localModel){params.localModel = 1}
-//type test localModel
-//if (params.localModel) {assert Number.isCase(params.useSubsets), " localModel can only be 'false' or a number, please check your launch file or command line"}
-
 
 report {
     enabled = true

diff --git a/simsapiper.nf b/simsapiper.nf
@@ -250,11 +250,6 @@ workflow {
         seqs_to_model = writeFastaFromMissing.out.found
 
         esmFolds(seqs_to_model)
-        //if seqs_to_model is empty, the pipeline does not complete, but if it is not empty, strucQC needs to wait for esm?
-        //esmStructuresCounter= Channel.fromPath("$params.structures/*.pdb").count()
-        //this does not work as a gate
-
-
         foundSequencesCount = finalModelFound.mix(esmFolds.out.esmFoldsStructures).count()
 
         structureless_seqs=Channel.empty()
-Original file line number
+Diff line change
@@ Expand Up / @@ -167,8 +167,6 @@ process squeeze{ @@
         """
         python3 $projectDir/bin/squeeze_msa.py $msa $dssp "$squeeze" $squeezePerc squeezed_${msa.baseName}
         """
-        //INFO:
-        //choose conserved secondary structure elements according to dssp across your dataset as it is does elements that TCOFFEE will use to align your proteins
     }
     process reorder{
@@ Expand Down @@