skip to tests not working with pytests pooling.

bigbio · Apr 19, 2024 · 4b4c8ab · 4b4c8ab
1 parent 1c38d7b
commit 4b4c8ab
Showing 1 changed file with 91 additions and 85 deletions.
diff --git a/pypgatk/proteogenomics/blast_get_position.py b/pypgatk/proteogenomics/blast_get_position.py
@@ -1,5 +1,5 @@
 import pandas as pd
-from Bio import pairwise2,SeqIO
+from Bio import pairwise2, SeqIO
 import datetime
 from pathos.multiprocessing import ProcessingPool as Pool
 from multiprocessing import Manager
@@ -8,6 +8,61 @@
 
 from pypgatk.toolbox.general import ParameterConfiguration
 
+
+def _blast_set(fasta_set, peptide):
+    length = len(peptide)
+    position_set = set()
+    for fasta in fasta_set:
+        if len(fasta) >= length:
+            alignments_score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0,
+                                                       open=-1,
+                                                       extend=0, score_only=True)
+            if alignments_score == length:
+                return "canonical"
+            elif alignments_score == length - 1:
+                alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0,
+                                                           open=-1, extend=0)
+                for alignment in alignments_local:
+                    # insertion e.g., ABCDMEFGH<----ABCDEFGH
+                    if alignment.end - alignment.start == length + 1:
+                        s = fasta[alignment.start:alignment.end]
+                        for i in range(length):
+                            if peptide[i] != s[i]:
+                                position_set.add(i + 1)
+                                break
+                    # substitution e.g., ABCDMFGH<----ABCDEFGH
+                    elif alignment.end - alignment.start == length:
+                        s = fasta[alignment.start:alignment.end]
+                        for i in range(length):
+                            if peptide[i] != s[i]:
+                                position_set.add(i + 1)
+                                break
+                    # substitution e.g., ABCDEFGM<----ABCDEFGH
+                    elif alignment.end - alignment.start == length - 1:
+                        s = fasta[alignment.start:alignment.end]
+                        if peptide[0] != s[0]:
+                            position_set.add(1)
+                        elif peptide[-1] != s[-1]:
+                            position_set.add(length)
+            elif alignments_score == length - 2:
+                alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=-1,
+                                                           open=-1, extend=0)
+                for alignment in alignments_local:
+                    # deletion e.g., ABCEFGH<----ABCDEFGH
+                    if alignment.end - alignment.start == length and alignment.score == length - 2:
+                        s = fasta[alignment.start:alignment.end - 1]
+                        if pairwise2.align.localms(sequenceA=s, sequenceB=peptide, match=1, mismatch=0, open=0,
+                                                   extend=0, score_only=True) == length - 1:
+                            for i in range(length - 1):
+                                if peptide[i] != s[i]:
+                                    position_set.add(i + 1)
+                                    break
+    if position_set:
+        return position_set
+    else:
+        return "non-canonical"
+
+
 class BlastGetPositionService(ParameterConfiguration):
     CONFIG_KEY_BlastGetPosition = 'blast_get_position'
     # CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
@@ -16,16 +71,18 @@ class BlastGetPositionService(ParameterConfiguration):
 
     def __init__(self, config_data, pipeline_arguments):
         """
-      Init the class with the specific parameters.
-      :param config_data configuration file
-      :param pipeline_arguments pipelines arguments
-      """
+        init the class with the specific parameters.
+        :param config_data configuration file
+        :param pipeline_arguments pipelines arguments
+        """
 
         super(BlastGetPositionService, self).__init__(self.CONFIG_KEY_BlastGetPosition, config_data, pipeline_arguments)
-        self._input_reference_database = self.get_blast_parameters(variable=self.CONFIG_INPUT_REFERENCE_DATABASE, default_value='')
-        self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES, default_value='40')
+        self._input_reference_database = self.get_blast_parameters(variable=self.CONFIG_INPUT_REFERENCE_DATABASE,
+                                                                   default_value='')
+        self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES,
+                                                              default_value='40')
 
-        self.fa_set =set()
+        self.fa_set = set()
         for j in SeqIO.parse(self._input_reference_database, "fasta"):
             self.fa_set.add(str(j.seq))
         self.blast_dict = Manager().dict()
@@ -38,7 +95,7 @@ def get_blast_parameters(self, variable: str, default_value):
                 variable in self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition]:
             value_return = self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition][variable]
         return value_return
-    
+
     def _blast_canonical(self, df):
         seq_set = set(df["sequence"].to_list())
 
@@ -52,116 +109,65 @@ def _blast_canonical(self, df):
 
         for protein_seq in self.fa_set:
             for end_ind, found in auto.iter(protein_seq):
-                seq_dict[found]= "canonical"
-        
+                seq_dict[found] = "canonical"
+
         df["position"] = df["sequence"].map(seq_dict)
-        return df 
-
-    def _blast_set(self, fasta_set, peptide):
-        length = len(peptide)
-        position_set = set()
-        for fasta in fasta_set:
-            if len(fasta)>=length:
-                alignments_score  = pairwise2.align.localms(sequenceA = fasta,sequenceB = peptide,match = 1,mismatch = 0,open = -1,extend = 0,score_only = True)
-                if alignments_score == length:
-                    return "canonical"
-                elif alignments_score == length-1:
-                    alignments_local  = pairwise2.align.localms(sequenceA = fasta,sequenceB = peptide,match = 1,mismatch = 0,open = -1,extend = 0)
-                    for alignment in alignments_local:
-                        #insertion e.g., ABCDMEFGH<----ABCDEFGH
-                        if alignment.end - alignment.start == length+1:
-                            s = fasta[alignment.start:alignment.end]
-                            for i in range(length):
-                                if peptide[i] != s[i]:
-                                    position_set.add(i+1)
-                                    break
-                        #substitution e.g., ABCDMFGH<----ABCDEFGH
-                        elif alignment.end - alignment.start == length:
-                            s = fasta[alignment.start:alignment.end]
-                            for i in range(length):
-                                if peptide[i] != s[i]:
-                                    position_set.add(i+1)
-                                    break
-                        # substitution e.g., ABCDEFGM<----ABCDEFGH
-                        elif alignment.end - alignment.start == length-1:
-                            s = fasta[alignment.start:alignment.end]
-                            if peptide[0] != s[0]:
-                                position_set.add(1)
-                            elif peptide[-1] != s[-1]:
-                                position_set.add(length)
-                elif alignments_score == length-2:
-                    alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=-1,
-                                                            open=-1, extend=0)
-                    for alignment in alignments_local:
-                        # deletion e.g., ABCEFGH<----ABCDEFGH
-                        if alignment.end - alignment.start == length and alignment.score == length - 2:
-                            s = fasta[alignment.start:alignment.end - 1]
-                            if pairwise2.align.localms(sequenceA=s, sequenceB=peptide, match=1, mismatch=0, open=0,
-                                                    extend=0, score_only=True) == length - 1:
-                                for i in range(length - 1):
-                                    if peptide[i] != s[i]:
-                                        position_set.add(i+1)
-                                        break
-        if position_set:
-            return position_set
-        else:
-            return "non-canonical"
+        return df
 
     def _result(self, sequence):
-        self.blast_dict[sequence] = self._blast_set(self.fa_set ,sequence)
+        self.blast_dict[sequence] = _blast_set(self.fa_set, sequence)
 
     def blast(self, input_psm_to_blast, output_psm):
         start_time = datetime.datetime.now()
         print("Start time :", start_time)
 
-        PSM = pd.read_table(input_psm_to_blast, header=0,sep="\t")
-        PSM = self._blast_canonical(PSM)
+        psm = pd.read_table(input_psm_to_blast, header=0, sep="\t")
+        psm = self._blast_canonical(psm)
 
-        first_filter = PSM[PSM.position=="canonical"]
-        psm_to_blast = PSM[PSM.position=="waiting for blast"]
+        first_filter = psm[psm.position == "canonical"]
+        psm_to_blast = psm[psm.position == "waiting for blast"]
         psm_to_blast = psm_to_blast.copy()
 
-        #Remove duplicate sequences
+        # Remove duplicate sequences
         seq_set = set(psm_to_blast["sequence"].to_list())
         seq_list = list(seq_set)
 
         pool = Pool(int(self._number_of_processes))
-        list(tqdm(pool.imap(self._result, seq_list) , total=len(seq_list), desc="Blast", unit="peptide"))
+        list(tqdm(pool.imap(self._result, seq_list), total=len(seq_list), desc="Blast", unit="peptide"))
 
         pool.close()
         pool.join()
 
-        psm_to_blast["position"] = PSM.apply(lambda x: self.blast_dict.get(x["sequence"]),axis=1)
-
-        second_filter = psm_to_blast[psm_to_blast.position=="canonical"]
-        non_filter = psm_to_blast[psm_to_blast.position=="non-canonical"]
+        psm_to_blast["position"] = psm.apply(lambda x: self.blast_dict.get(x["sequence"]), axis=1)
 
-        psm_to_findpos = psm_to_blast[psm_to_blast.position!="canonical"]
-        psm_to_findpos = psm_to_findpos[psm_to_findpos.position!="non-canonical"]
+        second_filter = psm_to_blast[psm_to_blast.position == "canonical"]
+        non_filter = psm_to_blast[psm_to_blast.position == "non-canonical"]
 
-        if len(psm_to_findpos)>0:
-            psm_to_findpos["var_num"] = psm_to_findpos.apply(lambda x: len(x["position"]),axis=1)
+        psm_to_findpos = psm_to_blast[psm_to_blast.position != "canonical"]
+        psm_to_findpos = psm_to_findpos[psm_to_findpos.position != "non-canonical"]
+
+        if len(psm_to_findpos) > 0:
+            psm_to_findpos["var_num"] = psm_to_findpos.apply(lambda x: len(x["position"]), axis=1)
             psm_to_findpos = psm_to_findpos.loc[psm_to_findpos.index.repeat(psm_to_findpos["var_num"])]
             psm_to_findpos["var_num"].iloc[0] = 0
             psm_id = psm_to_findpos["PSM_ID"].iloc[0]
-            for i in range(1,psm_to_findpos.shape[0]):
+            for i in range(1, psm_to_findpos.shape[0]):
                 if psm_to_findpos["PSM_ID"].iloc[i] == psm_id:
-                    psm_to_findpos["var_num"].iloc[i] = psm_to_findpos["var_num"].iloc[i-1]+1
+                    psm_to_findpos["var_num"].iloc[i] = psm_to_findpos["var_num"].iloc[i - 1] + 1
                 else:
                     psm_id = psm_to_findpos["PSM_ID"].iloc[i]
                     psm_to_findpos["var_num"].iloc[i] = 0
-            psm_to_findpos["position"] = psm_to_findpos.apply(lambda x: str(x["position"])[1:-1].split(",")[x["var_num"]],axis=1)
-            psm_to_findpos.drop(columns="var_num", axis = 1, inplace=True)
+            psm_to_findpos["position"] = psm_to_findpos.apply(
+                lambda x: str(x["position"])[1:-1].split(",")[x["var_num"]],
+                axis=1)
+            psm_to_findpos.drop(columns="var_num", axis=1, inplace=True)
             psm_to_findpos["position"] = psm_to_findpos.apply(lambda x: x["position"].replace(' ', ''), axis=1)
 
         all_psm_out = pd.concat([first_filter, second_filter, non_filter, psm_to_findpos], axis=0, join='outer')
         all_psm_out = all_psm_out.sort_values("PSM_ID")
-        all_psm_out.to_csv(output_psm, header=1,sep="\t", index=None)
+        all_psm_out.to_csv(output_psm, header=1, sep="\t", index=None)
 
         end_time = datetime.datetime.now()
         print("End time :", end_time)
         set_time_taken = end_time - start_time
         print("Time consumption :", set_time_taken)
-
-
-