Skip to content

Commit

Permalink
update new blast and spectrumAI
Browse files Browse the repository at this point in the history
  • Loading branch information
DongdongdongW committed Apr 24, 2024
1 parent a0193ec commit 114de48
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 584 deletions.
10 changes: 5 additions & 5 deletions pypgatk/commands/validate_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
@click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on',
is_flag=True)
@click.option('-msgf', '--msgf',
help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format',
@click.option('-m', '--mztab',
help='If the tsv was obtained from mzTab, please enable this option. Default to tsv obtained from parquet',
is_flag=True)
@click.pass_context
def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
number_of_processes, relative, msgf):
number_of_processes, relative, mztab):
config_data = None
if config_file is not None:
config_data = read_yaml_from_file(config_file)
Expand All @@ -47,8 +47,8 @@ def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outf
pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
if relative is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
if msgf is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf
if mztab is not None:
pipeline_arguments[ValidatePeptidesService.CONFIG_MZTAB] = mztab

validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
if validate_flag:
Expand Down
152 changes: 82 additions & 70 deletions pypgatk/proteogenomics/blast_get_position.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,61 +8,69 @@

from pypgatk.toolbox.general import ParameterConfiguration

def get_details(fasta, peptide):
res = []
i = 0
j = 0
for AA1, AA2 in zip(fasta, peptide):
i += 1
j += 1
if AA1 == AA2:
continue
else:
res.append(str(i) + "|" + AA1 + ">" + AA2)
return res

def _blast_set(fasta_set, peptide):
def peptide_blast_protein(fasta, peptide):
length = len(peptide)
position_set = set()
for fasta in fasta_set:
if len(fasta) >= length:
alignments_score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0, score_only=True)
if alignments_score == length:
return "canonical"
elif alignments_score == length - 1:
alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0)
for alignment in alignments_local:
# insertion e.g., ABCDMEFGH<----ABCDEFGH
if alignment.end - alignment.start == length + 1:
s = fasta[alignment.start:alignment.end]
for i in range(length):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
# substitution e.g., ABCDMFGH<----ABCDEFGH
elif alignment.end - alignment.start == length:
s = fasta[alignment.start:alignment.end]
for i in range(length):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
# substitution e.g., ABCDEFGM<----ABCDEFGH
elif alignment.end - alignment.start == length - 1:
s = fasta[alignment.start:alignment.end]
if peptide[0] != s[0]:
position_set.add(1)
elif peptide[-1] != s[-1]:
position_set.add(length)
elif alignments_score == length - 2:
alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=-1,
open=-1, extend=0)
for alignment in alignments_local:
# deletion e.g., ABCEFGH<----ABCDEFGH
if alignment.end - alignment.start == length and alignment.score == length - 2:
s = fasta[alignment.start:alignment.end - 1]
if pairwise2.align.localms(sequenceA=s, sequenceB=peptide, match=1, mismatch=0, open=0,
extend=0, score_only=True) == length - 1:
for i in range(length - 1):
if peptide[i] != s[i]:
position_set.add(i + 1)
break
if position_set:
return position_set
mismatch = []
if len(fasta) >= length:

Check notice on line 27 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L27

Trailing whitespace
score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide,

Check notice on line 28 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L28

Trailing whitespace
match=1, mismatch=0, open=-2, extend=-2, score_only=True)
if score == length-1:
alignment = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide,

Check notice on line 31 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L31

Trailing whitespace
match=1, mismatch=0, open=-2, extend=-2)[0]
if alignment.end - alignment.start == length:
mismatch = get_details(alignment.seqA[alignment.start:alignment.end], alignment.seqB[alignment.start:alignment.end])
elif alignment.end - alignment.start == length-1:
res = get_details(alignment.seqA[alignment.start:alignment.end+1], alignment.seqB[alignment.start:alignment.end+1])
if len(res) == 1:
if res[0].split(">")[1]!="-":
mismatch = res
else:
mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
elif len(res) == 0:
mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
else:
print("Number of mismatch Error")
return mismatch

def _blast_set(fasta_dict, peptide):
positions = dict()
for fasta in fasta_dict.keys():
mismatch = peptide_blast_protein(fasta, peptide)
if len(mismatch) == 1:
if positions.get(mismatch[0]):
positions[mismatch[0]].update(fasta_dict[fasta])
else:
positions[mismatch[0]] = fasta_dict[fasta]
elif len(mismatch) > 1:
print("Number of mismatch > 1")
print(peptide)
print(fasta)
print(mismatch)
if positions:
res = []
for key,value in positions.items():
splits = key.split("|")
splits.append(",".join(value))
res.append(splits)

Check notice on line 67 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L67

Trailing whitespace
return res
else:
return "non-canonical"


class BlastGetPositionService(ParameterConfiguration):
CONFIG_KEY_BlastGetPosition = 'blast_get_position'
# CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
CONFIG_INPUT_REFERENCE_DATABASE = 'input_reference_database'
CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'

Expand All @@ -79,9 +87,14 @@ def __init__(self, config_data, pipeline_arguments):
self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES,
default_value='40')

self.fa_set = set()

self.fasta_dict = dict()
for j in SeqIO.parse(self._input_reference_database, "fasta"):
self.fa_set.add(str(j.seq))
if self.fasta_dict.get(str(j.seq)):
self.fasta_dict[str(j.seq)].add(j.id)
else:
self.fasta_dict[str(j.seq)] = {j.id}

Check notice on line 97 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L97

Trailing whitespace
self.blast_dict = Manager().dict()

def get_blast_parameters(self, variable: str, default_value):
Expand All @@ -92,7 +105,7 @@ def get_blast_parameters(self, variable: str, default_value):
variable in self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition]:
value_return = self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition][variable]
return value_return

def _blast_canonical(self, df):
seq_set = set(df["sequence"].to_list())

Expand All @@ -104,7 +117,7 @@ def _blast_canonical(self, df):

auto.make_automaton()

for protein_seq in self.fa_set:
for protein_seq in self.fasta_dict.keys():
for end_ind, found in auto.iter(protein_seq):
seq_dict[found] = "canonical"
print("Found", found, "at position", end_ind, "in protein sequence")
Expand All @@ -113,7 +126,7 @@ def _blast_canonical(self, df):
return df

def _result(self, sequence):
self.blast_dict[sequence] = _blast_set(self.fa_set, sequence)
self.blast_dict[sequence] = _blast_set(self.fasta_dict, sequence)

def blast(self, input_psm_to_blast, output_psm):
"""
Expand All @@ -137,7 +150,7 @@ def blast(self, input_psm_to_blast, output_psm):
else:
raise ValueError("The input file format is not supported.")

psm = psm.head(4)

psm = self._blast_canonical(psm)

first_filter = psm[psm.position == "canonical"]
Expand All @@ -163,25 +176,24 @@ def blast(self, input_psm_to_blast, output_psm):
psm_to_findpos = psm_to_findpos[psm_to_findpos.position != "non-canonical"]

if len(psm_to_findpos) > 0:
psm_to_findpos["var_num"] = psm_to_findpos.apply(lambda x: len(x["position"]), axis=1)
psm_to_findpos = psm_to_findpos.loc[psm_to_findpos.index.repeat(psm_to_findpos["var_num"])]
psm_to_findpos["var_num"].iloc[0] = 0
psm_id = psm_to_findpos["usi"].iloc[0]
for i in range(1, psm_to_findpos.shape[0]):
if psm_to_findpos["usi"].iloc[i] == psm_id:
psm_to_findpos["var_num"].iloc[i] = psm_to_findpos["var_num"].iloc[i - 1] + 1
else:
psm_id = psm_to_findpos["usi"].iloc[i]
psm_to_findpos["var_num"].iloc[i] = 0
psm_to_findpos["position"] = psm_to_findpos.apply(
lambda x: str(x["position"])[1:-1].split(",")[x["var_num"]],
axis=1)
psm_to_findpos.drop(columns="var_num", axis=1, inplace=True)
psm_to_findpos["position"] = psm_to_findpos.apply(lambda x: x["position"].replace(' ', ''), axis=1)
psm_to_findpos = psm_to_findpos.explode("position", ignore_index=True)
psm_to_findpos["variant"] = psm_to_findpos["position"].apply(lambda x : x[1])

Check notice on line 180 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L180

Trailing whitespace
psm_to_findpos["protein"] = psm_to_findpos["position"].apply(lambda x : x[2])

Check notice on line 181 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L181

Trailing whitespace
psm_to_findpos["position"] = psm_to_findpos["position"].apply(lambda x : x[0])

Check notice on line 182 in pypgatk/proteogenomics/blast_get_position.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

pypgatk/proteogenomics/blast_get_position.py#L182

Trailing whitespace

all_psm_out = pd.concat([first_filter, second_filter, non_filter, psm_to_findpos], axis=0, join='outer')
all_psm_out = all_psm_out.sort_values("usi")
all_psm_out.to_csv(output_psm, header=1, sep=",", index=None)

if output_psm.endswith(".csv.gz"):
all_psm_out.to_csv(output_psm, header=True, sep=",", index=None, compression="gzip")
elif output_psm.endswith(".csv"):
all_psm_out.to_csv(output_psm, header=True, sep=",", index=None)
elif output_psm.endswith(".tsv.gz"):
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None, compression="gzip")
elif output_psm.endswith(".tsv"):
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)
else:
all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)

end_time = datetime.datetime.now()
print("End time :", end_time)
Expand Down
Loading

0 comments on commit 114de48

Please sign in to comment.