Skip to content

Commit

Permalink
- Fix: the colnames of OTF may be not correct when Annotator with cus…
Browse files Browse the repository at this point in the history
…tom data.
  • Loading branch information
byemaxx committed Dec 16, 2024
1 parent 8b0ddab commit fb81ea2
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 14 deletions.
5 changes: 5 additions & 0 deletions Docs/ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Version: 1.120.2
## Date: 2024-12-16
### Changes:
- Fix: the colnames of OTF may be not correct when Annotator with custom data.

# Version: 1.120.1
## Date: 2024-12-16
### Changes:
Expand Down
27 changes: 20 additions & 7 deletions metax/peptide_annotator/peptable_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
class PeptideAnnotator:
def __init__(self, db_path:str, peptide_path: str, output_path: str,
threshold=1.0, genome_mode=True, protein_separator=';', protein_genome_separator = '_',
protein_col='Proteins', peptide_col='Sequence', sample_col_prefix='Intensity_',
protein_col='Proteins', peptide_col='Sequence', sample_col_prefix='Intensity',
distinct_genome_threshold:int=0, exclude_protein_contains:str='REV_'):

self.db_path = db_path
Expand All @@ -30,7 +30,7 @@ def __init__(self, db_path:str, peptide_path: str, output_path: str,
self.protein_genome_separator = protein_genome_separator # the separator between protein and genome in each protein ID
self.protein_col = protein_col
self.peptide_col = peptide_col
self.sample_col_prefix = sample_col_prefix
self.sample_col_prefix = sample_col_prefix.strip()
self.distinct_genome_threshold = distinct_genome_threshold
self.exclude_protein_contains = exclude_protein_contains

Expand Down Expand Up @@ -198,9 +198,17 @@ def filter_genome_with_distinct_pep_num(self, df):
print(f'Peptides number: from [{original_num}] -> [{df.shape[0]}] after filtering genomes with distinct peptides')
return df




def rename_columns(self, df):
# remove the prefix of the peptide, protein and sample prefix columns
# to standardize the column names avoiding the error in the OTF Analyzer
cols = df.columns.tolist()
cols = [col.replace(self.peptide_col, 'Sequence') for col in cols]
cols = [col.replace(self.protein_col, 'Proteins') for col in cols]
cols = [col.replace(self.sample_col_prefix, 'Intensity_') for col in cols]
# replace the "Intensity__" to "Intensity_" if there are any
cols = [col.replace('Intensity__', 'Intensity_') for col in cols]
df.columns = cols
return df

def run_annotate(self):
print('Start running Peptide Annotator...')
Expand All @@ -218,8 +226,11 @@ def run_annotate(self):
df.columns = [col.replace(' ', '_') for col in df.columns]

print(f'Original shape: {df.shape}')
# exxtract the peptide sequence, protein accessions and sample columns from the dataframe
df = df.loc[:, [self.peptide_col, self.protein_col] + [col for col in df.columns if col.startswith(self.sample_col_prefix)]]
# extract the peptide sequence, protein accessions and sample columns from the dataframe
intensity_cols = [col for col in df.columns if col.startswith(self.sample_col_prefix)]
# remove the columns only containing self.sample_col_prefix, rather than starting with self.sample_col_prefix
intensity_cols = [col for col in intensity_cols if col != self.sample_col_prefix]
df = df.loc[:, [self.peptide_col, self.protein_col] + intensity_cols]

print(f'After filtering Intensity 0 in all samples and removing other columns: {df.shape}')

Expand All @@ -229,6 +240,8 @@ def run_annotate(self):

df_res = self.run_2_result(df)

df_res = self.rename_columns(df_res)

self.save_result(df_res)

return df_res
Expand Down
24 changes: 19 additions & 5 deletions metax/taxafunc_analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def __init__(
meta_path=None,
peptide_col_name="Sequence",
protein_col_name="Proteins",
sample_col_prefix='Intensity',
any_df_mode=False,
custom_col_name="Custom",
):
Expand All @@ -54,6 +55,7 @@ def __init__(

self.peptide_col_name = peptide_col_name
self.protein_col_name = protein_col_name
self.sample_col_prefix = sample_col_prefix.strip() #remove the space
self.protein_separator = ';'
self.custom_col_name = custom_col_name
self.sample_list: Optional[List[str]] = None
Expand Down Expand Up @@ -130,15 +132,27 @@ def _set_original_df(self, df_path: str) -> None:
col_names = self.original_df.columns.tolist()
# replace space with _
col_names = [i.replace(' ', '_') for i in col_names]
intensity_col_names = [i for i in col_names if i.startswith('Intensity_')]
intensity_col_names = [i for i in col_names if i.startswith(self.sample_col_prefix)]
# remove the prefix itself, for the case that the "Intensity" is a column showing the total intensity of all samples
intensity_col_names = [i for i in intensity_col_names if i != self.sample_col_prefix]

if len(intensity_col_names) > 0:
intensity_col_names = [i.replace('Intensity_', '') for i in intensity_col_names]
# add a _ to the prefix if the prefix is not ended with "_" but the intensity columns are started with "prefix_"
if f'{self.sample_col_prefix}_' in intensity_col_names[0]:
self.sample_col_prefix = f'{self.sample_col_prefix}_'

intensity_col_names = [i.replace(self.sample_col_prefix, '') for i in intensity_col_names]
self.sample_list = intensity_col_names
else:
raise ValueError(f"The OTF data must have Intensity columns: with prefix [{self.sample_col_prefix}]")
####

# replace space with _ and remove Intensity_
self.original_df.columns = self.original_df.columns.str.replace(
' ', '_').str.replace('Intensity_', '')
# replace space with _ and remove Intensity_ of original_df columns
original_col_names = self.original_df.columns.tolist()
original_col_names = [i.replace(' ', '_') for i in original_col_names]
original_col_names = [i.replace(self.sample_col_prefix, '') for i in original_col_names]
self.original_df.columns = original_col_names


def _set_meta(self, meta_path=None) -> None:
if meta_path is None:
Expand Down
2 changes: 1 addition & 1 deletion metax/utils/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version__ = '1.120.1'
__version__ = '1.120.2'
API_version = '4'
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "MetaXTools"
version = "1.120.1"
version = "1.120.2"
description = "MetaXTools is a novel tool for linking peptide sequences with taxonomic and functional information in Metaproteomics."
readme = "README_PyPi.md"
license = { text = "NorthOmics" }
Expand Down

0 comments on commit fb81ea2

Please sign in to comment.