From fb81ea205737966b4108cad9859424e0e53389f4 Mon Sep 17 00:00:00 2001 From: Qing Date: Mon, 16 Dec 2024 15:50:35 -0500 Subject: [PATCH] - Fix: the colnames of OTF may be not correct when Annotator with custom data. --- Docs/ChangeLog.md | 5 ++++ metax/peptide_annotator/peptable_annotator.py | 27 ++++++++++++++----- metax/taxafunc_analyzer/analyzer.py | 24 +++++++++++++---- metax/utils/version.py | 2 +- pyproject.toml | 2 +- 5 files changed, 46 insertions(+), 14 deletions(-) diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md index 274cf37..bdd9494 100644 --- a/Docs/ChangeLog.md +++ b/Docs/ChangeLog.md @@ -1,3 +1,8 @@ +# Version: 1.120.2 +## Date: 2024-12-16 +### Changes: +- Fix: the colnames of OTF may be not correct when Annotator with custom data. + # Version: 1.120.1 ## Date: 2024-12-16 ### Changes: diff --git a/metax/peptide_annotator/peptable_annotator.py b/metax/peptide_annotator/peptable_annotator.py index 74aa93b..b00d190 100644 --- a/metax/peptide_annotator/peptable_annotator.py +++ b/metax/peptide_annotator/peptable_annotator.py @@ -17,7 +17,7 @@ class PeptideAnnotator: def __init__(self, db_path:str, peptide_path: str, output_path: str, threshold=1.0, genome_mode=True, protein_separator=';', protein_genome_separator = '_', - protein_col='Proteins', peptide_col='Sequence', sample_col_prefix='Intensity_', + protein_col='Proteins', peptide_col='Sequence', sample_col_prefix='Intensity', distinct_genome_threshold:int=0, exclude_protein_contains:str='REV_'): self.db_path = db_path @@ -30,7 +30,7 @@ def __init__(self, db_path:str, peptide_path: str, output_path: str, self.protein_genome_separator = protein_genome_separator # the separator between protein and genome in each protein ID self.protein_col = protein_col self.peptide_col = peptide_col - self.sample_col_prefix = sample_col_prefix + self.sample_col_prefix = sample_col_prefix.strip() self.distinct_genome_threshold = distinct_genome_threshold self.exclude_protein_contains = exclude_protein_contains @@ -198,9 +198,17 @@ def filter_genome_with_distinct_pep_num(self, df): print(f'Peptides number: from [{original_num}] -> [{df.shape[0]}] after filtering genomes with distinct peptides') return df - - - + def rename_columns(self, df): + # remove the prefix of the peptide, protein and sample prefix columns + # to standardize the column names avoiding the error in the OTF Analyzer + cols = df.columns.tolist() + cols = [col.replace(self.peptide_col, 'Sequence') for col in cols] + cols = [col.replace(self.protein_col, 'Proteins') for col in cols] + cols = [col.replace(self.sample_col_prefix, 'Intensity_') for col in cols] + # replace the "Intensity__" to "Intensity_" if there are any + cols = [col.replace('Intensity__', 'Intensity_') for col in cols] + df.columns = cols + return df def run_annotate(self): print('Start running Peptide Annotator...') @@ -218,8 +226,11 @@ def run_annotate(self): df.columns = [col.replace(' ', '_') for col in df.columns] print(f'Original shape: {df.shape}') - # exxtract the peptide sequence, protein accessions and sample columns from the dataframe - df = df.loc[:, [self.peptide_col, self.protein_col] + [col for col in df.columns if col.startswith(self.sample_col_prefix)]] + # extract the peptide sequence, protein accessions and sample columns from the dataframe + intensity_cols = [col for col in df.columns if col.startswith(self.sample_col_prefix)] + # remove the columns only containing self.sample_col_prefix, rather than starting with self.sample_col_prefix + intensity_cols = [col for col in intensity_cols if col != self.sample_col_prefix] + df = df.loc[:, [self.peptide_col, self.protein_col] + intensity_cols] print(f'After filtering Intensity 0 in all samples and removing other columns: {df.shape}') @@ -229,6 +240,8 @@ def run_annotate(self): df_res = self.run_2_result(df) + df_res = self.rename_columns(df_res) + self.save_result(df_res) return df_res diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py index 10cbe13..b7e80ac 100644 --- a/metax/taxafunc_analyzer/analyzer.py +++ b/metax/taxafunc_analyzer/analyzer.py @@ -44,6 +44,7 @@ def __init__( meta_path=None, peptide_col_name="Sequence", protein_col_name="Proteins", + sample_col_prefix='Intensity', any_df_mode=False, custom_col_name="Custom", ): @@ -54,6 +55,7 @@ def __init__( self.peptide_col_name = peptide_col_name self.protein_col_name = protein_col_name + self.sample_col_prefix = sample_col_prefix.strip() #remove the space self.protein_separator = ';' self.custom_col_name = custom_col_name self.sample_list: Optional[List[str]] = None @@ -130,15 +132,27 @@ def _set_original_df(self, df_path: str) -> None: col_names = self.original_df.columns.tolist() # replace space with _ col_names = [i.replace(' ', '_') for i in col_names] - intensity_col_names = [i for i in col_names if i.startswith('Intensity_')] + intensity_col_names = [i for i in col_names if i.startswith(self.sample_col_prefix)] + # remove the prefix itself, for the case that the "Intensity" is a column showing the total intensity of all samples + intensity_col_names = [i for i in intensity_col_names if i != self.sample_col_prefix] + if len(intensity_col_names) > 0: - intensity_col_names = [i.replace('Intensity_', '') for i in intensity_col_names] + # add a _ to the prefix if the prefix is not ended with "_" but the intensity columns are started with "prefix_" + if f'{self.sample_col_prefix}_' in intensity_col_names[0]: + self.sample_col_prefix = f'{self.sample_col_prefix}_' + + intensity_col_names = [i.replace(self.sample_col_prefix, '') for i in intensity_col_names] self.sample_list = intensity_col_names + else: + raise ValueError(f"The OTF data must have Intensity columns: with prefix [{self.sample_col_prefix}]") #### - # replace space with _ and remove Intensity_ - self.original_df.columns = self.original_df.columns.str.replace( - ' ', '_').str.replace('Intensity_', '') + # replace space with _ and remove Intensity_ of original_df columns + original_col_names = self.original_df.columns.tolist() + original_col_names = [i.replace(' ', '_') for i in original_col_names] + original_col_names = [i.replace(self.sample_col_prefix, '') for i in original_col_names] + self.original_df.columns = original_col_names + def _set_meta(self, meta_path=None) -> None: if meta_path is None: diff --git a/metax/utils/version.py b/metax/utils/version.py index dd68a74..c768c7c 100644 --- a/metax/utils/version.py +++ b/metax/utils/version.py @@ -1,2 +1,2 @@ -__version__ = '1.120.1' +__version__ = '1.120.2' API_version = '4' \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 48287e9..f9d8674 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "MetaXTools" -version = "1.120.1" +version = "1.120.2" description = "MetaXTools is a novel tool for linking peptide sequences with taxonomic and functional information in Metaproteomics." readme = "README_PyPi.md" license = { text = "NorthOmics" }