- Fix: the colnames of OTF may be not correct when Annotator with cus…

…tom data.
byemaxx · Dec 16, 2024 · fb81ea2 · fb81ea2
1 parent 8b0ddab
commit fb81ea2
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 14 deletions.
diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md
@@ -1,3 +1,8 @@
+# Version: 1.120.2
+## Date: 2024-12-16
+### Changes:
+- Fix: the colnames of OTF may be not correct when Annotator with custom data.
+
 # Version: 1.120.1
 ## Date: 2024-12-16
 ### Changes:

diff --git a/metax/peptide_annotator/peptable_annotator.py b/metax/peptide_annotator/peptable_annotator.py
@@ -17,7 +17,7 @@
 class PeptideAnnotator:
     def __init__(self, db_path:str, peptide_path: str, output_path: str,
                  threshold=1.0, genome_mode=True, protein_separator=';', protein_genome_separator = '_',
-                 protein_col='Proteins', peptide_col='Sequence', sample_col_prefix='Intensity_',
+                 protein_col='Proteins', peptide_col='Sequence', sample_col_prefix='Intensity',
                  distinct_genome_threshold:int=0, exclude_protein_contains:str='REV_'):
 
         self.db_path = db_path
@@ -30,7 +30,7 @@ def __init__(self, db_path:str, peptide_path: str, output_path: str,
         self.protein_genome_separator = protein_genome_separator # the separator between protein and genome in each protein ID
         self.protein_col = protein_col
         self.peptide_col = peptide_col
-        self.sample_col_prefix = sample_col_prefix
+        self.sample_col_prefix = sample_col_prefix.strip()
         self.distinct_genome_threshold = distinct_genome_threshold
         self.exclude_protein_contains = exclude_protein_contains
 
@@ -198,9 +198,17 @@ def filter_genome_with_distinct_pep_num(self, df):
         print(f'Peptides number: from [{original_num}] -> [{df.shape[0]}] after filtering genomes with distinct peptides')
         return df
 
-
-
-
+    def rename_columns(self, df):
+        # remove the prefix of the peptide, protein and sample prefix columns
+        # to standardize the column names avoiding the error in the OTF Analyzer
+        cols = df.columns.tolist()
+        cols = [col.replace(self.peptide_col, 'Sequence') for col in cols]
+        cols = [col.replace(self.protein_col, 'Proteins') for col in cols]
+        cols = [col.replace(self.sample_col_prefix, 'Intensity_') for col in cols]
+        # replace the "Intensity__" to "Intensity_" if there are any
+        cols = [col.replace('Intensity__', 'Intensity_') for col in cols]
+        df.columns = cols
+        return df
 
     def run_annotate(self):
         print('Start running Peptide Annotator...')
@@ -218,8 +226,11 @@ def run_annotate(self):
         df.columns = [col.replace(' ', '_') for col in df.columns]
 
         print(f'Original shape: {df.shape}')
-        # exxtract the peptide sequence, protein accessions and sample columns from the dataframe
-        df = df.loc[:, [self.peptide_col, self.protein_col] + [col for col in df.columns if col.startswith(self.sample_col_prefix)]]
+        # extract the peptide sequence, protein accessions and sample columns from the dataframe
+        intensity_cols = [col for col in df.columns if col.startswith(self.sample_col_prefix)]
+        # remove the columns only containing self.sample_col_prefix, rather than starting with self.sample_col_prefix
+        intensity_cols = [col for col in intensity_cols if col != self.sample_col_prefix]
+        df = df.loc[:, [self.peptide_col, self.protein_col] + intensity_cols]
 
         print(f'After filtering Intensity 0 in all samples and removing other columns: {df.shape}')
 
@@ -229,6 +240,8 @@ def run_annotate(self):
 
         df_res = self.run_2_result(df)
 
+        df_res = self.rename_columns(df_res)
+
         self.save_result(df_res)
 
         return df_res

diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py
@@ -44,6 +44,7 @@ def __init__(
         meta_path=None,
         peptide_col_name="Sequence",
         protein_col_name="Proteins",
+        sample_col_prefix='Intensity',
         any_df_mode=False,
         custom_col_name="Custom",
     ):
@@ -54,6 +55,7 @@ def __init__(
 
         self.peptide_col_name = peptide_col_name
         self.protein_col_name = protein_col_name
+        self.sample_col_prefix = sample_col_prefix.strip() #remove the space
         self.protein_separator = ';'
         self.custom_col_name = custom_col_name
         self.sample_list: Optional[List[str]] = None
@@ -130,15 +132,27 @@ def _set_original_df(self, df_path: str) -> None:
         col_names = self.original_df.columns.tolist()
         # replace space with _
         col_names = [i.replace(' ', '_') for i in col_names]
-        intensity_col_names = [i for i in col_names if i.startswith('Intensity_')]
+        intensity_col_names = [i for i in col_names if i.startswith(self.sample_col_prefix)]
+        # remove the prefix itself, for the case that the "Intensity" is a column showing the total intensity of all samples
+        intensity_col_names = [i for i in intensity_col_names if i != self.sample_col_prefix] 
+
         if len(intensity_col_names) > 0:
-            intensity_col_names = [i.replace('Intensity_', '') for i in intensity_col_names]
+            # add a _ to the prefix if the prefix is not ended with "_" but the intensity columns are started with "prefix_"
+            if f'{self.sample_col_prefix}_' in intensity_col_names[0]:
+                self.sample_col_prefix = f'{self.sample_col_prefix}_'
+
+            intensity_col_names = [i.replace(self.sample_col_prefix, '') for i in intensity_col_names]
             self.sample_list = intensity_col_names
+        else:
+            raise ValueError(f"The OTF data must have Intensity columns: with prefix [{self.sample_col_prefix}]")
         ####
 
-        # replace space with _ and remove Intensity_
-        self.original_df.columns = self.original_df.columns.str.replace(
-            ' ', '_').str.replace('Intensity_', '')
+        # replace space with _ and remove Intensity_ of original_df columns
+        original_col_names = self.original_df.columns.tolist()
+        original_col_names = [i.replace(' ', '_') for i in original_col_names]
+        original_col_names = [i.replace(self.sample_col_prefix, '') for i in original_col_names]
+        self.original_df.columns = original_col_names
+
 
     def _set_meta(self, meta_path=None) -> None:
         if meta_path is None:

diff --git a/metax/utils/version.py b/metax/utils/version.py
@@ -1,2 +1,2 @@
-__version__ = '1.120.1'
+__version__ = '1.120.2'
 API_version = '4'
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "MetaXTools"
-version = "1.120.1"
+version = "1.120.2"
 description = "MetaXTools is a novel tool for linking peptide sequences with taxonomic and functional information in Metaproteomics."
 readme = "README_PyPi.md"
 license = { text = "NorthOmics" }