Add todos

Proteobench · Sep 26, 2023 · a5a5e8c · a5a5e8c
1 parent 60d8ae6
commit a5a5e8c
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 5 deletions.
diff --git a/proteobench/modules/dda_quant/datapoint.py b/proteobench/modules/dda_quant/datapoint.py
@@ -7,7 +7,7 @@
 @dataclass
 class Datapoint:
     """Data used to stored the"""
-
+    # TODO add threshold value used for presence ion/peptidoform
     id: str = None
     search_engine: str = None
     software_version: int = 0
@@ -51,7 +51,8 @@ def calculate_plot_data(self, df):
             prop_ratios.append(prop_ratio)
             sum_ratios += prop_ratio
             nr_missing_0 += f
-
+
+        # TODO rename/document code
         self.weighted_sum = round(sum_ratios, ndigits=3)
         self.nr_prec = len(df)
 
@@ -65,6 +66,7 @@ def generate_id(self):
         )
         print(self.id)
 
+    # TODO, not used? Can be removed?
     def dump_json_object(self, file_name):
         f = open(file_name, "a")
         f.write(json.dumps(asdict(self)))

diff --git a/proteobench/modules/dda_quant/module.py b/proteobench/modules/dda_quant/module.py
@@ -31,11 +31,12 @@ def generate_intermediate(
         """Take the generic format of data search output and convert it to get the quantification data (a tuple, the quantification measure and the reliability of it)."""
 
         # Summarize values of the same peptide using mean
+        # TODO should we take the mean or sum of the same peptidoform/peptideions same raw file multiple intensities
         quant_raw_df = filtered_df.groupby(["peptidoform", "Raw file"]).Intensity.mean()
         quant_df = quant_raw_df.unstack(level=1)
 
         # Count number of values per peptidoform and Raw file
-
+        # TODO calculate this on the log2 transformed values
         for replicate, replicate_runs in replicate_to_raw.items():
             selected_replicate_df = quant_raw_df.index.get_level_values(
                 "Raw file"
@@ -47,11 +48,13 @@ def generate_intermediate(
             quant_df["mean_of_" + str(replicate)] = mean_series
 
             ## Add number of missing values per row of replicate
+            # TODO keep missing values, filter later for calculation of ratios
             missing_series = replicate_quant_df.isna().groupby(["peptidoform"]).sum()
             quant_df["missing_values_" + str(replicate)] = missing_series
 
         species_peptidoform = list(parse_settings.species_dict.values())
         species_peptidoform.append("peptidoform")
+        # TODO check, do we need to drop_duplicates? When?
         peptidoform_to_species = filtered_df[species_peptidoform].drop_duplicates()
         peptidoform_to_species.index = peptidoform_to_species["peptidoform"]
         peptidoform_to_species_dict = peptidoform_to_species.T.to_dict()
@@ -70,6 +73,8 @@ def generate_intermediate(
             species_df_slice = cv_replicate_quant_species_df[
                 cv_replicate_quant_species_df[species] == True
             ]
+            # TODO add cutoffs for different thresholds presence of peptide ion
+            # TODO do substraction for log2 transformed
             for conditions in itertools.combinations(
                 set(parse_settings.replicate_mapper.values()), 2
             ):
@@ -89,6 +94,9 @@ def generate_intermediate(
                     * 100
                 )
 
+                # There is a loop that adds resulting ratios, if already
+                # exists than concat to the existing DF, otherwise
+                # keyexception and make df
                 try:
                     ratio_dict[condition_comp_id + "_ratio"] = pd.concat(
                         [ratio, ratio_dict[condition_comp_id + "_ratio"]]

diff --git a/proteobench/modules/dda_quant/parse.py b/proteobench/modules/dda_quant/parse.py
@@ -13,6 +13,7 @@ def convert_to_standard_format(
         self, df: pd.DataFrame, parse_settings: ParseSettings
     ) -> tuple[pd.DataFrame, Dict[int, List[str]]]:
         """Convert a search engine output into a generic format supported by the module."""
+        #TODO add functionality/steps in docstring
 
         for k, v in parse_settings.mapper.items():
             if k not in df.columns:
@@ -40,7 +41,10 @@ def convert_to_standard_format(
             > parse_settings.min_count_multispec
         )
 
+        df = df[df["MULTI_SPEC"] == False]
+
         # If there is "Raw file" then it is a long format, otherwise short format
+        # TODO we might need to generalize this with toml
         if "Raw file" not in parse_settings.mapper.values():
             meltvars = parse_settings.replicate_mapper.keys()
             df = df.melt(
@@ -49,13 +53,17 @@ def convert_to_standard_format(
                 var_name="Raw file",
                 value_name="Intensity",
             )
+
+        # TODO replace with condition_mapper
         df["replicate"] = df["Raw file"].map(parse_settings.replicate_mapper)
         df = pd.concat([df, pd.get_dummies(df["Raw file"])], axis=1)
 
-        df = df[df["MULTI_SPEC"] == False]
-
         # TODO, if "Charge" is not available return a sensible error
+        # TODO, include modifications for ion
         df.loc[df.index, "peptidoform"] = df.loc[df.index, "Sequence"]+"|Z="+df.loc[df.index, "Charge"].map(str)
+
+        # TODO use peptide_ion or peptidoform here
+        # TODO move this to datapoint, keep a count here of quantified AA
         count_non_zero = (
             df.groupby(["Sequence", "Raw file"])["Intensity"].sum() > 0.0
         ).groupby(level=[0]).sum() == 6