Skip to content

Commit

Permalink
Add todos
Browse files Browse the repository at this point in the history
  • Loading branch information
RobbinBouwmeester committed Sep 26, 2023
1 parent 60d8ae6 commit a5a5e8c
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 5 deletions.
6 changes: 4 additions & 2 deletions proteobench/modules/dda_quant/datapoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
@dataclass
class Datapoint:
"""Data used to stored the"""

# TODO add threshold value used for presence ion/peptidoform
id: str = None
search_engine: str = None
software_version: int = 0
Expand Down Expand Up @@ -51,7 +51,8 @@ def calculate_plot_data(self, df):
prop_ratios.append(prop_ratio)
sum_ratios += prop_ratio
nr_missing_0 += f


# TODO rename/document code
self.weighted_sum = round(sum_ratios, ndigits=3)
self.nr_prec = len(df)

Expand All @@ -65,6 +66,7 @@ def generate_id(self):
)
print(self.id)

# TODO, not used? Can be removed?
def dump_json_object(self, file_name):
f = open(file_name, "a")
f.write(json.dumps(asdict(self)))
Expand Down
10 changes: 9 additions & 1 deletion proteobench/modules/dda_quant/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ def generate_intermediate(
"""Take the generic format of data search output and convert it to get the quantification data (a tuple, the quantification measure and the reliability of it)."""

# Summarize values of the same peptide using mean
# TODO should we take the mean or sum of the same peptidoform/peptideions same raw file multiple intensities
quant_raw_df = filtered_df.groupby(["peptidoform", "Raw file"]).Intensity.mean()
quant_df = quant_raw_df.unstack(level=1)

# Count number of values per peptidoform and Raw file

# TODO calculate this on the log2 transformed values
for replicate, replicate_runs in replicate_to_raw.items():
selected_replicate_df = quant_raw_df.index.get_level_values(
"Raw file"
Expand All @@ -47,11 +48,13 @@ def generate_intermediate(
quant_df["mean_of_" + str(replicate)] = mean_series

## Add number of missing values per row of replicate
# TODO keep missing values, filter later for calculation of ratios
missing_series = replicate_quant_df.isna().groupby(["peptidoform"]).sum()
quant_df["missing_values_" + str(replicate)] = missing_series

species_peptidoform = list(parse_settings.species_dict.values())
species_peptidoform.append("peptidoform")
# TODO check, do we need to drop_duplicates? When?
peptidoform_to_species = filtered_df[species_peptidoform].drop_duplicates()
peptidoform_to_species.index = peptidoform_to_species["peptidoform"]
peptidoform_to_species_dict = peptidoform_to_species.T.to_dict()
Expand All @@ -70,6 +73,8 @@ def generate_intermediate(
species_df_slice = cv_replicate_quant_species_df[
cv_replicate_quant_species_df[species] == True
]
# TODO add cutoffs for different thresholds presence of peptide ion
# TODO do substraction for log2 transformed
for conditions in itertools.combinations(
set(parse_settings.replicate_mapper.values()), 2
):
Expand All @@ -89,6 +94,9 @@ def generate_intermediate(
* 100
)

# There is a loop that adds resulting ratios, if already
# exists than concat to the existing DF, otherwise
# keyexception and make df
try:
ratio_dict[condition_comp_id + "_ratio"] = pd.concat(
[ratio, ratio_dict[condition_comp_id + "_ratio"]]
Expand Down
12 changes: 10 additions & 2 deletions proteobench/modules/dda_quant/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def convert_to_standard_format(
self, df: pd.DataFrame, parse_settings: ParseSettings
) -> tuple[pd.DataFrame, Dict[int, List[str]]]:
"""Convert a search engine output into a generic format supported by the module."""
#TODO add functionality/steps in docstring

for k, v in parse_settings.mapper.items():
if k not in df.columns:
Expand Down Expand Up @@ -40,7 +41,10 @@ def convert_to_standard_format(
> parse_settings.min_count_multispec
)

df = df[df["MULTI_SPEC"] == False]

# If there is "Raw file" then it is a long format, otherwise short format
# TODO we might need to generalize this with toml
if "Raw file" not in parse_settings.mapper.values():
meltvars = parse_settings.replicate_mapper.keys()
df = df.melt(
Expand All @@ -49,13 +53,17 @@ def convert_to_standard_format(
var_name="Raw file",
value_name="Intensity",
)

# TODO replace with condition_mapper
df["replicate"] = df["Raw file"].map(parse_settings.replicate_mapper)
df = pd.concat([df, pd.get_dummies(df["Raw file"])], axis=1)

df = df[df["MULTI_SPEC"] == False]

# TODO, if "Charge" is not available return a sensible error
# TODO, include modifications for ion
df.loc[df.index, "peptidoform"] = df.loc[df.index, "Sequence"]+"|Z="+df.loc[df.index, "Charge"].map(str)

# TODO use peptide_ion or peptidoform here
# TODO move this to datapoint, keep a count here of quantified AA
count_non_zero = (
df.groupby(["Sequence", "Raw file"])["Intensity"].sum() > 0.0
).groupby(level=[0]).sum() == 6
Expand Down

0 comments on commit a5a5e8c

Please sign in to comment.