diff --git a/python/quantmsio/quantms_io/core/convert.py b/python/quantmsio/quantms_io/core/convert.py index 5526b75..5fcb561 100644 --- a/python/quantmsio/quantms_io/core/convert.py +++ b/python/quantmsio/quantms_io/core/convert.py @@ -11,9 +11,9 @@ tmt = { 'TMT10': ['TMT126', 'TMT127C', 'TMT127N', 'TMT128C', 'TMT128N', 'TMT129C', 'TMT129N', 'TMT130C', 'TMT130N', 'TMT131'], - 'TMT11': ["TMT126", "TMT127N", "TMT127C", "TMT128N", "TMT128C", "TMT129N", "TMT129C", "TMT130N", "TMT130C", "TMT131N", "TMT131C"], - 'TMT16': ["TMT126", "TMT127N", "TMT127C", "TMT128N", "TMT128C", "TMT129N", "TMT129C", "TMT130N", "TMT130C", "TMT131N", "TMT131C", "TMT132N", "TMT132C", "TMT133N", "TMT133C", "TMT134N"], - 'TMT6': ["TMT126", "TMT127", "TMT128", "TMT129", "TMT130", "TMT131"] + 'TMT11': ["TMT126", "TMT127N", "TMT127C", "TMT128N", "TMT128C", "TMT129N", "TMT129C", "TMT130N", "TMT130C", "TMT131N", "TMT131C"], + 'TMT16': ["TMT126", "TMT127N", "TMT127C", "TMT128N", "TMT128C", "TMT129N", "TMT129C", "TMT130N", "TMT130C", "TMT131N", "TMT131C", "TMT132N", "TMT132C", "TMT133N", "TMT133C", "TMT134N"], + 'TMT6': ["TMT126", "TMT127", "TMT128", "TMT129", "TMT130", "TMT131"] } diff --git a/python/quantmsio/quantms_io/core/feature.py b/python/quantmsio/quantms_io/core/feature.py index 3e22c16..239e1bb 100644 --- a/python/quantmsio/quantms_io/core/feature.py +++ b/python/quantmsio/quantms_io/core/feature.py @@ -16,11 +16,13 @@ import pyarrow as pa import pyarrow.parquet as pq import pandas as pd +from pandas import DataFrame from scipy.linalg._solve_toeplitz import float64 from quantms_io.core.mztab import MztabHandler from quantms_io.core.parquet_handler import ParquetHandler from quantms_io.core.sdrf import SDRFHandler +from quantms_io.utils.constants import TMT_CHANNELS, ITRAQ_CHANNEL from quantms_io.utils.pride_utils import clean_peptidoform_sequence, compare_protein_lists, \ standardize_protein_list_accession, standardize_protein_string_accession, get_modifications_object_from_mztab_line @@ -38,6 +40,30 @@ def get_quantmsio_modifications(modifications_string: str, modification_definiti modifications_definition=modification_definition) +def get_additional_properties_from_sdrf(feature_dict: dict, experiment_type: str, sdrf_samples: dict) -> dict: + + if 'FragmentIon' not in feature_dict: # FragmentIon is not in the MSstats file object if the experiment is label free + feature_dict["FragmentIon"] = None + + if 'IsotopeLabelType' not in feature_dict: + feature_dict["IsotopeLabelType"] = "L" + + if "TMT" in experiment_type: # TMT experiment + feature_dict["Channel"] = TMT_CHANNELS[experiment_type][feature_dict["Channel"] - 1] + elif "ITRAQ" in experiment_type: # ITRAQ experiment + feature_dict["Channel"] = ITRAQ_CHANNEL[experiment_type][feature_dict["Channel"] - 1] + else: # Label free experiment + feature_dict["Channel"] = "LABEL FREE SAMPLE" + + # Get the sample accession from the SDRF file. + sample_id = sdrf_samples[feature_dict["Reference"] + ":_:" + feature_dict["Channel"]] + feature_dict["SampleName"] = sample_id + + + + return feature_dict + + class FeatureHandler(ParquetHandler): """ This class handle protein tables in column format. The main serialization format is Apache Parquet. @@ -148,6 +174,8 @@ def convert_mztab_msstats_to_feature(self, msstats_file: str, sdrf_file: str, mz :return: none """ sdrf_handler = SDRFHandler(sdrf_file) + experiment_type = sdrf_handler.get_experiment_type_from_sdrf() + sdrf_samples = sdrf_handler.get_sample_map() mztab_handler = MztabHandler(mztab_file, use_cache=use_cache) mztab_handler.load_mztab_file(use_cache=use_cache) feature_list = [] @@ -166,7 +194,7 @@ def convert_mztab_msstats_to_feature(self, msstats_file: str, sdrf_file: str, mz msstats_values = line.split(",") # Create a dictionary with the values feature_dict = dict(zip(msstats_columns, msstats_values)) - msstats_feature = self._fetch_msstats_feature(feature_dict, sdrf_handler, mztab_handler) + msstats_feature = self._fetch_msstats_feature(feature_dict, experiment_type, sdrf_samples, mztab_handler) if msstats_feature is not None: feature_list.append(msstats_feature) #feature_table = self.create_feature_table([msstats_feature]) @@ -192,21 +220,28 @@ def describe_schema(self): schema_description.append(field_description) return schema_description - def _fetch_msstats_feature(self, feature_dict: dict, sdrf_handler: SDRFHandler, mztab_handler: MztabHandler): + def _fetch_msstats_feature(self, feature_dict: dict, experiment_type: str, sdrf_samples: dict, mztab_handler: MztabHandler): """ Fetch a feature from a MSstats dictionary and convert to a quantms.io format row :param feature_dict: MSstats feature dictionary - :param sdrf_handler: SDRF handler + :param experiment_type: experiment type + :param sdrf_samples: SDRF samples :param mztab_handler: mztab handler :return: quantms.io format row """ + + feature_dict["Reference"] = feature_dict["Reference"].replace("\"", "").split(".")[ + 0] # Reference is the Msstats form .e.g. HeLa_1to1_01 + + + feature_dict = get_additional_properties_from_sdrf(feature_dict, experiment_type, sdrf_samples) + protein_accessions_list = standardize_protein_list_accession(feature_dict["ProteinName"]) protein_accessions_string = standardize_protein_string_accession(feature_dict["ProteinName"]) peptidoform = feature_dict["PeptideSequence"] # Peptidoform is the Msstats form .e.g. EM(Oxidation)QDLGGGER peptide_sequence = clean_peptidoform_sequence(peptidoform) # Get sequence .e.g. EMQDLGGGER charge = feature_dict["PrecursorCharge"] # Charge is the Msstats form .e.g. 2 - reference_file = feature_dict["Reference"].replace("\"", "").split(".")[0] # Reference is the Msstats form .e.g. HeLa_1to1_01 peptide_mztab_qvalue = mztab_handler.get_peptide_qvalue_from_index(msstats_peptidoform=peptidoform, charge=charge) @@ -227,8 +262,9 @@ def _fetch_msstats_feature(self, feature_dict: dict, sdrf_handler: SDRFHandler, modifications_string = None if len(modifications_string)==0 else modifications_string[:-1] # Remove last comma try: peptide_count = mztab_handler.get_number_psm_from_index(msstats_peptidoform=peptidoform, - charge=charge, reference_file=reference_file) + charge=charge, reference_file=feature_dict["Reference"]) except: + reference_file = feature_dict["Reference"] print(f"MBR peptide: {peptidoform}, {charge}, {reference_file}") return None @@ -292,17 +328,17 @@ def _fetch_msstats_feature(self, feature_dict: dict, sdrf_handler: SDRFHandler, "best_id_score": f"{peptide_score_name}: {peptide_qvalue}", "intensity": float64(feature_dict["Intensity"]), "spectral_count": spectral_count, - "sample_accession": "S1", - "condition": "ConditionA", - "fraction": "F1", - "biological_replicate": "BioRep1", - "fragment_ion": "b2", - "isotope_label_type": "LabelA", - "run": "Run1", - "channel": "ChannelA", - "id_scores": [f"{peptide_score_name}: {peptide_mztab_qvalue}"], + "sample_accession": feature_dict["SampleName"], + "condition": feature_dict["Condition"], + "fraction": feature_dict["Fraction"], + "biological_replicate": feature_dict["BioReplicate"], + "fragment_ion": feature_dict["FragmentIon"], + "isotope_label_type": feature_dict["IsotopeLabelType"], + "run": feature_dict["Run"], + "channel": feature_dict["Channel"], + "id_scores": [f"{peptide_score_name}: {peptide_qvalue}"], "consensus_support": None, - "reference_file_name": reference_file, + "reference_file_name": feature_dict["Reference"], "scan_number": scan_number, "exp_mass_to_charge": float64(experimental_mass), "mz": None, diff --git a/python/quantmsio/quantms_io/core/sdrf.py b/python/quantmsio/quantms_io/core/sdrf.py index acd2fa6..cce0072 100644 --- a/python/quantmsio/quantms_io/core/sdrf.py +++ b/python/quantmsio/quantms_io/core/sdrf.py @@ -88,6 +88,9 @@ class SDRFHandler: PRECURSOR_MASS_TOLERANCE = "comment[precursor mass tolerance]" LABELING = "comment[label]" + # The supported labeling methods + SUPOORTED_LABELING = ["LABEL FREE", "TMT10", "TMT11", "TMT16", "TMT6", "ITRAQ4", "ITRAQ8"] + def __init__(self, sdrf_file: str): self.sdrf_file = sdrf_file self.sdrf_table = None @@ -153,11 +156,13 @@ def get_factor_value(self) -> str: return None return values[0] - def extract_feature_properties(self, experiment_type: str): + def extract_feature_properties(self): """ Extract the feature properties from the SDRF file. These properties are needed by the feature file and FeatureHandler class. The experiment type can be: "lfq", "tmt" or "itraq". """ + + experiment_type = self.get_experiment_type_from_sdrf() sdrf_pd = self.sdrf_table.copy() # type: DataFrame sdrf_pd['comment[data file]'] = sdrf_pd['comment[data file]'].apply(lambda x: x.split(".")[0]) @@ -188,7 +193,7 @@ def extract_feature_properties(self, experiment_type: str): sdrf["channel"] = None # Channel will be needed in the LFQ as empty. return sdrf - def extra_experiment_type_from_sdrf(self): + def get_experiment_type_from_sdrf(self): """ Using the SDRF file label column, we will try to extract the experiment type of an SDRF. The three possible values supported in SDRF are lfq, tmt and itraq. @@ -200,13 +205,61 @@ def extra_experiment_type_from_sdrf(self): if len(labeling_values) == 0: raise ValueError("The SDRF file provided does not contain any comment[label] value") - if len([i for i in labeling_values if "label free" in i]) > 0: - return "lfq" - elif len([i for i in labeling_values if "tmt" in i]) > 0: - return "tmt" - elif len([i for i in labeling_values if "itraq" in i]) > 0: - return "itraq" + labeling_values = [i.upper() for i in labeling_values] + + if len([i for i in labeling_values if "LABEL FREE" in i]) > 0: + return "LFQ" + elif len([i for i in labeling_values if "TMT" in i]) > 0: + if len(labeling_values) == 10: + return "TMT10" + elif len(labeling_values) == 11: + return "TMT11" + elif len(labeling_values) == 16: + return "TMT16" + elif len(labeling_values) == 6: + return "TMT6" + else: + raise ValueError("The SDRF file provided does not contain a supported TMT comment[label] value") + elif len([i for i in labeling_values if "ITRAQ" in i]) > 0: + if len(labeling_values) == 4: + return "ITRAQ4" + elif len(labeling_values) == 8: + return "ITRAQ8" + else: + raise ValueError("The SDRF file provided does not contain a supported iTRAQ comment[label] value") else: raise ValueError("The SDRF file provided does not contain any supported comment[label] value") + def get_sample_labels(self): + """ + Get the sample labels from the SDRF file. The sample labels are the values of the column "comment[label]". + :return: set of sample labels + """ + labels = self.sdrf_table['comment[label]'].unique() + return set(labels) + + def get_sample_map(self): + """ + Get the sample accession map from the sdrf file. The key of the sample map is: + - data file + :_: + sample label + The value of the sample map is the sample accession. + :return: sample map + """ + sample_map = {} + sdrf_pd = self.sdrf_table.copy() # type: DataFrame + sdrf_pd['comment[data file]'] = sdrf_pd['comment[data file]'].apply(lambda x: x.split(".")[0]) + for index, row in sdrf_pd.iterrows(): + channel = "LABEL FREE SAMPLE" if "LABEL FREE" in row['comment[label]'].upper() else row['comment[label]'] + if row['comment[data file]'] + ":_:" + channel in sample_map: + if sample_map[row['comment[data file]'] + ":_:" + channel] != row['source name']: + raise ValueError("The sample map is not unique") + else: + print("channel {} for sample {} already in the sample map".format(channel, row['source name'])) + sample_map[row['comment[data file]'] + ":_:" + channel] = row['source name'] + return sample_map + + + + + diff --git a/python/quantmsio/quantms_io/tests/test_sdrf.py b/python/quantmsio/quantms_io/tests/test_sdrf.py index 9cfc6d9..cc8f9c5 100644 --- a/python/quantmsio/quantms_io/tests/test_sdrf.py +++ b/python/quantmsio/quantms_io/tests/test_sdrf.py @@ -26,4 +26,11 @@ def test__load_sdrf_info(self): print(sdrf_handler.extra_experiment_type_from_sdrf()) + def test_get_labels(self): + file = "data/PXD016999-first-instrument.sdrf.tsv" + sdrf_handler=SDRFHandler(file) + self.assertEqual(len(sdrf_handler.get_sample_labels()), 10) + + experiment_type = sdrf_handler.get_experiment_type_from_sdrf() + diff --git a/python/quantmsio/quantms_io/utils/constants.py b/python/quantmsio/quantms_io/utils/constants.py index c7fa2d2..da13364 100644 --- a/python/quantmsio/quantms_io/utils/constants.py +++ b/python/quantmsio/quantms_io/utils/constants.py @@ -10,7 +10,12 @@ 'TMT16': ["TMT126", "TMT127N", "TMT127C", "TMT128N", "TMT128C", "TMT129N", "TMT129C", "TMT130N", "TMT130C", "TMT131N", "TMT131C", "TMT132N", "TMT132C", "TMT133N", "TMT133C", "TMT134N"], 'TMT6': ["TMT126", "TMT127", "TMT128", "TMT129", "TMT130", "TMT131"] -} # TMT channels used in different experiments +} + +ITRAQ_CHANNEL = { + 'ITRAQ4': ['ITRAQ114', 'ITRAQ115', 'ITRAQ116', 'ITRAQ117'], + 'ITRAQ8': ['ITRAQ113', 'ITRAQ114', 'ITRAQ115', 'ITRAQ116', 'ITRAQ117', 'ITRAQ118', 'ITRAQ119', 'ITRAQ121'] # NO EXAMPLES. +} SINGLE_PROTEIN = "single_protein" GROUP_PROTEIN = "indistinguishable_protein_group"