Skip to content

Commit

Permalink
first iteration of cache-based conversion for feature file.
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Aug 23, 2023
1 parent 2c2b933 commit 81b4419
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 27 deletions.
6 changes: 3 additions & 3 deletions python/quantmsio/quantms_io/core/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

tmt = {
'TMT10': ['TMT126', 'TMT127C', 'TMT127N', 'TMT128C', 'TMT128N', 'TMT129C', 'TMT129N', 'TMT130C', 'TMT130N', 'TMT131'],
'TMT11': ["TMT126", "TMT127N", "TMT127C", "TMT128N", "TMT128C", "TMT129N", "TMT129C", "TMT130N", "TMT130C", "TMT131N", "TMT131C"],
'TMT16': ["TMT126", "TMT127N", "TMT127C", "TMT128N", "TMT128C", "TMT129N", "TMT129C", "TMT130N", "TMT130C", "TMT131N", "TMT131C", "TMT132N", "TMT132C", "TMT133N", "TMT133C", "TMT134N"],
'TMT6': ["TMT126", "TMT127", "TMT128", "TMT129", "TMT130", "TMT131"]
'TMT11': ["TMT126", "TMT127N", "TMT127C", "TMT128N", "TMT128C", "TMT129N", "TMT129C", "TMT130N", "TMT130C", "TMT131N", "TMT131C"],
'TMT16': ["TMT126", "TMT127N", "TMT127C", "TMT128N", "TMT128C", "TMT129N", "TMT129C", "TMT130N", "TMT130C", "TMT131N", "TMT131C", "TMT132N", "TMT132C", "TMT133N", "TMT133C", "TMT134N"],
'TMT6': ["TMT126", "TMT127", "TMT128", "TMT129", "TMT130", "TMT131"]
}


Expand Down
66 changes: 51 additions & 15 deletions python/quantmsio/quantms_io/core/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pandas import DataFrame
from scipy.linalg._solve_toeplitz import float64

from quantms_io.core.mztab import MztabHandler
from quantms_io.core.parquet_handler import ParquetHandler
from quantms_io.core.sdrf import SDRFHandler
from quantms_io.utils.constants import TMT_CHANNELS, ITRAQ_CHANNEL
from quantms_io.utils.pride_utils import clean_peptidoform_sequence, compare_protein_lists, \
standardize_protein_list_accession, standardize_protein_string_accession, get_modifications_object_from_mztab_line

Expand All @@ -38,6 +40,30 @@ def get_quantmsio_modifications(modifications_string: str, modification_definiti
modifications_definition=modification_definition)


def get_additional_properties_from_sdrf(feature_dict: dict, experiment_type: str, sdrf_samples: dict) -> dict:

if 'FragmentIon' not in feature_dict: # FragmentIon is not in the MSstats file object if the experiment is label free
feature_dict["FragmentIon"] = None

if 'IsotopeLabelType' not in feature_dict:
feature_dict["IsotopeLabelType"] = "L"

if "TMT" in experiment_type: # TMT experiment
feature_dict["Channel"] = TMT_CHANNELS[experiment_type][feature_dict["Channel"] - 1]
elif "ITRAQ" in experiment_type: # ITRAQ experiment
feature_dict["Channel"] = ITRAQ_CHANNEL[experiment_type][feature_dict["Channel"] - 1]
else: # Label free experiment
feature_dict["Channel"] = "LABEL FREE SAMPLE"

# Get the sample accession from the SDRF file.
sample_id = sdrf_samples[feature_dict["Reference"] + ":_:" + feature_dict["Channel"]]
feature_dict["SampleName"] = sample_id



return feature_dict


class FeatureHandler(ParquetHandler):
"""
This class handle protein tables in column format. The main serialization format is Apache Parquet.
Expand Down Expand Up @@ -148,6 +174,8 @@ def convert_mztab_msstats_to_feature(self, msstats_file: str, sdrf_file: str, mz
:return: none
"""
sdrf_handler = SDRFHandler(sdrf_file)
experiment_type = sdrf_handler.get_experiment_type_from_sdrf()
sdrf_samples = sdrf_handler.get_sample_map()
mztab_handler = MztabHandler(mztab_file, use_cache=use_cache)
mztab_handler.load_mztab_file(use_cache=use_cache)
feature_list = []
Expand All @@ -166,7 +194,7 @@ def convert_mztab_msstats_to_feature(self, msstats_file: str, sdrf_file: str, mz
msstats_values = line.split(",")
# Create a dictionary with the values
feature_dict = dict(zip(msstats_columns, msstats_values))
msstats_feature = self._fetch_msstats_feature(feature_dict, sdrf_handler, mztab_handler)
msstats_feature = self._fetch_msstats_feature(feature_dict, experiment_type, sdrf_samples, mztab_handler)
if msstats_feature is not None:
feature_list.append(msstats_feature)
#feature_table = self.create_feature_table([msstats_feature])
Expand All @@ -192,21 +220,28 @@ def describe_schema(self):
schema_description.append(field_description)
return schema_description

def _fetch_msstats_feature(self, feature_dict: dict, sdrf_handler: SDRFHandler, mztab_handler: MztabHandler):
def _fetch_msstats_feature(self, feature_dict: dict, experiment_type: str, sdrf_samples: dict, mztab_handler: MztabHandler):
"""
Fetch a feature from a MSstats dictionary and convert to a quantms.io format row
:param feature_dict: MSstats feature dictionary
:param sdrf_handler: SDRF handler
:param experiment_type: experiment type
:param sdrf_samples: SDRF samples
:param mztab_handler: mztab handler
:return: quantms.io format row
"""

feature_dict["Reference"] = feature_dict["Reference"].replace("\"", "").split(".")[
0] # Reference is the Msstats form .e.g. HeLa_1to1_01


feature_dict = get_additional_properties_from_sdrf(feature_dict, experiment_type, sdrf_samples)

protein_accessions_list = standardize_protein_list_accession(feature_dict["ProteinName"])
protein_accessions_string = standardize_protein_string_accession(feature_dict["ProteinName"])

peptidoform = feature_dict["PeptideSequence"] # Peptidoform is the Msstats form .e.g. EM(Oxidation)QDLGGGER
peptide_sequence = clean_peptidoform_sequence(peptidoform) # Get sequence .e.g. EMQDLGGGER
charge = feature_dict["PrecursorCharge"] # Charge is the Msstats form .e.g. 2
reference_file = feature_dict["Reference"].replace("\"", "").split(".")[0] # Reference is the Msstats form .e.g. HeLa_1to1_01

peptide_mztab_qvalue = mztab_handler.get_peptide_qvalue_from_index(msstats_peptidoform=peptidoform,
charge=charge)
Expand All @@ -227,8 +262,9 @@ def _fetch_msstats_feature(self, feature_dict: dict, sdrf_handler: SDRFHandler,
modifications_string = None if len(modifications_string)==0 else modifications_string[:-1] # Remove last comma
try:
peptide_count = mztab_handler.get_number_psm_from_index(msstats_peptidoform=peptidoform,
charge=charge, reference_file=reference_file)
charge=charge, reference_file=feature_dict["Reference"])
except:
reference_file = feature_dict["Reference"]
print(f"MBR peptide: {peptidoform}, {charge}, {reference_file}")
return None

Expand Down Expand Up @@ -292,17 +328,17 @@ def _fetch_msstats_feature(self, feature_dict: dict, sdrf_handler: SDRFHandler,
"best_id_score": f"{peptide_score_name}: {peptide_qvalue}",
"intensity": float64(feature_dict["Intensity"]),
"spectral_count": spectral_count,
"sample_accession": "S1",
"condition": "ConditionA",
"fraction": "F1",
"biological_replicate": "BioRep1",
"fragment_ion": "b2",
"isotope_label_type": "LabelA",
"run": "Run1",
"channel": "ChannelA",
"id_scores": [f"{peptide_score_name}: {peptide_mztab_qvalue}"],
"sample_accession": feature_dict["SampleName"],
"condition": feature_dict["Condition"],
"fraction": feature_dict["Fraction"],
"biological_replicate": feature_dict["BioReplicate"],
"fragment_ion": feature_dict["FragmentIon"],
"isotope_label_type": feature_dict["IsotopeLabelType"],
"run": feature_dict["Run"],
"channel": feature_dict["Channel"],
"id_scores": [f"{peptide_score_name}: {peptide_qvalue}"],
"consensus_support": None,
"reference_file_name": reference_file,
"reference_file_name": feature_dict["Reference"],
"scan_number": scan_number,
"exp_mass_to_charge": float64(experimental_mass),
"mz": None,
Expand Down
69 changes: 61 additions & 8 deletions python/quantmsio/quantms_io/core/sdrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ class SDRFHandler:
PRECURSOR_MASS_TOLERANCE = "comment[precursor mass tolerance]"
LABELING = "comment[label]"

# The supported labeling methods
SUPOORTED_LABELING = ["LABEL FREE", "TMT10", "TMT11", "TMT16", "TMT6", "ITRAQ4", "ITRAQ8"]

def __init__(self, sdrf_file: str):
self.sdrf_file = sdrf_file
self.sdrf_table = None
Expand Down Expand Up @@ -153,11 +156,13 @@ def get_factor_value(self) -> str:
return None
return values[0]

def extract_feature_properties(self, experiment_type: str):
def extract_feature_properties(self):
"""
Extract the feature properties from the SDRF file. These properties are needed by the feature file and
FeatureHandler class. The experiment type can be: "lfq", "tmt" or "itraq".
"""

experiment_type = self.get_experiment_type_from_sdrf()
sdrf_pd = self.sdrf_table.copy() # type: DataFrame

sdrf_pd['comment[data file]'] = sdrf_pd['comment[data file]'].apply(lambda x: x.split(".")[0])
Expand Down Expand Up @@ -188,7 +193,7 @@ def extract_feature_properties(self, experiment_type: str):
sdrf["channel"] = None # Channel will be needed in the LFQ as empty.
return sdrf

def extra_experiment_type_from_sdrf(self):
def get_experiment_type_from_sdrf(self):
"""
Using the SDRF file label column, we will try to extract the experiment type of an SDRF.
The three possible values supported in SDRF are lfq, tmt and itraq.
Expand All @@ -200,13 +205,61 @@ def extra_experiment_type_from_sdrf(self):
if len(labeling_values) == 0:
raise ValueError("The SDRF file provided does not contain any comment[label] value")

if len([i for i in labeling_values if "label free" in i]) > 0:
return "lfq"
elif len([i for i in labeling_values if "tmt" in i]) > 0:
return "tmt"
elif len([i for i in labeling_values if "itraq" in i]) > 0:
return "itraq"
labeling_values = [i.upper() for i in labeling_values]

if len([i for i in labeling_values if "LABEL FREE" in i]) > 0:
return "LFQ"
elif len([i for i in labeling_values if "TMT" in i]) > 0:
if len(labeling_values) == 10:
return "TMT10"
elif len(labeling_values) == 11:
return "TMT11"
elif len(labeling_values) == 16:
return "TMT16"
elif len(labeling_values) == 6:
return "TMT6"
else:
raise ValueError("The SDRF file provided does not contain a supported TMT comment[label] value")
elif len([i for i in labeling_values if "ITRAQ" in i]) > 0:
if len(labeling_values) == 4:
return "ITRAQ4"
elif len(labeling_values) == 8:
return "ITRAQ8"
else:
raise ValueError("The SDRF file provided does not contain a supported iTRAQ comment[label] value")
else:
raise ValueError("The SDRF file provided does not contain any supported comment[label] value")


def get_sample_labels(self):
"""
Get the sample labels from the SDRF file. The sample labels are the values of the column "comment[label]".
:return: set of sample labels
"""
labels = self.sdrf_table['comment[label]'].unique()
return set(labels)

def get_sample_map(self):
"""
Get the sample accession map from the sdrf file. The key of the sample map is:
- data file + :_: + sample label
The value of the sample map is the sample accession.
:return: sample map
"""
sample_map = {}
sdrf_pd = self.sdrf_table.copy() # type: DataFrame
sdrf_pd['comment[data file]'] = sdrf_pd['comment[data file]'].apply(lambda x: x.split(".")[0])
for index, row in sdrf_pd.iterrows():
channel = "LABEL FREE SAMPLE" if "LABEL FREE" in row['comment[label]'].upper() else row['comment[label]']
if row['comment[data file]'] + ":_:" + channel in sample_map:
if sample_map[row['comment[data file]'] + ":_:" + channel] != row['source name']:
raise ValueError("The sample map is not unique")
else:
print("channel {} for sample {} already in the sample map".format(channel, row['source name']))
sample_map[row['comment[data file]'] + ":_:" + channel] = row['source name']
return sample_map





7 changes: 7 additions & 0 deletions python/quantmsio/quantms_io/tests/test_sdrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,11 @@ def test__load_sdrf_info(self):

print(sdrf_handler.extra_experiment_type_from_sdrf())

def test_get_labels(self):
file = "data/PXD016999-first-instrument.sdrf.tsv"
sdrf_handler=SDRFHandler(file)
self.assertEqual(len(sdrf_handler.get_sample_labels()), 10)

experiment_type = sdrf_handler.get_experiment_type_from_sdrf()


7 changes: 6 additions & 1 deletion python/quantmsio/quantms_io/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
'TMT16': ["TMT126", "TMT127N", "TMT127C", "TMT128N", "TMT128C", "TMT129N", "TMT129C", "TMT130N", "TMT130C",
"TMT131N", "TMT131C", "TMT132N", "TMT132C", "TMT133N", "TMT133C", "TMT134N"],
'TMT6': ["TMT126", "TMT127", "TMT128", "TMT129", "TMT130", "TMT131"]
} # TMT channels used in different experiments
}

ITRAQ_CHANNEL = {
'ITRAQ4': ['ITRAQ114', 'ITRAQ115', 'ITRAQ116', 'ITRAQ117'],
'ITRAQ8': ['ITRAQ113', 'ITRAQ114', 'ITRAQ115', 'ITRAQ116', 'ITRAQ117', 'ITRAQ118', 'ITRAQ119', 'ITRAQ121'] # NO EXAMPLES.
}

SINGLE_PROTEIN = "single_protein"
GROUP_PROTEIN = "indistinguishable_protein_group"
Expand Down

0 comments on commit 81b4419

Please sign in to comment.