Skip to content

Commit

Permalink
update: example
Browse files Browse the repository at this point in the history
  • Loading branch information
zprobot committed Nov 12, 2024
1 parent 7b3fe9d commit d08aaf9
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 20 deletions.
Binary file added docs/include/example.feature.parquet
Binary file not shown.
9 changes: 6 additions & 3 deletions docs/score_map_msg.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
type field desc map accession
maxquant Score Andromeda:score andromeda_score MS:1002338
maxquant Delta score Andromeda: delta score andromeda_delta_score MS:1003433
software field desc map accession
andromeda Score Andromeda:score andromeda_score MS:1002338
andromeda Delta score Andromeda: delta score andromeda_delta_score MS:1003433
percolator percolator:score percolator_score MS:1001492
openMS OpenMS:Best PSM Score openms_score MS:1003114
MS-GF MS-GF:RawScore msgf_score MS:1002049
5 changes: 5 additions & 0 deletions quantmsio/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,16 @@
"Q.Value": "qvalue",
"PG.Q.Value": "pg_qvalue",
"Precursor.Normalised": "normalize_intensity",
"PG.MaxLFQ": "lfq",
"Quantity.Quality": "quantitative_score",
"Precursor.Charge": "precursor_charge",
"Stripped.Sequence": "sequence",
"Modified.Sequence": "peptidoform",
"Genes": "gg_names",
"Run": "run"
}
DIANN_USECOLS = list(DIANN_MAP.keys())

MAXQUANT_PSM_MAP = {
"Sequence": "sequence",
"Proteins": "mp_accessions",
Expand Down
31 changes: 17 additions & 14 deletions quantmsio/core/diann.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
from quantmsio.core.feature import Feature
from quantmsio.core.duckdb import DuckDB
from quantmsio.utils.pride_utils import generate_scan_number
from quantmsio.core.common import DIANN_MAP

from quantmsio.core.common import DIANN_MAP, DIANN_USECOLS

DIANN_SQL = ', '.join([f'"{name}"' for name in DIANN_USECOLS])
class DiaNNConvert(DuckDB):

def __init__(self, diann_report, sdrf_path, duckdb_max_memory="16GB", duckdb_threads=4):
Expand All @@ -42,11 +42,11 @@ def get_report_from_database(self, runs: list) -> pd.DataFrame:
s = time.time()
database = self._duckdb.query(
"""
select "File.Name", "Run", "Protein.Group", "Protein.Ids", "Genes", "RT", "Predicted.RT", "RT.Start", "RT.Stop", "Precursor.Id", "Q.Value", "Global.Q.Value", "PEP",
"PG.Q.Value", "Global.PG.Q.Value", "Modified.Sequence", "Stripped.Sequence", "Precursor.Charge", "Precursor.Quantity", "Precursor.Normalised"
from report
select {}
from report
where Run IN {}
""".format(
DIANN_SQL,
tuple(runs)
)
)
Expand Down Expand Up @@ -101,7 +101,7 @@ def intergrate_msg(n):
sep="\t",
usecols=["Retention_Time", "SpectrumID", "Exp_Mass_To_Charge"],
)
group = report[report["Run"] == n].copy()
group = report[report["run"] == n].copy()
group.sort_values(by="rt_start", inplace=True)
target.rename(
columns={
Expand All @@ -115,8 +115,6 @@ def intergrate_msg(n):
res = pd.merge_asof(group, target, on="rt_start", direction="nearest")
return res

# query duckdb
# best_ref_map = self.get_peptide_map_from_database()
masses_map, modifications_map = self.get_masses_and_modifications_map()

info_list = [
Expand Down Expand Up @@ -183,31 +181,36 @@ def add_additional_msg(self, report: pd.DataFrame):
report["pg_accessions"] = report["pg_accessions"].str.split(";")
report.loc[:, "anchor_protein"] = report["pg_accessions"].str[0]
report.loc[:, "gg_names"] = report["gg_names"].str.split(",")

report.loc[:, "additional_intensities"] = report[
["reference_file_name", "channel", "normalize_intensity"]
["reference_file_name", "channel", "normalize_intensity", "lfq"]
].apply(
lambda rows: [
{
"sample_accession": self._sample_map[rows["reference_file_name"] + "-" + rows["channel"]],
"channel": rows["channel"],
"additional_intensity": [
{"intensity_name": "normalize_intensity", "intensity_value": rows["normalize_intensity"]}
{"intensity_name": "normalize_intensity", "intensity_value": rows["normalize_intensity"]},
{"intensity_name": "lfq", "intensity_value": rows["lfq"]}
],
}
],
axis=1,
)
report.loc[:, "additional_scores"] = report[["qvalue", "pg_qvalue"]].apply(
report.loc[:, "additional_scores"] = report[["qvalue", "pg_qvalue", "global_qvalue"]].apply(
lambda row: [
{"score_name": "qvalue", "score_value": row["qvalue"]},
{"score_name": "pg_qvalue", "score_value": row["pg_qvalue"]},
{"score_name": "global_qvalue", "score_value": row["global_qvalue"]}
],
axis=1,
)
report.loc[:, "cv_params"] = report[["quantitative_score"]].apply(
lambda rows: [
{"cv_name": "quantitative_score", "cv_value": str(rows["quantitative_score"])}
],
axis=1
)
report.loc[:, "scan_reference_file_name"] = None
report.loc[:, "scan"] = None
report.loc[:, "cv_params"] = None
report.loc[:, "gg_accessions"] = None
report.loc[:, "ion_mobility"] = None
report.loc[:, "start_ion_mobility"] = None
Expand Down
6 changes: 4 additions & 2 deletions quantmsio/core/psm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import pyarrow as pa
import pyarrow.parquet as pq
from quantmsio.utils.file_utils import extract_protein_list
Expand Down Expand Up @@ -87,15 +88,16 @@ def transform_parquet(df):
def _genarate_additional_scores(self, cols):
struct_list = []
for software, score in self._score_names.items():
struct = {"score_name": f"search_engine_score({software})", "score_value": cols[score]}
software = re.sub(r"[^a-zA-Z0-9\s]", "", software)
software = software.lower()
struct = {"score_name": f"{software}_score", "score_value": cols[score]}
struct_list.append(struct)
if cols["global_qvalue"]:
struct = {"score_name": "global_qvalue", "score_value": cols["global_qvalue"]}
struct_list.append(struct)
return struct_list

def add_addition_msg(self, df):
df.loc[:, "cv_params"] = None
df.loc[:, "predicted_rt"] = None
df.loc[:, "ion_mobility"] = None
df.loc[:, "number_peaks"] = None
Expand Down
2 changes: 1 addition & 1 deletion quantmsio/operate/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def generate_feature_of_gene(
pqwriters, pqwriter_no_part = save_parquet_file(
partitions, table, output_folder, filename, pqwriters, pqwriter_no_part
)
close_file(partitions, pqwriters, pqwriter_no_part)
close_file(pqwriters, pqwriter_no_part)


def map_protein_for_tsv(path: str, fasta: str, output_path: str, map_parameter: str):
Expand Down

0 comments on commit d08aaf9

Please sign in to comment.