Skip to content

Commit

Permalink
Merge pull request #1 from statisticalbiotechnology/master
Browse files Browse the repository at this point in the history
merge upstream into enetz
  • Loading branch information
enetz authored Jan 16, 2020
2 parents d331e92 + dac71b2 commit ae78695
Show file tree
Hide file tree
Showing 17 changed files with 2,487 additions and 132 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
data/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -127,3 +129,6 @@ dmypy.json

# Pyre type checker
.pyre/
.idea/
*.nf
data/
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,18 @@ The document is populated with pages for
- [Presentation](https://docs.google.com/presentation/d/1f9gMnzccAfw_EnLuwh-cbEAngYUHMVzfp19Fa_9URrc/edit?usp=sharing)

The html version of this page can be reached [here](https://statisticalbiotechnology.github.io/specpride/)

## Core contributors and Collaborators (Please add your name here):

- [Wout Bittremieux](mailto:[email protected]), UCSD, USA
- [Michael Turewicz](mailto:[email protected]), Ruhr-University Bochum, Germany
- [Ralf Gabriels](mailto:[email protected]), VIB-UGent Center for Medical Biotechnology, Belgium
- [Timo Sachsenberg](mailto:[email protected]), Univ. of Tübingen, Germany
- [Eric Deutsch](mailto:[email protected]), ISB, USA
- [Yasset Perez-Riverol](mailto:[email protected]), EBI, UK
- [Lukas Käll](mailto:[email protected]), KTH, Sweden
- [Julia Bubis](mailto:[email protected]), INEPCP RAS, Russia
- [Mark Ivanov](mailto:[email protected]), INEPCP RAS, Russia
- [Lev Levitsky](mailto:[email protected]), INEPCP RAS, Russia
- [Sander Willems](mailto:[email protected]), Laboratory of Pharmaceutical Biotechnology, Ghent University, Belgium
- [Henry Webel](mailto:[email protected]), Novo Nordisk Center for Protein Research, Copenhagen University, Denmark
45 changes: 45 additions & 0 deletions id/MSGFPlus.ini

Large diffs are not rendered by default.

63 changes: 63 additions & 0 deletions id/comet.ini

Large diffs are not rendered by default.

1,148 changes: 1,148 additions & 0 deletions id/final_concatenated_target_decoy.fasta

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions id/xtandem.ini

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions literature.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,11 @@ Different strategies have been suggested for forming representative spectra. [Fr

Proteome tool's publication [Zolg *et al.*](https://www.nature.com/articles/nmeth.4153)
We have extracted a subset of this set to a [google drive](https://drive.google.com/drive/u/1/folders/1VO9VXTsfacZB7yna_3yw77a7AegRu34G).

## References
- [Frank et al. (JPR 2008)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2533155/)
- [Wang et al. (bioRxiv_2018)](https://www.biorxiv.org/content/10.1101/308627v2.full.pdf)
- [Saeed et al. (Trans Comput Biol Bioinform 2014)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6143137/)
- [Griss et al. (Nat Methods 2013), see supplementary material](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3667236/)
- [Perez‐Riverol et al.(Proteomics 2018)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6099476/)
- [Denoising Peptide Tandem Mass Spectra for Spectral Libraries: A Bayesian Approach](https://pubs.acs.org/doi/10.1021/pr400080b)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
pyopenms
click
alabaster
altair
altgraph
Expand Down
214 changes: 214 additions & 0 deletions src/average_spectrum_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import argparse
import numpy as np
from itertools import groupby
from pyteomics import mgf, mass

H = mass.nist_mass['H+'][0][0]

# The following constants represent arbitrarily set default values for several parameters
# Each of these parameters will affect benchmarking and should be carefully chosen.
# See also the list of optional command-line arguments, as they will affect output.

# Here is an incomplete list of things that can be implemented differently:
# - how to set the precursor mass for representative spectrum
# - whether to filter the representative spectrum for low-intensity peaks and how
# (e.g. "dynamic range") filter currently implemented
# - whether to use peak intensities as weights for averaging within MS/MS-level clusters
# - filter peak clusters based on the number of scans they are representing
# - spectrum preprocessing (deisotoping / denoising / "demixing")
# before clustering AND in subsequent benchmarks

DIFF_THRESH = 0.01
DYN_RANGE = 1000
MIN_FRACTION = 0.5


def average_spectrum(spectra, title='', pepmass='', rtinseconds='', charge='', **kwargs):
'''
Produces an average spectrum for a cluster.
Parameters
----------
spectra : Iterable
An iterable of spectra, each spectrum is expected to be in pyteomics format.
title : str, optional
Title of the output spectrum
mz_accuracy : float, keyword only, optional
m/z accuracy used for MS/MS clustering. Default is `DIFF_THRESH`
dyn_range : float, keyword only, optional
Dynamic range to apply to output (peaks less than max_intensity / dyn_range
are discarded)
Returns
-------
out : average spectrum in pyteomics format
'''
mz_accuracy = kwargs.get('mz_accuracy', DIFF_THRESH)
dyn_range = kwargs.get('dyn_range', DYN_RANGE)
min_fraction = kwargs.get('min_fraction', MIN_FRACTION)
mz_arrays, int_arrays = [], []
n = 0 # number of spectra
for s in spectra:
mz_arrays.append(s['m/z array'])
int_arrays.append(s['intensity array'])
n += 1
if n > 1:
mz_array_all = np.concatenate(mz_arrays)
intensity_array_all = np.concatenate(int_arrays)

idx = np.argsort(mz_array_all)
mz_array_all = mz_array_all[idx]
intensity_array_all = intensity_array_all[idx]
diff_array = np.diff(mz_array_all)

new_mz_array = []
new_intensity_array = []

ind_list = list(np.where(diff_array >= mz_accuracy)[0]+1)

i_prev = ind_list[0]

mz_array_sum = np.cumsum(mz_array_all)
intensity_array_sum = np.cumsum(intensity_array_all)

min_l = min_fraction * n
if i_prev >= min_l:
new_mz_array.append(mz_array_sum[i_prev-1]/(i_prev))
new_intensity_array.append(intensity_array_sum[i_prev-1]/n)

for i in ind_list[1:-1]:
if i - i_prev >= min_l:
new_mz_array.append((mz_array_sum[i-1]-mz_array_sum[i_prev-1])/(i-i_prev))
new_intensity_array.append((intensity_array_sum[i-1]-intensity_array_sum[i_prev-1])/n)
i_prev = i

if (len(mz_array_sum) - i_prev) >= min_l:
new_mz_array.append((mz_array_sum[-1] - mz_array_sum[i_prev-1])/(len(mz_array_sum) - i_prev))
new_intensity_array.append((intensity_array_sum[-1] - intensity_array_sum[i_prev-1])/n)
else:
new_mz_array = mz_arrays[0]
new_intensity_array = int_arrays[0]

new_mz_array = np.array(new_mz_array)
new_intensity_array = np.array(new_intensity_array)

min_i = new_intensity_array.max()/dyn_range
idx = new_intensity_array >= min_i
new_intensity_array = new_intensity_array[idx]
new_mz_array = new_mz_array[idx]

return {'params': {'title': title, 'pepmass': pepmass,
'rtinseconds': rtinseconds, 'charge': charge},
'm/z array': new_mz_array,
'intensity array': new_intensity_array}


def _lower_median_mass_index(masses):
i = np.argsort(masses)
k = (len(masses) - 1) // 2
idx = i[k]
return idx, masses[idx]

def lower_median_mass(spectra):
masses, charges = _neutral_masses(spectra)
i, m = _lower_median_mass_index(masses)
z = charges[i]
return (m + z*H) / z, z

def lower_median_mass_rt(spectra):
masses, charges = _neutral_masses(spectra)
rts = [s['params']['rtinseconds'] for s in spectra]
i, m = _lower_median_mass_index(masses)
return rts[i]

def get_cluster_id(title):
return title.split(';', 1)[0]

def naive_average_mass_and_charge(spectra):
mzs = [s['params']['pepmass'][0] for s in spectra]
charges = {tuple(s['params']['charge']) for s in spectra}
if len(charges) > 1:
raise ValueError('There are different charge states in the cluster. Cannot average precursor m/z.')
return sum(mzs) / len(mzs), charges.pop()[0]

def _neutral_masses(spectra):
mzs = [s['params']['pepmass'][0] for s in spectra]
charges = [s['params']['charge'][0] for s in spectra if len(s['params']['charge']) == 1]
masses = [(m*c-c*H) for m, c in zip(mzs, charges)]
return masses, charges

def neutral_average_mass_and_charge(spectra):
masses, charges = _neutral_masses(spectra)
z = int(round(sum(charges)/len(charges)))
avg_mass = sum(masses) / len(masses)
return (avg_mass + z*H) / z, z

def median_rt(spectra):
rts = [s['params']['rtinseconds'] for s in spectra]
return np.median(rts)


def process_maracluster_mgf(fname, get_cluster=get_cluster_id,
get_pepmass=naive_average_mass_and_charge,
get_rt=median_rt,
**kwargs):
outputs = []
with mgf.IndexedMGF(fname, read_charges=False) as fin:
titles = fin.index.keys()
for cluster_id, cluster_group in groupby(titles, get_cluster):
ids = list(cluster_group)
spectra = fin[ids]
mz, c = get_pepmass(spectra)
rt = get_rt(spectra)
outputs.append(average_spectrum(
spectra, cluster_id, pepmass=mz, charge=c, rtinseconds=rt, **kwargs))
return outputs


def main():
pars = argparse.ArgumentParser()
pars.add_argument('input', help='MGF file with clustered spectra.')
pars.add_argument('output', nargs='?', help='Output file (default is stdout).')
mode = pars.add_mutually_exclusive_group(required=True)
mode.add_argument('--single', action='store_true',
help='If specified, input is interpreted as containing a single cluster.')
mode.add_argument('--encodedclusters', action='store_true',
help='Process an MGF with cluster IDs encoded in titles.')
pars.add_argument('--dyn-range', type=float, default=DYN_RANGE,
help='Dynamic range to apply to output spectra')
pars.add_argument('--min-fraction', type=float, default=MIN_FRACTION,
help='Minimum fraction of cluster spectra where MS/MS peak is present.')
pars.add_argument('--mz-accuracy', type=float, default=DIFF_THRESH,
help='Minimum distance between MS/MS peak clusters.')
pars.add_argument('--append', action='store_true',
help='Append to output file instead of replacing it.')
pars.add_argument('--rt', choices=['median', 'mass_lower_median'], default='median')
pars.add_argument('--pepmass', choices=['naive_average', 'neutral_average',
'lower_median'], default='lower_median')

args = pars.parse_args()
if args.pepmass == 'lower_median':
args.rt = 'mass_lower_median'
get_rt = {'median': median_rt, 'mass_lower_median': lower_median_mass_rt}[args.rt]
get_pepmass = {'naive_average': naive_average_mass_and_charge,
'neutral_average': neutral_average_mass_and_charge,
'lower_median': lower_median_mass}[args.pepmass]

kwargs = {'mz_accuracy': args.mz_accuracy, 'dyn_range': args.dyn_range, 'min_fraction': args.min_fraction}
mode = 'wa'[args.append]
if args.single:
spectra = list(mgf.read(args.input))
mz, c = get_pepmass(spectra)
rt = get_rt(spectra)
mgf.write([average_spectrum(spectra,
title=args.output, pepmass=mz, charge=c, rtinseconds=rt, **kwargs)],
args.output, file_mode=mode)
elif args.encodedclusters:
mgf.write(process_maracluster_mgf(args.input,
get_pepmass=get_pepmass, get_rt=get_rt, **kwargs), args.output, file_mode=mode)
else:
raise NotImplementedError('This mode is not implemented yet.')


if __name__ == '__main__':
main()
24 changes: 24 additions & 0 deletions src/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import numpy as np
import sys
import spectrum_utils.spectrum as sus
from pyteomics import mgf, parser
from scipy.stats import binned_statistic
import metrics as mx
import ms_io

def test_method_metric(member_file,representative_file,metric=mx.cos_dist):
# with mgf.mgf(member_file) as memb_reader, mgf.mgf(representative_file) as rep_reader:
# m_spec = memb_reader.next()
# m_specs = []
# # for r_spec in rep_reader:
pass

if __name__ == "__main__":
CLUSTER_FILE = "data/clusters_maracluster.mgf"
# REPRASENTATIVE_FILE = "data/representativespectra/most_similar_spectra__binned_dot_product.mgf"
REPRASENTATIVE_FILE = 'data/best_sp'
# clusters = ms_io.read_cluster_spectra(CLUSTER_FILE)
representatives = ms_io.read_cluster_spectra(REPRASENTATIVE_FILE, usi_present=False)
# test_method_metric(
# mx.cos_dist
# )
Loading

0 comments on commit ae78695

Please sign in to comment.