Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for MSP output #99

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
<common-collections.version>3.2.2</common-collections.version>
<commons-math3.version>3.5</commons-math3.version>
<commons-cli.version>1.2</commons-cli.version>
<pgatk-io.version>1.0.0-SNAPSHOT</pgatk-io.version>
<pgatk-io.version>1.0.1-SNAPSHOT</pgatk-io.version>
<slf4j.version>1.7.25</slf4j.version>
<logback.version>1.2.3</logback.version>
<guava.version>29.0-jre</guava.version>
<pride-mod.version>2.1.12</pride-mod.version>
</properties>

<dependencies>
Expand Down Expand Up @@ -99,6 +100,12 @@
<version>${commons-math3.version}</version>
</dependency>

<dependency>
<groupId>uk.ac.ebi.pride.utilities</groupId>
<artifactId>pride-mod</artifactId>
<version>${pride-mod.version}</version>
</dependency>

<!-- Performance libraries -->
<dependency>
<groupId>org.ehcache</groupId>
Expand Down Expand Up @@ -220,6 +227,10 @@
<id>sonatype-snapshopt</id>
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
</repository>
<repository>
<id>nuiton</id>
<url>http://maven.nuiton.org/release/</url>
</repository>
</repositories>

</project>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
package org.spectra.cluster.consensus;

import io.github.bigbio.pgatk.io.common.PgatkIOException;
import io.github.bigbio.pgatk.io.common.spectra.Spectrum;
import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
import io.github.bigbio.pgatk.io.properties.StoredProperties;
import org.spectra.cluster.model.cluster.ICluster;
import org.spectra.cluster.normalizer.IIntensityNormalizer;
import org.spectra.cluster.normalizer.MaxPeakNormalizer;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;

/**
* Collection of commonly required functions for all
* consensus spectrum builders.
*/
public abstract class AbstractConsensusSpectrumBuilder implements IConsensusSpectrumBuilder {
private static final IIntensityNormalizer intensityNormalizer = new MaxPeakNormalizer();

@Override
public abstract Spectrum createConsensusSpectrum(ICluster cluster, IPropertyStorage spectrumPropertyStorage);

// TODO: Load original scores based on https://github.com/bigbio/pia/blob/master/src/main/java/de/mpc/pia/modeller/score/ScoreModelEnum.javay

/**
* Merge consensus peaks that are within the specified m/z tolerance.
*
* Similar peaks are merged using the ConsensusPeak::mergePeak function where
* the weighted average m/z and weighted average intensity is used. No incremental
* threshold is applied.
*
* Warning: This function alters the originally passed objects.
*
* @param peaks The peaks to process.
* @param mzTolerance The tolerance within which to merge peaks.
* @return A list of merged peaks.
*/
static protected List<ConsensusPeak> mergeConsensusPeaks(List<ConsensusPeak> peaks, double mzTolerance) {
// sort the peaks according to m/z
peaks.sort(Comparator.comparingDouble(ConsensusPeak::getMz));

// iterate over the peaks
List<ConsensusPeak> sortedPeaks = new ArrayList<>(peaks.size());
ConsensusPeak lastPeak = null;

for (ConsensusPeak currentPeak : peaks) {
// if within the tolerance, merge into the last peak
if (lastPeak != null && currentPeak.getMz() - lastPeak.getMz() <= mzTolerance) {
lastPeak.mergePeak(currentPeak);
} else {
// add as new peak
sortedPeaks.add(currentPeak);
lastPeak = currentPeak;
}
}

return sortedPeaks;
}

/**
* Loads all original spectra from the property storage and returns them as one
* crowded peak list.
*
* @param cluster The cluster to load the spectra for.
* @param propertyStorage The spectra's property storage.
* @param normalizeIntensity If set, the intensity is automatically normalized based on the hightest peak intensity.
* @return A list of ConsensusPeakS
* @throws PgatkIOException If the loading of properties failed.
*/
static protected List<ConsensusPeak> loadOriginalPeaks(ICluster cluster, IPropertyStorage propertyStorage,
boolean normalizeIntensity) throws PgatkIOException {
// load a max of 50 peaks per spectrum
List<ConsensusPeak> peaks = new ArrayList<>(cluster.getClusteredSpectraCount() * 50);

for (String spectrumId : cluster.getClusteredSpectraIds()) {
// load the m/z and intensity values
String mzString = propertyStorage.get(spectrumId, StoredProperties.ORIGINAL_PEAKS_MZ);
String intensityString = propertyStorage.get(spectrumId, StoredProperties.ORIGINAL_PEAKS_INTENS);

double[] mz = convertDoubleString(mzString);
double[] intens = convertDoubleString(intensityString);

// sanity check
if (mz.length != intens.length) {
throw new PgatkIOException("Number of m/z and intensity values differs for spectrum " + spectrumId);
}

// normalize the intensity
if (normalizeIntensity) {
// convert to double list
List<Double> intensList = Arrays.stream(intens).boxed().collect(Collectors.toList());

// normalize
intens = Arrays.stream(intensityNormalizer.binDoubles(intensList))
.mapToDouble(Double::new).toArray();
}

for (int i = 0; i < mz.length; i++) {
peaks.add(new ConsensusPeak(mz[i], intens[i]));
}
}

return peaks;
}

/**
* Convert a string of "," delimited double values into an
* array of doubles.
* @param valueString The string to convert.
* @return An array of doubles.
*/
static private double[] convertDoubleString(String valueString) {
return Arrays.stream(valueString.split(","))
.mapToDouble(Double::valueOf)
.toArray();
}

/**
* Get the average precursor m/z of all contained spectra
* within the cluster based on the original spectra's precursor m/z.
*
* @param cluster The cluster
* @param propertyStorage Property storage of the spectra's properties
* @return The precursor m/z as double
* @throws IllegalStateException In case the property storage is not accessible.
*/
static protected Double getAveragePrecursorMz(ICluster cluster, IPropertyStorage propertyStorage) {
return cluster.getClusteredSpectraIds().stream()
.mapToDouble(id -> {
try {
return Double.parseDouble(propertyStorage.get(id, StoredProperties.PRECURSOR_MZ));
}
catch (Exception e) {
throw new IllegalStateException(e);
}
})
.average().getAsDouble();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package org.spectra.cluster.consensus;

import io.github.bigbio.pgatk.io.common.DefaultSpectrum;
import io.github.bigbio.pgatk.io.common.spectra.Spectrum;
import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.spectra.cluster.model.cluster.ICluster;
import org.spectra.cluster.util.ClusteringParameters;

import java.util.*;
import java.util.stream.Collectors;

/**
* ConsensusSpectrumBuilder that is similar to the consensus
* spectrum builder used by the original spectra-cluster 1.0
* algorithm.
*
* @author jg
*/
@Slf4j
@Data
public class AverageConsensusSpectrumBuilder extends AbstractConsensusSpectrumBuilder {
private final ClusteringParameters clusteringParameters;

@Override
public Spectrum createConsensusSpectrum(ICluster cluster, IPropertyStorage spectrumPropertyStorage) {
try {
// load all peaks
List<ConsensusPeak> peaks = loadOriginalPeaks(cluster, spectrumPropertyStorage, true);

// merge the peaks
double fragmentTolerance = (clusteringParameters.getFragmentIonPrecision().equalsIgnoreCase("high") ? 0.01 : 0.5);
double mzThresholdStep = fragmentTolerance / 5;

// merge similar peaks with increasing tolerance
for (double currentTolerance = mzThresholdStep; currentTolerance < fragmentTolerance; currentTolerance += mzThresholdStep) {
peaks = mergeConsensusPeaks(peaks, currentTolerance);
}

// adapt peak intensities based on the probability that they are observed
List<ConsensusPeak> adaptedIntensityPeaks = adaptPeakIntensities(peaks, cluster.getClusteredSpectraCount());

// apply the noise filter
List<ConsensusPeak> filteredPeaks = filterNoise(adaptedIntensityPeaks, 100.0, 5, 20);

// create the peak list as Map<Double, Double>
Map<Double, Double> consensusPeaks = new HashMap<>(filteredPeaks.size());

for (ConsensusPeak p : filteredPeaks)
consensusPeaks.put(p.getMz(), p.getIntensity());

// create the spectrum object
return new DefaultSpectrum(
UUID.randomUUID().toString(), // create a new unique id
1L,
cluster.getPrecursorCharge(),
getAveragePrecursorMz(cluster, spectrumPropertyStorage),
1.0,
consensusPeaks,
2,
Collections.emptyList());
} catch (Exception e) {
log.error(("Failed to build consensus spectrum: " + e.toString()));
return null;
}
}

/**
* Adapt the peak intensities in consensusPeaks using the following formula:
* I = I * (0.95 + 0.05 * (1 + pi))^5
* where pi is the peaks probability
*/
protected static List<ConsensusPeak> adaptPeakIntensities(List<ConsensusPeak> peaks, int nSpectra) {
List<ConsensusPeak> adaptedPeaks = new ArrayList<ConsensusPeak>(peaks);

for (int i = 0; i < adaptedPeaks.size(); i++) {
ConsensusPeak peak = adaptedPeaks.get(i);
double peakProbability = (double) peak.getCount() / (double) nSpectra;
double newIntensity = peak.getIntensity() * (0.95 + 0.05 * Math.pow(1 + peakProbability, 5));

adaptedPeaks.set(i, new ConsensusPeak(peak.getMz(), newIntensity, peak.getCount()));
}

return adaptedPeaks;
}

/**
* Filters the consensus spectrum keeping only the top N peaks per M m/z
*
* @param peaks The peaks to filter
* @param mzWindowSize The window size to use to filter peaks in m/z
* @param maxPeaksPerWindow Number of peaks per m/z window
* @param minTotalPeaks Minimum number of total peaks before filtering is done
*/
protected static List<ConsensusPeak> filterNoise(List<ConsensusPeak> peaks, double mzWindowSize, int maxPeaksPerWindow, int minTotalPeaks) {
if (peaks.size() <= minTotalPeaks)
return peaks;

// under certain conditions (averaging m/z values) the order of peaks can be disrupted
peaks.sort(Comparator.comparingDouble(ConsensusPeak::getMz));

List<ConsensusPeak> filteredPeaks = new ArrayList<>(peaks);
double maxMz = peaks.stream().mapToDouble(ConsensusPeak::getMz).max().getAsDouble();

for (double minMz = 0; minMz <= maxMz; minMz += mzWindowSize) {
final double lowerWindowMz = minMz;

// extract all peaks
List<ConsensusPeak> filteredWindowPeaks = peaks.stream()
// keep peaks within the window
.filter(p -> p.getMz() >= lowerWindowMz && p.getMz() < lowerWindowMz + mzWindowSize)
// sort based on intensity
.sorted(Comparator.comparingDouble(ConsensusPeak::getIntensity).reversed())
// retain max N peaks
.limit(maxPeaksPerWindow)
.collect(Collectors.toList());

filteredPeaks.addAll(filteredWindowPeaks);
}

return filteredPeaks;
}
}
64 changes: 64 additions & 0 deletions src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package org.spectra.cluster.consensus;

import lombok.Data;

/**
* A peak in a consensus spectrum (in Double space) that
* keeps track of its count and supports merging of peaks.
*/
@Data
public class ConsensusPeak {
private Double mz;
private Double intensity;
private int count;

/**
* Initializes a new ConsensusPeak
*/
public ConsensusPeak(Double mz, Double intensity) {
this.mz = mz;
this.intensity = intensity;
this.count = 1;
}

/**
* Initializes a new ConsensusPeak setting the count
*/
public ConsensusPeak(Double mz, Double intensity, int count) {
this.mz = mz;
this.intensity = intensity;
this.count = count;
}

public Double getMz() {
return mz;
}

public Double getIntensity() {
return intensity;
}

/**
* Merge a peak. Both, m/z and intensity are
* averaged weighted based on the respective
* counts.
* @param peak The peak to merge.
*/
public void mergePeak(ConsensusPeak peak) {
// calculate the weight first
int totalCount = this.count + peak.count;
double thisWeight = this.count / (double) totalCount;
double peakWeight = peak.count / (double) totalCount;

// calculate the new m/z and intensities
double newMz = this.mz * thisWeight + peak.mz * peakWeight;
double newIntensity = this.intensity * thisWeight + peak.intensity * peakWeight;

// update this variables
this.mz = newMz;
this.intensity = newIntensity;

// update the count
this.count = totalCount;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package org.spectra.cluster.consensus;

import io.github.bigbio.pgatk.io.common.spectra.Spectrum;
import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
import org.spectra.cluster.model.cluster.ICluster;

/**
* Describes classes that create consensus spectra
* based on a list of clusters.
*
* @author jg
*/
public interface IConsensusSpectrumBuilder {
/**
* Creates a consensus spectrum for the defined cluster. Some consensus
* spectrum building approaches require additional information that is
* retrieved from the IPropertyStorage.
*
* @param cluster The cluster to build the consensus spectrum for.
* @param spectrumPropertyStorage The property storage holding the spectra's additional properties.
* @return The consensus spectrum.
*/
public Spectrum createConsensusSpectrum(ICluster cluster, IPropertyStorage spectrumPropertyStorage);
}
Loading