diff --git a/pom.xml b/pom.xml index 33fc55e..42c6d2f 100644 --- a/pom.xml +++ b/pom.xml @@ -15,10 +15,11 @@ 3.2.2 3.5 1.2 - 1.0.0-SNAPSHOT + 1.0.1-SNAPSHOT 1.7.25 1.2.3 29.0-jre + 2.1.12 @@ -99,6 +100,12 @@ ${commons-math3.version} + + uk.ac.ebi.pride.utilities + pride-mod + ${pride-mod.version} + + org.ehcache @@ -220,6 +227,10 @@ sonatype-snapshopt https://oss.sonatype.org/content/repositories/snapshots + + nuiton + http://maven.nuiton.org/release/ + diff --git a/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java b/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java new file mode 100644 index 0000000..c0d34e2 --- /dev/null +++ b/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java @@ -0,0 +1,143 @@ +package org.spectra.cluster.consensus; + +import io.github.bigbio.pgatk.io.common.PgatkIOException; +import io.github.bigbio.pgatk.io.common.spectra.Spectrum; +import io.github.bigbio.pgatk.io.properties.IPropertyStorage; +import io.github.bigbio.pgatk.io.properties.StoredProperties; +import org.spectra.cluster.model.cluster.ICluster; +import org.spectra.cluster.normalizer.IIntensityNormalizer; +import org.spectra.cluster.normalizer.MaxPeakNormalizer; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +/** + * Collection of commonly required functions for all + * consensus spectrum builders. + */ +public abstract class AbstractConsensusSpectrumBuilder implements IConsensusSpectrumBuilder { + private static final IIntensityNormalizer intensityNormalizer = new MaxPeakNormalizer(); + + @Override + public abstract Spectrum createConsensusSpectrum(ICluster cluster, IPropertyStorage spectrumPropertyStorage); + + // TODO: Load original scores based on https://github.com/bigbio/pia/blob/master/src/main/java/de/mpc/pia/modeller/score/ScoreModelEnum.javay + + /** + * Merge consensus peaks that are within the specified m/z tolerance. + * + * Similar peaks are merged using the ConsensusPeak::mergePeak function where + * the weighted average m/z and weighted average intensity is used. No incremental + * threshold is applied. + * + * Warning: This function alters the originally passed objects. + * + * @param peaks The peaks to process. + * @param mzTolerance The tolerance within which to merge peaks. + * @return A list of merged peaks. + */ + static protected List mergeConsensusPeaks(List peaks, double mzTolerance) { + // sort the peaks according to m/z + peaks.sort(Comparator.comparingDouble(ConsensusPeak::getMz)); + + // iterate over the peaks + List sortedPeaks = new ArrayList<>(peaks.size()); + ConsensusPeak lastPeak = null; + + for (ConsensusPeak currentPeak : peaks) { + // if within the tolerance, merge into the last peak + if (lastPeak != null && currentPeak.getMz() - lastPeak.getMz() <= mzTolerance) { + lastPeak.mergePeak(currentPeak); + } else { + // add as new peak + sortedPeaks.add(currentPeak); + lastPeak = currentPeak; + } + } + + return sortedPeaks; + } + + /** + * Loads all original spectra from the property storage and returns them as one + * crowded peak list. + * + * @param cluster The cluster to load the spectra for. + * @param propertyStorage The spectra's property storage. + * @param normalizeIntensity If set, the intensity is automatically normalized based on the hightest peak intensity. + * @return A list of ConsensusPeakS + * @throws PgatkIOException If the loading of properties failed. + */ + static protected List loadOriginalPeaks(ICluster cluster, IPropertyStorage propertyStorage, + boolean normalizeIntensity) throws PgatkIOException { + // load a max of 50 peaks per spectrum + List peaks = new ArrayList<>(cluster.getClusteredSpectraCount() * 50); + + for (String spectrumId : cluster.getClusteredSpectraIds()) { + // load the m/z and intensity values + String mzString = propertyStorage.get(spectrumId, StoredProperties.ORIGINAL_PEAKS_MZ); + String intensityString = propertyStorage.get(spectrumId, StoredProperties.ORIGINAL_PEAKS_INTENS); + + double[] mz = convertDoubleString(mzString); + double[] intens = convertDoubleString(intensityString); + + // sanity check + if (mz.length != intens.length) { + throw new PgatkIOException("Number of m/z and intensity values differs for spectrum " + spectrumId); + } + + // normalize the intensity + if (normalizeIntensity) { + // convert to double list + List intensList = Arrays.stream(intens).boxed().collect(Collectors.toList()); + + // normalize + intens = Arrays.stream(intensityNormalizer.binDoubles(intensList)) + .mapToDouble(Double::new).toArray(); + } + + for (int i = 0; i < mz.length; i++) { + peaks.add(new ConsensusPeak(mz[i], intens[i])); + } + } + + return peaks; + } + + /** + * Convert a string of "," delimited double values into an + * array of doubles. + * @param valueString The string to convert. + * @return An array of doubles. + */ + static private double[] convertDoubleString(String valueString) { + return Arrays.stream(valueString.split(",")) + .mapToDouble(Double::valueOf) + .toArray(); + } + + /** + * Get the average precursor m/z of all contained spectra + * within the cluster based on the original spectra's precursor m/z. + * + * @param cluster The cluster + * @param propertyStorage Property storage of the spectra's properties + * @return The precursor m/z as double + * @throws IllegalStateException In case the property storage is not accessible. + */ + static protected Double getAveragePrecursorMz(ICluster cluster, IPropertyStorage propertyStorage) { + return cluster.getClusteredSpectraIds().stream() + .mapToDouble(id -> { + try { + return Double.parseDouble(propertyStorage.get(id, StoredProperties.PRECURSOR_MZ)); + } + catch (Exception e) { + throw new IllegalStateException(e); + } + }) + .average().getAsDouble(); + } +} diff --git a/src/main/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilder.java b/src/main/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilder.java new file mode 100644 index 0000000..f223a82 --- /dev/null +++ b/src/main/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilder.java @@ -0,0 +1,124 @@ +package org.spectra.cluster.consensus; + +import io.github.bigbio.pgatk.io.common.DefaultSpectrum; +import io.github.bigbio.pgatk.io.common.spectra.Spectrum; +import io.github.bigbio.pgatk.io.properties.IPropertyStorage; +import lombok.Data; +import lombok.extern.slf4j.Slf4j; +import org.spectra.cluster.model.cluster.ICluster; +import org.spectra.cluster.util.ClusteringParameters; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * ConsensusSpectrumBuilder that is similar to the consensus + * spectrum builder used by the original spectra-cluster 1.0 + * algorithm. + * + * @author jg + */ +@Slf4j +@Data +public class AverageConsensusSpectrumBuilder extends AbstractConsensusSpectrumBuilder { + private final ClusteringParameters clusteringParameters; + + @Override + public Spectrum createConsensusSpectrum(ICluster cluster, IPropertyStorage spectrumPropertyStorage) { + try { + // load all peaks + List peaks = loadOriginalPeaks(cluster, spectrumPropertyStorage, true); + + // merge the peaks + double fragmentTolerance = (clusteringParameters.getFragmentIonPrecision().equalsIgnoreCase("high") ? 0.01 : 0.5); + double mzThresholdStep = fragmentTolerance / 5; + + // merge similar peaks with increasing tolerance + for (double currentTolerance = mzThresholdStep; currentTolerance < fragmentTolerance; currentTolerance += mzThresholdStep) { + peaks = mergeConsensusPeaks(peaks, currentTolerance); + } + + // adapt peak intensities based on the probability that they are observed + List adaptedIntensityPeaks = adaptPeakIntensities(peaks, cluster.getClusteredSpectraCount()); + + // apply the noise filter + List filteredPeaks = filterNoise(adaptedIntensityPeaks, 100.0, 5, 20); + + // create the peak list as Map + Map consensusPeaks = new HashMap<>(filteredPeaks.size()); + + for (ConsensusPeak p : filteredPeaks) + consensusPeaks.put(p.getMz(), p.getIntensity()); + + // create the spectrum object + return new DefaultSpectrum( + UUID.randomUUID().toString(), // create a new unique id + 1L, + cluster.getPrecursorCharge(), + getAveragePrecursorMz(cluster, spectrumPropertyStorage), + 1.0, + consensusPeaks, + 2, + Collections.emptyList()); + } catch (Exception e) { + log.error(("Failed to build consensus spectrum: " + e.toString())); + return null; + } + } + + /** + * Adapt the peak intensities in consensusPeaks using the following formula: + * I = I * (0.95 + 0.05 * (1 + pi))^5 + * where pi is the peaks probability + */ + protected static List adaptPeakIntensities(List peaks, int nSpectra) { + List adaptedPeaks = new ArrayList(peaks); + + for (int i = 0; i < adaptedPeaks.size(); i++) { + ConsensusPeak peak = adaptedPeaks.get(i); + double peakProbability = (double) peak.getCount() / (double) nSpectra; + double newIntensity = peak.getIntensity() * (0.95 + 0.05 * Math.pow(1 + peakProbability, 5)); + + adaptedPeaks.set(i, new ConsensusPeak(peak.getMz(), newIntensity, peak.getCount())); + } + + return adaptedPeaks; + } + + /** + * Filters the consensus spectrum keeping only the top N peaks per M m/z + * + * @param peaks The peaks to filter + * @param mzWindowSize The window size to use to filter peaks in m/z + * @param maxPeaksPerWindow Number of peaks per m/z window + * @param minTotalPeaks Minimum number of total peaks before filtering is done + */ + protected static List filterNoise(List peaks, double mzWindowSize, int maxPeaksPerWindow, int minTotalPeaks) { + if (peaks.size() <= minTotalPeaks) + return peaks; + + // under certain conditions (averaging m/z values) the order of peaks can be disrupted + peaks.sort(Comparator.comparingDouble(ConsensusPeak::getMz)); + + List filteredPeaks = new ArrayList<>(peaks); + double maxMz = peaks.stream().mapToDouble(ConsensusPeak::getMz).max().getAsDouble(); + + for (double minMz = 0; minMz <= maxMz; minMz += mzWindowSize) { + final double lowerWindowMz = minMz; + + // extract all peaks + List filteredWindowPeaks = peaks.stream() + // keep peaks within the window + .filter(p -> p.getMz() >= lowerWindowMz && p.getMz() < lowerWindowMz + mzWindowSize) + // sort based on intensity + .sorted(Comparator.comparingDouble(ConsensusPeak::getIntensity).reversed()) + // retain max N peaks + .limit(maxPeaksPerWindow) + .collect(Collectors.toList()); + + filteredPeaks.addAll(filteredWindowPeaks); + } + + return filteredPeaks; + } +} diff --git a/src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java b/src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java new file mode 100644 index 0000000..7995c68 --- /dev/null +++ b/src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java @@ -0,0 +1,64 @@ +package org.spectra.cluster.consensus; + +import lombok.Data; + +/** + * A peak in a consensus spectrum (in Double space) that + * keeps track of its count and supports merging of peaks. + */ +@Data +public class ConsensusPeak { + private Double mz; + private Double intensity; + private int count; + + /** + * Initializes a new ConsensusPeak + */ + public ConsensusPeak(Double mz, Double intensity) { + this.mz = mz; + this.intensity = intensity; + this.count = 1; + } + + /** + * Initializes a new ConsensusPeak setting the count + */ + public ConsensusPeak(Double mz, Double intensity, int count) { + this.mz = mz; + this.intensity = intensity; + this.count = count; + } + + public Double getMz() { + return mz; + } + + public Double getIntensity() { + return intensity; + } + + /** + * Merge a peak. Both, m/z and intensity are + * averaged weighted based on the respective + * counts. + * @param peak The peak to merge. + */ + public void mergePeak(ConsensusPeak peak) { + // calculate the weight first + int totalCount = this.count + peak.count; + double thisWeight = this.count / (double) totalCount; + double peakWeight = peak.count / (double) totalCount; + + // calculate the new m/z and intensities + double newMz = this.mz * thisWeight + peak.mz * peakWeight; + double newIntensity = this.intensity * thisWeight + peak.intensity * peakWeight; + + // update this variables + this.mz = newMz; + this.intensity = newIntensity; + + // update the count + this.count = totalCount; + } +} diff --git a/src/main/java/org/spectra/cluster/consensus/IConsensusSpectrumBuilder.java b/src/main/java/org/spectra/cluster/consensus/IConsensusSpectrumBuilder.java new file mode 100644 index 0000000..662aa9d --- /dev/null +++ b/src/main/java/org/spectra/cluster/consensus/IConsensusSpectrumBuilder.java @@ -0,0 +1,24 @@ +package org.spectra.cluster.consensus; + +import io.github.bigbio.pgatk.io.common.spectra.Spectrum; +import io.github.bigbio.pgatk.io.properties.IPropertyStorage; +import org.spectra.cluster.model.cluster.ICluster; + +/** + * Describes classes that create consensus spectra + * based on a list of clusters. + * + * @author jg + */ +public interface IConsensusSpectrumBuilder { + /** + * Creates a consensus spectrum for the defined cluster. Some consensus + * spectrum building approaches require additional information that is + * retrieved from the IPropertyStorage. + * + * @param cluster The cluster to build the consensus spectrum for. + * @param spectrumPropertyStorage The property storage holding the spectra's additional properties. + * @return The consensus spectrum. + */ + public Spectrum createConsensusSpectrum(ICluster cluster, IPropertyStorage spectrumPropertyStorage); +} diff --git a/src/main/java/org/spectra/cluster/io/cluster/ChronicleMapClusterStorage.java b/src/main/java/org/spectra/cluster/io/cluster/ChronicleMapClusterStorage.java index 9515e69..18fee9e 100644 --- a/src/main/java/org/spectra/cluster/io/cluster/ChronicleMapClusterStorage.java +++ b/src/main/java/org/spectra/cluster/io/cluster/ChronicleMapClusterStorage.java @@ -1,10 +1,10 @@ package org.spectra.cluster.io.cluster; +import io.github.bigbio.pgatk.io.common.PgatkIOException; +import io.github.bigbio.pgatk.io.mapcache.IMapStorage; import lombok.extern.slf4j.Slf4j; import net.openhft.chronicle.map.ChronicleMap; import net.openhft.chronicle.map.ChronicleMapBuilder; -import io.github.bigbio.pgatk.io.common.PgatkIOException; -import io.github.bigbio.pgatk.io.mapcache.IMapStorage; import org.spectra.cluster.model.cluster.ICluster; import java.io.File; @@ -120,7 +120,7 @@ public void put(String key, ICluster cluster) { } @Override - public ICluster get(String key) throws PgatkIOException { + public ICluster get(String key) { return this.clusterStorage.get(key); } diff --git a/src/main/java/org/spectra/cluster/io/cluster/SparkKeyClusterStorage.java b/src/main/java/org/spectra/cluster/io/cluster/SparkKeyClusterStorage.java index 499210f..512624d 100644 --- a/src/main/java/org/spectra/cluster/io/cluster/SparkKeyClusterStorage.java +++ b/src/main/java/org/spectra/cluster/io/cluster/SparkKeyClusterStorage.java @@ -6,6 +6,7 @@ import com.spotify.sparkey.SparkeyWriter; import io.github.bigbio.pgatk.io.common.PgatkIOException; import io.github.bigbio.pgatk.io.mapcache.IMapStorage; +import lombok.extern.slf4j.Slf4j; import org.spectra.cluster.exceptions.SpectraClusterException; import org.spectra.cluster.model.cluster.GreedySpectralCluster; import org.spectra.cluster.model.cluster.ICluster; @@ -18,6 +19,7 @@ import java.util.HashSet; import java.util.concurrent.atomic.AtomicLong; +@Slf4j public class SparkKeyClusterStorage implements IMapStorage { private final boolean deleteOnClose; @@ -96,17 +98,17 @@ private ICluster deserialize(byte[] bytes) throws PgatkIOException { } @Override - public synchronized void put(String key, ICluster cluster) throws PgatkIOException{ + public synchronized void put(String key, ICluster cluster) { try { writer.put( serialize(key), cluster.toBytes()); }catch (IOException | SpectraClusterException ex){ - throw new PgatkIOException("Error wiring the following property - " + key + " " + cluster.getId() + "error " + ex.getMessage()); + throw new IllegalStateException("Error wiring the following property - " + key + " " + cluster.getId() + "error " + ex.getMessage()); } entryCounter.incrementAndGet(); } @Override - public ICluster get(String key) throws PgatkIOException { + public ICluster get(String key) { try { byte[] byteObject = readers.get().getAsByteArray(serialize(key)); @@ -115,8 +117,9 @@ public ICluster get(String key) throws PgatkIOException { } return deserialize(byteObject); - } catch (IOException ex) { - throw new PgatkIOException("Error retrieving the value for key -- " + key ); + } catch (PgatkIOException | IOException ex) { + log.error("Error retrieving the value for key -- " + key ); + return null; } } diff --git a/src/main/java/org/spectra/cluster/io/result/IClusteringResultWriter.java b/src/main/java/org/spectra/cluster/io/result/IClusteringResultWriter.java new file mode 100644 index 0000000..18835ee --- /dev/null +++ b/src/main/java/org/spectra/cluster/io/result/IClusteringResultWriter.java @@ -0,0 +1,24 @@ +package org.spectra.cluster.io.result; + +import io.github.bigbio.pgatk.io.properties.IPropertyStorage; +import org.spectra.cluster.io.cluster.ObjectDBGreedyClusterStorage; + +import java.nio.file.Path; + +/** + * Classes that write the clustering result into + * a specific format. + * + * @author jg + */ +public interface IClusteringResultWriter { + /** + * Convert a clustering result into a specific format. + * + * @param resultFile The final result file that will be created. This file must not exist. + * @param clusterStorage The final clustering result as an ObjectDBGreedyClusterStorage. + * @param spectraPropertyStorage The spectrum property storage. + * @throws Exception + */ + public void writeResult(Path resultFile, ObjectDBGreedyClusterStorage clusterStorage, IPropertyStorage spectraPropertyStorage) throws Exception; +} diff --git a/src/main/java/org/spectra/cluster/io/result/MspWriter.java b/src/main/java/org/spectra/cluster/io/result/MspWriter.java new file mode 100644 index 0000000..af554b3 --- /dev/null +++ b/src/main/java/org/spectra/cluster/io/result/MspWriter.java @@ -0,0 +1,221 @@ +package org.spectra.cluster.io.result; + +import io.github.bigbio.pgatk.io.common.spectra.Spectrum; +import io.github.bigbio.pgatk.io.properties.IPropertyStorage; +import io.github.bigbio.pgatk.io.properties.StoredProperties; +import lombok.Data; +import lombok.extern.slf4j.Slf4j; +import org.spectra.cluster.consensus.IConsensusSpectrumBuilder; +import org.spectra.cluster.io.cluster.ObjectDBGreedyClusterStorage; +import org.spectra.cluster.model.cluster.GreedySpectralCluster; +import uk.ac.ebi.pride.utilities.pridemod.ModReader; +import uk.ac.ebi.pride.utilities.pridemod.model.PTM; +import uk.ac.ebi.pride.utilities.pridemod.model.UniModPTM; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.*; +import java.util.function.Function; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Writes the clustering result into an MSP file. + * + * @author jg + */ +@Slf4j +public class MspWriter implements IClusteringResultWriter { + private final IConsensusSpectrumBuilder consensusSpectrumBuilder; + private static final Pattern MOD_PATTERN = Pattern.compile("[+-][0-9.]+"); + + /** + * A simple helper function to describe a PTM in + * the style used in MSP files. + */ + @Data + protected class MspMod { + private final int position; + private final String name; + private final String aminoAcid; + } + + /** + * Initializes a new MspWriter + * + * @param consensusSpectrumBuilder The consensus spectrum builder to use for the + * actual spectra + */ + public MspWriter(IConsensusSpectrumBuilder consensusSpectrumBuilder) { + this.consensusSpectrumBuilder = consensusSpectrumBuilder; + } + + @Override + public void writeResult(Path resultFile, ObjectDBGreedyClusterStorage clusterStorage, IPropertyStorage spectraPropertyStorage) throws Exception { + // ensure that the file does not exist + if (Files.exists(resultFile)) + throw new Exception(String.format("Error: File '%s' already exists.", resultFile.toString())); + + log.info(String.format("Saving clustering result as MSP file '%s'", resultFile.toString())); + + // iterate over all clusters and convert them to strings + while (clusterStorage.hasNext()) { + GreedySpectralCluster cluster = (GreedySpectralCluster) clusterStorage.next(); + + String clusterMspString = convertCluster(cluster, spectraPropertyStorage); + + Files.write(resultFile, clusterMspString.getBytes(), StandardOpenOption.CREATE, StandardOpenOption.APPEND); + } + } + + private String convertCluster(GreedySpectralCluster cluster, IPropertyStorage propertyStorage) { + StringBuilder clusterString = new StringBuilder(); + + // create the consensus spectrum + Spectrum consensus = consensusSpectrumBuilder.createConsensusSpectrum(cluster, propertyStorage); + + // get the most common PSM + Map sequenceCounts = cluster.getClusteredSpectraIds().stream() + .map(id -> getProperty(propertyStorage, id, StoredProperties.SEQUENCE)) + .filter(Objects::nonNull) + .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())); + + // get the number of identified spectra + long nIdentified = sequenceCounts.entrySet().stream() + .mapToLong(Map.Entry::getValue) + .sum(); + + String maxSequence = ""; + double maxRatio = 0.0; + if (sequenceCounts.size() < 1) { + maxSequence = "UNIDENTIFIED"; + } else { + maxSequence = sequenceCounts.entrySet().stream() + .max(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey).orElse(null); + + // get the max ratio + maxRatio = sequenceCounts.get(maxSequence) / (double) nIdentified; + } + + // create the name based on this information + clusterString.append(String.format("Name: %s/%d\n", maxSequence, consensus.getPrecursorCharge())); + // TODO: Handle Naa for unidentified cluster + clusterString.append(String.format("Comment: Spec=Consensus Parent=%.4f Mods=%s Nreps=%d Naa=%d MaxRatio=%.3f\n", + consensus.getPrecursorMZ(), getModString(maxSequence), + cluster.getClusteredSpectraCount(), maxSequence.length(), maxRatio)); + clusterString.append(String.format("Num peaks: %d\n", consensus.getPeakList().size())); + + // add the peaks + consensus.getPeakList().entrySet().stream() + .sorted(Map.Entry.comparingByKey()) + .forEach(peak -> clusterString.append(String.format("%f %f\n", peak.getKey(), peak.getValue()))); + + return clusterString.toString(); + } + + /** + * Creates an MSP Mod: String. + * + * @param sequence The sequence + * @return The mod string + */ + protected String getModString(String sequence) { + List mods = extractModsFromSequence(sequence); + + if (mods.size() < 1) + return "0"; + + StringBuilder modString = new StringBuilder(String.valueOf(mods.size())); + + for (MspMod mod : mods) { + modString.append(String.format("(%d,%s,%s)", mod.getPosition(), mod.getAminoAcid(), mod.getName())); + } + + return modString.toString(); + } + + /** + * Extract the PTMs from the peptide sequence with the PTM deltas. + * + * @param sequence The original sequence + * @return A List of MspModS + */ + protected List extractModsFromSequence(String sequence) { + Matcher matcher = MOD_PATTERN.matcher(sequence); + List mods = new ArrayList<>(3); + int modOffset = 0; + + while (matcher.find()) { + String modString = sequence.substring(matcher.start(), matcher.end()); + String position = ""; + + if (matcher.start() == 0) { + position = "c-term"; + } else if (matcher.end() == sequence.length()) { + position = "n-term"; + } else { + position = sequence.substring(matcher.start() - 1, matcher.start()); + } + + String modName = getModNameForDelta(Double.parseDouble(modString.substring(1)), position); + + if (matcher.start() == 0) { + mods.add(new MspMod(matcher.start() - modOffset, modName, "[")); + } else if (matcher.end() == sequence.length()) { + mods.add(new MspMod(matcher.start() - modOffset, modName, "]")); + } else { + mods.add(new MspMod(matcher.start() - modOffset, modName, sequence.substring(matcher.start() - 1, matcher.start()))); + } + + // keep track of the length of the mod strings + modOffset += modString.length(); + } + // create the pattern to extract the PTMs + return mods; + } + + /** + * Return the modification name for the passed delta. + * + * @param delta The delta mass + * @param position The position of the PTM + * @return The mod string + */ + private String getModNameForDelta(double delta, String position) { + ModReader modReader = ModReader.getInstance(); + List ptms = modReader.getAnchorMassModification(delta, 0.001, ""); + + // only use UNIMOD + ptms = ptms.stream() + .filter(p -> p instanceof UniModPTM) + .sorted(Comparator.comparing(PTM::getAccession)) + .collect(Collectors.toList()); + + if (ptms.size() < 1) { + return String.valueOf(delta); + } else { + // simply return the first + return ptms.get(0).getShortName(); + } + } + + /** + * Retrieves a property from the property storage but converts any + * exception to an IllegalStateExcpetion + * + * @param storage + * @param id + * @param property + * @return + */ + private String getProperty(IPropertyStorage storage, String id, String property) { + try { + return storage.get(id, property); + } catch (Exception e) { + throw new IllegalStateException(e); + } + } +} diff --git a/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java b/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java index 09ee80b..5db649f 100644 --- a/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java +++ b/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java @@ -1,6 +1,5 @@ package org.spectra.cluster.io.spectra; -import lombok.extern.slf4j.Slf4j; import io.github.bigbio.pgatk.io.clustering.ClusteringFileReader; import io.github.bigbio.pgatk.io.common.MzIterableReader; import io.github.bigbio.pgatk.io.common.Param; @@ -11,6 +10,7 @@ import io.github.bigbio.pgatk.io.objectdb.ObjectsDB; import io.github.bigbio.pgatk.io.properties.IPropertyStorage; import io.github.bigbio.pgatk.io.properties.StoredProperties; +import lombok.extern.slf4j.Slf4j; import org.spectra.cluster.engine.IClusteringEngine; import org.spectra.cluster.exceptions.SpectraClusterException; import org.spectra.cluster.filter.binaryspectrum.HighestPeakPerBinFunction; @@ -52,6 +52,7 @@ */ @Slf4j public class MzSpectraReader { + private final static IRawPeakFunction top50PeaksFilter = new KeepNHighestRawPeaks(50); /** Pattern for validating mzML format */ private static final Pattern mzMLHeaderPattern = Pattern.compile("^[^<]*(<\\?xml [^>]*>\\s*(\\s*)*)?<(mzML)|(indexedmzML) xmlns=.*", Pattern.MULTILINE); @@ -303,9 +304,11 @@ public ClusterIteratorConverter, ICluster> readClusterIterator(IP "Clusters"); return new ClusterIteratorConverter<>(iteratorStream, tupleSpectrum -> { + // ignore clusters if (tupleSpectrum.getValue() instanceof io.github.bigbio.pgatk.io.common.cluster.ICluster) { return storeCluster(propertyStorage, tupleSpectrum); } + // create the single spectrum cluster return clusteringEngine.createSingleSpectrumCluster( peaksPerMzWindowFilter.apply(storeIBinarySpectrum(propertyStorage, tupleSpectrum))); }); @@ -319,30 +322,11 @@ private ICluster storeCluster(IPropertyStorage propertyStorage, ITuple tupleSpec (io.github.bigbio.pgatk.io.common.cluster.ICluster) tupleSpectrum.getValue(); ICluster s = transformIOClusterToCluster(spectrum, clusteringEngine); -// // save spectrum properties -// if (propertyStorage != null) { -// for (Param param: spectrum.getAdditional()) { -// propertyStorage.put(s.getUUI(), param.getName(), param.getValue()); -// -// // TODO: map the title and retention time from existing cvParams -// // current implementation might only work for MGF files. -// -// // TODO: put support for PTMs -// } -// // always store the original filename -// propertyStorage.put(s.getUUI(), StoredProperties.ORG_FILENAME, inputFile.getName()); -// -// String spectrumId = spectrum.getObjectId(); -// -// // make spectrum id PSI format compatible -// if (spectrum instanceof Ms2Query) { -// spectrumId = "index=" + spectrumId; -// } -// -// propertyStorage.put(s.getUUI(), StoredProperties.FILE_INDEX, spectrumId); -// propertyStorage.put(s.getUUI(), StoredProperties.PRECURSOR_MZ, String.valueOf(spectrum.getPrecursorMZ())); -// propertyStorage.put(s.getUUI(), StoredProperties.CHARGE, String.valueOf(spectrum.getPrecursorCharge())); -// } +// // save additional properties + if (propertyStorage != null) { + // TODO: store additional cluster properties + log.warn("Loaded cluster properties are currently not stored."); + } return s; @@ -375,6 +359,11 @@ public SpectrumIteratorConverter, IBinarySpectrum> readBinarySpec } /** + * Stores the spectrum's properties in the property storage and returns + * the binary spectrum with defined filters already applied. + * + * Currently, the precursor m/z is stored in integer space, the peaks are + * normalized and the comparison filter is being applied. * * @param propertyStorage Property Storage * @param tupleSpectrum Spectrum Tuple @@ -385,6 +374,9 @@ private IBinarySpectrum storeIBinarySpectrum(IPropertyStorage propertyStorage, I File inputFile = (File) tupleSpectrum.getKey(); Spectrum spectrum = (Spectrum) tupleSpectrum.getValue(); + // retain the top 50 peaks for later + Map top50Peaks = top50PeaksFilter.apply(spectrum.getPeakList()); + // apply the initial loading filter if (loadingFilter != null) { spectrum = loadingFilter.apply(spectrum); @@ -419,6 +411,28 @@ private IBinarySpectrum storeIBinarySpectrum(IPropertyStorage propertyStorage, I propertyStorage.put(s.getUUI(), StoredProperties.FILE_INDEX, spectrumId); propertyStorage.put(s.getUUI(), StoredProperties.PRECURSOR_MZ, String.valueOf(spectrum.getPrecursorMZ())); propertyStorage.put(s.getUUI(), StoredProperties.CHARGE, String.valueOf(spectrum.getPrecursorCharge())); + + // save the original peaklist + StringBuilder mzValues = new StringBuilder(50); + StringBuilder intensValues = new StringBuilder(50); + + boolean isFirst = true; + + for (Double mz : top50Peaks.keySet()) { + // add the delimiter + if (!isFirst) { + mzValues.append(","); + intensValues.append(","); + } else { + isFirst = false; + } + + mzValues.append(String.format("%.4f", mz)); + intensValues.append(String.format("%.4f", top50Peaks.get(mz))); + } + + propertyStorage.put(s.getUUI(), StoredProperties.ORIGINAL_PEAKS_MZ, mzValues.toString()); + propertyStorage.put(s.getUUI(), StoredProperties.ORIGINAL_PEAKS_INTENS, intensValues.toString()); } // call the listeners diff --git a/src/main/java/org/spectra/cluster/tools/CliOptions.java b/src/main/java/org/spectra/cluster/tools/CliOptions.java index 535b365..01bc777 100644 --- a/src/main/java/org/spectra/cluster/tools/CliOptions.java +++ b/src/main/java/org/spectra/cluster/tools/CliOptions.java @@ -16,11 +16,12 @@ public enum OPTIONS { CONFIG_FILE("config", "c"), OUTPUT_PATH("output.path", "o"), + OUTPUT_MSP("output.msp", "om"), PRECURSOR_TOLERANCE("precursor.tolerance", "p"), FRAGMENT_PRECISION("fragment.precision", "f"), - MAJOR_PEAK_JOBS("major.peak.jobs", "m"), + N_THREADS("n.threads", "t"), START_THRESHOLD("threshold.start", "s"), END_THRESHOLD("threshold.end", "e"), @@ -47,7 +48,8 @@ public enum OPTIONS { ADVANCED_NUMBER_PREFILTERED_PEAKS("xn.prefiltered.peaks", "pp"), ADVANCED_LEARN_CDF("x.learn.cdf", "lcdf"), ADVANCED_LOAD_CDF_FILE("x.load.cdf", "lcdf"), - ADVANCED_DISABLE_MGF_COMMENTS("disable.mgf.comments", "dmc"); + ADVANCED_DISABLE_MGF_COMMENTS("disable.mgf.comments", "dmc"), + ADVANCED_MIN_INITIAL_PEAKS("x.min.initial.shared.peaks", "mip"); private String value; private String longValue; @@ -103,6 +105,12 @@ public String getLongValue() { .create(OPTIONS.OUTPUT_PATH.getValue()); options.addOption(outputPath); + Option outputMsp = OptionBuilder + .withDescription("If set, an MSP file containing the consensus spectra is written to the same location as the outputfile.") + .withLongOpt(OPTIONS.OUTPUT_MSP.getLongValue()) + .create(OPTIONS.OUTPUT_MSP.getValue()); + options.addOption(outputMsp); + Option startThreshold = OptionBuilder .hasArg() .withDescription("(Highest) starting threshold") @@ -126,9 +134,9 @@ public String getLongValue() { Option majorPeakJobs = OptionBuilder .hasArg() - .withDescription("Number of threads to use for major peak clustering.") - .withLongOpt(OPTIONS.MAJOR_PEAK_JOBS.getLongValue()) - .create(OPTIONS.MAJOR_PEAK_JOBS.getValue()); + .withDescription("Number of threads to use during the clustering process.") + .withLongOpt(OPTIONS.N_THREADS.getLongValue()) + .create(OPTIONS.N_THREADS.getValue()); options.addOption(majorPeakJobs); Option binaryDirectory = OptionBuilder @@ -224,6 +232,12 @@ public String getLongValue() { .withLongOpt(OPTIONS.ADVANCED_DISABLE_MGF_COMMENTS.getLongValue()) .create(OPTIONS.ADVANCED_DISABLE_MGF_COMMENTS.getValue()); options.addOption(xDisableMgfComments); + + Option xMinInitialSharedPeaks = OptionBuilder + .withDescription("(Advanced option) The number two spectra must share to be compared using the probabilistic comparison method.") + .withLongOpt(OPTIONS.ADVANCED_MIN_INITIAL_PEAKS.getLongValue()) + .create(OPTIONS.ADVANCED_MIN_INITIAL_PEAKS.getValue()); + options.addOption(xMinInitialSharedPeaks); } public static Options getOptions() { diff --git a/src/main/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringTool.java b/src/main/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringTool.java index ab1b823..5ec8552 100644 --- a/src/main/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringTool.java +++ b/src/main/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringTool.java @@ -1,14 +1,12 @@ package org.spectra.cluster.tools; -import lombok.Data; -import lombok.extern.slf4j.Slf4j; import io.github.bigbio.pgatk.io.common.PgatkIOException; import io.github.bigbio.pgatk.io.mapcache.IMapStorage; import io.github.bigbio.pgatk.io.objectdb.LongObject; import io.github.bigbio.pgatk.io.objectdb.ObjectsDB; +import lombok.Data; +import lombok.extern.slf4j.Slf4j; import org.spectra.cluster.binning.IClusterBinner; -import org.spectra.cluster.cdf.INumberOfComparisonAssessor; -import org.spectra.cluster.engine.GreedyClusteringEngine; import org.spectra.cluster.engine.IClusteringEngine; import org.spectra.cluster.exceptions.SpectraClusterException; import org.spectra.cluster.io.cluster.ClusterStorageFactory; @@ -16,8 +14,7 @@ import org.spectra.cluster.model.cluster.GreedySpectralCluster; import org.spectra.cluster.model.cluster.ICluster; import org.spectra.cluster.model.cluster.IClusterProperties; -import org.spectra.cluster.predicates.IComparisonPredicate; -import org.spectra.cluster.similarity.IBinarySpectrumSimilarity; +import org.spectra.cluster.util.ClusteringParameters; import java.io.File; import java.util.Arrays; @@ -37,12 +34,8 @@ public class LocalParallelBinnedClusteringTool { private final IClusterBinner binner; private final Class clusterClass; - public void runClustering(IClusterProperties[] clusters, IMapStorage clusterStorage, File resultFile, - int precursorTolerance, float thresholdStart, float thresholdEnd, - int clusteringRounds, IBinarySpectrumSimilarity similarityMeasure, - INumberOfComparisonAssessor numberOfComparisonAssessor, - IComparisonPredicate firstRoundPredicate, - int consensusSpectrumNoiseFilterIncrement) throws SpectraClusterException { + public void runClustering(IClusterProperties[] clusters, IMapStorage clusterStorage, + ClusteringParameters clusteringParameters) throws SpectraClusterException { log.debug("------ Local parallel binned clustering -----"); try { @@ -64,8 +57,7 @@ public void runClustering(IClusterProperties[] clusters, IMapStorage c // cluster the initially binned clusters IClusterProperties[][] firstRoundResult = clusterMapped(binnedClusterIds, clusterStorage, firstRoundStorage, - precursorTolerance, thresholdStart, thresholdEnd, clusteringRounds, similarityMeasure, - numberOfComparisonAssessor, firstRoundPredicate, consensusSpectrumNoiseFilterIncrement); + clusteringParameters); // close the initial storage clusterStorage.close(); @@ -87,15 +79,14 @@ public void runClustering(IClusterProperties[] clusters, IMapStorage c // run the clustering again on the re-binned ids IClusterProperties[][] secondRoundResult = clusterMapped(rebinnedClusterIds, firstRoundStorage, secondRoundStorage, - precursorTolerance, thresholdStart, thresholdEnd, clusteringRounds, similarityMeasure, - numberOfComparisonAssessor, firstRoundPredicate, consensusSpectrumNoiseFilterIncrement); + clusteringParameters); // close the first round storage - thereby deleting the temporary data firstRoundStorage.close(); // write the final clusters to file ObjectDBGreedyClusterStorage writer = new ObjectDBGreedyClusterStorage( - new ObjectsDB(resultFile.getAbsolutePath(), true)); + new ObjectsDB(clusteringParameters.getOutputFile().getAbsolutePath(), true)); // write all clusters to storage Arrays.stream(secondRoundResult).flatMap(Arrays::stream) @@ -149,20 +140,12 @@ private synchronized void writeClusters(IMapStorage storage, ICluster[ } private IClusterProperties[][] clusterMapped(String[][] binnedClusterIds, IMapStorage clusterStorage, - IMapStorage resultStorage, int precursorTolerance, - float thresholdStart, float thresholdEnd, - int clusteringRounds, IBinarySpectrumSimilarity similarityMeasure, - INumberOfComparisonAssessor numberOfComparisonAssessor, - IComparisonPredicate firstRoundPredicate, - int consensusSpectrumNoiseFilterIncrement) throws Exception { + IMapStorage resultStorage, ClusteringParameters clusteringParameters) throws Exception { // start the clustering ForkJoinPool clusteringPool = new ForkJoinPool(parallelJobs, ForkJoinPool.defaultForkJoinWorkerThreadFactory, null, true); return clusteringPool.submit(() -> Arrays.stream(binnedClusterIds).parallel().map((String[] clusterIds) -> { try { - // TODO: replace with factory for clustering engine - IClusteringEngine engine = new GreedyClusteringEngine(precursorTolerance, thresholdStart, thresholdEnd, - clusteringRounds, similarityMeasure, numberOfComparisonAssessor, firstRoundPredicate, - consensusSpectrumNoiseFilterIncrement); + IClusteringEngine engine = clusteringParameters.createGreedyClusteringEngine(); // load the clusters - parallel reads are not a problem ICluster[] loadedClusters = new ICluster[clusterIds.length]; diff --git a/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java b/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java index 66e1d44..ef13668 100644 --- a/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java +++ b/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java @@ -1,37 +1,44 @@ package org.spectra.cluster.tools; +import io.github.bigbio.pgatk.io.mapcache.IMapStorage; +import io.github.bigbio.pgatk.io.objectdb.ObjectsDB; +import io.github.bigbio.pgatk.io.properties.IPropertyStorage; +import io.github.bigbio.pgatk.io.properties.PropertyStorageFactory; import lombok.extern.slf4j.Slf4j; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.PosixParser; -import io.github.bigbio.pgatk.io.objectdb.ObjectsDB; -import io.github.bigbio.pgatk.io.properties.IPropertyStorage; -import io.github.bigbio.pgatk.io.properties.PropertyStorageFactory; -import org.spectra.cluster.cdf.SpectraPerBinNumberComparisonAssessor; +import org.spectra.cluster.binning.IClusterBinner; +import org.spectra.cluster.binning.SimilarSizedClusterBinner; +import org.spectra.cluster.consensus.AverageConsensusSpectrumBuilder; import org.spectra.cluster.engine.GreedyClusteringEngine; -import org.spectra.cluster.engine.IClusteringEngine; import org.spectra.cluster.exceptions.MissingParameterException; import org.spectra.cluster.filter.binaryspectrum.HighestPeakPerBinFunction; -import org.spectra.cluster.filter.rawpeaks.*; +import org.spectra.cluster.io.cluster.ClusterStorageFactory; import org.spectra.cluster.io.cluster.ObjectDBGreedyClusterStorage; +import org.spectra.cluster.io.result.IClusteringResultWriter; +import org.spectra.cluster.io.result.MspWriter; import org.spectra.cluster.io.spectra.MzSpectraReader; +import org.spectra.cluster.model.cluster.GreedySpectralCluster; import org.spectra.cluster.model.cluster.ICluster; -import org.spectra.cluster.normalizer.*; -import org.spectra.cluster.predicates.IComparisonPredicate; -import org.spectra.cluster.predicates.SameChargePredicate; -import org.spectra.cluster.predicates.ShareNComparisonPeaksPredicate; -import org.spectra.cluster.similarity.CombinedFisherIntensityTest; +import org.spectra.cluster.model.cluster.IClusterProperties; +import org.spectra.cluster.normalizer.BasicIntegerNormalizer; +import org.spectra.cluster.normalizer.MaxPeakNormalizer; import org.spectra.cluster.tools.utils.IProgressListener; import org.spectra.cluster.tools.utils.ProgressUpdate; -import org.spectra.cluster.util.DefaultParameters; +import org.spectra.cluster.util.ClusteringParameters; import java.io.File; +import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.time.Duration; import java.time.LocalDateTime; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; /** * This code is licensed under the Apache License, Version 2.0 (the @@ -47,12 +54,6 @@ @Slf4j public class SpectraClusterTool implements IProgressListener { - public static final boolean DELETE_TEMPORARY_CLUSTERING_RESULTS = true; - - public DefaultParameters defaultParameters = new DefaultParameters(); - - private boolean verbose; - public static void main(String[] args) { SpectraClusterTool instance = new SpectraClusterTool(); instance.run(args); @@ -62,6 +63,7 @@ private void run(String[] args) { CommandLineParser parser = new PosixParser(); try { + // parse the command line CommandLine commandLine = parser.parse(CliOptions.getOptions(), args); // HELP @@ -70,196 +72,213 @@ private void run(String[] args) { return; } - // File input - String[] peakFiles = commandLine.getArgs(); - - if (peakFiles.length < 1) { - printUsage(); - throw new MissingParameterException("Missing input files"); - } - - // RESULT FILE PATH - if (!commandLine.hasOption(CliOptions.OPTIONS.OUTPUT_PATH.getValue())) - throw new MissingParameterException("Missing required option " + - CliOptions.OPTIONS.OUTPUT_PATH.getValue()); + // create the object to hold the parameters + ClusteringParameters clusteringParameters = new ClusteringParameters(); - File finalResultFile = new File(commandLine.getOptionValue(CliOptions.OPTIONS.OUTPUT_PATH.getValue())); + // if a config file is set, merge the config file parameters first + if (commandLine.hasOption(CliOptions.OPTIONS.CONFIG_FILE.getValue())) { + String configFilePath = commandLine.getOptionValue(CliOptions.OPTIONS.CONFIG_FILE.getValue()); - if(commandLine.hasOption(CliOptions.OPTIONS.CONFIG_FILE.getValue())) - defaultParameters.mergeParameters(commandLine.getOptionValue(CliOptions.OPTIONS.CONFIG_FILE.getValue())); - - if (finalResultFile.exists()) - throw new Exception("Result file " + finalResultFile + " already exists"); - - // NUMBER OF ROUNDS - int rounds = defaultParameters.getClusterRounds(); - if (commandLine.hasOption(CliOptions.OPTIONS.ROUNDS.getValue())) - rounds = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.ROUNDS.getValue())); - - // START THRESHOLD - float startThreshold = defaultParameters.getThresholdStart(); - if (commandLine.hasOption(CliOptions.OPTIONS.START_THRESHOLD.getValue())) - startThreshold = Float.parseFloat(commandLine.getOptionValue(CliOptions.OPTIONS.START_THRESHOLD.getValue())); - - // END THRESHOLD - float endThreshold = defaultParameters.getThresholdEnd(); - if (commandLine.hasOption(CliOptions.OPTIONS.END_THRESHOLD.getValue())) - endThreshold = Float.parseFloat(commandLine.getOptionValue(CliOptions.OPTIONS.END_THRESHOLD.getValue())); - - // PRECURSOR TOLERANCE - // TODO: Add support for precursor tolerance - double precursorTolerance = defaultParameters.getPrecursorIonTolerance(); - if (commandLine.hasOption(CliOptions.OPTIONS.PRECURSOR_TOLERANCE.getValue())) { - precursorTolerance = Float.parseFloat(commandLine.getOptionValue(CliOptions.OPTIONS.PRECURSOR_TOLERANCE.getValue())); - } - // convert the precursor tolerance to int space - int binnedPrecursorTolerance = (int) Math.round(precursorTolerance * (double) BasicIntegerNormalizer.MZ_CONSTANT); - - // FRAGMENT TOLERANCE - // TODO: Add support for fragment tolerance - String fragmentPrecision = defaultParameters.getFragmentIonPrecision(); - if (commandLine.hasOption(CliOptions.OPTIONS.FRAGMENT_PRECISION.getValue())) { - fragmentPrecision = commandLine.getOptionValue(CliOptions.OPTIONS.FRAGMENT_PRECISION.getValue()); - } - if (!"high".equalsIgnoreCase(fragmentPrecision) && !"low".equalsIgnoreCase(fragmentPrecision)) { - throw new Exception("Invalid fragment precision set. Allowed values are 'low' and 'high'"); - } - - // IGNORE CHARGE STATE - boolean ignoreCharge = defaultParameters.isIgnoreCharge(); - if (commandLine.hasOption(CliOptions.OPTIONS.IGNORE_CHARGE.getValue())) { - ignoreCharge = true; - } - - String tempFolder = defaultParameters.getBinaryDirectory(); - if(commandLine.hasOption(CliOptions.OPTIONS.TEMP_DIRECTORY.getValue())){ - tempFolder = commandLine.getOptionValue(CliOptions.OPTIONS.TEMP_DIRECTORY.getValue()); - }else{ - tempFolder = createTempFolderPath(finalResultFile, tempFolder); + clusteringParameters.mergeParameters(configFilePath); + log.info("Configuration loaded from " + configFilePath); } - /** Advanced options */ - // MIN NUMBER OF COMPARISONS - int minNumberComparisons = defaultParameters.getMinNumberOfComparisons(); - if (commandLine.hasOption(CliOptions.OPTIONS.ADVANCED_MIN_NUMBER_COMPARISONS.getValue())) { - minNumberComparisons = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.ADVANCED_MIN_NUMBER_COMPARISONS.getValue())); - } + // merge all other command line parameters overwriting the config file values + clusteringParameters.mergeCommandLineArgs(commandLine); - SpectraPerBinNumberComparisonAssessor numberOfComparisonAssessor = new SpectraPerBinNumberComparisonAssessor( - binnedPrecursorTolerance * 2, minNumberComparisons, BasicIntegerNormalizer.MZ_CONSTANT * 2500 - ); + // make sure that the set parameters are valid + checkParameterValidity(clusteringParameters); - int nInitiallySharedPeaks = defaultParameters.getNInitiallySharedPeaks(); + // all remaining parameters are treated as input files + String[] peakFiles = commandLine.getArgs(); - /** Perform clustering **/ - LocalDateTime startTime = LocalDateTime.now(); + if (peakFiles.length < 1) { + printUsage(); + throw new MissingParameterException("Missing input files"); + } - // set an approximate fragment tolerance for the filters - double fragmentTolerance = (fragmentPrecision.equalsIgnoreCase("high")) ? 0.01 : 0.5; + // create the temporary directory in case it wasn't set + if (clusteringParameters.getBinaryDirectory() == null) + clusteringParameters.setBinaryDirectory(createTempFolderPath(clusteringParameters.getOutputFile(), "binary-clustering-files")); - // set the window size for the noise filter - int windowSizeNoiseFilter = (fragmentPrecision.equalsIgnoreCase("high")) ? 3000 : 100; + // start the clustering + runClustering(peakFiles, clusteringParameters); - // set the appropriate m/z binner - IMzBinner mzBinner = (fragmentPrecision.equalsIgnoreCase("high")) ? - new HighResolutionMzBinner() : new TideBinner(); + // exit nicely + System.exit(0); + } catch (MissingParameterException e) { + System.out.println("Error: " + e.getMessage() + "\n\n"); + printUsage(); - IRawSpectrumFunction loadingFilter = new RemoveImpossiblyHighPeaksFunction() - .specAndThen(new RemovePrecursorPeaksFunction(fragmentTolerance)) - .specAndThen(new RawPeaksWrapperFunction(new KeepNHighestRawPeaks(defaultParameters - .getNumberHigherPeaks()))); + System.exit(1); + } catch (Exception e) { + e.printStackTrace(); + System.out.println("Error: " + e.getMessage()); - IPropertyStorage localStorage = PropertyStorageFactory - .buildDynamicLevelDBPropertyStorage(new File(tempFolder)); + System.exit(1); + } + } - File[] inputFiles = Arrays.stream(peakFiles) - .map(File::new) - .toArray(File[]::new); + /** + * Run the clustering process using the defined parameters. + * + * @param clusteringParameters + * @throws Exception + */ + private void runClustering(String[] peakFiles, ClusteringParameters clusteringParameters) throws Exception { + // create the storage to load the spectra + IPropertyStorage propertyStorage = PropertyStorageFactory + .buildDynamicLevelDBPropertyStorage(new File(clusteringParameters.getBinaryDirectory())); - // create the comparison predicate for the first round - IComparisonPredicate firstRoundPredicate = new ShareNComparisonPeaksPredicate( - nInitiallySharedPeaks); + File clusterStorageDir = createUniqueDirectory(new File(clusteringParameters.getBinaryDirectory(), "loaded-clusters")); + IMapStorage clusterStorage = ClusterStorageFactory.buildTemporaryDynamicStorage(clusterStorageDir, GreedySpectralCluster.class); - if (!ignoreCharge) { - firstRoundPredicate = new SameChargePredicate().and(firstRoundPredicate); - } + // load the spectra + IClusterProperties[] loadedClusters = loadInputFiles(peakFiles, clusteringParameters, propertyStorage, clusterStorage); - IClusteringEngine engine = new GreedyClusteringEngine( - binnedPrecursorTolerance, - startThreshold, endThreshold, rounds, new CombinedFisherIntensityTest(), - numberOfComparisonAssessor, firstRoundPredicate, - windowSizeNoiseFilter); + // run the clustering in parallel + File clusteringTmpDir = createUniqueDirectory(new File(clusteringParameters.getBinaryDirectory(), "clustering-files")); - MzSpectraReader reader = new MzSpectraReader( mzBinner, new MaxPeakNormalizer(), - new BasicIntegerNormalizer(), new HighestPeakPerBinFunction(), loadingFilter, - GreedyClusteringEngine.COMPARISON_FILTER, engine, inputFiles); + IClusterBinner clusterBinner = new SimilarSizedClusterBinner( + 2 * clusteringParameters.getIntPrecursorTolerance(), + 1_000, !clusteringParameters.isIgnoreCharge()); - // set the comparison assessor as listener to count the spectra per bin - reader.addSpectrumListener(numberOfComparisonAssessor); + LocalParallelBinnedClusteringTool clusteringTool = new LocalParallelBinnedClusteringTool( + clusteringParameters.getNThreads(), clusteringTmpDir, clusterBinner, GreedySpectralCluster.class); - Iterator iterator = reader.readClusterIterator(localStorage); + LocalDateTime startTime = LocalDateTime.now(); + log.debug("Starting clustering..."); - /** - * Todo: We need to review the current implementation because it loads in memory - * all the clusters. If we want to the Ehcache we will need to use Maps and - * all the methods - **/ - List spectra = new ArrayList<>(1_000); + clusteringTool.runClustering(loadedClusters, clusterStorage, clusteringParameters); - log.debug(String.format("Loading spectra from %d file(s)...", inputFiles.length)); + // some nice output + LocalDateTime clusteringCompleteTime = LocalDateTime.now(); + log.debug(String.format("Clustering completed in in %d seconds", + Duration.between(startTime, clusteringCompleteTime).getSeconds())); - int count = 0; - while (iterator.hasNext()) { - ICluster cluster = iterator.next(); - spectra.add(cluster); - count++; - } + log.info("Result file written to " + clusteringParameters.getOutputFile()); - LocalDateTime loadingCompleteTime = LocalDateTime.now(); - log.debug(String.format("Loaded %d spectra in %d seconds", count, - Duration.between(startTime, loadingCompleteTime).getSeconds())); + // create the MSP file + if (clusteringParameters.isOutputMsp()) { + Path mspFile = Paths.get(clusteringParameters.getOutputFile().toString() + ".msp"); + IClusteringResultWriter writer = new MspWriter(new AverageConsensusSpectrumBuilder(clusteringParameters)); - // sort according to m/z - spectra.sort(Comparator.comparingInt(ICluster::getPrecursorMz)); + // open the result file again + ObjectDBGreedyClusterStorage resultReader = new ObjectDBGreedyClusterStorage( + new ObjectsDB(clusteringParameters.getOutputFile().getAbsolutePath(), false)); - Path thisResult = Paths.get(finalResultFile.getAbsolutePath()); + log.info("Saving clustering results as MSP file at " + mspFile.toString()); + writer.writeResult(mspFile, resultReader, propertyStorage); + } - log.debug("Clustering files..."); - ICluster[] clusters = engine.clusterSpectra(spectra.toArray(new ICluster[spectra.size()])); + // TODO: Create .clustering output file - LocalDateTime clusteringCompleteTime = LocalDateTime.now(); - log.debug(String.format("Clustering completed in %d seconds", Duration.between(loadingCompleteTime, clusteringCompleteTime).getSeconds())); + // close the storage + clusterStorage.close(); + propertyStorage.close(); + } - ObjectDBGreedyClusterStorage writer = new ObjectDBGreedyClusterStorage( - new ObjectsDB(finalResultFile.getAbsolutePath(), true)); - writer.addGreedySpectralClusters(clusters); - writer.writeDBMode(); - writer.flush(); + /** + * Loads all spectra from the defined peak list files as IClusters. The clusters are stored in the defined clusterStorage. + * The cluster's properties are stored in the propertyStorage. The cluster's basic properties are returned as an array. + * + * @param peakFiles Files to load. + * @param clusteringParameters Clustering parameters to use. + * @param propertyStorage The property storage. + * @param clusterStorage Storage to store the clusters in. + * @return The IClusterProperties of the loaded clusters. + */ + private IClusterProperties[] loadInputFiles(String[] peakFiles, ClusteringParameters clusteringParameters, + IPropertyStorage propertyStorage, IMapStorage clusterStorage) + throws Exception { + log.debug(String.format("Loading spectra from %d input files", peakFiles.length)); + LocalDateTime startTime = LocalDateTime.now(); + + // convert the input files to file objects + File[] inputFiles = Arrays.stream(peakFiles) + .map(File::new) + .toArray(File[]::new); + + // load the spectra using an MzSpectraReader + MzSpectraReader reader = new MzSpectraReader( clusteringParameters.createMzBinner(), new MaxPeakNormalizer(), + new BasicIntegerNormalizer(), new HighestPeakPerBinFunction(), clusteringParameters.createLoadingFilter(), + GreedyClusteringEngine.COMPARISON_FILTER, clusteringParameters.createGreedyClusteringEngine(), inputFiles); + + // create the iterator to load the clusters + Iterator iterator = reader.readClusterIterator(propertyStorage); + + List loadedClusters = new ArrayList<>(1000); + + while (iterator.hasNext()) { + ICluster cluster = iterator.next(); + + // all clusters are primarily stored in the cluster storage + clusterStorage.put(cluster.getId(), cluster); + + // only retain the basic properties + loadedClusters.add(cluster.getProperties()); + } - log.debug(String.format("Results written in %d seconds", - Duration.between(clusteringCompleteTime, LocalDateTime.now()).getSeconds())); - System.out.println("Results written to " + thisResult.toString()); + // some nice output + LocalDateTime loadingCompleteTime = LocalDateTime.now(); + log.debug(String.format("Loaded %d spectra in %d seconds", loadedClusters.size(), + Duration.between(startTime, loadingCompleteTime).getSeconds())); - log.debug(String.format("Process completed in %d seconds", Duration.between(startTime, - LocalDateTime.now()).getSeconds())); + return loadedClusters.toArray(new IClusterProperties[0]); + } - // Close the property storage and delete folders and property files. - localStorage.close(); + /** + * Create a new unique directory name. In case the target name already exists, a new directory with the name + * {target name}-{N} is created. + * + * @param targetDirectoryName The target directory name / path. + * @return The File object representing the finally created unique directory. + * @throws IOException Thrown if creating the directory failed. + */ + private File createUniqueDirectory(File targetDirectoryName) throws IOException { + int iteration = 1; + String orgPath = targetDirectoryName.getAbsolutePath(); + + while (targetDirectoryName.exists()) { + targetDirectoryName = new File(orgPath + "-" + String.valueOf(iteration)); + iteration++; + } - System.exit(0); - } catch (MissingParameterException e) { - System.out.println("Error: " + e.getMessage() + "\n\n"); - printUsage(); + // now that the directory does not yet exist, create it + if (!targetDirectoryName.mkdir()) { + throw new IOException("Failed to create directory " + targetDirectoryName.getAbsolutePath()); + } - System.exit(1); - } catch (Exception e) { - e.printStackTrace(); - System.out.println("Error: " + e.getMessage()); + return targetDirectoryName; + } - System.exit(1); + /** + * Ensures that the set user parameters are valid. In case they are not, an Exception is thrown. + * + * @param clusteringParameters The set parameters + * @throws Exception In case something is invalid. + * @throws MissingParameterException + */ + private void checkParameterValidity(ClusteringParameters clusteringParameters) throws Exception, MissingParameterException { + // RESULT FILE PATH + if (clusteringParameters.getOutputFile() == null) + throw new MissingParameterException("Missing required option " + + CliOptions.OPTIONS.OUTPUT_PATH.getValue()); + + // ensure that the output file does not exist + if (clusteringParameters.getOutputFile().exists()) + throw new Exception("Result file " + clusteringParameters.getOutputFile().getAbsolutePath() + " already exists"); + + // check whether the fragment tolerance is valid + if (!"high".equalsIgnoreCase(clusteringParameters.getFragmentIonPrecision()) && + !"low".equalsIgnoreCase(clusteringParameters.getFragmentIonPrecision())) { + throw new Exception("Invalid fragment precision set. Allowed values are 'low' and 'high'"); } } + /** + + TODO: update print function private void printSettings(File finalResultFile, int nMajorPeakJobs, float startThreshold, float endThreshold, int rounds, boolean keepBinaryFiles, File binaryTmpDirectory, String[] peaklistFilenames, boolean reUseBinaryFiles, boolean fastMode, @@ -278,7 +297,7 @@ private void printSettings(File finalResultFile, int nMajorPeakJobs, float start System.out.println("Using fast mode: " + (fastMode ? "yes" : "no")); System.out.println("\nOther settings:"); - System.out.println("Precursor tolerance: " + defaultParameters.getPrecursorIonTolerance()); + System.out.println("Precursor tolerance: " + clusteringParameters.getPrecursorIonTolerance()); System.out.println("Fragment ion precision: " + fragmentPrecision); // used filters @@ -296,7 +315,7 @@ private void printSettings(File finalResultFile, int nMajorPeakJobs, float start // System.out.println("Minimum number of comparisons: " + Defaults.getMinNumberComparisons()); System.out.println(); - } + }*/ private void printUsage() { HelpFormatter formatter = new HelpFormatter(); diff --git a/src/main/java/org/spectra/cluster/util/ClusteringParameters.java b/src/main/java/org/spectra/cluster/util/ClusteringParameters.java new file mode 100644 index 0000000..5d7ce6a --- /dev/null +++ b/src/main/java/org/spectra/cluster/util/ClusteringParameters.java @@ -0,0 +1,246 @@ +package org.spectra.cluster.util; + +import lombok.Data; +import org.apache.commons.cli.CommandLine; +import org.spectra.cluster.cdf.SpectraPerBinNumberComparisonAssessor; +import org.spectra.cluster.engine.GreedyClusteringEngine; +import org.spectra.cluster.filter.rawpeaks.*; +import org.spectra.cluster.model.cluster.ICluster; +import org.spectra.cluster.normalizer.BasicIntegerNormalizer; +import org.spectra.cluster.normalizer.HighResolutionMzBinner; +import org.spectra.cluster.normalizer.IMzBinner; +import org.spectra.cluster.normalizer.TideBinner; +import org.spectra.cluster.predicates.IComparisonPredicate; +import org.spectra.cluster.predicates.SameChargePredicate; +import org.spectra.cluster.predicates.ShareNComparisonPeaksPredicate; +import org.spectra.cluster.similarity.CombinedFisherIntensityTest; +import org.spectra.cluster.tools.CliOptions; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URISyntaxException; +import java.util.Properties; + +/** + * This code is licensed under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * ==Overview== + * + * @author ypriverol on 18/10/2018. + */ +@Data +public class ClusteringParameters { + + private String binaryDirectory; + private boolean reuseBinary; + private boolean fastMode; + private Integer clusterRounds; + private boolean filterReportPeaks; + private Integer numberHigherPeaks; + private Double precursorIonTolerance; + private String fragmentIonPrecision; + private boolean ignoreCharge; + + private Float thresholdStart; + private Float thresholdEnd; + private int nInitiallySharedPeaks; + private int minNumberOfComparisons; + + private File outputFile; + private boolean outputMsp; + + private int nThreads; + + + public ClusteringParameters(){ + + try { + Properties properties = readProperties(); + setProperties(properties); + + } catch (URISyntaxException e) { + e.printStackTrace(); + } + + } + + private void setProperties(Properties properties) { + if(properties.containsKey("precursor.tolerance")) + this.precursorIonTolerance = Double.parseDouble(properties.getProperty("precursor.tolerance").trim()); + if(properties.containsKey("fragment.precision")) + this.fragmentIonPrecision = properties.getProperty("fragment.precision"); + if(properties.containsKey("n.threads")) + this.nThreads = Integer.parseInt(properties.getProperty("n.threads")); + if(properties.containsKey("threshold.start")) + this.thresholdStart = Float.parseFloat(properties.getProperty("threshold.start")); + if(properties.containsKey("threshold.end")) + this.thresholdEnd = Float.parseFloat(properties.getProperty("threshold.end")); + if(properties.containsKey("number.higher.peaks")) + this.numberHigherPeaks = Integer.parseInt(properties.getProperty("number.higher.peaks")); + if(properties.containsKey("cluster.rounds")) + this.clusterRounds = Integer.parseInt(properties.getProperty("cluster.rounds")); + if(properties.containsKey("binary.temp.directory")) + this.binaryDirectory = properties.getProperty("binary.temp.directory"); + if(properties.containsKey("reuse.binary.files")) + this.reuseBinary = Boolean.parseBoolean(properties.getProperty("reuse.binary.files")); + if(properties.containsKey("ignore.charge")) + this.ignoreCharge = Boolean.parseBoolean(properties.getProperty("ignore.charge")); + if(properties.containsKey("cluster.fast.mode")) + this.fastMode = Boolean.parseBoolean(properties.getProperty("cluster.fast.mode")); + if(properties.containsKey("filters.remove.reporter.peaks")) + this.filterReportPeaks = Boolean.parseBoolean(properties.getProperty("filters.remove.reporter.peaks")); + if(properties.containsKey("initially.shared.peaks")) + this.nInitiallySharedPeaks = Integer.parseInt(properties.getProperty("initially.shared.peaks")); + if(properties.containsKey("x.min.comparisons")) + this.minNumberOfComparisons = Integer.parseInt(properties.getProperty("x.min.comparisons")); + if(properties.contains("output.msp")) + this.outputMsp = Boolean.parseBoolean(properties.getProperty("output.msp")); + } + + public Properties readProperties() throws URISyntaxException { + Properties properties = new Properties(); + InputStream output; + + try { + output = getClass().getClassLoader().getResourceAsStream("application.properties"); + properties.load(output); + } catch (IOException e) { + e.printStackTrace(); + } + return properties; + } + + /** + * Adapt the default parameters based on the set command line arguments. + * + * @param commandLine A command line object. + */ + public void mergeCommandLineArgs(CommandLine commandLine) { + if (commandLine.hasOption(CliOptions.OPTIONS.OUTPUT_PATH.getValue())) + outputFile = new File(commandLine.getOptionValue(CliOptions.OPTIONS.OUTPUT_PATH.getValue())); + + // NUMBER OF ROUNDS + if (commandLine.hasOption(CliOptions.OPTIONS.ROUNDS.getValue())) + clusterRounds = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.ROUNDS.getValue())); + + if (commandLine.hasOption(CliOptions.OPTIONS.START_THRESHOLD.getValue())) + thresholdStart = Float.parseFloat(commandLine.getOptionValue(CliOptions.OPTIONS.START_THRESHOLD.getValue())); + + if (commandLine.hasOption(CliOptions.OPTIONS.END_THRESHOLD.getValue())) + thresholdEnd = Float.parseFloat(commandLine.getOptionValue(CliOptions.OPTIONS.END_THRESHOLD.getValue())); + + if (commandLine.hasOption(CliOptions.OPTIONS.PRECURSOR_TOLERANCE.getValue())) + precursorIonTolerance = Double.parseDouble(commandLine.getOptionValue(CliOptions.OPTIONS.PRECURSOR_TOLERANCE.getValue())); + + if (commandLine.hasOption(CliOptions.OPTIONS.FRAGMENT_PRECISION.getValue())) + fragmentIonPrecision = commandLine.getOptionValue(CliOptions.OPTIONS.FRAGMENT_PRECISION.getValue()); + + ignoreCharge = commandLine.hasOption(CliOptions.OPTIONS.IGNORE_CHARGE.getValue()); + + if (commandLine.hasOption(CliOptions.OPTIONS.TEMP_DIRECTORY.getValue())) + binaryDirectory = commandLine.getOptionValue(CliOptions.OPTIONS.TEMP_DIRECTORY.getValue()); + + if (commandLine.hasOption(CliOptions.OPTIONS.ADVANCED_MIN_NUMBER_COMPARISONS.getValue())) + minNumberOfComparisons = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.ADVANCED_MIN_NUMBER_COMPARISONS.getValue())); + + if (commandLine.hasOption(CliOptions.OPTIONS.ADVANCED_MIN_INITIAL_PEAKS.getValue())) + nInitiallySharedPeaks = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.ADVANCED_MIN_INITIAL_PEAKS.getValue())); + + if (commandLine.hasOption(CliOptions.OPTIONS.N_THREADS.getValue())) + nThreads = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.N_THREADS.getValue())); + + outputMsp = commandLine.hasOption(CliOptions.OPTIONS.OUTPUT_MSP.getValue()); + } + + public void mergeParameters(String configFile) throws IOException { + File propertiesFactoryBean = new File(configFile); + Properties newProperties = new Properties(); + InputStream output = new FileInputStream(propertiesFactoryBean); + newProperties.load(output); + setProperties(newProperties); + } + + // ---------- + // A collection of functions to return clustering objects matching the + // parameters + + /** + * Get the precursor tolerance as an integer. + * + * @return The precursor tolerance + */ + public int getIntPrecursorTolerance() { + return (int) Math.round(precursorIonTolerance * (double) BasicIntegerNormalizer.MZ_CONSTANT); + } + + /** + * Creates a new GreedyClusteringEngine based on the currently set parameters. + * + * Note: The validity of these parameters is not checked in this function but + * must be checked before calling it. + * + * @return A new instance of a GreedyClusteringEngine + * @throws Exception + */ + public GreedyClusteringEngine createGreedyClusteringEngine() throws Exception { + int precursorTolerance = getIntPrecursorTolerance(); + + SpectraPerBinNumberComparisonAssessor numberOfComparisonAssessor = new SpectraPerBinNumberComparisonAssessor( + precursorTolerance * 2, minNumberOfComparisons, BasicIntegerNormalizer.MZ_CONSTANT * 2500 + ); + + IComparisonPredicate firstRoundPredicate = new ShareNComparisonPeaksPredicate( + nInitiallySharedPeaks); + + if (!ignoreCharge) { + firstRoundPredicate = new SameChargePredicate().and(firstRoundPredicate); + } + + int windowSizeNoiseFilter = (fragmentIonPrecision.equalsIgnoreCase("high")) ? 3000 : 100; + + GreedyClusteringEngine engine = new GreedyClusteringEngine( + precursorTolerance, + thresholdStart, thresholdEnd, clusterRounds, new CombinedFisherIntensityTest(), + numberOfComparisonAssessor, firstRoundPredicate, + windowSizeNoiseFilter); + + return engine; + } + + /** + * Create a new loading filter based on the currently set parameters. + * + * Note: The validity of these parameters is not checked in this function but + * must be checked before calling it. + * + * @return A new instance of an IRawSpectrumFunction + */ + public IRawSpectrumFunction createLoadingFilter() { + // set an approximate fragment tolerance for the filters + double fragmentTolerance = (fragmentIonPrecision.equalsIgnoreCase("high")) ? 0.01 : 0.5; + + return new RemoveImpossiblyHighPeaksFunction() + .specAndThen(new RemovePrecursorPeaksFunction(fragmentTolerance)) + .specAndThen(new RawPeaksWrapperFunction(new KeepNHighestRawPeaks(numberHigherPeaks))); + + } + + /** + * Creates a new instance of the matching m/z binner. + * + * Note: The validity of these parameters is not checked in this function but + * must be checked before calling it. + * + * @return A new IMzBinner object. + */ + public IMzBinner createMzBinner() { + return (fragmentIonPrecision.equalsIgnoreCase("high")) ? + new HighResolutionMzBinner() : new TideBinner(); + } +} diff --git a/src/main/java/org/spectra/cluster/util/DefaultParameters.java b/src/main/java/org/spectra/cluster/util/DefaultParameters.java deleted file mode 100644 index 2adc120..0000000 --- a/src/main/java/org/spectra/cluster/util/DefaultParameters.java +++ /dev/null @@ -1,103 +0,0 @@ -package org.spectra.cluster.util; - -import lombok.Data; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.URISyntaxException; -import java.util.Properties; - -/** - * This code is licensed under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

- * ==Overview== - * - * @author ypriverol on 18/10/2018. - */ -@Data -public class DefaultParameters { - - private String binaryDirectory; - private boolean reuseBinary; - private boolean fastMode; - private Integer clusterRounds; - private boolean filterReportPeaks; - private Integer numberHigherPeaks; - private Double precursorIonTolerance; - private String fragmentIonPrecision; - private boolean ignoreCharge; - - private Float thresholdStart; - private Float thresholdEnd; - private int nInitiallySharedPeaks; - private int minNumberOfComparisons; - - - public DefaultParameters(){ - - try { - Properties properties = readProperties(); - setProperties(properties); - - } catch (URISyntaxException e) { - e.printStackTrace(); - } - - } - - private void setProperties(Properties properties) { - if(properties.containsKey("precursor.tolerance")) - this.precursorIonTolerance = Double.parseDouble(properties.getProperty("precursor.tolerance").trim()); - if(properties.containsKey("fragment.precision")) - this.fragmentIonPrecision = properties.getProperty("fragment.precision"); - if(properties.containsKey("threshold.start")) - this.thresholdStart = Float.parseFloat(properties.getProperty("threshold.start")); - if(properties.containsKey("threshold.end")) - this.thresholdEnd = Float.parseFloat(properties.getProperty("threshold.end")); - if(properties.containsKey("number.higher.peaks")) - this.numberHigherPeaks = Integer.parseInt(properties.getProperty("number.higher.peaks")); - if(properties.containsKey("cluster.rounds")) - this.clusterRounds = Integer.parseInt(properties.getProperty("cluster.rounds")); - if(properties.containsKey("binary.temp.directory")) - this.binaryDirectory = properties.getProperty("binary.temp.directory"); - if(properties.containsKey("reuse.binary.files")) - this.reuseBinary = Boolean.parseBoolean(properties.getProperty("reuse.binary.files")); - if(properties.containsKey("ignore.charge")) - this.ignoreCharge = Boolean.parseBoolean(properties.getProperty("ignore.charge")); - if(properties.containsKey("cluster.fast.mode")) - this.fastMode = Boolean.parseBoolean(properties.getProperty("cluster.fast.mode")); - if(properties.containsKey("filters.remove.reporter.peaks")) - this.filterReportPeaks = Boolean.parseBoolean(properties.getProperty("filters.remove.reporter.peaks")); - if(properties.containsKey("initially.shared.peaks")) - this.nInitiallySharedPeaks = Integer.parseInt(properties.getProperty("initially.shared.peaks")); - if(properties.containsKey("x.min.comparisons")) - this.minNumberOfComparisons = Integer.parseInt(properties.getProperty("x.min.comparisons")); - } - - public Properties readProperties() throws URISyntaxException { - Properties properties = new Properties(); - InputStream output; - - try { - output = getClass().getClassLoader().getResourceAsStream("application.properties"); - properties.load(output); - } catch (IOException e) { - e.printStackTrace(); - } - return properties; - } - - public void mergeParameters(String configFile) throws IOException { - File propertiesFactoryBean = new File(configFile); - Properties newProperties = new Properties(); - InputStream output = new FileInputStream(propertiesFactoryBean); - newProperties.load(output); - setProperties(newProperties); - } -} diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index cb28b27..3149508 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -1,7 +1,6 @@ precursor.tolerance=1 fragment.precision=low - # Clustering execution parameters threshold.start=1.0F threshold.end=0.99F @@ -14,11 +13,17 @@ reuse.binary.files=false cluster.fast.mode=false ignore.charge=false +# also write an MSP file containing all consensus spectra +output.msp=true + # Number of peaks that will be used to perform the comparison number.higher.peaks=40 initially.shared.peaks=5 filters.remove.reporter.peaks=true +# number of parallel jobs +n.threads = 2 + # The minimum number of comparisons is derived from the data. The set number # is used as an additional minimum x.min.comparisons=0 \ No newline at end of file diff --git a/src/test/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilderTest.java b/src/test/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilderTest.java new file mode 100644 index 0000000..6887231 --- /dev/null +++ b/src/test/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilderTest.java @@ -0,0 +1,70 @@ +package org.spectra.cluster.consensus; + +import io.github.bigbio.pgatk.io.common.spectra.Spectrum; +import io.github.bigbio.pgatk.io.properties.IPropertyStorage; +import io.github.bigbio.pgatk.io.properties.InMemoryPropertyStorage; +import org.junit.Assert; +import org.junit.Test; +import org.spectra.cluster.cdf.MinNumberComparisonsAssessor; +import org.spectra.cluster.engine.GreedyClusteringEngine; +import org.spectra.cluster.engine.IClusteringEngine; +import org.spectra.cluster.filter.binaryspectrum.HighestPeakPerBinFunction; +import org.spectra.cluster.io.spectra.MzSpectraReader; +import org.spectra.cluster.model.cluster.ICluster; +import org.spectra.cluster.model.consensus.GreedyConsensusSpectrum; +import org.spectra.cluster.normalizer.BasicIntegerNormalizer; +import org.spectra.cluster.normalizer.MaxPeakNormalizer; +import org.spectra.cluster.normalizer.TideBinner; +import org.spectra.cluster.predicates.ShareHighestPeaksClusterPredicate; +import org.spectra.cluster.similarity.CombinedFisherIntensityTest; +import org.spectra.cluster.util.ClusteringParameters; + +import java.io.File; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + +public class AverageConsensusSpectrumBuilderTest { + @Test + public void testAverageSpectrum() throws Exception { + ClusteringParameters params = new ClusteringParameters(); + IConsensusSpectrumBuilder builder = new AverageConsensusSpectrumBuilder(params); + + // load the spectra + File mgfFile = new File(getClass().getClassLoader().getResource("imp_single_cluster.mgf").toURI()); + IPropertyStorage localStorage = new InMemoryPropertyStorage(); + MzSpectraReader reader = new MzSpectraReader(mgfFile, new TideBinner(), new MaxPeakNormalizer(), + new BasicIntegerNormalizer(), new HighestPeakPerBinFunction(), params.createLoadingFilter(), + GreedyClusteringEngine.COMPARISON_FILTER, params.createGreedyClusteringEngine()); + + Iterator iterator = reader.readClusterIterator(localStorage); + List spectra = new ArrayList<>(1_000); + + while (iterator.hasNext()) { + spectra.add(iterator.next()); + } + + // sort according to m/z + spectra.sort(Comparator.comparingInt(ICluster::getPrecursorMz)); + + // cluster everything + IClusteringEngine engine = new GreedyClusteringEngine(BasicIntegerNormalizer.MZ_CONSTANT, + 1, 0.99f, 5, new CombinedFisherIntensityTest(), + new MinNumberComparisonsAssessor(10_000), new ShareHighestPeaksClusterPredicate(5), + GreedyConsensusSpectrum.NOISE_FILTER_INCREMENT); + + ICluster[] clusters = engine.clusterSpectra(spectra.toArray(new ICluster[spectra.size()])); + + Assert.assertEquals(1, clusters.length); + + // create the consensus spectrum + Spectrum consensusSpectrum = builder.createConsensusSpectrum(clusters[0], localStorage); + + Assert.assertNotNull(consensusSpectrum); + Assert.assertEquals(2, (int) consensusSpectrum.getPrecursorCharge()); + Assert.assertEquals(2, (int) consensusSpectrum.getMsLevel()); + Assert.assertEquals(69, consensusSpectrum.getPeakList().size()); + Assert.assertEquals(402.717, consensusSpectrum.getPrecursorMZ(), 0.001); + } +} diff --git a/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java b/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java new file mode 100644 index 0000000..82fe678 --- /dev/null +++ b/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java @@ -0,0 +1,97 @@ +package org.spectra.cluster.consensus; + +import io.github.bigbio.pgatk.io.properties.IPropertyStorage; +import io.github.bigbio.pgatk.io.properties.InMemoryPropertyStorage; +import org.junit.Assert; +import org.junit.Test; +import org.spectra.cluster.cdf.MinNumberComparisonsAssessor; +import org.spectra.cluster.engine.GreedyClusteringEngine; +import org.spectra.cluster.engine.IClusteringEngine; +import org.spectra.cluster.io.spectra.MzSpectraReader; +import org.spectra.cluster.model.cluster.ICluster; +import org.spectra.cluster.model.consensus.GreedyConsensusSpectrum; +import org.spectra.cluster.normalizer.BasicIntegerNormalizer; +import org.spectra.cluster.normalizer.MaxPeakNormalizer; +import org.spectra.cluster.predicates.ShareHighestPeaksClusterPredicate; +import org.spectra.cluster.similarity.CombinedFisherIntensityTest; + +import java.io.File; +import java.net.URI; +import java.util.*; +import java.util.stream.Collectors; + +public class TestAbstractConsensusBuilder { + private List loadClusters(IPropertyStorage propertyStorage) throws Exception { + // open the file + URI uri = Objects.requireNonNull(getClass().getClassLoader().getResource("single-spectra.mgf")).toURI(); + File mgfFile = new File(uri); + + IClusteringEngine engine = new GreedyClusteringEngine(BasicIntegerNormalizer.MZ_CONSTANT, + 1, 0.99f, 5, new CombinedFisherIntensityTest(), + new MinNumberComparisonsAssessor(10000), new ShareHighestPeaksClusterPredicate(5), + GreedyConsensusSpectrum.NOISE_FILTER_INCREMENT); + + MzSpectraReader spectraReader = new MzSpectraReader(mgfFile, GreedyClusteringEngine.COMPARISON_FILTER, engine); + + + // read the spectra + Iterator clusterIterator = spectraReader.readClusterIterator(propertyStorage); + List clusters = new ArrayList<>(20); + + while (clusterIterator.hasNext()) { + clusters.add(clusterIterator.next()); + } + + Assert.assertEquals(2, clusters.size()); + + return clusters; + } + + @Test + public void testLoadOriginalPeaks() throws Exception { + IPropertyStorage propertyStorage = new InMemoryPropertyStorage(); + List clusters = loadClusters(propertyStorage); + + // get the original peaks + for (ICluster cluster : clusters) { + List rawPeaks = AbstractConsensusSpectrumBuilder.loadOriginalPeaks(cluster, propertyStorage, true); + + Assert.assertEquals(50, rawPeaks.size()); + + // max peak must always be the same + double maxValue = rawPeaks.stream().mapToDouble(ConsensusPeak::getIntensity).max().getAsDouble(); + + Assert.assertEquals(MaxPeakNormalizer.MAX_INTENSITY, maxValue, 0); + } + } + + @Test + public void testAveragePrecursorMz () throws Exception { + IPropertyStorage propertyStorage = new InMemoryPropertyStorage(); + List clusters = loadClusters(propertyStorage); + + // get the average precuror m/z for every cluster + for (ICluster cluster : clusters) { + Double averageMz = AbstractConsensusSpectrumBuilder.getAveragePrecursorMz(cluster, propertyStorage); + + Assert.assertEquals(400.29, averageMz, 0.01); + } + } + + @Test + public void testMergePeaks() throws Exception { + ConsensusPeak p1 = new ConsensusPeak(10.0, 1.0); + ConsensusPeak p2 = new ConsensusPeak(10.1, 2.0); + ConsensusPeak p3 = new ConsensusPeak(11.0, 1.0); + + List peaks = Arrays.stream(new ConsensusPeak[]{p1, p2, p3}).collect(Collectors.toList()); + + List mergedPeaks = AbstractConsensusSpectrumBuilder.mergeConsensusPeaks(peaks, 0.5); + + Assert.assertEquals(2, mergedPeaks.size()); + Assert.assertEquals(10.05, mergedPeaks.get(0).getMz(), 0); + Assert.assertEquals(2, mergedPeaks.get(0).getCount()); + Assert.assertEquals(1.5, mergedPeaks.get(0).getIntensity(), 0); + Assert.assertEquals(1, mergedPeaks.get(1).getCount()); + } +} diff --git a/src/test/java/org/spectra/cluster/consensus/TestConsensusPeak.java b/src/test/java/org/spectra/cluster/consensus/TestConsensusPeak.java new file mode 100644 index 0000000..12b500f --- /dev/null +++ b/src/test/java/org/spectra/cluster/consensus/TestConsensusPeak.java @@ -0,0 +1,26 @@ +package org.spectra.cluster.consensus; + +import org.junit.Assert; +import org.junit.Test; + +public class TestConsensusPeak { + @Test + public void basicConsensusPeakTest() { + ConsensusPeak p1 = new ConsensusPeak(1.0, 1.0); + ConsensusPeak p2 = new ConsensusPeak(2.0, 0.5); + + p1.mergePeak(p2); + + Assert.assertEquals(1.5, p1.getMz(), 0); + Assert.assertEquals(0.75, p1.getIntensity(), 0); + Assert.assertEquals(2, p1.getCount()); + + ConsensusPeak p3 = new ConsensusPeak(3.0, 4.0); + + p1.mergePeak(p3); + + Assert.assertEquals(2.0, p1.getMz(), 0); + Assert.assertEquals(1.833, p1.getIntensity(), 0.001); + Assert.assertEquals(3, p1.getCount()); + } +} diff --git a/src/test/java/org/spectra/cluster/io/MapClusterStorageBenchmarkTest.java b/src/test/java/org/spectra/cluster/io/MapClusterStorageBenchmarkTest.java index bf6bb9c..02f14df 100644 --- a/src/test/java/org/spectra/cluster/io/MapClusterStorageBenchmarkTest.java +++ b/src/test/java/org/spectra/cluster/io/MapClusterStorageBenchmarkTest.java @@ -112,12 +112,8 @@ public void storeBigClusterStatic() throws IOException, SpectraClusterException, time = System.currentTimeMillis(); IntStream.range(0, MAX_READING_VALUE).forEach(x -> { - try { - int key = random.nextInt(clusters.length); - ICluster value = clusterStorage.get(clusters[key].getId() + String.valueOf(x)); - }catch (PgatkIOException ex){ - log.error("Error reading entry -- " + x); - } + int key = random.nextInt(clusters.length); + ICluster value = clusterStorage.get(clusters[key].getId() + String.valueOf(x)); }); System.out.println("ChronicleMap: Reading 200'000 Clusters -- " + (System.currentTimeMillis() - time) / 1000); @@ -152,11 +148,7 @@ public void storeBigClusterDynamic() throws IOException, SpectraClusterException time = System.currentTimeMillis(); IntStream.range(0, MAX_READING_VALUE).forEach(x -> { - try { - ICluster value = clusterStorage.get(clusters[0].getId() + "-" + String.valueOf(x)); - }catch (PgatkIOException ex){ - log.error("Error reading entry -- " + x); - } + ICluster value = clusterStorage.get(clusters[0].getId() + "-" + String.valueOf(x)); }); System.out.println("Sparkey: Reading 200'000 Clusters -- " + (System.currentTimeMillis() - time) / 1000); diff --git a/src/test/java/org/spectra/cluster/io/MzSpectraReaderTest.java b/src/test/java/org/spectra/cluster/io/MzSpectraReaderTest.java index f51253a..e58c4fa 100644 --- a/src/test/java/org/spectra/cluster/io/MzSpectraReaderTest.java +++ b/src/test/java/org/spectra/cluster/io/MzSpectraReaderTest.java @@ -117,7 +117,7 @@ public void testPropertyLoading() throws Exception { Assert.assertEquals(136, nIdentified); - Assert.assertEquals(7, storage.getAvailableProperties().size()); + Assert.assertEquals(9, storage.getAvailableProperties().size()); } @Test diff --git a/src/test/java/org/spectra/cluster/io/result/MspWriterTest.java b/src/test/java/org/spectra/cluster/io/result/MspWriterTest.java new file mode 100644 index 0000000..b338439 --- /dev/null +++ b/src/test/java/org/spectra/cluster/io/result/MspWriterTest.java @@ -0,0 +1,127 @@ +package org.spectra.cluster.io.result; + +import io.github.bigbio.pgatk.io.objectdb.LongObject; +import io.github.bigbio.pgatk.io.objectdb.ObjectsDB; +import io.github.bigbio.pgatk.io.properties.IPropertyStorage; +import io.github.bigbio.pgatk.io.properties.InMemoryPropertyStorage; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.spectra.cluster.cdf.MinNumberComparisonsAssessor; +import org.spectra.cluster.consensus.AverageConsensusSpectrumBuilder; +import org.spectra.cluster.engine.GreedyClusteringEngine; +import org.spectra.cluster.filter.binaryspectrum.HighestPeakPerBinFunction; +import org.spectra.cluster.filter.rawpeaks.*; +import org.spectra.cluster.io.cluster.ObjectDBGreedyClusterStorage; +import org.spectra.cluster.io.spectra.MzSpectraReader; +import org.spectra.cluster.model.cluster.GreedySpectralCluster; +import org.spectra.cluster.model.cluster.ICluster; +import org.spectra.cluster.model.cluster.IClusterProperties; +import org.spectra.cluster.model.consensus.GreedyConsensusSpectrum; +import org.spectra.cluster.normalizer.BasicIntegerNormalizer; +import org.spectra.cluster.normalizer.MaxPeakNormalizer; +import org.spectra.cluster.normalizer.TideBinner; +import org.spectra.cluster.predicates.ShareHighestPeaksClusterPredicate; +import org.spectra.cluster.similarity.CombinedFisherIntensityTest; +import org.spectra.cluster.util.ClusteringParameters; + +import java.io.File; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +public class MspWriterTest { + Path testDir; + + @Before + public void setUp() throws Exception { + testDir = Files.createTempDirectory("clusters-"); + } + + @Test + public void testMspWriting() throws Exception { + // ignore the property storage for now + IPropertyStorage propertyStorage = new InMemoryPropertyStorage(); + + IRawSpectrumFunction loadingFilter = new RemoveImpossiblyHighPeaksFunction() + .specAndThen(new RemovePrecursorPeaksFunction(0.5)) + .specAndThen(new RawPeaksWrapperFunction(new KeepNHighestRawPeaks(40))); + + // create a basic clustering engine for testing + GreedyClusteringEngine engine = new GreedyClusteringEngine(BasicIntegerNormalizer.MZ_CONSTANT, + 1, 0.99f, 5, new CombinedFisherIntensityTest(), + new MinNumberComparisonsAssessor(10000), new ShareHighestPeaksClusterPredicate(5), + GreedyConsensusSpectrum.NOISE_FILTER_INCREMENT); + + URI[] mgfFiles = new URI[] { + getClass().getClassLoader().getResource("same_sequence_cluster.mgf").toURI(), + getClass().getClassLoader().getResource("synthetic_mixed_runs.mgf").toURI()}; + File[] inFiles = Arrays.stream(mgfFiles).map(File::new).toArray(File[]::new); + + // read all files at once + MzSpectraReader reader = new MzSpectraReader(new TideBinner(), new MaxPeakNormalizer(), + new BasicIntegerNormalizer(), new HighestPeakPerBinFunction(), loadingFilter, + GreedyClusteringEngine.COMPARISON_FILTER, engine, inFiles); + + // create the iterator + Iterator iterator = reader.readClusterIterator(propertyStorage); + + // keep track of the cluster ids + List clusterProperties = new ArrayList<>(10_000); + + // create the output file + Path clusteringResult = Paths.get(testDir.toString(), "clustering_result.cls"); + ObjectDBGreedyClusterStorage clusterStorage = new ObjectDBGreedyClusterStorage(new ObjectsDB(clusteringResult.toString(), true)); + + while (iterator.hasNext()) { + GreedySpectralCluster c = (GreedySpectralCluster) iterator.next(); + clusterStorage.addGreedySpectralCluster(LongObject.asLongHash(c.getId()), c); + } + + clusterStorage.writeDBMode(); + clusterStorage.flush(); + + // convert + Path mspFile = Paths.get(testDir.toString(), "clusters.msp"); + + MspWriter resultWriter = new MspWriter(new AverageConsensusSpectrumBuilder(new ClusteringParameters())); + resultWriter.writeResult(mspFile, clusterStorage, propertyStorage); + + // check that everything worked + Assert.assertTrue(Files.exists(mspFile)); + + List lines = Files.readAllLines(mspFile); + Assert.assertEquals("Name: +42.011EVQLVETGGGLIQPGGSLR/2", lines.get(0)); + Assert.assertEquals("Comment: Spec=Consensus Parent=977.0230 Mods=1(0,[,Acetyl) Nreps=1 Naa=26 MaxRatio=1.000", lines.get(1)); + Assert.assertEquals("Num peaks: 50", lines.get(2)); + } + + @Test + public void testExtractPtms() { + MspWriter writer = new MspWriter(new AverageConsensusSpectrumBuilder(new ClusteringParameters())); + String sequence = "+42.011EVQLVET+42.011GGGLIQPGGSLR+42.011"; + + List mods = writer.extractModsFromSequence(sequence); + + Assert.assertEquals(3, mods.size()); + + Assert.assertEquals(0, mods.get(0).getPosition()); + Assert.assertEquals("Acetyl", mods.get(0).getName()); + Assert.assertEquals("[", mods.get(0).getAminoAcid()); + } + + @Test + public void testGetModString() { + MspWriter writer = new MspWriter(new AverageConsensusSpectrumBuilder(new ClusteringParameters())); + String sequence = "+42.011EVQLVET+42.011GGGLIQPGGSLR+42.011"; + + String modString = writer.getModString(sequence); + + Assert.assertEquals("3(0,[,Acetyl)(7,T,Acetyl)(19,],Acetyl)", modString); + } +} diff --git a/src/test/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringToolTest.java b/src/test/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringToolTest.java index 5a98978..247d00e 100644 --- a/src/test/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringToolTest.java +++ b/src/test/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringToolTest.java @@ -24,6 +24,7 @@ import org.spectra.cluster.normalizer.TideBinner; import org.spectra.cluster.predicates.ShareHighestPeaksClusterPredicate; import org.spectra.cluster.similarity.CombinedFisherIntensityTest; +import org.spectra.cluster.util.ClusteringParameters; import java.io.File; import java.net.URI; @@ -108,11 +109,18 @@ public void testParallelClustering() throws Exception { File finalResultFile = new File(testDir.toFile(), "result.bcs"); - clusterer.runClustering(testClusters, clusterStorage, finalResultFile, - BasicIntegerNormalizer.MZ_CONSTANT, - 1, 0.99f, 5, new CombinedFisherIntensityTest(), - new MinNumberComparisonsAssessor(10000), new ShareHighestPeaksClusterPredicate(5), - GreedyConsensusSpectrum.NOISE_FILTER_INCREMENT); + ClusteringParameters clusteringParameters = new ClusteringParameters(); + clusteringParameters.setThresholdStart(1f); + clusteringParameters.setThresholdEnd(0.99f); + clusteringParameters.setClusterRounds(5); + clusteringParameters.setFragmentIonPrecision("high"); + clusteringParameters.setPrecursorIonTolerance((double) 1); + clusteringParameters.setIgnoreCharge(false); + clusteringParameters.setNInitiallySharedPeaks(5); + clusteringParameters.setNThreads(1); + clusteringParameters.setOutputFile(finalResultFile); + + clusterer.runClustering(testClusters, clusterStorage, clusteringParameters); // make sure the final result file exists Assert.assertTrue(finalResultFile.exists()); @@ -131,7 +139,7 @@ public void testParallelClustering() throws Exception { totalSpectra += cluster.getClusteredSpectraCount(); } - Assert.assertEquals(192, totalClusters); + Assert.assertEquals(95, totalClusters); Assert.assertEquals(testClusters.length, totalSpectra); } }