OpenMS
Biosaur2Algorithm.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Timo Sachsenberg $
6 // $Authors: Timo Sachsenberg $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
14 
15 #include <map>
16 #include <vector>
17 
18 namespace OpenMS
19 {
20 
50 class OPENMS_DLLAPI Biosaur2Algorithm :
51  public DefaultParamHandler
52 {
53 public:
62  struct Hill
63  {
64  std::vector<Size> scan_indices;
65  std::vector<Size> peak_indices;
66  std::vector<double> mz_values;
67  std::vector<double> intensities;
68  std::vector<double> rt_values;
69  std::vector<double> drift_times;
70  std::vector<double> ion_mobilities;
71  double mz_weighted_mean = 0.0;
72  double rt_start = 0.0;
73  double rt_end = 0.0;
74  double rt_apex = 0.0;
75  double intensity_apex = 0.0;
76  double intensity_sum = 0.0;
77  double drift_time_median = -1.0;
78  double ion_mobility_median = -1.0;
79  Size length = 0;
80  Size hill_idx = 0;
81  };
82 
91  {
92  Size hill_idx = 0;
93  Size isotope_number = 0;
94  double mass_diff_ppm = 0.0;
95  double cos_corr = 0.0;
96  };
97 
107  {
108  double mz = 0.0;
109  double rt_start = 0.0;
110  double rt_end = 0.0;
111  double rt_apex = 0.0;
112  double intensity_apex = 0.0;
113  double intensity_sum = 0.0;
114  int charge = 0;
115  Size n_isotopes = 0;
116  Size n_scans = 0;
117  double mass_calib = 0.0;
118  double drift_time = -1.0;
119  double ion_mobility = -1.0;
120  std::vector<IsotopeCandidate> isotopes;
121  Size mono_hill_idx = 0;
122  };
123 
130 
136  void setMSData(const MSExperiment& ms_data);
137 
143  void setMSData(MSExperiment&& ms_data);
144 
151 
157  const MSExperiment& getMSData() const;
158 
169  void run(FeatureMap& feature_map);
170 
192  void run(FeatureMap& feature_map,
193  std::vector<Hill>& hills,
194  std::vector<PeptideFeature>& peptide_features);
195 
205  void writeTSV(const std::vector<PeptideFeature>& features, const String& filename) const;
206 
216  void writeHills(const std::vector<Hill>& hills, const String& filename) const;
217 
218 protected:
220  void updateMembers_() override;
221 
222 private:
224 
225 
233  {
234  Size hill_index = 0;
235  Size first_scan = 0;
236  Size last_scan = 0;
237  };
238 
246  {
247  Size mono_index = 0;
248  double mono_mz = 0.0;
249  int charge = 0;
250  double cos_cor_isotopes = 0.0;
251  std::vector<IsotopeCandidate> isotopes;
252  Size n_scans = 0;
253  };
254 
256 
258 
259 
267  double calculatePPM_(double mz1, double mz2) const;
268 
275  double calculateMedian_(const std::vector<double>& values) const;
276 
282  double cosineCorrelation1D_(const std::vector<double>& v1,
283  const std::vector<double>& v2) const;
284 
292  std::pair<double, Size> checkingCosCorrelationForCarbon_(const std::vector<double>& theor_full,
293  const std::vector<double>& exp_full,
294  double thresh) const;
295 
304  std::pair<std::vector<double>, Size> computeAveragine_(double neutral_mass,
305  double apex_intensity) const;
306 
316  std::vector<double> meanFilter_(const std::vector<double>& data, Size window) const;
317 
327  std::pair<double, double> calibrateMass_(const std::vector<double>& mass_errors, double bin_width = 0.05) const;
328 
336  double computeHillMzStep_(const MSExperiment& exp,
337  double htol_ppm,
338  double min_intensity,
339  double min_mz,
340  double max_mz) const;
341 
350  void processTOF_(MSExperiment& exp) const;
351 
358 
369  void centroidPASEFData_(MSExperiment& exp, double mz_step, double pasef_tolerance) const;
370 
386  std::vector<Hill> detectHills_(const MSExperiment& exp, double htol_ppm, double min_intensity, double min_mz, double max_mz, bool use_im, std::vector<double>* hill_mass_diffs = nullptr) const;
387 
396  void linkScanToHills_(const MSSpectrum& spectrum,
397  Size scan_idx,
398  double htol_ppm,
399  double min_intensity,
400  double min_mz,
401  double max_mz,
402  double mz_step,
403  bool use_im_global,
404  std::vector<Hill>& hills,
405  Size& hill_idx_counter,
406  std::vector<Size>& prev_peak_to_hill,
407  const MSSpectrum*& prev_spectrum_ptr,
408  std::map<int, std::vector<int>>& prev_fast_dict,
409  std::vector<int>& prev_im_bins,
410  std::vector<double>* hill_mass_diffs) const;
411 
419  std::vector<Hill> processHills_(const std::vector<Hill>& hills, Size min_length) const;
420 
432  std::vector<Hill> splitHills_(const std::vector<Hill>& hills, double hvf, Size min_length) const;
433 
445  Size checkIsotopeValleySplit_(const std::vector<IsotopeCandidate>& isotopes, const std::vector<Hill>& hills, double ivf) const;
446 
461  std::map<int, std::pair<double, double>> performInitialIsotopeCalibration_(const std::vector<Hill>& hills,
462  double itol_ppm,
463  int min_charge,
464  int max_charge,
465  bool enable_isotope_calib) const;
466 
479  double buildFastMzLookup_(const std::vector<Hill>& hills,
480  bool use_im,
481  std::map<int, std::vector<FastHillEntry>>& hills_mz_fast,
482  std::vector<int>& hill_im_bins) const;
483 
503  std::vector<PatternCandidate> generateIsotopeCandidates_(const std::vector<Hill>& hills,
504  double itol_ppm,
505  int min_charge,
506  int max_charge,
507  double ivf,
508  double mz_step,
509  const std::map<int, std::vector<FastHillEntry>>& hills_mz_fast,
510  const std::map<Size, Size>& hill_idx_to_index,
511  const std::vector<int>& hill_im_bins,
512  bool use_im) const;
513 
525  std::vector<PatternCandidate> applyRtFiltering_(const std::vector<PatternCandidate>& candidates,
526  const std::vector<Hill>& hills,
527  const std::map<Size, Size>& hill_idx_to_index) const;
528 
540  std::map<int, std::pair<double, double>> refineIsotopeCalibration_(const std::vector<PatternCandidate>& candidates,
541  double itol_ppm,
542  bool enable_isotope_calib) const;
543 
557  std::vector<PatternCandidate> filterByCalibration_(const std::vector<PatternCandidate>& candidates,
558  const std::vector<Hill>& hills,
559  const std::map<Size, Size>& hill_idx_to_index,
560  const std::map<int, std::pair<double, double>>& isotope_calib_map_ready,
561  bool enable_isotope_calib) const;
562 
577  std::vector<PeptideFeature> selectNonOverlappingPatterns_(const std::vector<PatternCandidate>& filtered_ready,
578  const std::vector<Hill>& hills,
579  bool negative_mode,
580  int iuse,
581  double itol_ppm) const;
582 
601  std::vector<PeptideFeature> detectIsotopePatterns_(std::vector<Hill>& hills, double itol_ppm, int min_charge, int max_charge, bool negative_mode, double ivf, int iuse, bool enable_isotope_calib, bool use_im) const;
602 
615  FeatureMap convertToFeatureMap_(const std::vector<PeptideFeature>& features,
616  const std::vector<Hill>& hills) const;
617 
634  void debugCheckIsotopeConsistency_(const char* stage_label,
635  double mono_mz_center,
636  double mono_rt_apex,
637  Size mono_hill_idx,
638  int charge,
639  double itol_ppm,
640  const Hill& iso_hill,
641  Size isotope_number) const;
642 
655  double cosineCorrelation_(const std::vector<double>& intensities1, const std::vector<Size>& scans1,
656  const std::vector<double>& intensities2, const std::vector<Size>& scans2) const;
657 
667  bool shouldThrowForMissingIM_(const MSSpectrum& spectrum) const;
668 
681  void processFAIMSGroup_(double faims_cv,
682  MSExperiment& group_exp,
683  double original_paseftol,
684  std::vector<Hill>& hills_out,
685  std::vector<PeptideFeature>& features_out);
687 
689 
691 
692  // Algorithm parameters (cached from Param for performance)
693  double mini_;
694  double minmz_;
695  double maxmz_;
696  double htol_;
697  double itol_;
698  double hvf_;
699  double ivf_;
701  int cmin_;
702  int cmax_;
703  double pasefmini_;
705  int iuse_;
707  bool tof_mode_;
711  double paseftol_;
712  double hrttol_;
716 };
717 
718 } // namespace OpenMS
Implementation of the Biosaur2 feature detection workflow for LC-MS1 data.
Definition: Biosaur2Algorithm.h:52
int cmin_
Minimum charge state to consider.
Definition: Biosaur2Algorithm.h:701
std::vector< PatternCandidate > generateIsotopeCandidates_(const std::vector< Hill > &hills, double itol_ppm, int min_charge, int max_charge, double ivf, double mz_step, const std::map< int, std::vector< FastHillEntry >> &hills_mz_fast, const std::map< Size, Size > &hill_idx_to_index, const std::vector< int > &hill_im_bins, bool use_im) const
Generate initial isotope pattern candidates for all monoisotopic hills.
double pasefmini_
Minimum intensity for PASEF/TIMS clusters after centroiding.
Definition: Biosaur2Algorithm.h:703
std::vector< double > drift_times
Drift time values (TIMS data), empty if not available.
Definition: Biosaur2Algorithm.h:69
void centroidProfileSpectra_(MSExperiment &exp) const
Centroid profile spectra using PeakPickerHiRes.
std::vector< PatternCandidate > filterByCalibration_(const std::vector< PatternCandidate > &candidates, const std::vector< Hill > &hills, const std::map< Size, Size > &hill_idx_to_index, const std::map< int, std::pair< double, double >> &isotope_calib_map_ready, bool enable_isotope_calib) const
Filter isotope pattern candidates using refined calibration and cosine checks.
String convex_hull_mode_
Representation of feature convex hulls ("mass_traces" vs. "bounding_box")
Definition: Biosaur2Algorithm.h:713
double minmz_
Minimum m/z value.
Definition: Biosaur2Algorithm.h:694
std::vector< PeptideFeature > detectIsotopePatterns_(std::vector< Hill > &hills, double itol_ppm, int min_charge, int max_charge, bool negative_mode, double ivf, int iuse, bool enable_isotope_calib, bool use_im) const
Detect isotope patterns and assemble peptide features.
double mini_
Minimum intensity threshold.
Definition: Biosaur2Algorithm.h:693
void processFAIMSGroup_(double faims_cv, MSExperiment &group_exp, double original_paseftol, std::vector< Hill > &hills_out, std::vector< PeptideFeature > &features_out)
Process a single FAIMS compensation voltage group.
std::vector< double > mz_values
m/z values of peaks in this hill
Definition: Biosaur2Algorithm.h:66
std::pair< double, Size > checkingCosCorrelationForCarbon_(const std::vector< double > &theor_full, const std::vector< double > &exp_full, double thresh) const
Check cosine correlation for averagine-based isotope intensities.
std::map< int, std::pair< double, double > > performInitialIsotopeCalibration_(const std::vector< Hill > &hills, double itol_ppm, int min_charge, int max_charge, bool enable_isotope_calib) const
Perform an initial mass calibration for isotope spacings based on raw hills.
const MSExperiment & getMSData() const
Get const reference to MS data.
std::vector< double > rt_values
Retention time values corresponding to each peak.
Definition: Biosaur2Algorithm.h:68
Size checkIsotopeValleySplit_(const std::vector< IsotopeCandidate > &isotopes, const std::vector< Hill > &hills, double ivf) const
Evaluate whether isotope pattern should be truncated at valley positions.
void run(FeatureMap &feature_map, std::vector< Hill > &hills, std::vector< PeptideFeature > &peptide_features)
Execute the Biosaur2 workflow on the stored MS1 experiment.
bool faims_merge_features_
Whether to merge features at different FAIMS CV values representing the same analyte.
Definition: Biosaur2Algorithm.h:714
bool profile_mode_
Whether to centroid profile data using PeakPickerHiRes.
Definition: Biosaur2Algorithm.h:708
double htol_
Mass tolerance in ppm for hill detection.
Definition: Biosaur2Algorithm.h:696
bool shouldThrowForMissingIM_(const MSSpectrum &spectrum) const
Check if missing ion mobility data should be treated as an error.
double hrttol_
Maximum RT difference between monoisotopic and isotope apex (0=disable)
Definition: Biosaur2Algorithm.h:712
Size minlh_
Minimum number of scans required for a hill.
Definition: Biosaur2Algorithm.h:700
std::pair< std::vector< double >, Size > computeAveragine_(double neutral_mass, double apex_intensity) const
Compute averagine-based theoretical isotope intensities.
std::vector< double > ion_mobilities
Ion mobility values, empty if not available.
Definition: Biosaur2Algorithm.h:70
std::vector< Hill > splitHills_(const std::vector< Hill > &hills, double hvf, Size min_length) const
Split hills at valley positions to separate co-eluting species.
Biosaur2Algorithm()
Default constructor.
bool ignore_iso_calib_
Whether to disable automatic isotope mass calibration.
Definition: Biosaur2Algorithm.h:710
double itol_
Mass tolerance in ppm for isotope pattern detection.
Definition: Biosaur2Algorithm.h:697
std::vector< Size > scan_indices
Indices of spectra containing peaks of this hill.
Definition: Biosaur2Algorithm.h:64
std::vector< Hill > processHills_(const std::vector< Hill > &hills, Size min_length) const
Filter and process hills by applying length constraints and computing summary statistics.
double buildFastMzLookup_(const std::vector< Hill > &hills, bool use_im, std::map< int, std::vector< FastHillEntry >> &hills_mz_fast, std::vector< int > &hill_im_bins) const
Build fast m/z and optional ion-mobility lookup structures for hills.
bool tof_mode_
Whether to enable TOF-specific intensity filtering.
Definition: Biosaur2Algorithm.h:707
void setMSData(const MSExperiment &ms_data)
Set the MS data used for feature detection (copy version)
MSExperiment & getMSData()
Get non-const reference to MS data.
void run(FeatureMap &feature_map)
Execute the Biosaur2 workflow on the stored MS1 experiment (simplified interface)
int cmax_
Maximum charge state to consider.
Definition: Biosaur2Algorithm.h:702
int iuse_
Number of isotopes for intensity calculation (0=mono, -1=all, N=mono+N)
Definition: Biosaur2Algorithm.h:705
std::vector< double > intensities
Intensity values of peaks in this hill.
Definition: Biosaur2Algorithm.h:67
void writeHills(const std::vector< Hill > &hills, const String &filename) const
Export the detected hills as TSV for diagnostic purposes.
FeatureMap convertToFeatureMap_(const std::vector< PeptideFeature > &features, const std::vector< Hill > &hills) const
Convert peptide features to OpenMS FeatureMap format.
std::vector< PeptideFeature > selectNonOverlappingPatterns_(const std::vector< PatternCandidate > &filtered_ready, const std::vector< Hill > &hills, bool negative_mode, int iuse, double itol_ppm) const
Greedily select non-overlapping isotope patterns and assemble peptide features.
std::vector< Hill > detectHills_(const MSExperiment &exp, double htol_ppm, double min_intensity, double min_mz, double max_mz, bool use_im, std::vector< double > *hill_mass_diffs=nullptr) const
Detect hills (continuous m/z traces) in the MS experiment.
double maxmz_
Maximum m/z value.
Definition: Biosaur2Algorithm.h:695
double paseftol_
Ion mobility tolerance for PASEF/TIMS data (0=disable)
Definition: Biosaur2Algorithm.h:711
bool use_hill_calib_
Whether to use automatic hill mass tolerance calibration.
Definition: Biosaur2Algorithm.h:709
void debugCheckIsotopeConsistency_(const char *stage_label, double mono_mz_center, double mono_rt_apex, Size mono_hill_idx, int charge, double itol_ppm, const Hill &iso_hill, Size isotope_number) const
Debug helper to log obviously inconsistent isotope assignments.
std::vector< Size > peak_indices
Indices of peaks within each spectrum.
Definition: Biosaur2Algorithm.h:65
Size pasefminlh_
Minimum number of points per PASEF/TIMS cluster.
Definition: Biosaur2Algorithm.h:704
double ivf_
Isotope valley factor for splitting isotope patterns.
Definition: Biosaur2Algorithm.h:699
double cosineCorrelation1D_(const std::vector< double > &v1, const std::vector< double > &v2) const
Compute a cosine correlation between two 1D intensity vectors.
void updateMembers_() override
Update internal member variables from parameters (called automatically when parameters change)
bool negative_mode_
Whether to use negative ion mode.
Definition: Biosaur2Algorithm.h:706
void linkScanToHills_(const MSSpectrum &spectrum, Size scan_idx, double htol_ppm, double min_intensity, double min_mz, double max_mz, double mz_step, bool use_im_global, std::vector< Hill > &hills, Size &hill_idx_counter, std::vector< Size > &prev_peak_to_hill, const MSSpectrum *&prev_spectrum_ptr, std::map< int, std::vector< int >> &prev_fast_dict, std::vector< int > &prev_im_bins, std::vector< double > *hill_mass_diffs) const
Link peaks in a single scan to existing hills or start new hills.
double calculatePPM_(double mz1, double mz2) const
Calculate the mass accuracy (ppm) between two m/z values.
MSExperiment ms_data_
Input LC-MS data.
Definition: Biosaur2Algorithm.h:690
void centroidPASEFData_(MSExperiment &exp, double mz_step, double pasef_tolerance) const
Centroid PASEF/TIMS spectra in joint m/z-ion mobility space.
double calculateMedian_(const std::vector< double > &values) const
Calculate the median of a vector of values.
void writeTSV(const std::vector< PeptideFeature > &features, const String &filename) const
Export detected peptide features to a Biosaur2-compatible TSV file.
double cosineCorrelation_(const std::vector< double > &intensities1, const std::vector< Size > &scans1, const std::vector< double > &intensities2, const std::vector< Size > &scans2) const
Compute cosine correlation between two intensity traces.
void setMSData(MSExperiment &&ms_data)
Set the MS data used for feature detection (move version)
double computeHillMzStep_(const MSExperiment &exp, double htol_ppm, double min_intensity, double min_mz, double max_mz) const
Determine m/z binning step for hill detection.
std::vector< PatternCandidate > applyRtFiltering_(const std::vector< PatternCandidate > &candidates, const std::vector< Hill > &hills, const std::map< Size, Size > &hill_idx_to_index) const
Apply RT-apex based filtering to isotope pattern candidates.
std::map< int, std::pair< double, double > > refineIsotopeCalibration_(const std::vector< PatternCandidate > &candidates, double itol_ppm, bool enable_isotope_calib) const
Refine isotope mass calibration based on initial pattern candidates.
void processTOF_(MSExperiment &exp) const
Apply TOF-specific intensity filtering.
std::vector< IsotopeCandidate > isotopes
List of associated isotope peaks.
Definition: Biosaur2Algorithm.h:120
std::vector< double > meanFilter_(const std::vector< double > &data, Size window) const
Apply a mean filter (moving average) to smooth data.
double hvf_
Hill valley factor for splitting hills at valleys.
Definition: Biosaur2Algorithm.h:698
std::pair< double, double > calibrateMass_(const std::vector< double > &mass_errors, double bin_width=0.05) const
Estimate mass calibration parameters from a distribution of mass errors.
Lightweight index entry for fast m/z-based hill lookup.
Definition: Biosaur2Algorithm.h:233
Representation of a single hill (continuous m/z trace across adjacent scans).
Definition: Biosaur2Algorithm.h:63
Candidate isotope peak that can be associated with a monoisotopic hill.
Definition: Biosaur2Algorithm.h:91
Internal representation of a candidate isotope pattern.
Definition: Biosaur2Algorithm.h:246
Aggregated properties of a detected peptide feature.
Definition: Biosaur2Algorithm.h:107
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:66
A container for features.
Definition: FeatureMap.h:82
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:49
The representation of a 1D spectrum.
Definition: MSSpectrum.h:44
A more convenient string class.
Definition: String.h:34
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:97
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19