OpenMS  2.8.0
SignalToNoiseEstimatorMedian.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2021.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: $
33 // --------------------------------------------------------------------------
34 //
35 
36 #pragma once
37 
38 
43 #include <vector>
44 #include <algorithm> //for std::max_element
45 
46 namespace OpenMS
47 {
80  template <typename Container = MSSpectrum>
82  public SignalToNoiseEstimator<Container>
83  {
84 
85 public:
86 
89 
93 
96 
98 
101  {
102  //set the name for DefaultParamHandler error messages
103  this->setName("SignalToNoiseEstimatorMedian");
104 
105  defaults_.setValue("max_intensity", -1, "maximal intensity considered for histogram construction. By default, it will be calculated automatically (see auto_mode)." \
106  " Only provide this parameter if you know what you are doing (and change 'auto_mode' to '-1')!" \
107  " All intensities EQUAL/ABOVE 'max_intensity' will be added to the LAST histogram bin." \
108  " If you choose 'max_intensity' too small, the noise estimate might be too small as well. " \
109  " If chosen too big, the bins become quite large (which you could counter by increasing 'bin_count', which increases runtime)." \
110  " In general, the Median-S/N estimator is more robust to a manual max_intensity than the MeanIterative-S/N.", {"advanced"});
111  defaults_.setMinInt("max_intensity", -1);
112 
113  defaults_.setValue("auto_max_stdev_factor", 3.0, "parameter for 'max_intensity' estimation (if 'auto_mode' == 0): mean + 'auto_max_stdev_factor' * stdev", {"advanced"});
114  defaults_.setMinFloat("auto_max_stdev_factor", 0.0);
115  defaults_.setMaxFloat("auto_max_stdev_factor", 999.0);
116 
117  defaults_.setValue("auto_max_percentile", 95, "parameter for 'max_intensity' estimation (if 'auto_mode' == 1): auto_max_percentile th percentile", {"advanced"});
118  defaults_.setMinInt("auto_max_percentile", 0);
119  defaults_.setMaxInt("auto_max_percentile", 100);
120 
121  defaults_.setValue("auto_mode", 0, "method to use to determine maximal intensity: -1 --> use 'max_intensity'; 0 --> 'auto_max_stdev_factor' method (default); 1 --> 'auto_max_percentile' method", {"advanced"});
122  defaults_.setMinInt("auto_mode", -1);
123  defaults_.setMaxInt("auto_mode", 1);
124 
125  defaults_.setValue("win_len", 200.0, "window length in Thomson");
126  defaults_.setMinFloat("win_len", 1.0);
127 
128  defaults_.setValue("bin_count", 30, "number of bins for intensity values");
129  defaults_.setMinInt("bin_count", 3);
130 
131  defaults_.setValue("min_required_elements", 10, "minimum number of elements required in a window (otherwise it is considered sparse)");
132  defaults_.setMinInt("min_required_elements", 1);
133 
134  defaults_.setValue("noise_for_empty_window", std::pow(10.0, 20), "noise value used for sparse windows", {"advanced"});
135 
136  defaults_.setValue("write_log_messages", "true", "Write out log messages in case of sparse windows or median in rightmost histogram bin");
137  defaults_.setValidStrings("write_log_messages", {"true","false"});
138 
140  }
141 
144  SignalToNoiseEstimator<Container>(source)
145  {
146  updateMembers_();
147  }
148 
154  {
155  if (&source == this) return *this;
156 
158  updateMembers_();
159  return *this;
160  }
161 
163 
166  {}
167 
169  double getSparseWindowPercent() const
170  {
171  return sparse_window_percent_;
172  }
173 
176  {
177  return histogram_oob_percent_;
178  }
179 
180 protected:
181 
182 
188  void computeSTN_(const Container& c) override
189  {
190  //first element in the scan
191  PeakIterator scan_first_ = c.begin();
192  //last element in the scan
193  PeakIterator scan_last_ = c.end();
194 
195  // reset counter for sparse windows
197  // reset counter for histogram overflow
199 
200  // reset the results
201  stn_estimates_.clear();
202  stn_estimates_.resize(c.size());
203 
204  // maximal range of histogram needs to be calculated first
205  if (auto_mode_ == AUTOMAXBYSTDEV)
206  {
207  // use MEAN+auto_max_intensity_*STDEV as threshold
208  GaussianEstimate gauss_global = SignalToNoiseEstimator<Container>::estimate_(scan_first_, scan_last_);
209  max_intensity_ = gauss_global.mean + std::sqrt(gauss_global.variance) * auto_max_stdev_Factor_;
210  }
211  else if (auto_mode_ == AUTOMAXBYPERCENT)
212  {
213  // get value at "auto_max_percentile_"th percentile
214  // we use a histogram approach here as well.
215  if ((auto_max_percentile_ < 0) || (auto_max_percentile_ > 100))
216  {
218  throw Exception::InvalidValue(__FILE__,
219  __LINE__,
220  OPENMS_PRETTY_FUNCTION,
221  "auto_mode is on AUTOMAXBYPERCENT! auto_max_percentile is not in [0,100]. Use setAutoMaxPercentile(<value>) to change it!",
222  s);
223  }
224 
225  std::vector<int> histogram_auto(100, 0);
226 
227  // find maximum of current scan
228  auto maxIt = std::max_element(c.begin(), c.end() ,[](const PeakType& a, const PeakType& b){ return a.getIntensity() > b.getIntensity();});
229  typename PeakType::IntensityType maxInt = maxIt->getIntensity();
230 
231  double bin_size = maxInt / 100;
232 
233  // fill histogram
234  for(const auto& peak : c)
235  {
236  ++histogram_auto[(int) ((peak.getIntensity() - 1) / bin_size)];
237  }
238 
239  // add up element counts in histogram until ?th percentile is reached
240  int elements_below_percentile = (int) (auto_max_percentile_ * c.size() / 100);
241  int elements_seen = 0;
242  int i = -1;
243  PeakIterator run = scan_first_;
244 
245  while (run != scan_last_ && elements_seen < elements_below_percentile)
246  {
247  ++i;
248  elements_seen += histogram_auto[i];
249  ++run;
250  }
251 
252  max_intensity_ = (((double)i) + 0.5) * bin_size;
253  }
254  else //if (auto_mode_ == MANUAL)
255  {
256  if (max_intensity_ <= 0)
257  {
259  throw Exception::InvalidValue(__FILE__,
260  __LINE__,
261  OPENMS_PRETTY_FUNCTION,
262  "auto_mode is on MANUAL! max_intensity is <=0. Needs to be positive! Use setMaxIntensity(<value>) or enable auto_mode!",
263  s);
264  }
265  }
266 
267  if (max_intensity_ < 0)
268  {
269  std::cerr << "TODO SignalToNoiseEstimatorMedian: the max_intensity_ value should be positive! " << max_intensity_ << std::endl;
270  return;
271  }
272 
273  PeakIterator window_pos_center = scan_first_;
274  PeakIterator window_pos_borderleft = scan_first_;
275  PeakIterator window_pos_borderright = scan_first_;
276 
277  double window_half_size = win_len_ / 2;
278  double bin_size = std::max(1.0, max_intensity_ / bin_count_); // at least size of 1 for intensity bins
279  int bin_count_minus_1 = bin_count_ - 1;
280 
281  std::vector<int> histogram(bin_count_, 0);
282  std::vector<double> bin_value(bin_count_, 0);
283  // calculate average intensity that is represented by a bin
284  for (int bin = 0; bin < bin_count_; bin++)
285  {
286  histogram[bin] = 0;
287  bin_value[bin] = (bin + 0.5) * bin_size;
288  }
289  // bin in which a datapoint would fall
290  int to_bin = 0;
291 
292  // index of bin where the median is located
293  int median_bin = 0;
294  // additive number of elements from left to x in histogram
295  int element_inc_count = 0;
296 
297  // tracks elements in current window, which may vary because of unevenly spaced data
298  int elements_in_window = 0;
299  // number of windows
300  int window_count = 0;
301 
302  // number of elements where we find the median
303  int element_in_window_half = 0;
304 
305  double noise; // noise value of a datapoint
306 
308  SignalToNoiseEstimator<Container>::startProgress(0, c.size(), "noise estimation of data");
309 
310  // MAIN LOOP
311  while (window_pos_center != scan_last_)
312  {
313 
314  // erase all elements from histogram that will leave the window on the LEFT side
315  while ((*window_pos_borderleft).getMZ() < (*window_pos_center).getMZ() - window_half_size)
316  {
317  to_bin = std::max(std::min<int>((int)((*window_pos_borderleft).getIntensity() / bin_size), bin_count_minus_1), 0);
318  --histogram[to_bin];
319  --elements_in_window;
320  ++window_pos_borderleft;
321  }
322 
323  // add all elements to histogram that will enter the window on the RIGHT side
324  while ((window_pos_borderright != scan_last_)
325  && ((*window_pos_borderright).getMZ() <= (*window_pos_center).getMZ() + window_half_size))
326  {
327  //std::cerr << (*window_pos_borderright).getIntensity() << " " << bin_size << " " << bin_count_minus_1 << std::endl;
328  to_bin = std::max(std::min<int>((int)((*window_pos_borderright).getIntensity() / bin_size), bin_count_minus_1), 0);
329  ++histogram[to_bin];
330  ++elements_in_window;
331  ++window_pos_borderright;
332  }
333 
334  if (elements_in_window < min_required_elements_)
335  {
336  noise = noise_for_empty_window_;
338  }
339  else
340  {
341  // find bin i where ceil[elements_in_window/2] <= sum_c(0..i){ histogram[c] }
342  median_bin = -1;
343  element_inc_count = 0;
344  element_in_window_half = (elements_in_window + 1) / 2;
345  while (median_bin < bin_count_minus_1 && element_inc_count < element_in_window_half)
346  {
347  ++median_bin;
348  element_inc_count += histogram[median_bin];
349  }
350 
351  // increase the error count
352  if (median_bin == bin_count_minus_1) {++histogram_oob_percent_; }
353 
354  // just avoid division by 0
355  noise = std::max(1.0, bin_value[median_bin]);
356  }
357 
358  // store result
359  stn_estimates_[window_count] = (*window_pos_center).getIntensity() / noise;
360 
361 
362  // advance the window center by one datapoint
363  ++window_pos_center;
364  ++window_count;
365  // update progress
367 
368  } // end while
369 
371 
372  sparse_window_percent_ = sparse_window_percent_ * 100 / window_count;
373  histogram_oob_percent_ = histogram_oob_percent_ * 100 / window_count;
374 
375  // warn if percentage of sparse windows is above 20%
377  {
378  OPENMS_LOG_WARN << "WARNING in SignalToNoiseEstimatorMedian: "
380  << "% of all windows were sparse. You should consider increasing 'win_len' or decreasing 'min_required_elements'"
381  << std::endl;
382  }
383 
384  // warn if percentage of possibly wrong median estimates is above 1%
386  {
387  OPENMS_LOG_WARN << "WARNING in SignalToNoiseEstimatorMedian: "
389  << "% of all Signal-to-Noise estimates are too high, because the median was found in the rightmost histogram-bin. "
390  << "You should consider increasing 'max_intensity' (and maybe 'bin_count' with it, to keep bin width reasonable)"
391  << std::endl;
392  }
393 
394  } // end of shiftWindow_
395 
397  void updateMembers_() override
398  {
399  max_intensity_ = (double)param_.getValue("max_intensity");
400  auto_max_stdev_Factor_ = (double)param_.getValue("auto_max_stdev_factor");
401  auto_max_percentile_ = param_.getValue("auto_max_percentile");
402  auto_mode_ = param_.getValue("auto_mode");
403  win_len_ = (double)param_.getValue("win_len");
404  bin_count_ = param_.getValue("bin_count");
405  min_required_elements_ = param_.getValue("min_required_elements");
406  noise_for_empty_window_ = (double)param_.getValue("noise_for_empty_window");
407  write_log_messages_ = (bool)param_.getValue("write_log_messages").toBool();
408  stn_estimates_.clear();
409  }
410 
420  double win_len_;
428 
429  // whether to write out log messages in the case of failure
431 
432  // counter for sparse windows
434  // counter for histogram overflow
436 
437 
438  };
439 
440 } // namespace OpenMS
441 
#define OPENMS_LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged.
Definition: LogStream.h:460
void defaultsToParam_()
Updates the parameters after the defaults have been set in the constructor.
Param param_
Container for current parameters.
Definition: DefaultParamHandler.h:166
Param defaults_
Container for default parameters. This member should be filled in the constructor of derived classes!
Definition: DefaultParamHandler.h:173
void setName(const String &name)
Mutable access to the name.
Invalid value exception.
Definition: Exception.h:329
bool toBool() const
Conversion to bool.
void setValidStrings(const std::string &key, const std::vector< std::string > &strings)
Sets the valid strings for the parameter key.
void setMaxFloat(const std::string &key, double max)
Sets the maximum value for the floating point or floating point list parameter key.
void setMaxInt(const std::string &key, int max)
Sets the maximum value for the integer or integer list parameter key.
const ParamValue & getValue(const std::string &key) const
Returns a value of a parameter.
void setMinInt(const std::string &key, int min)
Sets the minimum value for the integer or integer list parameter key.
void setValue(const std::string &key, const ParamValue &value, const std::string &description="", const std::vector< std::string > &tags=std::vector< std::string >())
Sets a value.
void setMinFloat(const std::string &key, double min)
Sets the minimum value for the floating point or floating point list parameter key.
float IntensityType
Intensity type.
Definition: Peak2D.h:62
void setProgress(SignedSize value) const
Sets the current progress.
void startProgress(SignedSize begin, SignedSize end, const String &label) const
Initializes the progress display.
void endProgress() const
Ends the progress display.
Estimates the signal/noise (S/N) ratio of each data point in a scan by using the median (histogram ba...
Definition: SignalToNoiseEstimatorMedian.h:83
SignalToNoiseEstimator< Container >::PeakIterator PeakIterator
Definition: SignalToNoiseEstimatorMedian.h:94
SignalToNoiseEstimatorMedian & operator=(const SignalToNoiseEstimatorMedian &source)
Definition: SignalToNoiseEstimatorMedian.h:153
double win_len_
range of data points which belong to a window in Thomson
Definition: SignalToNoiseEstimatorMedian.h:420
double noise_for_empty_window_
Definition: SignalToNoiseEstimatorMedian.h:427
~SignalToNoiseEstimatorMedian() override
Destructor.
Definition: SignalToNoiseEstimatorMedian.h:165
SignalToNoiseEstimatorMedian()
default constructor
Definition: SignalToNoiseEstimatorMedian.h:100
double max_intensity_
maximal intensity considered during binning (values above get discarded)
Definition: SignalToNoiseEstimatorMedian.h:412
double auto_max_percentile_
parameter for initial automatic estimation of "max_intensity_" percentile or a stdev
Definition: SignalToNoiseEstimatorMedian.h:416
double histogram_oob_percent_
Definition: SignalToNoiseEstimatorMedian.h:435
void computeSTN_(const Container &c) override
Definition: SignalToNoiseEstimatorMedian.h:188
bool write_log_messages_
Definition: SignalToNoiseEstimatorMedian.h:430
void updateMembers_() override
overridden function from DefaultParamHandler to keep members up to date, when a parameter is changed
Definition: SignalToNoiseEstimatorMedian.h:397
int min_required_elements_
minimal number of elements a window needs to cover to be used
Definition: SignalToNoiseEstimatorMedian.h:424
SignalToNoiseEstimatorMedian(const SignalToNoiseEstimatorMedian &source)
Copy Constructor.
Definition: SignalToNoiseEstimatorMedian.h:143
double getSparseWindowPercent() const
Returns how many percent of the windows were sparse.
Definition: SignalToNoiseEstimatorMedian.h:169
double sparse_window_percent_
Definition: SignalToNoiseEstimatorMedian.h:433
double getHistogramRightmostPercent() const
Returns the percentage where the median was found in the rightmost bin.
Definition: SignalToNoiseEstimatorMedian.h:175
SignalToNoiseEstimator< Container >::PeakType PeakType
Definition: SignalToNoiseEstimatorMedian.h:95
SignalToNoiseEstimator< Container >::GaussianEstimate GaussianEstimate
Definition: SignalToNoiseEstimatorMedian.h:97
int auto_mode_
determines which method shall be used for estimating "max_intensity_". valid are MANUAL=-1,...
Definition: SignalToNoiseEstimatorMedian.h:418
IntensityThresholdCalculation
method to use for estimating the maximal intensity that is used for histogram calculation
Definition: SignalToNoiseEstimatorMedian.h:88
@ MANUAL
Definition: SignalToNoiseEstimatorMedian.h:88
@ AUTOMAXBYSTDEV
Definition: SignalToNoiseEstimatorMedian.h:88
@ AUTOMAXBYPERCENT
Definition: SignalToNoiseEstimatorMedian.h:88
int bin_count_
number of bins in the histogram
Definition: SignalToNoiseEstimatorMedian.h:422
double auto_max_stdev_Factor_
parameter for initial automatic estimation of "max_intensity_": a stdev multiplier
Definition: SignalToNoiseEstimatorMedian.h:414
This class represents the abstract base class of a signal to noise estimator.
Definition: SignalToNoiseEstimator.h:59
double variance
variance of estimated Gaussian
Definition: SignalToNoiseEstimator.h:134
SignalToNoiseEstimator & operator=(const SignalToNoiseEstimator &source)
Assignment operator.
Definition: SignalToNoiseEstimator.h:86
PeakIterator::value_type PeakType
Definition: SignalToNoiseEstimator.h:66
GaussianEstimate estimate_(const PeakIterator &scan_first_, const PeakIterator &scan_last_) const
calculate mean & stdev of intensities of a spectrum
Definition: SignalToNoiseEstimator.h:139
double mean
mean of estimated Gaussian
Definition: SignalToNoiseEstimator.h:133
std::vector< double > stn_estimates_
stores the noise estimate for each peak
Definition: SignalToNoiseEstimator.h:172
Container::const_iterator PeakIterator
Definition: SignalToNoiseEstimator.h:65
protected struct to store parameters my, sigma for a Gaussian distribution
Definition: SignalToNoiseEstimator.h:132
A more convenient string class.
Definition: String.h:60
const double c
Definition: Constants.h:209
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47