OpenMS  2.4.0
SignalToNoiseEstimatorMedian.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2018.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: $
33 // --------------------------------------------------------------------------
34 //
35 
36 #pragma once
37 
38 
43 #include <vector>
44 
45 namespace OpenMS
46 {
79  template <typename Container = MSSpectrum>
81  public SignalToNoiseEstimator<Container>
82  {
83 
84 public:
85 
88 
95 
98 
100 
103  {
104  //set the name for DefaultParamHandler error messages
105  this->setName("SignalToNoiseEstimatorMedian");
106 
107  defaults_.setValue("max_intensity", -1, "maximal intensity considered for histogram construction. By default, it will be calculated automatically (see auto_mode)." \
108  " Only provide this parameter if you know what you are doing (and change 'auto_mode' to '-1')!" \
109  " All intensities EQUAL/ABOVE 'max_intensity' will be added to the LAST histogram bin." \
110  " If you choose 'max_intensity' too small, the noise estimate might be too small as well. " \
111  " If chosen too big, the bins become quite large (which you could counter by increasing 'bin_count', which increases runtime)." \
112  " In general, the Median-S/N estimator is more robust to a manual max_intensity than the MeanIterative-S/N.", ListUtils::create<String>("advanced"));
113  defaults_.setMinInt("max_intensity", -1);
114 
115  defaults_.setValue("auto_max_stdev_factor", 3.0, "parameter for 'max_intensity' estimation (if 'auto_mode' == 0): mean + 'auto_max_stdev_factor' * stdev", ListUtils::create<String>("advanced"));
116  defaults_.setMinFloat("auto_max_stdev_factor", 0.0);
117  defaults_.setMaxFloat("auto_max_stdev_factor", 999.0);
118 
119  defaults_.setValue("auto_max_percentile", 95, "parameter for 'max_intensity' estimation (if 'auto_mode' == 1): auto_max_percentile th percentile", ListUtils::create<String>("advanced"));
120  defaults_.setMinInt("auto_max_percentile", 0);
121  defaults_.setMaxInt("auto_max_percentile", 100);
122 
123  defaults_.setValue("auto_mode", 0, "method to use to determine maximal intensity: -1 --> use 'max_intensity'; 0 --> 'auto_max_stdev_factor' method (default); 1 --> 'auto_max_percentile' method", ListUtils::create<String>("advanced"));
124  defaults_.setMinInt("auto_mode", -1);
125  defaults_.setMaxInt("auto_mode", 1);
126 
127  defaults_.setValue("win_len", 200.0, "window length in Thomson");
128  defaults_.setMinFloat("win_len", 1.0);
129 
130  defaults_.setValue("bin_count", 30, "number of bins for intensity values");
131  defaults_.setMinInt("bin_count", 3);
132 
133  defaults_.setValue("min_required_elements", 10, "minimum number of elements required in a window (otherwise it is considered sparse)");
134  defaults_.setMinInt("min_required_elements", 1);
135 
136  defaults_.setValue("noise_for_empty_window", std::pow(10.0, 20), "noise value used for sparse windows", ListUtils::create<String>("advanced"));
137 
138  defaults_.setValue("write_log_messages", "true", "Write out log messages in case of sparse windows or median in rightmost histogram bin");
139  defaults_.setValidStrings("write_log_messages", ListUtils::create<String>("true,false"));
140 
142  }
143 
146  SignalToNoiseEstimator<Container>(source)
147  {
148  updateMembers_();
149  }
150 
156  {
157  if (&source == this) return *this;
158 
160  updateMembers_();
161  return *this;
162  }
163 
165 
168  {}
169 
171  double getSparseWindowPercent() const
172  {
173  return sparse_window_percent_;
174  }
175 
178  {
179  return histogram_oob_percent_;
180  }
181 
182 protected:
183 
184 
191  void computeSTN_(const PeakIterator & scan_first_, const PeakIterator & scan_last_) override
192  {
193  // reset counter for sparse windows
195  // reset counter for histogram overflow
197 
198  // reset the results
199  stn_estimates_.clear();
200 
201  // maximal range of histogram needs to be calculated first
202  if (auto_mode_ == AUTOMAXBYSTDEV)
203  {
204  // use MEAN+auto_max_intensity_*STDEV as threshold
205  GaussianEstimate gauss_global = SignalToNoiseEstimator<Container>::estimate_(scan_first_, scan_last_);
206  max_intensity_ = gauss_global.mean + std::sqrt(gauss_global.variance) * auto_max_stdev_Factor_;
207  }
208  else if (auto_mode_ == AUTOMAXBYPERCENT)
209  {
210  // get value at "auto_max_percentile_"th percentile
211  // we use a histogram approach here as well.
212  if ((auto_max_percentile_ < 0) || (auto_max_percentile_ > 100))
213  {
215  throw Exception::InvalidValue(__FILE__,
216  __LINE__,
217  OPENMS_PRETTY_FUNCTION,
218  "auto_mode is on AUTOMAXBYPERCENT! auto_max_percentile is not in [0,100]. Use setAutoMaxPercentile(<value>) to change it!",
219  s);
220  }
221 
222  std::vector<int> histogram_auto(100, 0);
223 
224  // find maximum of current scan
225  int size = 0;
226  typename PeakType::IntensityType maxInt = 0;
227  PeakIterator run = scan_first_;
228  while (run != scan_last_)
229  {
230  maxInt = std::max(maxInt, (*run).getIntensity());
231  ++size;
232  ++run;
233  }
234 
235  double bin_size = maxInt / 100;
236 
237  // fill histogram
238  run = scan_first_;
239  while (run != scan_last_)
240  {
241  ++histogram_auto[(int) (((*run).getIntensity() - 1) / bin_size)];
242  ++run;
243  }
244 
245  // add up element counts in histogram until ?th percentile is reached
246  int elements_below_percentile = (int) (auto_max_percentile_ * size / 100);
247  int elements_seen = 0;
248  int i = -1;
249  run = scan_first_;
250 
251  while (run != scan_last_ && elements_seen < elements_below_percentile)
252  {
253  ++i;
254  elements_seen += histogram_auto[i];
255  ++run;
256  }
257 
258  max_intensity_ = (((double)i) + 0.5) * bin_size;
259  }
260  else //if (auto_mode_ == MANUAL)
261  {
262  if (max_intensity_ <= 0)
263  {
265  throw Exception::InvalidValue(__FILE__,
266  __LINE__,
267  OPENMS_PRETTY_FUNCTION,
268  "auto_mode is on MANUAL! max_intensity is <=0. Needs to be positive! Use setMaxIntensity(<value>) or enable auto_mode!",
269  s);
270  }
271  }
272 
273  if (max_intensity_ < 0)
274  {
275  std::cerr << "TODO SignalToNoiseEstimatorMedian: the max_intensity_ value should be positive! " << max_intensity_ << std::endl;
276  return;
277  }
278 
279  PeakIterator window_pos_center = scan_first_;
280  PeakIterator window_pos_borderleft = scan_first_;
281  PeakIterator window_pos_borderright = scan_first_;
282 
283  double window_half_size = win_len_ / 2;
284  double bin_size = std::max(1.0, max_intensity_ / bin_count_); // at least size of 1 for intensity bins
285  int bin_count_minus_1 = bin_count_ - 1;
286 
287  std::vector<int> histogram(bin_count_, 0);
288  std::vector<double> bin_value(bin_count_, 0);
289  // calculate average intensity that is represented by a bin
290  for (int bin = 0; bin < bin_count_; bin++)
291  {
292  histogram[bin] = 0;
293  bin_value[bin] = (bin + 0.5) * bin_size;
294  }
295  // bin in which a datapoint would fall
296  int to_bin = 0;
297 
298  // index of bin where the median is located
299  int median_bin = 0;
300  // additive number of elements from left to x in histogram
301  int element_inc_count = 0;
302 
303  // tracks elements in current window, which may vary because of unevenly spaced data
304  int elements_in_window = 0;
305  // number of windows
306  int window_count = 0;
307 
308  // number of elements where we find the median
309  int element_in_window_half = 0;
310 
311  double noise; // noise value of a datapoint
312 
313  // determine how many elements we need to estimate (for progress estimation)
314  int windows_overall = 0;
315  PeakIterator run = scan_first_;
316  while (run != scan_last_)
317  {
318  ++windows_overall;
319  ++run;
320  }
321  SignalToNoiseEstimator<Container>::startProgress(0, windows_overall, "noise estimation of data");
322 
323  // MAIN LOOP
324  while (window_pos_center != scan_last_)
325  {
326 
327  // erase all elements from histogram that will leave the window on the LEFT side
328  while ((*window_pos_borderleft).getMZ() < (*window_pos_center).getMZ() - window_half_size)
329  {
330  to_bin = std::max(std::min<int>((int)((*window_pos_borderleft).getIntensity() / bin_size), bin_count_minus_1), 0);
331  --histogram[to_bin];
332  --elements_in_window;
333  ++window_pos_borderleft;
334  }
335 
336  // add all elements to histogram that will enter the window on the RIGHT side
337  while ((window_pos_borderright != scan_last_)
338  && ((*window_pos_borderright).getMZ() <= (*window_pos_center).getMZ() + window_half_size))
339  {
340  //std::cerr << (*window_pos_borderright).getIntensity() << " " << bin_size << " " << bin_count_minus_1 << std::endl;
341  to_bin = std::max(std::min<int>((int)((*window_pos_borderright).getIntensity() / bin_size), bin_count_minus_1), 0);
342  ++histogram[to_bin];
343  ++elements_in_window;
344  ++window_pos_borderright;
345  }
346 
347  if (elements_in_window < min_required_elements_)
348  {
349  noise = noise_for_empty_window_;
351  }
352  else
353  {
354  // find bin i where ceil[elements_in_window/2] <= sum_c(0..i){ histogram[c] }
355  median_bin = -1;
356  element_inc_count = 0;
357  element_in_window_half = (elements_in_window + 1) / 2;
358  while (median_bin < bin_count_minus_1 && element_inc_count < element_in_window_half)
359  {
360  ++median_bin;
361  element_inc_count += histogram[median_bin];
362  }
363 
364  // increase the error count
365  if (median_bin == bin_count_minus_1) {++histogram_oob_percent_; }
366 
367  // just avoid division by 0
368  noise = std::max(1.0, bin_value[median_bin]);
369  }
370 
371  // store result
372  stn_estimates_[*window_pos_center] = (*window_pos_center).getIntensity() / noise;
373 
374 
375  // advance the window center by one datapoint
376  ++window_pos_center;
377  ++window_count;
378  // update progress
380 
381  } // end while
382 
384 
385  sparse_window_percent_ = sparse_window_percent_ * 100 / window_count;
386  histogram_oob_percent_ = histogram_oob_percent_ * 100 / window_count;
387 
388  // warn if percentage of sparse windows is above 20%
390  {
391  LOG_WARN << "WARNING in SignalToNoiseEstimatorMedian: "
393  << "% of all windows were sparse. You should consider increasing 'win_len' or decreasing 'min_required_elements'"
394  << std::endl;
395  }
396 
397  // warn if percentage of possibly wrong median estimates is above 1%
399  {
400  LOG_WARN << "WARNING in SignalToNoiseEstimatorMedian: "
402  << "% of all Signal-to-Noise estimates are too high, because the median was found in the rightmost histogram-bin. "
403  << "You should consider increasing 'max_intensity' (and maybe 'bin_count' with it, to keep bin width reasonable)"
404  << std::endl;
405  }
406 
407  } // end of shiftWindow_
408 
410  void updateMembers_() override
411  {
412  max_intensity_ = (double)param_.getValue("max_intensity");
413  auto_max_stdev_Factor_ = (double)param_.getValue("auto_max_stdev_factor");
414  auto_max_percentile_ = param_.getValue("auto_max_percentile");
415  auto_mode_ = param_.getValue("auto_mode");
416  win_len_ = (double)param_.getValue("win_len");
417  bin_count_ = param_.getValue("bin_count");
418  min_required_elements_ = param_.getValue("min_required_elements");
419  noise_for_empty_window_ = (double)param_.getValue("noise_for_empty_window");
420  write_log_messages_ = (bool)param_.getValue("write_log_messages").toBool();
421  is_result_valid_ = false;
422  }
423 
433  double win_len_;
441 
442  // whether to write out log messages in the case of failure
444 
445  // counter for sparse windows
447  // counter for histogram overflow
449 
450 
451  };
452 
453 } // namespace OpenMS
454 
Param defaults_
Container for default parameters. This member should be filled in the constructor of derived classes!...
Definition: DefaultParamHandler.h:156
void setValue(const String &key, const DataValue &value, const String &description="", const StringList &tags=StringList())
Sets a value.
SignalToNoiseEstimator< Container >::PeakType PeakType
Definition: SignalToNoiseEstimatorMedian.h:97
A more convenient string class.
Definition: String.h:57
double auto_max_percentile_
parameter for initial automatic estimation of "max_intensity_" percentile or a stdev ...
Definition: SignalToNoiseEstimatorMedian.h:429
Definition: SignalToNoiseEstimatorMedian.h:87
int auto_mode_
determines which method shall be used for estimating "max_intensity_". valid are MANUAL=-1, AUTOMAXBYSTDEV=0 or AUTOMAXBYPERCENT=1
Definition: SignalToNoiseEstimatorMedian.h:431
Param param_
Container for current parameters.
Definition: DefaultParamHandler.h:149
void endProgress() const
Ends the progress display.
int bin_count_
number of bins in the histogram
Definition: SignalToNoiseEstimatorMedian.h:435
SignalToNoiseEstimator & operator=(const SignalToNoiseEstimator &source)
Assignment operator.
Definition: SignalToNoiseEstimator.h:91
double win_len_
range of data points which belong to a window in Thomson
Definition: SignalToNoiseEstimatorMedian.h:433
double getSparseWindowPercent() const
Returns how many percent of the windows were sparse.
Definition: SignalToNoiseEstimatorMedian.h:171
Container::const_iterator PeakIterator
Definition: SignalToNoiseEstimator.h:64
double max_intensity_
maximal intensity considered during binning (values above get discarded)
Definition: SignalToNoiseEstimatorMedian.h:425
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
SignalToNoiseEstimator< Container >::GaussianEstimate GaussianEstimate
Definition: SignalToNoiseEstimatorMedian.h:99
bool toBool() const
Conversion to bool.
SignalToNoiseEstimatorMedian(const SignalToNoiseEstimatorMedian &source)
Copy Constructor.
Definition: SignalToNoiseEstimatorMedian.h:145
#define LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged...
Definition: LogStream.h:450
bool is_result_valid_
flag: set to true if SignalToNoise estimates are calculated and none of the params were changed...
Definition: SignalToNoiseEstimator.h:214
void setMaxInt(const String &key, Int max)
Sets the maximum value for the integer or integer list parameter key.
GaussianEstimate estimate_(const PeakIterator &scan_first_, const PeakIterator &scan_last_) const
calculate mean & stdev of intensities of a spectrum
Definition: SignalToNoiseEstimator.h:174
protected struct to store parameters my, sigma for a Gaussian distribution
Definition: SignalToNoiseEstimator.h:166
void computeSTN_(const PeakIterator &scan_first_, const PeakIterator &scan_last_) override
Definition: SignalToNoiseEstimatorMedian.h:191
bool write_log_messages_
Definition: SignalToNoiseEstimatorMedian.h:443
const DataValue & getValue(const String &key) const
Returns a value of a parameter.
void setProgress(SignedSize value) const
Sets the current progress.
double auto_max_stdev_Factor_
parameter for initial automatic estimation of "max_intensity_": a stdev multiplier ...
Definition: SignalToNoiseEstimatorMedian.h:427
void setMaxFloat(const String &key, double max)
Sets the maximum value for the floating point or floating point list parameter key.
IntensityThresholdCalculation
method to use for estimating the maximal intensity that is used for histogram calculation ...
Definition: SignalToNoiseEstimatorMedian.h:87
PeakIterator::value_type PeakType
Definition: SignalToNoiseEstimator.h:65
void updateMembers_() override
overridden function from DefaultParamHandler to keep members up to date, when a parameter is changed ...
Definition: SignalToNoiseEstimatorMedian.h:410
~SignalToNoiseEstimatorMedian() override
Destructor.
Definition: SignalToNoiseEstimatorMedian.h:167
double getHistogramRightmostPercent() const
Returns the percentage where the median was found in the rightmost bin.
Definition: SignalToNoiseEstimatorMedian.h:177
double noise_for_empty_window_
Definition: SignalToNoiseEstimatorMedian.h:440
Definition: SignalToNoiseEstimatorMedian.h:87
void setValidStrings(const String &key, const std::vector< String > &strings)
Sets the valid strings for the parameter key.
Invalid value exception.
Definition: Exception.h:335
SignalToNoiseEstimatorMedian & operator=(const SignalToNoiseEstimatorMedian &source)
Definition: SignalToNoiseEstimatorMedian.h:155
Estimates the signal/noise (S/N) ratio of each data point in a scan by using the median (histogram ba...
Definition: SignalToNoiseEstimatorMedian.h:80
SignalToNoiseEstimatorMedian()
default constructor
Definition: SignalToNoiseEstimatorMedian.h:102
int min_required_elements_
minimal number of elements a window needs to cover to be used
Definition: SignalToNoiseEstimatorMedian.h:437
void setMinInt(const String &key, Int min)
Sets the minimum value for the integer or integer list parameter key.
void startProgress(SignedSize begin, SignedSize end, const String &label) const
Initializes the progress display.
This class represents the abstract base class of a signal to noise estimator.
Definition: SignalToNoiseEstimator.h:56
std::map< PeakType, double, typename PeakType::PositionLess > stn_estimates_
stores the noise estimate for each peak
Definition: SignalToNoiseEstimator.h:207
void setName(const String &name)
Mutable access to the name.
double histogram_oob_percent_
Definition: SignalToNoiseEstimatorMedian.h:448
Definition: SignalToNoiseEstimatorMedian.h:87
double sparse_window_percent_
Definition: SignalToNoiseEstimatorMedian.h:446
SignalToNoiseEstimator< Container >::PeakIterator PeakIterator
Definition: SignalToNoiseEstimatorMedian.h:96
void setMinFloat(const String &key, double min)
Sets the minimum value for the floating point or floating point list parameter key.
void defaultsToParam_()
Updates the parameters after the defaults have been set in the constructor.