Home  · Classes  · Annotated Classes  · Modules  · Members  · Namespaces  · Related Pages
StatisticFunctions.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2017.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: Clemens Groepl, Johannes Junker, Mathias Walzer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 #ifndef OPENMS_MATH_STATISTICS_STATISTICFUNCTIONS_H
35 #define OPENMS_MATH_STATISTICS_STATISTICFUNCTIONS_H
36 
37 #include <vector>
39 #include <OpenMS/CONCEPT/Types.h>
40 
41 // array_wrapper needs to be included before it is used
42 // only in boost1.64+. See issue #2790
43 #if OPENMS_BOOST_VERSION_MINOR >= 64
44 #include <boost/serialization/array_wrapper.hpp>
45 #endif
46 #include <boost/accumulators/accumulators.hpp>
47 #include <boost/accumulators/statistics/covariance.hpp>
48 #include <boost/accumulators/statistics/mean.hpp>
49 #include <boost/accumulators/statistics/stats.hpp>
50 #include <boost/accumulators/statistics/variance.hpp>
51 #include <boost/accumulators/statistics/variates/covariate.hpp>
52 #include <boost/function/function_base.hpp>
53 #include <boost/lambda/casts.hpp>
54 #include <boost/lambda/lambda.hpp>
55 
56 #include <iterator>
57 #include <algorithm>
58 
59 using std::iterator_traits;
60 
61 namespace OpenMS
62 {
63 
64  namespace Math
65  {
73  template <typename IteratorType>
74  static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
75  {
76  if (begin == end)
77  {
78  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
79  }
80  }
81 
89  template <typename IteratorType>
90  static void checkIteratorsEqual(IteratorType begin, IteratorType end)
91  {
92  if (begin != end)
93  {
94  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
95  }
96  }
97 
105  template <typename IteratorType1, typename IteratorType2>
107  IteratorType1 begin_b, IteratorType1 end_b,
108  IteratorType2 begin_a, IteratorType2 end_a)
109  {
110  if (begin_b != end_b && begin_a == end_a)
111  {
112  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
113  }
114  }
115 
121  template <typename IteratorType>
122  static double sum(IteratorType begin, IteratorType end)
123  {
124  return std::accumulate(begin, end, 0.0);
125  }
126 
134  template <typename IteratorType>
135  static double mean(IteratorType begin, IteratorType end)
136  {
137  checkIteratorsNotNULL(begin, end);
138  return sum(begin, end) / std::distance(begin, end);
139  }
140 
152  template <typename IteratorType>
153  static double median(IteratorType begin, IteratorType end,
154  bool sorted = false)
155  {
156  checkIteratorsNotNULL(begin, end);
157  if (!sorted)
158  {
159  std::sort(begin, end);
160  }
161 
162  Size size = std::distance(begin, end);
163  if (size % 2 == 0) // even size => average two middle values
164  {
165  IteratorType it1 = begin;
166  std::advance(it1, size / 2 - 1);
167  IteratorType it2 = it1;
168  std::advance(it2, 1);
169  return (*it1 + *it2) / 2.0;
170  }
171  else
172  {
173  IteratorType it = begin;
174  std::advance(it, (size - 1) / 2);
175  return *it;
176  }
177  }
178 
179 
199  template <typename IteratorType>
200  double MAD(IteratorType begin, IteratorType end, double median_of_numbers)
201  {
202  std::vector<double> diffs;
203  diffs.reserve(std::distance(begin, end));
204  for (IteratorType it = begin; it != end; ++it)
205  {
206  diffs.push_back(fabs(*it - median_of_numbers));
207  }
208  return median(diffs.begin(), diffs.end(), false);
209  }
210 
224  template <typename IteratorType>
225  static double quantile1st(IteratorType begin, IteratorType end,
226  bool sorted = false)
227  {
228  checkIteratorsNotNULL(begin, end);
229 
230  if (!sorted)
231  {
232  std::sort(begin, end);
233  }
234 
235  Size size = std::distance(begin, end);
236  if (size % 2 == 0)
237  {
238  return median(begin, begin + (size/2)-1, true); //-1 to exclude median values
239  }
240  return median(begin, begin + (size/2), true);
241  }
242 
256  template <typename IteratorType>
257  static double quantile3rd(
258  IteratorType begin, IteratorType end, bool sorted = false)
259  {
260  checkIteratorsNotNULL(begin, end);
261  if (!sorted)
262  {
263  std::sort(begin, end);
264  }
265 
266  Size size = std::distance(begin, end);
267  return median(begin + (size/2)+1, end, true); //+1 to exclude median values
268  }
269 
279  template <typename IteratorType>
280  static double variance(IteratorType begin, IteratorType end,
281  double mean = std::numeric_limits<double>::max())
282  {
283  checkIteratorsNotNULL(begin, end);
284  double sum = 0.0;
285  if (mean == std::numeric_limits<double>::max())
286  {
287  mean = Math::mean(begin, end);
288  }
289  for (IteratorType iter=begin; iter!=end; ++iter)
290  {
291  double diff = *iter - mean;
292  sum += diff * diff;
293  }
294  return sum / (std::distance(begin, end)-1);
295  }
296 
306  template <typename IteratorType>
307  static double sd(IteratorType begin, IteratorType end,
308  double mean = std::numeric_limits<double>::max())
309  {
310  checkIteratorsNotNULL(begin, end);
311  return std::sqrt( variance(begin, end, mean) );
312  }
313 
321  template <typename IteratorType>
322  static double absdev(IteratorType begin, IteratorType end,
323  double mean = std::numeric_limits<double>::max())
324  {
325  checkIteratorsNotNULL(begin, end);
326  double sum = 0.0;
327  if (mean == std::numeric_limits<double>::max())
328  {
329  mean = Math::mean(begin, end);
330  }
331  for (IteratorType iter=begin; iter!=end; ++iter)
332  {
333  sum += *iter - mean;
334  }
335  return sum / std::distance(begin, end);
336  }
337 
347  template <typename IteratorType1, typename IteratorType2>
348  static double covariance(IteratorType1 begin_a, IteratorType1 end_a,
349  IteratorType2 begin_b, IteratorType2 end_b)
350  {
351  //no data or different lengths
352  checkIteratorsNotNULL(begin_a, end_a);
353 
354  double sum = 0.0;
355  double mean_a = Math::mean(begin_a, end_a);
356  double mean_b = Math::mean(begin_b, end_b);
357  IteratorType1 iter_a = begin_a;
358  IteratorType2 iter_b = begin_b;
359  for (; iter_a != end_a; ++iter_a, ++iter_b)
360  {
361  /* assure both ranges have the same number of elements */
362  checkIteratorsAreValid(begin_b, end_b, begin_a, end_a);
363  sum += (*iter_a - mean_a) * (*iter_b - mean_b);
364  }
365  /* assure both ranges have the same number of elements */
366  checkIteratorsEqual(iter_b, end_b);
367  Size n = std::distance(begin_a, end_a);
368  return sum / (n-1);
369  }
370 
371 
372 
373 
383  template <typename IteratorType1, typename IteratorType2>
384  static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a,
385  IteratorType2 begin_b, IteratorType2 end_b)
386  {
387  //no data or different lengths
388  checkIteratorsNotNULL(begin_a, end_a);
389 
390  SignedSize dist = std::distance(begin_a, end_a);
391  double error = 0;
392  IteratorType1 iter_a = begin_a;
393  IteratorType2 iter_b = begin_b;
394  for (; iter_a != end_a; ++iter_a, ++iter_b)
395  {
396  /* assure both ranges have the same number of elements */
397  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
398 
399  double tmp(*iter_a - *iter_b);
400  error += tmp * tmp;
401  }
402  /* assure both ranges have the same number of elements */
403  checkIteratorsEqual(iter_b, end_b);
404 
405  return error / dist;
406  }
407 
417  template <typename IteratorType1, typename IteratorType2>
418  static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a,
419  IteratorType2 begin_b, IteratorType2 end_b)
420  {
421  //no data or different lengths
422  checkIteratorsNotNULL(begin_a, end_a);
423 
424  SignedSize dist = std::distance(begin_a, end_a);
425  SignedSize correct = dist;
426  IteratorType1 iter_a = begin_a;
427  IteratorType2 iter_b = begin_b;
428  for (; iter_a != end_a; ++iter_a, ++iter_b)
429  {
430  /* assure both ranges have the same number of elements */
431  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
432  if ((*iter_a < 0 && *iter_b >= 0) || (*iter_a >= 0 && *iter_b < 0))
433  {
434  --correct;
435  }
436 
437  }
438  /* assure both ranges have the same number of elements */
439  checkIteratorsEqual(iter_b, end_b);
440 
441  return double(correct) / dist;
442  }
443 
456  template <typename IteratorType1, typename IteratorType2>
458  IteratorType1 begin_a, IteratorType1 end_a,
459  IteratorType2 begin_b, IteratorType2 end_b)
460  {
461  //no data or different lengths
462  checkIteratorsNotNULL(begin_a, end_b);
463 
464  double tp = 0;
465  double fp = 0;
466  double tn = 0;
467  double fn = 0;
468  IteratorType1 iter_a = begin_a;
469  IteratorType2 iter_b = begin_b;
470  for (; iter_a != end_a; ++iter_a, ++iter_b)
471  {
472  /* assure both ranges have the same number of elements */
473  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
474 
475  if (*iter_a < 0 && *iter_b >= 0)
476  {
477  ++fn;
478  }
479  else if (*iter_a < 0 && *iter_b < 0)
480  {
481  ++tn;
482  }
483  else if (*iter_a >= 0 && *iter_b >= 0)
484  {
485  ++tp;
486  }
487  else if (*iter_a >= 0 && *iter_b < 0)
488  {
489  ++fp;
490  }
491  }
492  /* assure both ranges have the same number of elements */
493  checkIteratorsEqual(iter_b, end_b);
494 
495  return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn));
496  }
497 
509  template <typename IteratorType1, typename IteratorType2>
511  IteratorType1 begin_a, IteratorType1 end_a,
512  IteratorType2 begin_b, IteratorType2 end_b)
513  {
514  //no data or different lengths
515  checkIteratorsNotNULL(begin_a, end_a);
516 
517  //calculate average
518  SignedSize dist = std::distance(begin_a, end_a);
519  double avg_a = std::accumulate(begin_a, end_a, 0.0) / dist;
520  double avg_b = std::accumulate(begin_b, end_b, 0.0) / dist;
521 
522  double numerator = 0;
523  double denominator_a = 0;
524  double denominator_b = 0;
525  IteratorType1 iter_a = begin_a;
526  IteratorType2 iter_b = begin_b;
527  for (; iter_a != end_a; ++iter_a, ++iter_b)
528  {
529  /* assure both ranges have the same number of elements */
530  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
531  double temp_a = *iter_a - avg_a;
532  double temp_b = *iter_b - avg_b;
533  numerator += (temp_a * temp_b);
534  denominator_a += (temp_a * temp_a);
535  denominator_b += (temp_b * temp_b);
536  }
537  /* assure both ranges have the same number of elements */
538  checkIteratorsEqual(iter_b, end_b);
539  return numerator / sqrt(denominator_a * denominator_b);
540  }
541 
543  template <typename Value>
544  static void computeRank(std::vector<Value> & w)
545  {
546  Size i = 0; // main index
547  Size z = 0; // "secondary" index
548  Value rank = 0;
549  Size n = (w.size() - 1);
550  //store original indices for later
551  std::vector<std::pair<Size, Value> > w_idx;
552  for (Size j = 0; j < w.size(); ++j)
553  {
554  w_idx.push_back(std::make_pair(j, w[j]));
555  }
556  //sort
557  std::sort(w_idx.begin(), w_idx.end(),
558  boost::lambda::ret<bool>((&boost::lambda::_1->*& std::pair<Size, Value>::second) <
559  (&boost::lambda::_2->*& std::pair<Size, Value>::second)));
560  //replace pairs <orig_index, value> in w_idx by pairs <orig_index, rank>
561  while (i < n)
562  {
563  // test for equality with tolerance:
564  if (fabs(w_idx[i + 1].second - w_idx[i].second) > 0.0000001 * fabs(w_idx[i + 1].second)) // no tie
565  {
566  w_idx[i].second = Value(i + 1);
567  ++i;
568  }
569  else // tie, replace by mean rank
570  {
571  // count number of ties
572  for (z = i + 1; (z <= n) && fabs(w_idx[z].second - w_idx[i].second) <= 0.0000001 * fabs(w_idx[z].second); ++z)
573  {
574  }
575  // compute mean rank of tie
576  rank = 0.5 * (i + z + 1);
577  // replace intensities by rank
578  for (Size v = i; v <= z - 1; ++v)
579  {
580  w_idx[v].second = rank;
581  }
582  i = z;
583  }
584  }
585  if (i == n)
586  w_idx[n].second = Value(n + 1);
587  //restore original order and replace elements of w with their ranks
588  for (Size j = 0; j < w.size(); ++j)
589  {
590  w[w_idx[j].first] = w_idx[j].second;
591  }
592  }
593 
605  template <typename IteratorType1, typename IteratorType2>
607  IteratorType1 begin_a, IteratorType1 end_a,
608  IteratorType2 begin_b, IteratorType2 end_b)
609  {
610  //no data or different lengths
611  checkIteratorsNotNULL(begin_a, end_a);
612 
613  // store and sort intensities of model and data
614  SignedSize dist = std::distance(begin_a, end_a);
615  std::vector<double> ranks_data;
616  ranks_data.reserve(dist);
617  std::vector<double> ranks_model;
618  ranks_model.reserve(dist);
619  IteratorType1 iter_a = begin_a;
620  IteratorType2 iter_b = begin_b;
621  for (; iter_a != end_a; ++iter_a, ++iter_b)
622  {
623  /* assure both ranges have the same number of elements */
624  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
625 
626  ranks_model.push_back(*iter_a);
627  ranks_data.push_back(*iter_b);
628  }
629  /* assure both ranges have the same number of elements */
630  checkIteratorsEqual(iter_b, end_b);
631 
632  // replace entries by their ranks
633  computeRank(ranks_data);
634  computeRank(ranks_model);
635 
636  double mu = double(ranks_data.size() + 1) / 2.; // mean of ranks
637  // Was the following, but I think the above is more correct ... (Clemens)
638  // double mu = (ranks_data.size() + 1) / 2;
639 
640  double sum_model_data = 0;
641  double sqsum_data = 0;
642  double sqsum_model = 0;
643 
644  for (Int i = 0; i < dist; ++i)
645  {
646  sum_model_data += (ranks_data[i] - mu) * (ranks_model[i] - mu);
647  sqsum_data += (ranks_data[i] - mu) * (ranks_data[i] - mu);
648  sqsum_model += (ranks_model[i] - mu) * (ranks_model[i] - mu);
649  }
650 
651  // check for division by zero
652  if (!sqsum_data || !sqsum_model)
653  {
654  return 0;
655  }
656 
657  return sum_model_data / (sqrt(sqsum_data) * sqrt(sqsum_model));
658  }
659 
661  template<typename T>
663  {
665  :mean(0), variance(0), min(0), lowerq(0), median(0), upperq(0), max(0)
666  {
667  }
668 
669  // Ctor with data
671  {
672  count = data.size();
673  // Sanity check: avoid core dump if no data points present.
674  if (data.empty())
675  {
676  mean = variance = min = lowerq = median = upperq = max = 0.0;
677  }
678  else
679  {
680  sort(data.begin(), data.end());
681  mean = Math::mean(data.begin(), data.end());
682  variance = Math::variance(data.begin(), data.end(), mean);
683  min = data.front();
684  lowerq = Math::quantile1st(data.begin(), data.end(), true);
685  median = Math::median(data.begin(), data.end(), true);
686  upperq = Math::quantile3rd(data.begin(), data.end(), true);
687  max = data.back();
688  }
689  }
690 
692  typename T::value_type min, max;
693  size_t count;
694  };
695 
696  } // namespace Math
697 } // namespace OpenMS
698 
699 #endif // OPENMS_MATH_STATISTICS_STATISTICFUNCTIONS_H
SummaryStatistics()
Definition: StatisticFunctions.h:664
double variance
Definition: StatisticFunctions.h:691
static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the mean square error for the values in [begin_a, end_a) and [begin_b, end_b) ...
Definition: StatisticFunctions.h:384
T::value_type max
Definition: StatisticFunctions.h:692
double MAD(IteratorType begin, IteratorType end, double median_of_numbers)
median absolute deviation (MAD)
Definition: StatisticFunctions.h:200
double lowerq
Definition: StatisticFunctions.h:691
static double variance(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the variance of a range of values.
Definition: StatisticFunctions.h:280
static double sum(IteratorType begin, IteratorType end)
Calculates the sum of a range of values.
Definition: StatisticFunctions.h:122
static void checkIteratorsAreValid(IteratorType1 begin_b, IteratorType1 end_b, IteratorType2 begin_a, IteratorType2 end_a)
Helper function checking if an iterator and a co-iterator both have a next element.
Definition: StatisticFunctions.h:106
static double quantile1st(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the first quantile of a range of values.
Definition: StatisticFunctions.h:225
static void computeRank(std::vector< Value > &w)
Replaces the elements in vector w by their ranks.
Definition: StatisticFunctions.h:544
static double covariance(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the covariance of two ranges of values.
Definition: StatisticFunctions.h:348
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:135
static void checkIteratorsEqual(IteratorType begin, IteratorType end)
Helper function checking if two iterators are equal.
Definition: StatisticFunctions.h:90
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
static double mean(IteratorType begin, IteratorType end)
Calculates the mean of a range of values.
Definition: StatisticFunctions.h:135
static double sd(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the standard deviation of a range of values.
Definition: StatisticFunctions.h:307
static double matthewsCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Matthews correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:457
static double absdev(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the absolute deviation of a range of values.
Definition: StatisticFunctions.h:322
static double pearsonCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Pearson correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:510
Helper class to gather (and dump) some statistics from a e.g. vector<double>.
Definition: StatisticFunctions.h:662
static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
Helper function checking if two iterators are not equal.
Definition: StatisticFunctions.h:74
double upperq
Definition: StatisticFunctions.h:691
static double quantile3rd(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the third quantile of a range of values.
Definition: StatisticFunctions.h:257
double median
Definition: StatisticFunctions.h:691
static double median(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the median of a range of values.
Definition: StatisticFunctions.h:153
SummaryStatistics(T &data)
Definition: StatisticFunctions.h:670
Invalid range exception.
Definition: Exception.h:286
static double rankCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
calculates the rank correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:606
size_t count
Definition: StatisticFunctions.h:693
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:128
T::value_type min
Definition: StatisticFunctions.h:692
static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the classification rate for the values in [begin_a, end_a) and [begin_b, end_b)
Definition: StatisticFunctions.h:418
double mean
Definition: StatisticFunctions.h:691
int Int
Signed integer type.
Definition: Types.h:103

OpenMS / TOPP release 2.3.0 Documentation generated on Tue Jan 9 2018 18:22:04 using doxygen 1.8.13