OpenMS  2.4.0
StatisticFunctions.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2018.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: Clemens Groepl, Johannes Junker, Mathias Walzer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 #pragma once
35 
36 #include <vector>
38 #include <OpenMS/CONCEPT/Types.h>
39 
40 // array_wrapper needs to be included before it is used
41 // only in boost1.64+. See issue #2790
42 #if OPENMS_BOOST_VERSION_MINOR >= 64
43 #include <boost/serialization/array_wrapper.hpp>
44 #endif
45 #include <boost/accumulators/accumulators.hpp>
46 #include <boost/accumulators/statistics/covariance.hpp>
47 #include <boost/accumulators/statistics/mean.hpp>
48 #include <boost/accumulators/statistics/stats.hpp>
49 #include <boost/accumulators/statistics/variance.hpp>
50 #include <boost/accumulators/statistics/variates/covariate.hpp>
51 #include <boost/function/function_base.hpp>
52 #include <boost/lambda/casts.hpp>
53 #include <boost/lambda/lambda.hpp>
54 
55 #include <iterator>
56 #include <algorithm>
57 
58 using std::iterator_traits;
59 
60 namespace OpenMS
61 {
62 
63  namespace Math
64  {
72  template <typename IteratorType>
73  static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
74  {
75  if (begin == end)
76  {
77  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
78  }
79  }
80 
88  template <typename IteratorType>
89  static void checkIteratorsEqual(IteratorType begin, IteratorType end)
90  {
91  if (begin != end)
92  {
93  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
94  }
95  }
96 
104  template <typename IteratorType1, typename IteratorType2>
106  IteratorType1 begin_b, IteratorType1 end_b,
107  IteratorType2 begin_a, IteratorType2 end_a)
108  {
109  if (begin_b != end_b && begin_a == end_a)
110  {
111  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
112  }
113  }
114 
120  template <typename IteratorType>
121  static double sum(IteratorType begin, IteratorType end)
122  {
123  return std::accumulate(begin, end, 0.0);
124  }
125 
133  template <typename IteratorType>
134  static double mean(IteratorType begin, IteratorType end)
135  {
136  checkIteratorsNotNULL(begin, end);
137  return sum(begin, end) / std::distance(begin, end);
138  }
139 
151  template <typename IteratorType>
152  static double median(IteratorType begin, IteratorType end,
153  bool sorted = false)
154  {
155  checkIteratorsNotNULL(begin, end);
156  if (!sorted)
157  {
158  std::sort(begin, end);
159  }
160 
161  Size size = std::distance(begin, end);
162  if (size % 2 == 0) // even size => average two middle values
163  {
164  IteratorType it1 = begin;
165  std::advance(it1, size / 2 - 1);
166  IteratorType it2 = it1;
167  std::advance(it2, 1);
168  return (*it1 + *it2) / 2.0;
169  }
170  else
171  {
172  IteratorType it = begin;
173  std::advance(it, (size - 1) / 2);
174  return *it;
175  }
176  }
177 
178 
198  template <typename IteratorType>
199  double MAD(IteratorType begin, IteratorType end, double median_of_numbers)
200  {
201  std::vector<double> diffs;
202  diffs.reserve(std::distance(begin, end));
203  for (IteratorType it = begin; it != end; ++it)
204  {
205  diffs.push_back(fabs(*it - median_of_numbers));
206  }
207  return median(diffs.begin(), diffs.end(), false);
208  }
209 
223  template <typename IteratorType>
224  static double quantile1st(IteratorType begin, IteratorType end,
225  bool sorted = false)
226  {
227  checkIteratorsNotNULL(begin, end);
228 
229  if (!sorted)
230  {
231  std::sort(begin, end);
232  }
233 
234  Size size = std::distance(begin, end);
235  if (size % 2 == 0)
236  {
237  return median(begin, begin + (size/2)-1, true); //-1 to exclude median values
238  }
239  return median(begin, begin + (size/2), true);
240  }
241 
255  template <typename IteratorType>
256  static double quantile3rd(
257  IteratorType begin, IteratorType end, bool sorted = false)
258  {
259  checkIteratorsNotNULL(begin, end);
260  if (!sorted)
261  {
262  std::sort(begin, end);
263  }
264 
265  Size size = std::distance(begin, end);
266  return median(begin + (size/2)+1, end, true); //+1 to exclude median values
267  }
268 
278  template <typename IteratorType>
279  static double variance(IteratorType begin, IteratorType end,
280  double mean = std::numeric_limits<double>::max())
281  {
282  checkIteratorsNotNULL(begin, end);
283  double sum = 0.0;
284  if (mean == std::numeric_limits<double>::max())
285  {
286  mean = Math::mean(begin, end);
287  }
288  for (IteratorType iter=begin; iter!=end; ++iter)
289  {
290  double diff = *iter - mean;
291  sum += diff * diff;
292  }
293  return sum / (std::distance(begin, end)-1);
294  }
295 
305  template <typename IteratorType>
306  static double sd(IteratorType begin, IteratorType end,
307  double mean = std::numeric_limits<double>::max())
308  {
309  checkIteratorsNotNULL(begin, end);
310  return std::sqrt( variance(begin, end, mean) );
311  }
312 
320  template <typename IteratorType>
321  static double absdev(IteratorType begin, IteratorType end,
322  double mean = std::numeric_limits<double>::max())
323  {
324  checkIteratorsNotNULL(begin, end);
325  double sum = 0.0;
326  if (mean == std::numeric_limits<double>::max())
327  {
328  mean = Math::mean(begin, end);
329  }
330  for (IteratorType iter=begin; iter!=end; ++iter)
331  {
332  sum += *iter - mean;
333  }
334  return sum / std::distance(begin, end);
335  }
336 
346  template <typename IteratorType1, typename IteratorType2>
347  static double covariance(IteratorType1 begin_a, IteratorType1 end_a,
348  IteratorType2 begin_b, IteratorType2 end_b)
349  {
350  //no data or different lengths
351  checkIteratorsNotNULL(begin_a, end_a);
352 
353  double sum = 0.0;
354  double mean_a = Math::mean(begin_a, end_a);
355  double mean_b = Math::mean(begin_b, end_b);
356  IteratorType1 iter_a = begin_a;
357  IteratorType2 iter_b = begin_b;
358  for (; iter_a != end_a; ++iter_a, ++iter_b)
359  {
360  /* assure both ranges have the same number of elements */
361  checkIteratorsAreValid(begin_b, end_b, begin_a, end_a);
362  sum += (*iter_a - mean_a) * (*iter_b - mean_b);
363  }
364  /* assure both ranges have the same number of elements */
365  checkIteratorsEqual(iter_b, end_b);
366  Size n = std::distance(begin_a, end_a);
367  return sum / (n-1);
368  }
369 
370 
371 
372 
382  template <typename IteratorType1, typename IteratorType2>
383  static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a,
384  IteratorType2 begin_b, IteratorType2 end_b)
385  {
386  //no data or different lengths
387  checkIteratorsNotNULL(begin_a, end_a);
388 
389  SignedSize dist = std::distance(begin_a, end_a);
390  double error = 0;
391  IteratorType1 iter_a = begin_a;
392  IteratorType2 iter_b = begin_b;
393  for (; iter_a != end_a; ++iter_a, ++iter_b)
394  {
395  /* assure both ranges have the same number of elements */
396  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
397 
398  double tmp(*iter_a - *iter_b);
399  error += tmp * tmp;
400  }
401  /* assure both ranges have the same number of elements */
402  checkIteratorsEqual(iter_b, end_b);
403 
404  return error / dist;
405  }
406 
416  template <typename IteratorType1, typename IteratorType2>
417  static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a,
418  IteratorType2 begin_b, IteratorType2 end_b)
419  {
420  //no data or different lengths
421  checkIteratorsNotNULL(begin_a, end_a);
422 
423  SignedSize dist = std::distance(begin_a, end_a);
424  SignedSize correct = dist;
425  IteratorType1 iter_a = begin_a;
426  IteratorType2 iter_b = begin_b;
427  for (; iter_a != end_a; ++iter_a, ++iter_b)
428  {
429  /* assure both ranges have the same number of elements */
430  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
431  if ((*iter_a < 0 && *iter_b >= 0) || (*iter_a >= 0 && *iter_b < 0))
432  {
433  --correct;
434  }
435 
436  }
437  /* assure both ranges have the same number of elements */
438  checkIteratorsEqual(iter_b, end_b);
439 
440  return double(correct) / dist;
441  }
442 
455  template <typename IteratorType1, typename IteratorType2>
457  IteratorType1 begin_a, IteratorType1 end_a,
458  IteratorType2 begin_b, IteratorType2 end_b)
459  {
460  //no data or different lengths
461  checkIteratorsNotNULL(begin_a, end_b);
462 
463  double tp = 0;
464  double fp = 0;
465  double tn = 0;
466  double fn = 0;
467  IteratorType1 iter_a = begin_a;
468  IteratorType2 iter_b = begin_b;
469  for (; iter_a != end_a; ++iter_a, ++iter_b)
470  {
471  /* assure both ranges have the same number of elements */
472  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
473 
474  if (*iter_a < 0 && *iter_b >= 0)
475  {
476  ++fn;
477  }
478  else if (*iter_a < 0 && *iter_b < 0)
479  {
480  ++tn;
481  }
482  else if (*iter_a >= 0 && *iter_b >= 0)
483  {
484  ++tp;
485  }
486  else if (*iter_a >= 0 && *iter_b < 0)
487  {
488  ++fp;
489  }
490  }
491  /* assure both ranges have the same number of elements */
492  checkIteratorsEqual(iter_b, end_b);
493 
494  return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn));
495  }
496 
508  template <typename IteratorType1, typename IteratorType2>
510  IteratorType1 begin_a, IteratorType1 end_a,
511  IteratorType2 begin_b, IteratorType2 end_b)
512  {
513  //no data or different lengths
514  checkIteratorsNotNULL(begin_a, end_a);
515 
516  //calculate average
517  SignedSize dist = std::distance(begin_a, end_a);
518  double avg_a = std::accumulate(begin_a, end_a, 0.0) / dist;
519  double avg_b = std::accumulate(begin_b, end_b, 0.0) / dist;
520 
521  double numerator = 0;
522  double denominator_a = 0;
523  double denominator_b = 0;
524  IteratorType1 iter_a = begin_a;
525  IteratorType2 iter_b = begin_b;
526  for (; iter_a != end_a; ++iter_a, ++iter_b)
527  {
528  /* assure both ranges have the same number of elements */
529  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
530  double temp_a = *iter_a - avg_a;
531  double temp_b = *iter_b - avg_b;
532  numerator += (temp_a * temp_b);
533  denominator_a += (temp_a * temp_a);
534  denominator_b += (temp_b * temp_b);
535  }
536  /* assure both ranges have the same number of elements */
537  checkIteratorsEqual(iter_b, end_b);
538  return numerator / sqrt(denominator_a * denominator_b);
539  }
540 
542  template <typename Value>
543  static void computeRank(std::vector<Value> & w)
544  {
545  Size i = 0; // main index
546  Size z = 0; // "secondary" index
547  Value rank = 0;
548  Size n = (w.size() - 1);
549  //store original indices for later
550  std::vector<std::pair<Size, Value> > w_idx;
551  for (Size j = 0; j < w.size(); ++j)
552  {
553  w_idx.push_back(std::make_pair(j, w[j]));
554  }
555  //sort
556  std::sort(w_idx.begin(), w_idx.end(),
557  boost::lambda::ret<bool>((&boost::lambda::_1->*& std::pair<Size, Value>::second) <
558  (&boost::lambda::_2->*& std::pair<Size, Value>::second)));
559  //replace pairs <orig_index, value> in w_idx by pairs <orig_index, rank>
560  while (i < n)
561  {
562  // test for equality with tolerance:
563  if (fabs(w_idx[i + 1].second - w_idx[i].second) > 0.0000001 * fabs(w_idx[i + 1].second)) // no tie
564  {
565  w_idx[i].second = Value(i + 1);
566  ++i;
567  }
568  else // tie, replace by mean rank
569  {
570  // count number of ties
571  for (z = i + 1; (z <= n) && fabs(w_idx[z].second - w_idx[i].second) <= 0.0000001 * fabs(w_idx[z].second); ++z)
572  {
573  }
574  // compute mean rank of tie
575  rank = 0.5 * (i + z + 1);
576  // replace intensities by rank
577  for (Size v = i; v <= z - 1; ++v)
578  {
579  w_idx[v].second = rank;
580  }
581  i = z;
582  }
583  }
584  if (i == n)
585  w_idx[n].second = Value(n + 1);
586  //restore original order and replace elements of w with their ranks
587  for (Size j = 0; j < w.size(); ++j)
588  {
589  w[w_idx[j].first] = w_idx[j].second;
590  }
591  }
592 
604  template <typename IteratorType1, typename IteratorType2>
606  IteratorType1 begin_a, IteratorType1 end_a,
607  IteratorType2 begin_b, IteratorType2 end_b)
608  {
609  //no data or different lengths
610  checkIteratorsNotNULL(begin_a, end_a);
611 
612  // store and sort intensities of model and data
613  SignedSize dist = std::distance(begin_a, end_a);
614  std::vector<double> ranks_data;
615  ranks_data.reserve(dist);
616  std::vector<double> ranks_model;
617  ranks_model.reserve(dist);
618  IteratorType1 iter_a = begin_a;
619  IteratorType2 iter_b = begin_b;
620  for (; iter_a != end_a; ++iter_a, ++iter_b)
621  {
622  /* assure both ranges have the same number of elements */
623  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
624 
625  ranks_model.push_back(*iter_a);
626  ranks_data.push_back(*iter_b);
627  }
628  /* assure both ranges have the same number of elements */
629  checkIteratorsEqual(iter_b, end_b);
630 
631  // replace entries by their ranks
632  computeRank(ranks_data);
633  computeRank(ranks_model);
634 
635  double mu = double(ranks_data.size() + 1) / 2.; // mean of ranks
636  // Was the following, but I think the above is more correct ... (Clemens)
637  // double mu = (ranks_data.size() + 1) / 2;
638 
639  double sum_model_data = 0;
640  double sqsum_data = 0;
641  double sqsum_model = 0;
642 
643  for (Int i = 0; i < dist; ++i)
644  {
645  sum_model_data += (ranks_data[i] - mu) * (ranks_model[i] - mu);
646  sqsum_data += (ranks_data[i] - mu) * (ranks_data[i] - mu);
647  sqsum_model += (ranks_model[i] - mu) * (ranks_model[i] - mu);
648  }
649 
650  // check for division by zero
651  if (!sqsum_data || !sqsum_model)
652  {
653  return 0;
654  }
655 
656  return sum_model_data / (sqrt(sqsum_data) * sqrt(sqsum_model));
657  }
658 
660  template<typename T>
662  {
664  :mean(0), variance(0), min(0), lowerq(0), median(0), upperq(0), max(0)
665  {
666  }
667 
668  // Ctor with data
670  {
671  count = data.size();
672  // Sanity check: avoid core dump if no data points present.
673  if (data.empty())
674  {
675  mean = variance = min = lowerq = median = upperq = max = 0.0;
676  }
677  else
678  {
679  sort(data.begin(), data.end());
680  mean = Math::mean(data.begin(), data.end());
681  variance = Math::variance(data.begin(), data.end(), mean);
682  min = data.front();
683  lowerq = Math::quantile1st(data.begin(), data.end(), true);
684  median = Math::median(data.begin(), data.end(), true);
685  upperq = Math::quantile3rd(data.begin(), data.end(), true);
686  max = data.back();
687  }
688  }
689 
691  typename T::value_type min, max;
692  size_t count;
693  };
694 
695  } // namespace Math
696 } // namespace OpenMS
697 
SummaryStatistics()
Definition: StatisticFunctions.h:663
double variance
Definition: StatisticFunctions.h:690
static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the mean square error for the values in [begin_a, end_a) and [begin_b, end_b) ...
Definition: StatisticFunctions.h:383
T::value_type max
Definition: StatisticFunctions.h:691
double MAD(IteratorType begin, IteratorType end, double median_of_numbers)
median absolute deviation (MAD)
Definition: StatisticFunctions.h:199
double lowerq
Definition: StatisticFunctions.h:690
static double variance(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the variance of a range of values.
Definition: StatisticFunctions.h:279
static double sum(IteratorType begin, IteratorType end)
Calculates the sum of a range of values.
Definition: StatisticFunctions.h:121
static void checkIteratorsAreValid(IteratorType1 begin_b, IteratorType1 end_b, IteratorType2 begin_a, IteratorType2 end_a)
Helper function checking if an iterator and a co-iterator both have a next element.
Definition: StatisticFunctions.h:105
static double quantile1st(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the first quantile of a range of values.
Definition: StatisticFunctions.h:224
static void computeRank(std::vector< Value > &w)
Replaces the elements in vector w by their ranks.
Definition: StatisticFunctions.h:543
static double covariance(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the covariance of two ranges of values.
Definition: StatisticFunctions.h:347
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
static void checkIteratorsEqual(IteratorType begin, IteratorType end)
Helper function checking if two iterators are equal.
Definition: StatisticFunctions.h:89
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
static double mean(IteratorType begin, IteratorType end)
Calculates the mean of a range of values.
Definition: StatisticFunctions.h:134
static double sd(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the standard deviation of a range of values.
Definition: StatisticFunctions.h:306
static double matthewsCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Matthews correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:456
static double absdev(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the absolute deviation of a range of values.
Definition: StatisticFunctions.h:321
static double pearsonCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Pearson correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:509
Helper class to gather (and dump) some statistics from a e.g. vector<double>.
Definition: StatisticFunctions.h:661
static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
Helper function checking if two iterators are not equal.
Definition: StatisticFunctions.h:73
double upperq
Definition: StatisticFunctions.h:690
static double quantile3rd(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the third quantile of a range of values.
Definition: StatisticFunctions.h:256
double median
Definition: StatisticFunctions.h:690
static double median(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the median of a range of values.
Definition: StatisticFunctions.h:152
SummaryStatistics(T &data)
Definition: StatisticFunctions.h:669
Invalid range exception.
Definition: Exception.h:285
static double rankCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
calculates the rank correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:605
size_t count
Definition: StatisticFunctions.h:692
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
T::value_type min
Definition: StatisticFunctions.h:691
static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the classification rate for the values in [begin_a, end_a) and [begin_b, end_b)
Definition: StatisticFunctions.h:417
double mean
Definition: StatisticFunctions.h:690
int Int
Signed integer type.
Definition: Types.h:102