OpenMS
StatisticFunctions.h
Go to the documentation of this file.
1 // Copyright (c) 2002-2023, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Timo Sachsenberg $
6 // $Authors: Clemens Groepl, Johannes Junker, Mathias Walzer, Chris Bielow $
7 // --------------------------------------------------------------------------
8 #pragma once
9 
10 #include <vector>
12 #include <OpenMS/CONCEPT/Types.h>
13 
14 #include <algorithm>
15 #include <cmath>
16 #include <iterator>
17 #include <numeric>
18 
19 namespace OpenMS
20 {
21 
22  namespace Math
23  {
24 
32  template <typename IteratorType>
33  static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
34  {
35  if (begin == end)
36  {
37  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
38  }
39  }
40 
48  template <typename IteratorType>
49  static void checkIteratorsEqual(IteratorType begin, IteratorType end)
50  {
51  if (begin != end)
52  {
53  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
54  }
55  }
56 
64  template <typename IteratorType1, typename IteratorType2>
66  IteratorType1 begin_b, IteratorType1 end_b,
67  IteratorType2 begin_a, IteratorType2 end_a)
68  {
69  if (begin_b != end_b && begin_a == end_a)
70  {
71  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
72  }
73  }
74 
80  template <typename IteratorType>
81  static double sum(IteratorType begin, IteratorType end)
82  {
83  return std::accumulate(begin, end, 0.0);
84  }
85 
93  template <typename IteratorType>
94  static double mean(IteratorType begin, IteratorType end)
95  {
96  checkIteratorsNotNULL(begin, end);
97  return sum(begin, end) / std::distance(begin, end);
98  }
99 
111  template <typename IteratorType>
112  static double median(IteratorType begin, IteratorType end,
113  bool sorted = false)
114  {
115  checkIteratorsNotNULL(begin, end);
116  if (!sorted)
117  {
118  std::sort(begin, end);
119  }
120 
121  Size size = std::distance(begin, end);
122  if (size % 2 == 0) // even size => average two middle values
123  {
124  IteratorType it1 = begin;
125  std::advance(it1, size / 2 - 1);
126  IteratorType it2 = it1;
127  std::advance(it2, 1);
128  return (*it1 + *it2) / 2.0;
129  }
130  else
131  {
132  IteratorType it = begin;
133  std::advance(it, (size - 1) / 2);
134  return *it;
135  }
136  }
137 
138 
158  template <typename IteratorType>
159  double MAD(IteratorType begin, IteratorType end, double median_of_numbers)
160  {
161  std::vector<double> diffs;
162  diffs.reserve(std::distance(begin, end));
163  for (IteratorType it = begin; it != end; ++it)
164  {
165  diffs.push_back(fabs(*it - median_of_numbers));
166  }
167  return median(diffs.begin(), diffs.end(), false);
168  }
169 
188  template <typename IteratorType>
189  double MeanAbsoluteDeviation(IteratorType begin, IteratorType end, double mean_of_numbers)
190  {
191  double mean_value {0};
192  for (IteratorType it = begin; it != end; ++it)
193  {
194  mean_value += fabs(*it - mean_of_numbers);
195  }
196  return mean_value / std::distance(begin, end);
197  }
198 
212  template <typename IteratorType>
213  static double quantile1st(IteratorType begin, IteratorType end,
214  bool sorted = false)
215  {
216  checkIteratorsNotNULL(begin, end);
217 
218  if (!sorted)
219  {
220  std::sort(begin, end);
221  }
222 
223  Size size = std::distance(begin, end);
224  if (size % 2 == 0)
225  {
226  return median(begin, begin + (size/2)-1, true); //-1 to exclude median values
227  }
228  return median(begin, begin + (size/2), true);
229  }
230 
244  template <typename IteratorType>
245  static double quantile3rd(
246  IteratorType begin, IteratorType end, bool sorted = false)
247  {
248  checkIteratorsNotNULL(begin, end);
249  if (!sorted)
250  {
251  std::sort(begin, end);
252  }
253 
254  Size size = std::distance(begin, end);
255  return median(begin + (size/2)+1, end, true); //+1 to exclude median values
256  }
257 
267  template <typename IteratorType>
268  static double variance(IteratorType begin, IteratorType end,
269  double mean = std::numeric_limits<double>::max())
270  {
271  checkIteratorsNotNULL(begin, end);
272  double sum_value = 0.0;
273  if (mean == std::numeric_limits<double>::max())
274  {
275  mean = Math::mean(begin, end);
276  }
277  for (IteratorType iter=begin; iter!=end; ++iter)
278  {
279  double diff = *iter - mean;
280  sum_value += diff * diff;
281  }
282  return sum_value / (std::distance(begin, end)-1);
283  }
284 
294  template <typename IteratorType>
295  static double sd(IteratorType begin, IteratorType end,
296  double mean = std::numeric_limits<double>::max())
297  {
298  checkIteratorsNotNULL(begin, end);
299  return std::sqrt( variance(begin, end, mean) );
300  }
301 
309  template <typename IteratorType>
310  static double absdev(IteratorType begin, IteratorType end,
311  double mean = std::numeric_limits<double>::max())
312  {
313  checkIteratorsNotNULL(begin, end);
314  double sum_value = 0.0;
315  if (mean == std::numeric_limits<double>::max())
316  {
317  mean = Math::mean(begin, end);
318  }
319  for (IteratorType iter=begin; iter!=end; ++iter)
320  {
321  sum_value += *iter - mean;
322  }
323  return sum_value / std::distance(begin, end);
324  }
325 
335  template <typename IteratorType1, typename IteratorType2>
336  static double covariance(IteratorType1 begin_a, IteratorType1 end_a,
337  IteratorType2 begin_b, IteratorType2 end_b)
338  {
339  //no data or different lengths
340  checkIteratorsNotNULL(begin_a, end_a);
341 
342  double sum_value = 0.0;
343  double mean_a = Math::mean(begin_a, end_a);
344  double mean_b = Math::mean(begin_b, end_b);
345  IteratorType1 iter_a = begin_a;
346  IteratorType2 iter_b = begin_b;
347  for (; iter_a != end_a; ++iter_a, ++iter_b)
348  {
349  /* assure both ranges have the same number of elements */
350  checkIteratorsAreValid(begin_b, end_b, begin_a, end_a);
351  sum_value += (*iter_a - mean_a) * (*iter_b - mean_b);
352  }
353  /* assure both ranges have the same number of elements */
354  checkIteratorsEqual(iter_b, end_b);
355  Size n = std::distance(begin_a, end_a);
356  return sum_value / (n-1);
357  }
358 
368  template <typename IteratorType1, typename IteratorType2>
369  static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a,
370  IteratorType2 begin_b, IteratorType2 end_b)
371  {
372  //no data or different lengths
373  checkIteratorsNotNULL(begin_a, end_a);
374 
375  SignedSize dist = std::distance(begin_a, end_a);
376  double error = 0;
377  IteratorType1 iter_a = begin_a;
378  IteratorType2 iter_b = begin_b;
379  for (; iter_a != end_a; ++iter_a, ++iter_b)
380  {
381  /* assure both ranges have the same number of elements */
382  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
383 
384  double tmp(*iter_a - *iter_b);
385  error += tmp * tmp;
386  }
387  /* assure both ranges have the same number of elements */
388  checkIteratorsEqual(iter_b, end_b);
389 
390  return error / dist;
391  }
392 
402  template <typename IteratorType1, typename IteratorType2>
403  static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a,
404  IteratorType2 begin_b, IteratorType2 end_b)
405  {
406  //no data or different lengths
407  checkIteratorsNotNULL(begin_a, end_a);
408 
409  SignedSize dist = std::distance(begin_a, end_a);
410  SignedSize correct = dist;
411  IteratorType1 iter_a = begin_a;
412  IteratorType2 iter_b = begin_b;
413  for (; iter_a != end_a; ++iter_a, ++iter_b)
414  {
415  /* assure both ranges have the same number of elements */
416  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
417  if ((*iter_a < 0 && *iter_b >= 0) || (*iter_a >= 0 && *iter_b < 0))
418  {
419  --correct;
420  }
421 
422  }
423  /* assure both ranges have the same number of elements */
424  checkIteratorsEqual(iter_b, end_b);
425 
426  return double(correct) / dist;
427  }
428 
441  template <typename IteratorType1, typename IteratorType2>
443  IteratorType1 begin_a, IteratorType1 end_a,
444  IteratorType2 begin_b, IteratorType2 end_b)
445  {
446  //no data or different lengths
447  checkIteratorsNotNULL(begin_a, end_b);
448 
449  double tp = 0;
450  double fp = 0;
451  double tn = 0;
452  double fn = 0;
453  IteratorType1 iter_a = begin_a;
454  IteratorType2 iter_b = begin_b;
455  for (; iter_a != end_a; ++iter_a, ++iter_b)
456  {
457  /* assure both ranges have the same number of elements */
458  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
459 
460  if (*iter_a < 0 && *iter_b >= 0)
461  {
462  ++fn;
463  }
464  else if (*iter_a < 0 && *iter_b < 0)
465  {
466  ++tn;
467  }
468  else if (*iter_a >= 0 && *iter_b >= 0)
469  {
470  ++tp;
471  }
472  else if (*iter_a >= 0 && *iter_b < 0)
473  {
474  ++fp;
475  }
476  }
477  /* assure both ranges have the same number of elements */
478  checkIteratorsEqual(iter_b, end_b);
479 
480  return (tp * tn - fp * fn) / std::sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn));
481  }
482 
494  template <typename IteratorType1, typename IteratorType2>
496  IteratorType1 begin_a, IteratorType1 end_a,
497  IteratorType2 begin_b, IteratorType2 end_b)
498  {
499  //no data or different lengths
500  checkIteratorsNotNULL(begin_a, end_a);
501 
502  //calculate average
503  SignedSize dist = std::distance(begin_a, end_a);
504  double avg_a = std::accumulate(begin_a, end_a, 0.0) / dist;
505  double avg_b = std::accumulate(begin_b, end_b, 0.0) / dist;
506 
507  double numerator = 0;
508  double denominator_a = 0;
509  double denominator_b = 0;
510  IteratorType1 iter_a = begin_a;
511  IteratorType2 iter_b = begin_b;
512  for (; iter_a != end_a; ++iter_a, ++iter_b)
513  {
514  /* assure both ranges have the same number of elements */
515  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
516  double temp_a = *iter_a - avg_a;
517  double temp_b = *iter_b - avg_b;
518  numerator += (temp_a * temp_b);
519  denominator_a += (temp_a * temp_a);
520  denominator_b += (temp_b * temp_b);
521  }
522  /* assure both ranges have the same number of elements */
523  checkIteratorsEqual(iter_b, end_b);
524  return numerator / std::sqrt(denominator_a * denominator_b);
525  }
526 
528  template <typename Value>
529  static void computeRank(std::vector<Value> & w)
530  {
531  Size i = 0; // main index
532  Size z = 0; // "secondary" index
533  Value rank = 0;
534  Size n = (w.size() - 1);
535  //store original indices for later
536  std::vector<std::pair<Size, Value> > w_idx;
537  for (Size j = 0; j < w.size(); ++j)
538  {
539  w_idx.push_back(std::make_pair(j, w[j]));
540  }
541  //sort
542  std::sort(w_idx.begin(), w_idx.end(),
543  [](const auto& pair1, const auto& pair2) { return pair1.second < pair2.second; });
544  //replace pairs <orig_index, value> in w_idx by pairs <orig_index, rank>
545  while (i < n)
546  {
547  // test for equality with tolerance:
548  if (fabs(w_idx[i + 1].second - w_idx[i].second) > 0.0000001 * fabs(w_idx[i + 1].second)) // no tie
549  {
550  w_idx[i].second = Value(i + 1);
551  ++i;
552  }
553  else // tie, replace by mean rank
554  {
555  // count number of ties
556  for (z = i + 1; (z <= n) && fabs(w_idx[z].second - w_idx[i].second) <= 0.0000001 * fabs(w_idx[z].second); ++z)
557  {
558  }
559  // compute mean rank of tie
560  rank = 0.5 * (i + z + 1);
561  // replace intensities by rank
562  for (Size v = i; v <= z - 1; ++v)
563  {
564  w_idx[v].second = rank;
565  }
566  i = z;
567  }
568  }
569  if (i == n)
570  w_idx[n].second = Value(n + 1);
571  //restore original order and replace elements of w with their ranks
572  for (Size j = 0; j < w.size(); ++j)
573  {
574  w[w_idx[j].first] = w_idx[j].second;
575  }
576  }
577 
589  template <typename IteratorType1, typename IteratorType2>
591  IteratorType1 begin_a, IteratorType1 end_a,
592  IteratorType2 begin_b, IteratorType2 end_b)
593  {
594  //no data or different lengths
595  checkIteratorsNotNULL(begin_a, end_a);
596 
597  // store and sort intensities of model and data
598  SignedSize dist = std::distance(begin_a, end_a);
599  std::vector<double> ranks_data;
600  ranks_data.reserve(dist);
601  std::vector<double> ranks_model;
602  ranks_model.reserve(dist);
603  IteratorType1 iter_a = begin_a;
604  IteratorType2 iter_b = begin_b;
605  for (; iter_a != end_a; ++iter_a, ++iter_b)
606  {
607  /* assure both ranges have the same number of elements */
608  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
609 
610  ranks_model.push_back(*iter_a);
611  ranks_data.push_back(*iter_b);
612  }
613  /* assure both ranges have the same number of elements */
614  checkIteratorsEqual(iter_b, end_b);
615 
616  // replace entries by their ranks
617  computeRank(ranks_data);
618  computeRank(ranks_model);
619 
620  double mu = double(ranks_data.size() + 1) / 2.; // mean of ranks
621  // Was the following, but I think the above is more correct ... (Clemens)
622  // double mu = (ranks_data.size() + 1) / 2;
623 
624  double sum_model_data = 0;
625  double sqsum_data = 0;
626  double sqsum_model = 0;
627 
628  for (Int i = 0; i < dist; ++i)
629  {
630  sum_model_data += (ranks_data[i] - mu) * (ranks_model[i] - mu);
631  sqsum_data += (ranks_data[i] - mu) * (ranks_data[i] - mu);
632  sqsum_model += (ranks_model[i] - mu) * (ranks_model[i] - mu);
633  }
634 
635  // check for division by zero
636  if (!sqsum_data || !sqsum_model)
637  {
638  return 0;
639  }
640 
641  return sum_model_data / (std::sqrt(sqsum_data) * std::sqrt(sqsum_model));
642  }
643 
645  template<typename T>
647  {
648  SummaryStatistics() = default;
649 
650  // Ctor with data
652  {
653  count = data.size();
654  // Sanity check: avoid core dump if no data points present.
655  if (data.empty())
656  {
657  mean = variance = min = lowerq = median = upperq = max = 0.0;
658  }
659  else
660  {
661  sort(data.begin(), data.end());
662  mean = Math::mean(data.begin(), data.end());
663  variance = Math::variance(data.begin(), data.end(), mean);
664  min = data.front();
665  lowerq = Math::quantile1st(data.begin(), data.end(), true);
666  median = Math::median(data.begin(), data.end(), true);
667  upperq = Math::quantile3rd(data.begin(), data.end(), true);
668  max = data.back();
669  }
670  }
671 
672  double mean = 0, variance = 0 , lowerq = 0, median = 0, upperq = 0;
673  typename T::value_type min = 0, max = 0;
674  size_t count = 0;
675  };
676 
677  } // namespace Math
678 } // namespace OpenMS
679 
Invalid range exception.
Definition: Exception.h:252
int Int
Signed integer type.
Definition: Types.h:76
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:108
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:101
static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the classification rate for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:403
static double median(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the median of a range of values.
Definition: StatisticFunctions.h:112
static double mean(IteratorType begin, IteratorType end)
Calculates the mean of a range of values.
Definition: StatisticFunctions.h:94
static double covariance(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the covariance of two ranges of values.
Definition: StatisticFunctions.h:336
static double quantile3rd(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the third quantile of a range of values.
Definition: StatisticFunctions.h:245
static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
Helper function checking if two iterators are not equal.
Definition: StatisticFunctions.h:33
static double matthewsCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Matthews correlation coefficient for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:442
double MeanAbsoluteDeviation(IteratorType begin, IteratorType end, double mean_of_numbers)
mean absolute deviation (MeanAbsoluteDeviation)
Definition: StatisticFunctions.h:189
static double sum(IteratorType begin, IteratorType end)
Calculates the sum of a range of values.
Definition: StatisticFunctions.h:81
static double absdev(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the absolute deviation of a range of values.
Definition: StatisticFunctions.h:310
static double pearsonCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Pearson correlation coefficient for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:495
static double sd(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the standard deviation of a range of values.
Definition: StatisticFunctions.h:295
double MAD(IteratorType begin, IteratorType end, double median_of_numbers)
median absolute deviation (MAD)
Definition: StatisticFunctions.h:159
static double rankCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
calculates the rank correlation coefficient for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:590
static void checkIteratorsAreValid(IteratorType1 begin_b, IteratorType1 end_b, IteratorType2 begin_a, IteratorType2 end_a)
Helper function checking if an iterator and a co-iterator both have a next element.
Definition: StatisticFunctions.h:65
static double quantile1st(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the first quantile of a range of values.
Definition: StatisticFunctions.h:213
static void checkIteratorsEqual(IteratorType begin, IteratorType end)
Helper function checking if two iterators are equal.
Definition: StatisticFunctions.h:49
static double variance(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the variance of a range of values.
Definition: StatisticFunctions.h:268
static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the mean square error for the values in [begin_a, end_a) and [begin_b, end_b)
Definition: StatisticFunctions.h:369
static void computeRank(std::vector< Value > &w)
Replaces the elements in vector w by their ranks.
Definition: StatisticFunctions.h:529
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:22
Helper class to gather (and dump) some statistics from a e.g. vector<double>.
Definition: StatisticFunctions.h:647
double lowerq
Definition: StatisticFunctions.h:672
double variance
Definition: StatisticFunctions.h:672
T::value_type max
Definition: StatisticFunctions.h:673
SummaryStatistics(T &data)
Definition: StatisticFunctions.h:651
double median
Definition: StatisticFunctions.h:672
size_t count
Definition: StatisticFunctions.h:674
double mean
Definition: StatisticFunctions.h:672
double upperq
Definition: StatisticFunctions.h:672
T::value_type min
Definition: StatisticFunctions.h:673