libcootapi
 
Loading...
Searching...
No Matches
stats.hh
1/* analysis/stats.cc
2 *
3 * Copyright 2016 by Medical Research Council
4 * Author: Paul Emsley
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or (at
9 * your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 * 02110-1301, USA
20 */
21
22#ifndef INCLUDE_STATS_HH
23#define INCLUDE_STATS_HH
24
25#include <vector>
26#include <utility>
27#include <algorithm>
28
29#include <math.h>
30
31#include <gsl/gsl_sf_erf.h>
32
33namespace coot {
34
35 namespace stats {
36
37 // 1-d data
38 class single {
39
40 // double cached_kurtosis;
41 // bool have_cached_kurtosis;
42
43 public:
44 std::vector<double> v;
45
46 single() {
47 // have_cached_kurtosis = false;
48 }
49 explicit single(const std::vector<double> &v_in) :v(v_in) {
50 // have_cached_kurtosis = false;
51 }
52 unsigned int size() const { return v.size(); }
53 bool empty() const { return v.size() == 0; }
54 void add(const double &a) {
55 v.push_back(a);
56 // have_cached_kurtosis = false;
57 }
58 void add(const single &s) {
59 v.insert(v.end(), s.v.begin(), s.v.end());
60 }
61
62 double mean() const {
63 double m = 0;
64 if (! v.empty() ) {
65 double sum = 0;
66 for (unsigned int i=0; i<v.size(); i++)
67 sum += v[i];
68 m = sum/double(v.size());
69 }
70 return m;
71 }
72
73 double variance() const {
74 double var = 0;
75 if (! v.empty() ) {
76 double sum = 0;
77 double sum_sq = 0;
78 for (unsigned int i=0; i<v.size(); i++) {
79 sum += v[i];
80 sum_sq += v[i] * v[i];
81 }
82 double m = sum/double(v.size());
83 var = sum_sq/double(v.size()) - m*m;
84 }
85 if (var < 0) var = 0; // numerical stability
86 return var;
87 }
88
89 double skew() const {
90 double skew = 0;
91 double m = mean();
92 double var = variance();
93 double sigma = sqrt(var);
94 double s3 = sigma * sigma * sigma;
95 if (v.size() > 0) {
96 double sum_cubed = 0;
97 for (unsigned int i=0; i<v.size(); i++) {
98 double delta = v[i] - m;
99 sum_cubed += delta * delta * delta;
100 }
101 skew = (sum_cubed/double(v.size()))/s3;
102 }
103 return skew;
104 }
105
106 double kurtosis() const {
107
108 // recall kurtosis, $k$ of $N$ observations:
109 // k = \frac{\Sigma(x_i - \mu)^4} {N \sigma^4} - 3
110 // (x_i - \mu)^4 = x_i^4 + 4x_i^3\mu + 6x_i^2\mu^2 + 4x_i\mu^3 + \mu^4
111
112 // Can't enable this! A compiler bug (maybe) is apparent when sorting
113 // (bonds_vec_k_sorter()).
114 // g++ (Ubuntu 4.4.3-4ubuntu5.1) 4.4.3
115 //
116 // if (have_cached_kurtosis)
117 // return cached_kurtosis;
118
119 double k = -999;
120 if (v.size() ) {
121
122 double m = mean();
123 double var = variance();
124
125 if (var > 0) {
126 double sum_to_the_4 = 0;
127 for (unsigned int i=0; i<v.size(); i++) {
128 double t = v[i] - m;
129 sum_to_the_4 += t * t * t * t;
130 }
131 k = sum_to_the_4/(double(v.size()) * var * var);
132 // cached_kurtosis = k;
133 // have_cached_kurtosis = true;
134 }
135 }
136 return k;
137 }
138
139 std::pair<double, double> median_and_iqr() const {
140
141 std::vector<double> vv = v;
142 std::sort(vv.begin(), vv.end());
143 int n = vv.size();
144
145 int idx_q1 = static_cast<int>(0.25 * static_cast<double>(n));
146 int idx_q2 = static_cast<int>(0.50 * static_cast<double>(n));
147 int idx_q3 = static_cast<int>(0.75 * static_cast<double>(n));
148 double iqr = vv[idx_q3] - vv[idx_q1];
149 double m = vv[idx_q2];
150 if (n%2 == 0) {
151 int idx_q2a = idx_q2 + 1;
152 if (idx_q2a < n)
153 m = (m + vv[idx_q2a]) * 0.5;
154 }
155 return std::pair<double, double> (m,iqr);
156 }
157
158 double get_ith_highest(unsigned int idx) const {
159 std::vector<double> vv = v;
160 std::sort(vv.begin(), vv.end()); // lowest at the top
161 std::reverse(vv.begin(), vv.end());
162 return vv[idx];
163 }
164 double get_ith_lowest(unsigned int idx) const {
165 std::vector<double> vv = v;
166 std::sort(vv.begin(), vv.end()); // lowest at the top
167 return vv[idx];
168 }
169
170 };
171
172 class pnorm {
173
174 // return a cumulative probability for this number of standard deviations from the mean
175 // e.g. return for 0, return 0.5 and -1 return 0.1586
176
177 void init() { }
178 public:
179 pnorm() { init(); }
180 double erf(const double &z) const; // public for testing.
181 double get(const double &x) const {
182 // return 0.5 * (1 + erf(x/sqrt(2.0)));
183 return 0.5 * (1 + gsl_sf_erf(x/sqrt(2.0)));
184 }
185 };
186
187 // 20150807-PE
188 //
189 double get_kolmogorov_smirnov_vs_normal(const std::vector<double> &v1,
190 const double &reference_mean,
191 const double &reference_sd);
192 }
193}
194
195#endif // INCLUDE_STATS_HH