Kstars

robuststatistics.h
1/*
2 SPDX-FileCopyrightText: 2022 Sophie Taylor <sophie@spacekitteh.moe>
3
4 SPDX-License-Identifier: GPL-2.0-or-later
5*/
6
7// The purpose of Robust Statistics is to provide a more robust way of calculating typical statistical measures
8// such as averages. More information is available here: https://en.wikipedia.org/wiki/Robust_statistics
9// The basic idea is that traditional measures of "location" such as the mean are susceptible to outliers in the
10// distribution. More information on location is available here: https://en.wikipedia.org/wiki/Location_parameter
11// The same applies to measures of scale such as variance (more information available here:
12// https://en.wikipedia.org/wiki/Scale_parameter)
13//
14// Robust Statistics uses underlying GNU Science Library (GSL) routines and provides a wrapper around these routines.
15// See the Statistics section of the GSL documentation: https://www.gnu.org/software/gsl/doc/html/statistics.html
16//
17// Currently the following are provided:
18// Location: LOCATION_MEAN - calculate the mean of input data
19// LOCATION_MEDIAN - calculate the median
20// LOCATION_TRIMMEDMEAN - discard a specified fraction of high/low values before calculating the mean
21// LOCATION_GASTWIRTH - use the Gastwirth estimator based on combining different quantiles
22// see https://www.gnu.org/software/gsl/doc/html/statistics.html#gastwirth-estimator
23// LOCATION_SIGMACLIPPING - single step sigma clipping routine to sigma clip outliers fron the
24// input data and calculate the mean from the remaining data
25//
26// Scale: SCALE_VARIANCE - variance
27// SCALE_MAD - median absolute deviation is the median of absolute differences between each
28// datapoint and the median.
29// SCALE_BWMV - biweight midvariance is a more complex calculate detailed here
30// https://en.wikipedia.org/wiki/Robust_measures_of_scale#The_biweight_midvariance
31// implemenated internally - not provided by GSL
32// SCALE_SESTIMATOR - Qn and Sn estimators are pairwise alternative to MAD by Rousseeuw and Croux
33// SCALE_QESTIMATOR https://www.researchgate.net/publication/221996720_Alternatives_to_Median_Absolute_Deviation
34// SCALE_PESTIMATOR - Pairwise mean scale estimator (Pn). This is the interquartile range of the
35// pairwise means. Implemented internally, not provided by GSL.
36//
37// Where necessary data is sorted by the routines and functionality to use a user selected array sride is included.
38// C++ Templates are used to provide access to the GSL routines based on the datatype of the input data.
39
40#pragma once
41
42#include <limits>
43#include <vector>
44#include <cmath>
45#include <QObject>
46#include <QVector>
47
48#include "gslhelpers.h"
49
50namespace Mathematics::RobustStatistics
51{
52
53template<typename Base>
54double Pn_from_sorted_data(const Base sorted_data[],
55 const size_t stride,
56 const size_t n,
57 Base work[],
58 int work_int[]);
59
60enum ScaleCalculation { SCALE_VARIANCE, SCALE_BWMV, SCALE_SESTIMATOR, SCALE_QESTIMATOR, SCALE_MAD, SCALE_PESTIMATOR };
61
62
63enum LocationCalculation { LOCATION_MEAN, LOCATION_MEDIAN, LOCATION_TRIMMEDMEAN, LOCATION_GASTWIRTH,
64 LOCATION_SIGMACLIPPING
65 };
66
67struct SampleStatistics
68{
69 constexpr SampleStatistics() : location(0), scale(0), weight(std::numeric_limits<double>::infinity()) {}
70 constexpr SampleStatistics(const double location, const double scale, const double weight) :
71 location(location), scale(scale), weight(weight) {}
72 double location;
73 double scale;
74 double weight;
75};
76
77//template<typename Base=double>
78//struct Estimator
79//{
80// virtual double Estimate(std::vector<Base> data, const size_t stride = 1)
81// {
82// std::sort(Mathematics::GSLHelpers::make_strided_iter(data.begin(), stride),
83// Mathematics::GSLHelpers::make_strided_iter(data.end(), stride));
84// return EstimateFromSortedData(data, stride);
85// }
86// virtual double Estimate(const std::vector<Base> &data, const size_t stride = 1)
87// {
88// auto sorted = data;
89// std::sort(Mathematics::GSLHelpers::make_strided_iter(sorted.begin(), stride),
90// Mathematics::GSLHelpers::make_strided_iter(sorted.end(), stride));
91// return EstimateFromSortedData(sorted, stride);
92// }
93
94// virtual double EstimateFromSortedData(const std::vector<Base> &data, const size_t stride = 1) = 0;
95//};
96
97template<typename Base> struct LocationEstimator;
98
99template<typename Base = double>
100struct ScaleEstimator //: public virtual Estimator<Base>
101{
102 LocationEstimator<Base> locationEstimator;
103 virtual double EstimateScale(std::vector<Base> data, const size_t stride = 1)
104 {
105 std::sort(Mathematics::GSLHelpers::make_strided_iter(data.begin(), stride),
106 Mathematics::GSLHelpers::make_strided_iter(data.end(), stride));
107 return EstimateScaleFromSortedData(data, stride);
108 }
109 virtual double EstimateScale(const std::vector<Base> &data, const size_t stride = 1)
110 {
111 auto sorted = data;
112 std::sort(Mathematics::GSLHelpers::make_strided_iter(sorted.begin(), stride),
113 Mathematics::GSLHelpers::make_strided_iter(sorted.end(), stride));
114 return EstimateScaleFromSortedData(sorted, stride);
115 }
116
117 virtual double ConvertScaleToWeight(const double scale)
118 {
119 return 1 / (scale * scale);
120 }
121
122 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) = 0;
123};
124
125template <typename Base = double>
126struct MAD : public virtual ScaleEstimator<Base>
127{
128 virtual double EstimateScale(const std::vector<Base> &data, const size_t stride = 1) override
129 {
130 auto const size = data.size();
131 auto work = std::make_unique<double[]>(size / stride);
132 return Mathematics::GSLHelpers::gslMAD(data.data(), stride, size, work.get());
133 }
134 virtual double EstimateScale(std::vector<Base> data, const size_t stride = 1) override
135 {
136 return EstimateScale(data, stride);
137 }
138 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
139 {
140 return EstimateScale(data, stride);
141 }
142};
143template <typename Base = double>
144struct Variance : public virtual ScaleEstimator<Base>
145{
146 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
147 {
148 return Mathematics::GSLHelpers::gslVariance(data.data(), stride, data.size());
149 }
150 virtual double ConvertScaleToWeight(const double variance) override
151 {
152 return 1 / variance;
153 }
154};
155
156template <typename Base = double>
157struct SnEstimator : public virtual ScaleEstimator<Base>
158{
159 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
160 {
161 auto size = data.size();
162 auto work = std::make_unique<Base[]>(size);
163 return Mathematics::GSLHelpers::gslSnFromSortedData(data.data(), stride, size, work.get());
164 }
165};
166
167template <typename Base = double>
168struct QnEstimator : public virtual ScaleEstimator<Base>
169{
170 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
171 {
172 auto size = data.size();
173 auto work = std::make_unique<Base[]>(3 * size);
174 auto intWork = std::make_unique<Base[]>(5 * size);
175 return Mathematics::GSLHelpers::gslQnFromSortedData(data.data(), stride, size, work.get(), intWork.get());
176 }
177};
178
179template <typename Base = double>
180struct PnEstimator : public virtual ScaleEstimator<Base>
181{
182 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
183 {
184 auto size = data.size();
185 auto work = std::make_unique<Base[]>(3 * size);
186 auto intWork = std::make_unique<Base[]>(5 * size);
187 return Pn_from_sorted_data(data.data(), stride, size, work.get(), intWork.get());
188 }
189};
190
191
192template <typename Base = double>
193struct BiweightMidvariance : public virtual ScaleEstimator<Base>
194{
195 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
196 {
197 auto const size = data.size();
198 auto const theData = data.data();
199 auto work = std::make_unique<double[]>(size / stride);
200 auto const adjustedMad = 9.0 * Mathematics::GSLHelpers::gslMAD(theData, stride, size, work.get());
201 auto const median = this->locationEstimator.EstimateLocationFromSortedData(theData,
202 stride); //gslMedianFromSortedData(theData, stride, size);
203
204 auto const begin = Mathematics::GSLHelpers::make_strided_iter(data.cbegin(), stride);
205 auto const end = Mathematics::GSLHelpers::make_strided_iter(data.cend(), stride);
206
207 auto ys = std::vector<Base>(size / stride);
208 std::transform(begin, end, ys.begin(), [ = ](Base x)
209 {
210 return (x - median) / adjustedMad;
211 });
212 auto const indicator = [](Base y)
213 {
214 return std::fabs(y) < 1 ? 1 : 0;
215 };
216 auto const top = std::transform_reduce(begin, end, ys.begin(), 0, std::plus{},
217 [ = ](Base x, Base y)
218 {
219 return indicator(y) * pow(x - median, 2) * pow(1 - pow(y, 2), 4);
220 });
221 auto const bottomSum = std::transform_reduce(begin, end, 0, std::plus{},
222 [ = ](Base y)
223 {
224 return indicator(y) * (1.0 - pow(y, 2)) * (1.0 - 5.0 * pow(y, 2));
225 });
226 // The -1 is for Bessel's correction.
227 auto const bottom = bottomSum * (bottomSum - 1);
228
229 return (size / stride) * (top / bottom);
230 }
231
232 virtual double ConvertScaleToWeight(const double variance) override
233 {
234 return 1 / variance;
235 }
236};
237
238template<typename Base = double>
239struct LocationEstimator //: public virtual Estimator<Base>
240{
241 ScaleEstimator<Base> scaleEstimator;
242 virtual double EstimateLocation(std::vector<Base> data, const size_t stride = 1)
243 {
244 std::sort(Mathematics::GSLHelpers::make_strided_iter(data.begin(), stride),
245 Mathematics::GSLHelpers::make_strided_iter(data.end(), stride));
246 return EstimateLocationFromSortedData(data, stride);
247 }
248 virtual double EstimateLocation(const std::vector<Base> &data, const size_t stride = 1)
249 {
250 auto sorted = data;
251 std::sort(Mathematics::GSLHelpers::make_strided_iter(sorted.begin(), stride),
252 Mathematics::GSLHelpers::make_strided_iter(sorted.end(), stride));
253 return EstimateLocationFromSortedData(sorted, stride);
254 }
255 virtual double EstimateLocationFromSortedData(const std::vector<Base> &data, const size_t stride = 1) = 0;
256};
257
258template<typename Base = double>
259struct StatisticalCoEstimator : public virtual LocationEstimator<Base>, public virtual ScaleEstimator<Base>
260{
261 private:
262 LocationEstimator<Base> locationEstimator;
263 ScaleEstimator<Base> scaleEstimator;
264 public:
265 StatisticalCoEstimator(LocationEstimator<Base> locationEstimator, ScaleEstimator<Base> scaleEstimator)
266 : locationEstimator(locationEstimator), scaleEstimator(scaleEstimator) {}
267 virtual double EstimateLocationFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
268 {
269 auto const stats = EstimateStatisticsFromSortedData(data, stride);
270 return stats.location;
271 }
272 virtual double EstimateScaleFromSortedData(const std::vector<Base> &data, const size_t stride = 1) override
273 {
274 auto const stats = EstimateStatisticsFromSortedData(data, stride);
275 return stats.scale;
276 }
277 virtual SampleStatistics EstimateStatisticsFromSortedData(const std::vector<Base> &data, const size_t stride = 1)
278 {
279 auto const location = EstimateLocationFromSortedData(data, stride);
280 auto const scale = EstimateScaleFromSortedData(data, stride);
281 auto const weight = ConvertScaleToWeight(scale);
282
283 return SampleStatistics{location, scale, weight};
284 }
285};
286
287/**
288 * @short Computes a estimate of the statistical scale of the input sample.
289 *
290 * @param scaleMethod The estimator to use.
291 * @param data The sample to estimate the scale of.
292 * @param stride The stide of the data.
293 */
294template<typename Base = double>
295double ComputeScale(const ScaleCalculation scaleMethod, std::vector<Base> data,
296 const size_t stride = 1)
297{
298 if (scaleMethod != SCALE_VARIANCE)
299 std::sort(data.begin(), data.end());
300 return ComputeScaleFromSortedData(scaleMethod, data, stride);
301}
302//[[using gnu : pure]]
303template<typename Base = double>
304double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
305 const std::vector<Base> &data,
306 const size_t stride = 1);
307/**
308 * @short Computes a estimate of the statistical location of the input sample.
309 *
310 * @param scaleMethod The estimator to use.
311 * @param data The sample to estimate the location of.
312 * @param sortedData The sorted data.
313 * @param trimAmount When using the trimmed mean estimate, the percentage quartile to remove either side.
314 * When using sigma clipping, the number of standard deviations a sample has to be from the mean
315 * to be considered an outlier.
316 * @param stride The stide of the data. Note that in cases other than the simple mean, the stride will only apply
317 * AFTER sorting.
318 */
319template<typename Base = double>
320double ComputeLocation(const LocationCalculation locationMethod, std::vector<Base> data,
321 const double trimAmount = 0.25, const size_t stride = 1)
322{
323 if (locationMethod != LOCATION_MEAN)
324 std::sort(data.begin(), data.end());
325 return ComputeLocationFromSortedData(locationMethod, data, trimAmount, stride);
326}
327
328//[[using gnu : pure]]
329template<typename Base = double>
330double ComputeLocationFromSortedData(const LocationCalculation locationMethod,
331 const std::vector<Base> &data, const double trimAmount = 0.25, const size_t stride = 1);
332
333/**
334 * @short Computes a weight for use in regression.
335 *
336 * @param scaleMethod The estimator to use.
337 * @param data The sample to estimate the scale of.
338 * @param sortedData The sorted data.
339 * @param stride The stide of the data. Note that in cases other than the simple variance, the stride will only
340 * apply AFTER sorting.
341 */
342template<typename Base = double>
343double ComputeWeight(const ScaleCalculation scaleMethod, std::vector<Base> data, const size_t stride = 1)
344{
345 if (scaleMethod != SCALE_VARIANCE)
346 std::sort(data.begin(), data.end());
347 return ComputeWeightFromSortedData(scaleMethod, data, stride);
348}
349//[[using gnu : pure]]
350template<typename Base = double>
351double ComputeWeightFromSortedData(const ScaleCalculation scaleMethod, const std::vector<Base> &data,
352 const size_t stride = 1)
353{
354 auto const scale = ComputeScaleFromSortedData(scaleMethod, data, stride);
355 return ConvertScaleToWeight(scaleMethod, scale);
356}
357
358SampleStatistics ComputeSampleStatistics(std::vector<double> data,
359 const LocationCalculation locationMethod = LOCATION_TRIMMEDMEAN,
360 const ScaleCalculation scaleMethod = SCALE_QESTIMATOR,
361 double trimAmount = 0.25,
362 const size_t stride = 1);
363
364[[using gnu : pure]]
365constexpr double ConvertScaleToWeight(const ScaleCalculation scaleMethod, double scale)
366{
367 // If the passed in scale is zero or near zero return a very small weight rather than infinity.
368 // This fixes the situation where, e.g. there is a single datapoint so scale is zero and
369 // weighting this datapoint with infinite weight is incorrect (and breaks the LM solver)
370 switch (scaleMethod)
371 {
372 // Variance, biweight midvariance are variances.
373 case SCALE_VARIANCE:
374 [[fallthrough]];
375 case SCALE_BWMV:
376 return (scale < 1e-10) ? 1e-10 : 1 / scale;
377 // MAD, Sn, Qn and Pn estimators are all calibrated to estimate the standard deviation of Gaussians.
378 case SCALE_MAD:
379 [[fallthrough]];
380 case SCALE_SESTIMATOR:
381 [[fallthrough]];
382 case SCALE_QESTIMATOR:
383 [[fallthrough]];
384 case SCALE_PESTIMATOR:
385 return (scale < 1e-10) ? 1e-10 : 1 / (scale * scale);
386 }
387 return -1;
388}
389
390
391} // namespace Mathematics
QVariant location(const QVariant &res)
const QList< QKeySequence > & begin()
const QList< QKeySequence > & end()
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Fri Sep 6 2024 11:56:56 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.