Kstars

robuststatistics.cpp
1/*
2 SPDX-FileCopyrightText: 2022 Sophie Taylor <sophie@spacekitteh.moe>
3
4 SPDX-License-Identifier: GPL-2.0-or-later
5*/
6
7#include "robuststatistics.h"
8#include <cmath>
9
10namespace Mathematics::RobustStatistics
11{
12using namespace Mathematics::GSLHelpers;
13
14// int64 code is currently deactiovated by GSLHELPERS_INT64. It doesn't compile on Mac becuase it
15// won't cast a long long * to a long * even though they are both 64bit pointers.
16// On 32bit systems this would be an issue because they are not the same.
17// If the int64 versions are required in future then this "problem" will need to
18// be resolved.
19
20
21template<typename Base = double>
22double Pn_from_sorted_data(const Base sorted_data[],
23 const size_t stride,
24 const size_t n,
25 Base work[],
26 int work_int[]);
27
28template<typename Base>
29double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
30 const std::vector<Base> &data,
31 const size_t stride)
32{
33 auto const size = data.size();
34 auto const theData = data.data();
35 switch (scaleMethod)
36 {
37 // Compute biweight midvariance
38 case SCALE_BWMV:
39 {
40 auto work = std::make_unique<double[]>(size);
41 auto const adjustedMad = 9.0 * gslMAD(theData, stride, size, work.get());
42 auto const median = gslMedianFromSortedData(theData, stride, size);
43
44 auto const begin = Mathematics::GSLHelpers::make_strided_iter(data.cbegin(), stride);
45 auto const end = Mathematics::GSLHelpers::make_strided_iter(data.cend(), stride);
46
47 auto ys = std::vector<Base>(size / stride);
48 std::transform(begin, end, ys.begin(), [ = ](Base x)
49 {
50 return (x - median) / adjustedMad;
51 });
52 auto const indicator = [](Base y)
53 {
54 return std::fabs(y) < 1 ? 1 : 0;
55 };
56 auto const top = std::transform_reduce(begin, end, ys.begin(), 0, std::plus{},
57 [ = ](Base x, Base y)
58 {
59 return indicator(y) * pow(x - median, 2) * pow(1 - pow(y, 2), 4);
60 });
61 auto const bottomSum = std::transform_reduce(begin, end, 0, std::plus{},
62 [ = ](Base y)
63 {
64 return indicator(y) * (1.0 - pow(y, 2)) * (1.0 - 5.0 * pow(y, 2));
65 });
66 // The -1 is for Bessel's correction.
67 auto const bottom = bottomSum * (bottomSum - 1);
68
69 return (size / stride) * (top / bottom);
70
71 }
72 case SCALE_MAD:
73 {
74 auto work = std::make_unique<double[]>(size);
75 return gslMAD(theData, stride, size, work.get());
76 }
77 case SCALE_SESTIMATOR:
78 {
79 auto work = std::make_unique<Base[]>(size);
80 return gslSnFromSortedData(theData, stride, size, work.get());
81 }
82 case SCALE_QESTIMATOR:
83 {
84 auto work = std::make_unique<Base[]>(3 * size);
85 auto intWork = std::make_unique<int[]>(5 * size);
86
87 return gslQnFromSortedData(theData, stride, size, work.get(), intWork.get());
88 }
89 case SCALE_PESTIMATOR:
90 {
91 auto work = std::make_unique<Base[]>(3 * size);
92 auto intWork = std::make_unique<int[]>(5 * size);
93
94 return Pn_from_sorted_data(theData, stride, size, work.get(), intWork.get());
95 }
96 case SCALE_VARIANCE:
97 [[fallthrough]];
98 default:
99 return gslVariance(theData, stride, size);
100 }
101}
102
103
104template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
105 const std::vector<double> &data,
106 const size_t stride);
107template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
108 const std::vector<float> &data,
109 const size_t stride);
110template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
111 const std::vector<uint8_t> &data,
112 const size_t stride);
113template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
114 const std::vector<uint16_t> &data,
115 const size_t stride);
116template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
117 const std::vector<int16_t> &data,
118 const size_t stride);
119template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
120 const std::vector<uint32_t> &data,
121 const size_t stride);
122template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
123 const std::vector<int32_t> &data,
124 const size_t stride);
125#ifdef GSLHELPERS_INT64
126template double ComputeScaleFromSortedData(const ScaleCalculation scaleMethod,
127 const std::vector<int64_t> &data,
128 const size_t stride);
129#endif
130
131template<typename Base> double ComputeLocationFromSortedData(
132 const LocationCalculation locationMethod,
133 const std::vector<Base> &data,
134 double trimAmount, const size_t stride)
135{
136 auto const size = data.size();
137 auto const theData = data.data();
138
139 switch (locationMethod)
140 {
141 case LOCATION_MEDIAN:
142 {
143 return gslMedianFromSortedData(theData, stride, size);
144 }
145 case LOCATION_TRIMMEDMEAN:
146 {
147 return gslTrimmedMeanFromSortedData(trimAmount, theData, stride, size);
148 }
149 case LOCATION_GASTWIRTH:
150 {
151 return gslGastwirthMeanFromSortedData(theData, stride, size);
152 }
153 // TODO: Parameterise
154 case LOCATION_SIGMACLIPPING:
155 {
156 auto const median = gslMedianFromSortedData(theData, stride, size);
157 if (data.size() > 3)
158 {
159 auto const stddev = gslStandardDeviation(theData, stride, size);
160 auto begin = GSLHelpers::make_strided_iter(data.cbegin(), stride);
161 auto end = GSLHelpers::make_strided_iter(data.cend(), stride);
162
163 // Remove samples over 2 standard deviations away.
164 auto lower = std::lower_bound(begin, end, (median - stddev * trimAmount));
165 auto upper = std::upper_bound(lower, end, (median + stddev * trimAmount));
166
167 // New mean
168 auto sum = std::reduce(lower, upper);
169 const int num_remaining = std::distance(lower, upper);
170 if (num_remaining > 0) return sum / num_remaining;
171 }
172 return median;
173 }
174 case LOCATION_MEAN:
175 [[fallthrough]];
176 default:
177 return gslMean(theData, stride, size);
178
179 }
180}
181
182template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
183 const std::vector<double> &data,
184 double trimAmount, const size_t stride);
185template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
186 const std::vector<float> &data,
187 double trimAmount, const size_t stride);
188template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
189 const std::vector<uint8_t> &data,
190 double trimAmount, const size_t stride);
191template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
192 const std::vector<uint16_t> &data,
193 double trimAmount, const size_t stride);
194template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
195 const std::vector<int16_t> &data,
196 double trimAmount, const size_t stride);
197template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
198 const std::vector<uint32_t> &data,
199 double trimAmount, const size_t stride);
200template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
201 const std::vector<int32_t> &data,
202 double trimAmount, const size_t stride);
203#ifdef GSLHELPERS_INT64
204template double ComputeLocationFromSortedData(const LocationCalculation scaleMethod,
205 const std::vector<int64_t> &data,
206 double trimAmount, const size_t stride);
207#endif
208
209
210SampleStatistics ComputeSampleStatistics(std::vector<double> data,
211 const RobustStatistics::LocationCalculation locationMethod,
212 const RobustStatistics::ScaleCalculation scaleMethod,
213 double trimAmount,
214 const size_t stride)
215{
216 std::sort(data.begin(), data.end());
217 double location = RobustStatistics::ComputeLocationFromSortedData(locationMethod, data, trimAmount, stride);
218 double scale = RobustStatistics::ComputeScaleFromSortedData(scaleMethod, data, stride);
219 double weight = RobustStatistics::ConvertScaleToWeight(scaleMethod, scale);
220 return RobustStatistics::SampleStatistics{location, scale, weight};
221}
222
223/*
224 Algorithm to compute the weighted high median in O(n) time.
225 The whimed is defined as the smallest a[j] such that the sum
226 of the weights of all a[i] <= a[j] is strictly greater than
227 half of the total weight.
228 Arguments:
229 a: double array containing the observations
230 n: number of observations
231 w: array of (int/double) weights of the observations.
232*/
233
234template<typename Base = double> Base whimed(Base* a, int * w, int n, Base* a_cand, Base* a_srt, int * w_cand)
235{
236 int n2, i, kcand;
237 /* sum of weights: `int' do overflow when n ~>= 1e5 */
238 int64_t wleft, wmid, wright, w_tot, wrest;
239
240 Base trial;
241
242 w_tot = 0;
243 for (i = 0; i < n; ++i)
244 w_tot += w[i];
245
246 wrest = 0;
247
248 /* REPEAT : */
249 do
250 {
251 for (i = 0; i < n; ++i)
252 a_srt[i] = a[i];
253
254 n2 = n / 2; /* =^= n/2 +1 with 0-indexing */
255
256 gslSort(a_srt, 1, n);
257
258 trial = a_srt[n2];
259
260 wleft = 0;
261 wmid = 0;
262 wright = 0;
263
264 for (i = 0; i < n; ++i)
265 {
266 if (a[i] < trial)
267 wleft += w[i];
268 else if (a[i] > trial)
269 wright += w[i];
270 else
271 wmid += w[i];
272 }
273
274 /* wleft = sum_{i; a[i] < trial} w[i]
275 * wmid = sum_{i; a[i] == trial} w[i] at least one 'i' since trial is one a[]!
276 * wright= sum_{i; a[i] > trial} w[i]
277 */
278 kcand = 0;
279 if (2 * (wrest + wleft) > w_tot)
280 {
281 for (i = 0; i < n; ++i)
282 {
283 if (a[i] < trial)
284 {
285 a_cand[kcand] = a[i];
286 w_cand[kcand] = w[i];
287 ++kcand;
288 }
289 }
290 }
291 else if (2 * (wrest + wleft + wmid) <= w_tot)
292 {
293 for (i = 0; i < n; ++i)
294 {
295 if (a[i] > trial)
296 {
297 a_cand[kcand] = a[i];
298 w_cand[kcand] = w[i];
299 ++kcand;
300 }
301 }
302
303 wrest += wleft + wmid;
304 }
305 else
306 {
307 return trial;
308 }
309
310 n = kcand;
311 for (i = 0; i < n; ++i)
312 {
313 a[i] = a_cand[i];
314 w[i] = w_cand[i];
315 }
316 }
317 while(1);
318}
319
320
321/*
322gsl_stats_Qn0_from_sorted_data()
323 Efficient algorithm for the scale estimator:
324 Q_n0 = { |x_i - x_j|; i<j }_(k) [ = Qn without scaling ]
325i.e. the k-th order statistic of the |x_i - x_j|, where:
326k = (floor(n/2) + 1 choose 2)
327Inputs: sorted_data - sorted array containing the observations
328 stride - stride
329 n - length of 'sorted_data'
330 work - workspace of length 3n of type BASE
331 work_int - workspace of length 5n of type int
332Return: Q_n statistic (without scale/correction factor); same type as input data
333*/
334
335template<typename Base = double> double Pn0k_from_sorted_data(const Base sorted_data[],
336 const double k,
337 const size_t stride,
338 const size_t n,
339 Base work[],
340 int work_int[])
341{
342 const int ni = (int) n;
343 Base * a_srt = &work[n];
344 Base * a_cand = &work[2 * n];
345
346 int *left = &work_int[0];
347 int *right = &work_int[n];
348 int *p = &work_int[2 * n];
349 int *q = &work_int[3 * n];
350 int *weight = &work_int[4 * n];
351
352 Base trial = (Base)0.0;
353 int found = 0;
354
355 int h, i, j, jh;
356
357 /* following should be `long long int' : they can be of order n^2 */
358 int64_t knew, nl, nr, sump, sumq;
359
360 /* check for quick return */
361 if (n < 2)
362 return (Base)0.0;
363
364 h = n / 2 + 1;
365
366 for (i = 0; i < ni; ++i)
367 {
368 left[i] = ni - i + 1;
369 right[i] = (i <= h) ? ni : ni - (i - h);
370
371 /* the n - (i-h) is from the paper; original code had `n' */
372 }
373
374 nl = (int64_t)n * (n + 1) / 2;
375 nr = (int64_t)n * n;
376 knew = k + nl;/* = k + (n+1 \over 2) */
377
378 /* L200: */
379 while (!found && nr - nl > ni)
380 {
381 j = 0;
382 /* Truncation to float : try to make sure that the same values are got later (guard bits !) */
383 for (i = 1; i < ni; ++i)
384 {
385 if (left[i] <= right[i])
386 {
387 weight[j] = right[i] - left[i] + 1;
388 jh = left[i] + weight[j] / 2;
389 work[j] = (sorted_data[i * stride] + sorted_data[(ni - jh) * stride]) / 2 ;
390 ++j;
391 }
392 }
393
394 trial = whimed(work, weight, j, a_cand, a_srt, /*iw_cand*/ p);
395
396 j = 0;
397 for (i = ni - 1; i >= 0; --i)
398 {
399 while (j < ni && ((double)(sorted_data[i * stride] + sorted_data[(ni - j - 1) * stride])) / 2 < trial)
400 ++j;
401
402 p[i] = j;
403 }
404
405 j = ni + 1;
406 for (i = 0; i < ni; ++i)
407 {
408 while ((double)(sorted_data[i * stride] + sorted_data[(ni - j + 1) * stride]) / 2 > trial)
409 --j;
410
411 q[i] = j;
412 }
413
414 sump = 0;
415 sumq = 0;
416
417 for (i = 0; i < ni; ++i)
418 {
419 sump += p[i];
420 sumq += q[i] - 1;
421 }
422
423 if (knew <= sump)
424 {
425 for (i = 0; i < ni; ++i)
426 right[i] = p[i];
427
428 nr = sump;
429 }
430 else if (knew > sumq)
431 {
432 for (i = 0; i < ni; ++i)
433 left[i] = q[i];
434
435 nl = sumq;
436 }
437 else /* sump < knew <= sumq */
438 {
439 found = 1;
440 }
441 } /* while */
442
443 if (found)
444 {
445 return trial;
446 }
447 else
448 {
449 j = 0;
450 for (i = 1; i < ni; ++i)
451 {
452 int jj;
453
454 for (jj = left[i]; jj <= right[i]; ++jj)
455 {
456 work[j] = (sorted_data[i * stride] + sorted_data[(ni - jj) * stride]) / 2;
457 j++;
458 }/* j will be = sum_{i=2}^n ((right[i] + left[i] + 1)_{+})/2 */
459 }
460
461 /* return pull(work, j - 1, knew - nl) : */
462 knew -= (nl + 1); /* -1: 0-indexing */
463
464 /* sort work array */
465 gslSort(work, 1, j);
466
467 return (work[knew]);
468 }
469}
470
471
472template<typename Base>
473double Pn_from_sorted_data(const Base sorted_data[],
474 const size_t stride,
475 const size_t n,
476 Base work[],
477 int work_int[])
478{
479 if (n <= 2)
480 return sqrt(gslVariance(sorted_data, stride, n));
481
482 double upper = Pn0k_from_sorted_data(sorted_data, (3.0 / 4.0), stride, n, work, work_int);
483 double lower = Pn0k_from_sorted_data(sorted_data, (1.0 / 4.0), stride, n, work, work_int);
484
485 const double asymptoticCorrection = 1 / 0.9539;
486
487 auto const asymptoticEstimate = asymptoticCorrection * (upper - lower);
488
489 static double correctionFactors[] = {1.128, 1.303, 1.109, 1.064, 1.166, 1.103, 1.087, 1.105, 1.047, 1.063, 1.057,
490 1.040, 1.061, 1.047, 1.043, 1.048, 1.031, 1.037, 1.035, 1.028, 1.036, 1.030,
491 1.029, 1.032, 1.023, 1.025, 1.024, 1.021, 1.026, 1.022, 1.021, 1.023, 1.018,
492 1.020, 1.019, 1.017, 1.020, 1.018, 1.017, 1.018, 1.015, 1.016, 1.016, 1.014,
493 1.016, 1.015, 1.014, 1.015
494 };
495
496 if (n <= 42)
497 {
499 }
500 else
501 {
502 return asymptoticEstimate * (n / (n - 0.7));
503 }
504
505}
506
507
508} // namespace Mathematics
QVariant location(const QVariant &res)
const QList< QKeySequence > & begin()
const QList< QKeySequence > & end()
QTextStream & left(QTextStream &stream)
QTextStream & right(QTextStream &stream)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Fri May 3 2024 11:49:50 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.