author | Andreas Thurnherr <ant@ldeo.columbia.edu> |
Mon, 13 Apr 2020 11:06:22 -0400 | |
changeset 40 | c1803ae2540f |
parent 39 | 56bdfe65a697 |
child 47 | dde46143288c |
permissions | -rw-r--r-- |
0 | 1 |
#====================================================================== |
39 | 2 |
# . . / L I B / L I B S T A T S . P L |
0 | 3 |
# doc: Wed Mar 24 13:59:27 1999 |
40 | 4 |
# dlm: Tue Mar 26 12:59:32 2019 |
0 | 5 |
# (c) 1999 A.M. Thurnherr |
40 | 6 |
# uE-Info: 50 31 NIL 0 0 72 0 2 4 NIL ofnI |
0 | 7 |
#====================================================================== |
8 |
||
9 |
# HISTORY: |
|
10 |
# Mar 24, 1999: - created for the ANO paper |
|
11 |
# Mar 27, 1999: - extended |
|
12 |
# Sep 18, 1999: - argument typechecking |
|
13 |
# Sep 30, 1999: - added gauss() |
|
14 |
# Oct 04, 1999: - moved gauss() to [./libfuns.pl] (changed from specfuns) |
|
15 |
# Oct 20, 1999: - changed, 'cause I understand it better |
|
16 |
# Oct 21, 1999: - changed &Fishers_z to &r2z(); added &z2r() |
|
17 |
# - added &sig_rr(), &sig_rrtrue |
|
18 |
# Jan 22, 2002: - added N(), avg(), stddev(), min(), max() |
|
19 |
# Feb 27, 2006: - adjusted median() for compat with NR (even # of points) |
|
20 |
# - added medianF() |
|
21 |
# Jun 27, 2006: - added medianFNaN() |
|
22 |
# Jul 1, 2006: - Version 3.3 [HISTORY] |
|
23 |
# - made median respect nan based on perlfunc(1) |
|
24 |
# Apr 25, 2008: - added &bootstrap() |
|
25 |
# Oct 24, 2010: - replaced grep { $_ == $_ } by grep { numberp($_) } everywhere |
|
26 |
# - added &fixLowSampStat() |
|
27 |
# Nov 5, 2010: - added std (stderr would have been better but that's used in libPOSIX.pl |
|
28 |
# Dec 18, 2010: - added stddev2, mad, mad2 |
|
29 |
# Dec 31, 2010: - added rms() |
|
30 |
# Jul 2, 2011: - added mad2F() |
|
31 |
# Mar 10, 2012: - medianF() -> medianAnts_(); mad2F() -> mad2Ants_() |
|
32 |
# - added sum() |
|
33 |
# Apr 26, 2012: - BUG: std() did not allow nan as stddev input |
|
3 | 34 |
# Oct 15, 2012: - added max_i(), min_i() |
5 | 35 |
# Nov 24, 2013: - renamed N to ndata |
36 |
# - added fdiff() |
|
10 | 37 |
# Jan 30, 2015: - added log_avg() |
38 |
# - added noise_avg() |
|
39 | 39 |
# Mar 26, 2019: - added regress() |
0 | 40 |
|
41 |
require "$ANTS/libfuns.pl"; |
|
42 |
||
43 |
#---------------------------------------------------------------------- |
|
39 | 44 |
# simple linear regression |
45 |
# - returns regression coefficient (slope) |
|
46 |
#---------------------------------------------------------------------- |
|
47 |
||
48 |
sub regress(@) |
|
49 |
{ |
|
50 |
my($n) = @_/2; |
|
51 |
return nan unless ($n > 1); |
|
52 |
||
53 |
my(@X) = @_[0..$n-1]; |
|
54 |
my(@Y) = @_[$n..2*$n-1]; |
|
55 |
||
56 |
my($sumx) = 0; |
|
57 |
for (my($i)=0; $i<$n; $i++) { $sumx += $X[$i]; } |
|
58 |
my($midx) = $sumx / @Y; |
|
59 |
my($rcoeff) = my($sumxdsq) = 0; |
|
60 |
for (my($i)=0; $i<$n; $i++) { |
|
61 |
my($xd) = $X[$i] - $midx; |
|
62 |
$sumxdsq += $xd**2; |
|
63 |
$rcoeff += $xd * $Y[$i]; |
|
64 |
} |
|
65 |
$rcoeff /= $sumxdsq; |
|
66 |
return $rcoeff; |
|
67 |
} |
|
68 |
||
69 |
#---------------------------------------------------------------------- |
|
0 | 70 |
# estimate stderr given stddev & degrees of freedom |
71 |
# - return nan for dof <= 0 |
|
72 |
#---------------------------------------------------------------------- |
|
73 |
||
74 |
sub std(@) |
|
75 |
{ |
|
76 |
my($sig,$dof) = |
|
77 |
&antsFunUsage(2,".c","stddev, deg_of_freedom",@_); |
|
78 |
return nan unless ($dof > 0); |
|
79 |
return $sig / sqrt($dof); |
|
80 |
} |
|
81 |
||
82 |
#---------------------------------------------------------------------- |
|
83 |
# calc standard stats from vector of vals |
|
5 | 84 |
# - used, e.g., in [rfilt] |
0 | 85 |
#---------------------------------------------------------------------- |
86 |
||
87 |
sub min(@) |
|
88 |
{ |
|
89 |
my($min) = 9e99; |
|
90 |
for (my($i)=0; $i<=$#_; $i++) { |
|
91 |
$min = $_[$i] if (numberp($_[$i]) && $_[$i] < $min); |
|
92 |
} |
|
93 |
return $min<9e99 ? $min : nan; |
|
94 |
} |
|
95 |
||
3 | 96 |
sub min_i(@) |
97 |
{ |
|
98 |
my($min) = 9e99; |
|
99 |
my($min_i); |
|
100 |
||
101 |
for (my($i)=0; $i<=$#_; $i++) { |
|
102 |
$min_i=$i,$min=$_[$i] if (numberp($_[$i]) && $_[$i] < $min); |
|
103 |
} |
|
104 |
return $min<9e99 ? $min_i : nan; |
|
105 |
} |
|
106 |
||
0 | 107 |
sub max(@) |
108 |
{ |
|
109 |
my($max) = -9e99; |
|
110 |
for (my($i)=0; $i<=$#_; $i++) { |
|
111 |
$max = $_[$i] if (numberp($_[$i]) && $_[$i] > $max); |
|
112 |
} |
|
113 |
return $max>-9e99 ? $max : nan; |
|
114 |
} |
|
115 |
||
3 | 116 |
sub max_i(@) |
117 |
{ |
|
118 |
my($max) = -9e99; |
|
119 |
my($max_i); |
|
120 |
||
121 |
for (my($i)=0; $i<=$#_; $i++) { |
|
122 |
$max_i=$i,$max=$_[$i] if (numberp($_[$i]) && $_[$i] > $max); |
|
123 |
} |
|
124 |
return $max>-9e99 ? $max_i : nan; |
|
125 |
} |
|
126 |
||
5 | 127 |
sub ndata(@) |
0 | 128 |
{ |
129 |
my($N) = 0; |
|
130 |
for (my($i)=0; $i<=$#_; $i++) { $N++ if (numberp($_[$i])); } |
|
131 |
return $N; |
|
132 |
} |
|
133 |
||
134 |
sub sum(@) |
|
135 |
{ |
|
136 |
my($N) = my($sum) = 0; |
|
4
ff72b00b4342
after adding $pi and some other stuff
A.M. Thurnherr <athurnherr@yahoo.com>
parents:
3
diff
changeset
|
137 |
@_ = split(/\s+/,$_[0]) if (@_ == 1 && !numberp($_[0])); |
0 | 138 |
for (my($i)=0; $i<=$#_; $i++) { $N++,$sum+=$_[$i] if (numberp($_[$i])); } |
139 |
return ($N>0)?$sum:nan; |
|
140 |
} |
|
141 |
||
142 |
sub avg(@) |
|
143 |
{ |
|
144 |
my($N) = my($sum) = 0; |
|
145 |
for (my($i)=0; $i<=$#_; $i++) { $N++,$sum+=$_[$i] if (numberp($_[$i])); } |
|
146 |
return ($N>0)?$sum/$N:nan; |
|
147 |
} |
|
148 |
||
10 | 149 |
#---------------------------------------------------------------------- |
150 |
# log_avg does the average in log space, which may be useful |
|
151 |
# for log-normal distributions. However, when I tired this |
|
152 |
# in response to a reviewer suggestion, the results were |
|
153 |
# noticably but not significantly worse, even though at least |
|
154 |
# some of the distributions looked quite log-normal. |
|
155 |
# It appears, however, that 32 samples are enough to sample |
|
156 |
# the distribution of dissipation adequately. |
|
157 |
#---------------------------------------------------------------------- |
|
158 |
||
159 |
sub log_avg(@) # exp(avg(log(x))) |
|
160 |
{ |
|
161 |
my($N) = my($sum) = 0; |
|
162 |
for (my($i)=0; $i<=$#_; $i++) { $N++,$sum+=log($_[$i]) if (numberp($_[$i])); } |
|
163 |
return ($N>0)?exp($sum/$N):nan; |
|
164 |
} |
|
165 |
||
166 |
#---------------------------------------------------------------------- |
|
167 |
# noise_avg(noise_lvl,frac) calculates an arithmetic average after |
|
168 |
# setting all samples below noise level to a certain fraction of the |
|
169 |
# noise level. 66% was used for the VKE paper. |
|
170 |
# Requires $noise_avg to be set, e.g. with &antsFunOpt() |
|
171 |
#---------------------------------------------------------------------- |
|
172 |
||
173 |
sub noise_avg(@) |
|
174 |
{ |
|
175 |
my($nlv,$frac) = split(',',$noise_avg); |
|
176 |
croak("libstats.pl: noise_avg: cannot decode \$noise_avg = <$noise_avg> <$nlv,$frac> (noise level, fraction)\n") |
|
177 |
unless ($nlv > 0) && ($frac < 1); |
|
178 |
my($nsamp) = my($sum) = 0; |
|
179 |
for (my($i)=0; $i<=$#_; $i++) { |
|
180 |
if (numberp($_[$i])) { |
|
181 |
$nsamp++; |
|
182 |
if ($_[$i] > $nlv) { |
|
183 |
$sum += $_[$i]; |
|
184 |
} else { |
|
185 |
$sum += $frac*$nlv; |
|
186 |
} |
|
187 |
} |
|
188 |
} |
|
189 |
return ($nsamp > 0) ? $sum/$nsamp : nan; |
|
190 |
} |
|
191 |
||
192 |
||
0 | 193 |
sub stddev2(@) # avg, val, val, val, ... |
194 |
{ |
|
195 |
my($N) = my($sum) = 0; |
|
196 |
for (my($i)=1; $i<=$#_; $i++) { |
|
197 |
$N++,$sum+=($_[0]-$_[$i])**2 if (numberp($_[$i])); |
|
198 |
} |
|
199 |
return ($N>1)?sqrt($sum/($N-1)):nan; |
|
200 |
} |
|
201 |
||
202 |
sub stddev(@) |
|
203 |
{ |
|
204 |
my($avg) = &avg(@_); |
|
205 |
return numberp($avg) ? stddev2($avg,@_) : nan; |
|
206 |
} |
|
207 |
||
208 |
sub rms(@) |
|
209 |
{ |
|
210 |
my($N) = my($sum) = 0; |
|
211 |
for (my($i)=0; $i<=$#_; $i++) { $N++,$sum+=$_[$i]**2 if (numberp($_[$i])); } |
|
212 |
return ($N>0)?sqrt($sum/$N):nan; |
|
213 |
} |
|
214 |
||
215 |
sub median(@) |
|
216 |
{ |
|
217 |
my(@svals) = sort {$a <=> $b} grep { numberp($_) } @_; |
|
218 |
return nan if (@svals == 0); |
|
219 |
return (@svals & 1) ? |
|
220 |
$svals[$#svals/2] : |
|
221 |
0.5 * ($svals[$#svals/2] + $svals[$#svals/2+1]); |
|
222 |
} |
|
223 |
||
224 |
sub medianAnts_($) |
|
225 |
{ |
|
226 |
my($fnr) = @_; |
|
227 |
my(@svals) = sort {@{$a}[$fnr] <=> @{$b}[$fnr]} grep { numberp(@{$_}[$fnr]) } @ants_; |
|
228 |
return nan if (@svals == 0); |
|
229 |
return (@svals & 1) ? |
|
230 |
$svals[$#svals/2][$fnr] : |
|
231 |
0.5 * ($svals[$#svals/2][$fnr] + $svals[$#svals/2+1][$fnr]); |
|
232 |
} |
|
233 |
||
234 |
sub mad2(@) # avg, val, val, val, ... |
|
235 |
{ |
|
236 |
my($N) = my($sum) = 0; |
|
237 |
for (my($i)=1; $i<=$#_; $i++) { |
|
238 |
$N++,$sum+=abs($_[0]-$_[$i]) if (numberp($_[$i])); |
|
239 |
} |
|
240 |
return ($N>0)?sqrt($sum/$N):nan; |
|
241 |
} |
|
242 |
||
243 |
sub mad(@) |
|
244 |
{ |
|
245 |
my($median) = &median(@_); |
|
246 |
return numberp($median) ? mad2($median,@_) : nan; |
|
247 |
} |
|
248 |
||
249 |
sub mad2Ants_($$) |
|
250 |
{ |
|
251 |
my($median,$fnr) = @_; |
|
252 |
my($sum,$n); |
|
253 |
||
254 |
for (my($r)=0; $r<@ants_; $r++) { |
|
255 |
next unless numberp($ants_[$r][$fnr]); |
|
256 |
$sum += abs($median - $ants_[$r][$fnr]); |
|
257 |
$n++ |
|
258 |
} |
|
259 |
return ($n>0) ? $sum/$n : nan; |
|
260 |
} |
|
261 |
||
5 | 262 |
sub fdiff(@) # finite difference, e.g. for [rfilt] |
263 |
{ |
|
264 |
return (numberp($_[0]) && numberp($_[$#_])) ? $_[$#_] - $_[0] : nan; |
|
265 |
} |
|
266 |
||
267 |
||
0 | 268 |
#---------------------------------------------------------------------- |
269 |
# &bootstrap(nDraw,cLim,statFun,val[,...]) |
|
270 |
# nDraw number of synthetic samples to draw |
|
271 |
# cLim confidence limit (e.g. 0.95) |
|
272 |
# statFun pointer to stats function |
|
273 |
# val[,...] data values |
|
274 |
# |
|
275 |
# e.g. bootstrap(1000,.5,\&avg,1,2,1000) |
|
276 |
#---------------------------------------------------------------------- |
|
277 |
||
278 |
sub bootstrap($$$@) |
|
279 |
{ |
|
280 |
my($nDraw,$cLim,$statFun,@vals) = @_; |
|
281 |
my(@sv,@stats); |
|
282 |
||
283 |
for (my($s)=0; $s<$nDraw; $s++) { |
|
284 |
for (my($i)=0; $i<@vals; $i++) { |
|
285 |
$sv[$i] = $vals[int(rand(@vals))]; |
|
286 |
} |
|
287 |
$stats[$s] = &$statFun(@sv); |
|
288 |
} |
|
289 |
@stats = sort {$a <=> $b} grep { numberp($_) } @stats; |
|
290 |
my($cli) = int($nDraw*(1-$cLim)/2); |
|
291 |
return ($stats[$cli],$stats[$#stats-$cli]); |
|
292 |
} |
|
293 |
||
294 |
#---------------------------------------------------------------------- |
|
295 |
# &fixLowSampStat(statRef,nsamp[]) |
|
296 |
# - replace stat (variance, stddev, stderr) based on small (<10) samples |
|
297 |
# with median calculated from all stats |
|
298 |
# - median of all stats is chosen to allow routine to work even if all |
|
299 |
# stats are based on small samples |
|
300 |
#---------------------------------------------------------------------- |
|
301 |
||
302 |
sub fixLowSampStat($@) |
|
303 |
{ |
|
304 |
my($statR,@nsamp) = @_; |
|
305 |
||
306 |
my($medStat) = median(@{$statR}); |
|
307 |
for (my($i)=0; $i<@{$statR}; $i++) { |
|
308 |
$statR->[$i] = $medStat |
|
309 |
unless ($nsamp[$i]>=10 || !defined($statR->[$i]) || $statR->[$i]>$medStat); |
|
310 |
} |
|
311 |
} |
|
312 |
||
313 |
#---------------------------------------------------------------------- |
|
314 |
# significance of difference of means (NR, 2nd ed, 14.2) |
|
315 |
#---------------------------------------------------------------------- |
|
316 |
||
317 |
sub Students_t(@) |
|
318 |
{ |
|
319 |
my($mu1,$sig1,$N1,$mu2,$sig2,$N2) = |
|
320 |
&antsFunUsage(6,"ffcffc","mu1, sigma1, N1, mu2, sigma2, N2",@_); |
|
321 |
my($var1) = $sig1 * $sig1; |
|
322 |
my($var2) = $sig2 * $sig2; |
|
323 |
my($sd) = sqrt($var1 + $var2 / ($N1+$N2-2) * (1/$N1 + 1/$N2)); |
|
324 |
return ($mu1-$mu2) / $sd; |
|
325 |
} |
|
326 |
||
327 |
sub slevel_mudiff1(@) |
|
328 |
{ |
|
329 |
my($mu1,$sig1,$N1,$mu2,$sig2,$N2) = |
|
330 |
&antsFunUsage(6,"ffcffc","mean1, sqrt(var1), N1, mean2, sqrt(var2), N2",@_); |
|
331 |
my($df) = $N1 + $N2 - 2; |
|
332 |
return &betai(0.5*$df,0.5, |
|
333 |
$df/($df + &Students_t(mu1,$sig1,$N1,$mu2,$sig2,$N2)**2)); |
|
334 |
} |
|
335 |
||
336 |
#---------------------------------------------------------------------- |
|
337 |
# significance of correlation coefficient (NR, 2nd ed, 14.5) |
|
338 |
#---------------------------------------------------------------------- |
|
339 |
||
340 |
sub slevel_r(@) |
|
341 |
{ |
|
342 |
my($r,$N) = &antsFunUsage(2,"fc","r, N",@_); |
|
343 |
return &erfcc(abs($r) * sqrt($N/2)); |
|
344 |
} |
|
345 |
||
346 |
#---------------------------------------------------------------------- |
|
347 |
# significance of difference btw two measured correlation coeffs |
|
348 |
# using Fisher's z (from NR, 2nd ed, 14.5). NB: averaging correlation |
|
349 |
# coefficients is done using [avgr] |
|
350 |
# NB: significance level is only good if correlated variables form |
|
351 |
# a binormal distribution |
|
352 |
#---------------------------------------------------------------------- |
|
353 |
||
354 |
sub r2z(@) |
|
355 |
{ |
|
356 |
my($r) = &antsFunUsage(1,"f","<r>",@_); |
|
357 |
return 0.5 * log((1+$r)/(1-$r)); |
|
358 |
} |
|
359 |
||
360 |
sub z2r(@) |
|
361 |
{ |
|
362 |
my($z) = &antsFunUsage(1,"f","<z>",@_); |
|
363 |
my($e) = exp(2*$z); |
|
364 |
return ($e-1) / ($e+1); |
|
365 |
} |
|
366 |
||
367 |
sub slevel_rrtrue(@) |
|
368 |
{ |
|
369 |
my($r,$N,$rtrue) = &antsFunUsage(3,"fcf","r, N, r_true",@_); |
|
370 |
croak("$0 (libstats.pl): N (=$N) < 10 in &slevel_rrtrue()\n") |
|
371 |
if ($N < 10); |
|
372 |
return &erfcc(abs(&r2z($r) - (&r2z($rtrue) + $rtrue/(2*$N-2))) * |
|
373 |
sqrt($N - 3) / sqrt(2)); |
|
374 |
} |
|
375 |
||
376 |
sub slevel_zz(@) |
|
377 |
{ |
|
378 |
my($z1,$N1,$z2,$N2) = &antsFunUsage(4,"fcfc","z1, N1, z2, N2",@_); |
|
379 |
croak("$0 (libstats.pl): N (=$N1,$N2) < 10 in &slevel_zz()\n") |
|
380 |
if ($N1 < 10 || $N2 < 10); |
|
381 |
return &erfcc(abs($z1-$z2) / sqrt(2/($N1-3) + 2/($N2-3))); |
|
382 |
} |
|
383 |
||
384 |
sub slevel_rr(@) |
|
385 |
{ |
|
386 |
my($r1,$N1,$r2,$N2) = &antsFunUsage(4,"fcfc","r1, N1, r2, N2",@_); |
|
387 |
return &slevel_zz(&r2z($r1),$N1,&r2z($r2),$N2); |
|
388 |
} |
|
389 |
||
390 |
#---------------------------------------------------------------------- |
|
391 |
# significance of difference btw two measured correlation coeffs |
|
392 |
# from brookes+dick63, p.216f |
|
393 |
# NB: result is returned as ratio of difference in Fisher's z to |
|
394 |
# standard error of Fisher's z; values >> 1 indicate that difference |
|
395 |
# is significant |
|
396 |
#---------------------------------------------------------------------- |
|
397 |
||
398 |
sub sig_rrtrue(@) |
|
399 |
{ |
|
400 |
my($r,$N,$rtrue) = &antsFunUsage(3,"fcf","r, N, r_true",@_); |
|
401 |
return abs(&r2z($r) - &r2z($rtrue)) * sqrt($N-3); |
|
402 |
||
403 |
} |
|
404 |
||
405 |
sub sig_rr(@) |
|
406 |
{ |
|
407 |
my($r1,$N1,$r2,$N2) = &antsFunUsage(4,"fcfc","r1, N1, r2, N2",@_); |
|
408 |
return abs(&r2z($r1) - &r2z($r2)) / (1/sqrt($N1-3)+1/sqrt($N2-3)); |
|
409 |
} |
|
410 |
||
411 |
#---------------------------------------------------------------------- |
|
412 |
||
413 |
1; |
|
414 |