author | Andreas Thurnherr <ant@ldeo.columbia.edu> |
Tue, 05 Mar 2019 10:03:40 -0500 | |
changeset 38 | 15c603bc4f70 |
parent 10 | 3dfa16523886 |
child 39 | 56bdfe65a697 |
permissions | -rw-r--r-- |
0 | 1 |
#====================================================================== |
2 |
# L I B S T A T S . P L |
|
3 |
# doc: Wed Mar 24 13:59:27 1999 |
|
10 | 4 |
# dlm: Sat Jan 31 15:51:14 2015 |
0 | 5 |
# (c) 1999 A.M. Thurnherr |
10 | 6 |
# uE-Info: 150 0 NIL 0 0 70 2 2 4 NIL ofnI |
0 | 7 |
#====================================================================== |
8 |
||
9 |
# HISTORY: |
|
10 |
# Mar 24, 1999: - created for the ANO paper |
|
11 |
# Mar 27, 1999: - extended |
|
12 |
# Sep 18, 1999: - argument typechecking |
|
13 |
# Sep 30, 1999: - added gauss() |
|
14 |
# Oct 04, 1999: - moved gauss() to [./libfuns.pl] (changed from specfuns) |
|
15 |
# Oct 20, 1999: - changed, 'cause I understand it better |
|
16 |
# Oct 21, 1999: - changed &Fishers_z to &r2z(); added &z2r() |
|
17 |
# - added &sig_rr(), &sig_rrtrue |
|
18 |
# Jan 22, 2002: - added N(), avg(), stddev(), min(), max() |
|
19 |
# Feb 27, 2006: - adjusted median() for compat with NR (even # of points) |
|
20 |
# - added medianF() |
|
21 |
# Jun 27, 2006: - added medianFNaN() |
|
22 |
# Jul 1, 2006: - Version 3.3 [HISTORY] |
|
23 |
# - made median respect nan based on perlfunc(1) |
|
24 |
# Apr 25, 2008: - added &bootstrap() |
|
25 |
# Oct 24, 2010: - replaced grep { $_ == $_ } by grep { numberp($_) } everywhere |
|
26 |
# - added &fixLowSampStat() |
|
27 |
# Nov 5, 2010: - added std (stderr would have been better but that's used in libPOSIX.pl |
|
28 |
# Dec 18, 2010: - added stddev2, mad, mad2 |
|
29 |
# Dec 31, 2010: - added rms() |
|
30 |
# Jul 2, 2011: - added mad2F() |
|
31 |
# Mar 10, 2012: - medianF() -> medianAnts_(); mad2F() -> mad2Ants_() |
|
32 |
# - added sum() |
|
33 |
# Apr 26, 2012: - BUG: std() did not allow nan as stddev input |
|
3 | 34 |
# Oct 15, 2012: - added max_i(), min_i() |
5 | 35 |
# Nov 24, 2013: - renamed N to ndata |
36 |
# - added fdiff() |
|
10 | 37 |
# Jan 30, 2015: - added log_avg() |
38 |
# - added noise_avg() |
|
0 | 39 |
|
40 |
require "$ANTS/libfuns.pl"; |
|
41 |
||
42 |
#---------------------------------------------------------------------- |
|
43 |
# estimate stderr given stddev & degrees of freedom |
|
44 |
# - return nan for dof <= 0 |
|
45 |
#---------------------------------------------------------------------- |
|
46 |
||
47 |
sub std(@) |
|
48 |
{ |
|
49 |
my($sig,$dof) = |
|
50 |
&antsFunUsage(2,".c","stddev, deg_of_freedom",@_); |
|
51 |
return nan unless ($dof > 0); |
|
52 |
return $sig / sqrt($dof); |
|
53 |
} |
|
54 |
||
55 |
#---------------------------------------------------------------------- |
|
56 |
# calc standard stats from vector of vals |
|
5 | 57 |
# - used, e.g., in [rfilt] |
0 | 58 |
#---------------------------------------------------------------------- |
59 |
||
60 |
sub min(@) |
|
61 |
{ |
|
62 |
my($min) = 9e99; |
|
63 |
for (my($i)=0; $i<=$#_; $i++) { |
|
64 |
$min = $_[$i] if (numberp($_[$i]) && $_[$i] < $min); |
|
65 |
} |
|
66 |
return $min<9e99 ? $min : nan; |
|
67 |
} |
|
68 |
||
3 | 69 |
sub min_i(@) |
70 |
{ |
|
71 |
my($min) = 9e99; |
|
72 |
my($min_i); |
|
73 |
||
74 |
for (my($i)=0; $i<=$#_; $i++) { |
|
75 |
$min_i=$i,$min=$_[$i] if (numberp($_[$i]) && $_[$i] < $min); |
|
76 |
} |
|
77 |
return $min<9e99 ? $min_i : nan; |
|
78 |
} |
|
79 |
||
0 | 80 |
sub max(@) |
81 |
{ |
|
82 |
my($max) = -9e99; |
|
83 |
for (my($i)=0; $i<=$#_; $i++) { |
|
84 |
$max = $_[$i] if (numberp($_[$i]) && $_[$i] > $max); |
|
85 |
} |
|
86 |
return $max>-9e99 ? $max : nan; |
|
87 |
} |
|
88 |
||
3 | 89 |
sub max_i(@) |
90 |
{ |
|
91 |
my($max) = -9e99; |
|
92 |
my($max_i); |
|
93 |
||
94 |
for (my($i)=0; $i<=$#_; $i++) { |
|
95 |
$max_i=$i,$max=$_[$i] if (numberp($_[$i]) && $_[$i] > $max); |
|
96 |
} |
|
97 |
return $max>-9e99 ? $max_i : nan; |
|
98 |
} |
|
99 |
||
5 | 100 |
sub ndata(@) |
0 | 101 |
{ |
102 |
my($N) = 0; |
|
103 |
for (my($i)=0; $i<=$#_; $i++) { $N++ if (numberp($_[$i])); } |
|
104 |
return $N; |
|
105 |
} |
|
106 |
||
107 |
sub sum(@) |
|
108 |
{ |
|
109 |
my($N) = my($sum) = 0; |
|
4
ff72b00b4342
after adding $pi and some other stuff
A.M. Thurnherr <athurnherr@yahoo.com>
parents:
3
diff
changeset
|
110 |
@_ = split(/\s+/,$_[0]) if (@_ == 1 && !numberp($_[0])); |
0 | 111 |
for (my($i)=0; $i<=$#_; $i++) { $N++,$sum+=$_[$i] if (numberp($_[$i])); } |
112 |
return ($N>0)?$sum:nan; |
|
113 |
} |
|
114 |
||
115 |
sub avg(@) |
|
116 |
{ |
|
117 |
my($N) = my($sum) = 0; |
|
118 |
for (my($i)=0; $i<=$#_; $i++) { $N++,$sum+=$_[$i] if (numberp($_[$i])); } |
|
119 |
return ($N>0)?$sum/$N:nan; |
|
120 |
} |
|
121 |
||
10 | 122 |
#---------------------------------------------------------------------- |
123 |
# log_avg does the average in log space, which may be useful |
|
124 |
# for log-normal distributions. However, when I tired this |
|
125 |
# in response to a reviewer suggestion, the results were |
|
126 |
# noticably but not significantly worse, even though at least |
|
127 |
# some of the distributions looked quite log-normal. |
|
128 |
# It appears, however, that 32 samples are enough to sample |
|
129 |
# the distribution of dissipation adequately. |
|
130 |
#---------------------------------------------------------------------- |
|
131 |
||
132 |
sub log_avg(@) # exp(avg(log(x))) |
|
133 |
{ |
|
134 |
my($N) = my($sum) = 0; |
|
135 |
for (my($i)=0; $i<=$#_; $i++) { $N++,$sum+=log($_[$i]) if (numberp($_[$i])); } |
|
136 |
return ($N>0)?exp($sum/$N):nan; |
|
137 |
} |
|
138 |
||
139 |
#---------------------------------------------------------------------- |
|
140 |
# noise_avg(noise_lvl,frac) calculates an arithmetic average after |
|
141 |
# setting all samples below noise level to a certain fraction of the |
|
142 |
# noise level. 66% was used for the VKE paper. |
|
143 |
# Requires $noise_avg to be set, e.g. with &antsFunOpt() |
|
144 |
#---------------------------------------------------------------------- |
|
145 |
||
146 |
sub noise_avg(@) |
|
147 |
{ |
|
148 |
my($nlv,$frac) = split(',',$noise_avg); |
|
149 |
croak("libstats.pl: noise_avg: cannot decode \$noise_avg = <$noise_avg> <$nlv,$frac> (noise level, fraction)\n") |
|
150 |
unless ($nlv > 0) && ($frac < 1); |
|
151 |
my($nsamp) = my($sum) = 0; |
|
152 |
for (my($i)=0; $i<=$#_; $i++) { |
|
153 |
if (numberp($_[$i])) { |
|
154 |
$nsamp++; |
|
155 |
if ($_[$i] > $nlv) { |
|
156 |
$sum += $_[$i]; |
|
157 |
} else { |
|
158 |
$sum += $frac*$nlv; |
|
159 |
} |
|
160 |
} |
|
161 |
} |
|
162 |
return ($nsamp > 0) ? $sum/$nsamp : nan; |
|
163 |
} |
|
164 |
||
165 |
||
0 | 166 |
sub stddev2(@) # avg, val, val, val, ... |
167 |
{ |
|
168 |
my($N) = my($sum) = 0; |
|
169 |
for (my($i)=1; $i<=$#_; $i++) { |
|
170 |
$N++,$sum+=($_[0]-$_[$i])**2 if (numberp($_[$i])); |
|
171 |
} |
|
172 |
return ($N>1)?sqrt($sum/($N-1)):nan; |
|
173 |
} |
|
174 |
||
175 |
sub stddev(@) |
|
176 |
{ |
|
177 |
my($avg) = &avg(@_); |
|
178 |
return numberp($avg) ? stddev2($avg,@_) : nan; |
|
179 |
} |
|
180 |
||
181 |
sub rms(@) |
|
182 |
{ |
|
183 |
my($N) = my($sum) = 0; |
|
184 |
for (my($i)=0; $i<=$#_; $i++) { $N++,$sum+=$_[$i]**2 if (numberp($_[$i])); } |
|
185 |
return ($N>0)?sqrt($sum/$N):nan; |
|
186 |
} |
|
187 |
||
188 |
sub median(@) |
|
189 |
{ |
|
190 |
my(@svals) = sort {$a <=> $b} grep { numberp($_) } @_; |
|
191 |
return nan if (@svals == 0); |
|
192 |
return (@svals & 1) ? |
|
193 |
$svals[$#svals/2] : |
|
194 |
0.5 * ($svals[$#svals/2] + $svals[$#svals/2+1]); |
|
195 |
} |
|
196 |
||
197 |
sub medianAnts_($) |
|
198 |
{ |
|
199 |
my($fnr) = @_; |
|
200 |
my(@svals) = sort {@{$a}[$fnr] <=> @{$b}[$fnr]} grep { numberp(@{$_}[$fnr]) } @ants_; |
|
201 |
return nan if (@svals == 0); |
|
202 |
return (@svals & 1) ? |
|
203 |
$svals[$#svals/2][$fnr] : |
|
204 |
0.5 * ($svals[$#svals/2][$fnr] + $svals[$#svals/2+1][$fnr]); |
|
205 |
} |
|
206 |
||
207 |
sub mad2(@) # avg, val, val, val, ... |
|
208 |
{ |
|
209 |
my($N) = my($sum) = 0; |
|
210 |
for (my($i)=1; $i<=$#_; $i++) { |
|
211 |
$N++,$sum+=abs($_[0]-$_[$i]) if (numberp($_[$i])); |
|
212 |
} |
|
213 |
return ($N>0)?sqrt($sum/$N):nan; |
|
214 |
} |
|
215 |
||
216 |
sub mad(@) |
|
217 |
{ |
|
218 |
my($median) = &median(@_); |
|
219 |
return numberp($median) ? mad2($median,@_) : nan; |
|
220 |
} |
|
221 |
||
222 |
sub mad2Ants_($$) |
|
223 |
{ |
|
224 |
my($median,$fnr) = @_; |
|
225 |
my($sum,$n); |
|
226 |
||
227 |
for (my($r)=0; $r<@ants_; $r++) { |
|
228 |
next unless numberp($ants_[$r][$fnr]); |
|
229 |
$sum += abs($median - $ants_[$r][$fnr]); |
|
230 |
$n++ |
|
231 |
} |
|
232 |
return ($n>0) ? $sum/$n : nan; |
|
233 |
} |
|
234 |
||
5 | 235 |
sub fdiff(@) # finite difference, e.g. for [rfilt] |
236 |
{ |
|
237 |
return (numberp($_[0]) && numberp($_[$#_])) ? $_[$#_] - $_[0] : nan; |
|
238 |
} |
|
239 |
||
240 |
||
0 | 241 |
#---------------------------------------------------------------------- |
242 |
# &bootstrap(nDraw,cLim,statFun,val[,...]) |
|
243 |
# nDraw number of synthetic samples to draw |
|
244 |
# cLim confidence limit (e.g. 0.95) |
|
245 |
# statFun pointer to stats function |
|
246 |
# val[,...] data values |
|
247 |
# |
|
248 |
# e.g. bootstrap(1000,.5,\&avg,1,2,1000) |
|
249 |
#---------------------------------------------------------------------- |
|
250 |
||
251 |
sub bootstrap($$$@) |
|
252 |
{ |
|
253 |
my($nDraw,$cLim,$statFun,@vals) = @_; |
|
254 |
my(@sv,@stats); |
|
255 |
||
256 |
for (my($s)=0; $s<$nDraw; $s++) { |
|
257 |
for (my($i)=0; $i<@vals; $i++) { |
|
258 |
$sv[$i] = $vals[int(rand(@vals))]; |
|
259 |
} |
|
260 |
$stats[$s] = &$statFun(@sv); |
|
261 |
} |
|
262 |
@stats = sort {$a <=> $b} grep { numberp($_) } @stats; |
|
263 |
my($cli) = int($nDraw*(1-$cLim)/2); |
|
264 |
return ($stats[$cli],$stats[$#stats-$cli]); |
|
265 |
} |
|
266 |
||
267 |
#---------------------------------------------------------------------- |
|
268 |
# &fixLowSampStat(statRef,nsamp[]) |
|
269 |
# - replace stat (variance, stddev, stderr) based on small (<10) samples |
|
270 |
# with median calculated from all stats |
|
271 |
# - median of all stats is chosen to allow routine to work even if all |
|
272 |
# stats are based on small samples |
|
273 |
#---------------------------------------------------------------------- |
|
274 |
||
275 |
sub fixLowSampStat($@) |
|
276 |
{ |
|
277 |
my($statR,@nsamp) = @_; |
|
278 |
||
279 |
my($medStat) = median(@{$statR}); |
|
280 |
for (my($i)=0; $i<@{$statR}; $i++) { |
|
281 |
$statR->[$i] = $medStat |
|
282 |
unless ($nsamp[$i]>=10 || !defined($statR->[$i]) || $statR->[$i]>$medStat); |
|
283 |
} |
|
284 |
} |
|
285 |
||
286 |
#---------------------------------------------------------------------- |
|
287 |
# significance of difference of means (NR, 2nd ed, 14.2) |
|
288 |
#---------------------------------------------------------------------- |
|
289 |
||
290 |
sub Students_t(@) |
|
291 |
{ |
|
292 |
my($mu1,$sig1,$N1,$mu2,$sig2,$N2) = |
|
293 |
&antsFunUsage(6,"ffcffc","mu1, sigma1, N1, mu2, sigma2, N2",@_); |
|
294 |
my($var1) = $sig1 * $sig1; |
|
295 |
my($var2) = $sig2 * $sig2; |
|
296 |
my($sd) = sqrt($var1 + $var2 / ($N1+$N2-2) * (1/$N1 + 1/$N2)); |
|
297 |
return ($mu1-$mu2) / $sd; |
|
298 |
} |
|
299 |
||
300 |
sub slevel_mudiff1(@) |
|
301 |
{ |
|
302 |
my($mu1,$sig1,$N1,$mu2,$sig2,$N2) = |
|
303 |
&antsFunUsage(6,"ffcffc","mean1, sqrt(var1), N1, mean2, sqrt(var2), N2",@_); |
|
304 |
my($df) = $N1 + $N2 - 2; |
|
305 |
return &betai(0.5*$df,0.5, |
|
306 |
$df/($df + &Students_t(mu1,$sig1,$N1,$mu2,$sig2,$N2)**2)); |
|
307 |
} |
|
308 |
||
309 |
#---------------------------------------------------------------------- |
|
310 |
# significance of correlation coefficient (NR, 2nd ed, 14.5) |
|
311 |
#---------------------------------------------------------------------- |
|
312 |
||
313 |
sub slevel_r(@) |
|
314 |
{ |
|
315 |
my($r,$N) = &antsFunUsage(2,"fc","r, N",@_); |
|
316 |
return &erfcc(abs($r) * sqrt($N/2)); |
|
317 |
} |
|
318 |
||
319 |
#---------------------------------------------------------------------- |
|
320 |
# significance of difference btw two measured correlation coeffs |
|
321 |
# using Fisher's z (from NR, 2nd ed, 14.5). NB: averaging correlation |
|
322 |
# coefficients is done using [avgr] |
|
323 |
# NB: significance level is only good if correlated variables form |
|
324 |
# a binormal distribution |
|
325 |
#---------------------------------------------------------------------- |
|
326 |
||
327 |
sub r2z(@) |
|
328 |
{ |
|
329 |
my($r) = &antsFunUsage(1,"f","<r>",@_); |
|
330 |
return 0.5 * log((1+$r)/(1-$r)); |
|
331 |
} |
|
332 |
||
333 |
sub z2r(@) |
|
334 |
{ |
|
335 |
my($z) = &antsFunUsage(1,"f","<z>",@_); |
|
336 |
my($e) = exp(2*$z); |
|
337 |
return ($e-1) / ($e+1); |
|
338 |
} |
|
339 |
||
340 |
sub slevel_rrtrue(@) |
|
341 |
{ |
|
342 |
my($r,$N,$rtrue) = &antsFunUsage(3,"fcf","r, N, r_true",@_); |
|
343 |
croak("$0 (libstats.pl): N (=$N) < 10 in &slevel_rrtrue()\n") |
|
344 |
if ($N < 10); |
|
345 |
return &erfcc(abs(&r2z($r) - (&r2z($rtrue) + $rtrue/(2*$N-2))) * |
|
346 |
sqrt($N - 3) / sqrt(2)); |
|
347 |
} |
|
348 |
||
349 |
sub slevel_zz(@) |
|
350 |
{ |
|
351 |
my($z1,$N1,$z2,$N2) = &antsFunUsage(4,"fcfc","z1, N1, z2, N2",@_); |
|
352 |
croak("$0 (libstats.pl): N (=$N1,$N2) < 10 in &slevel_zz()\n") |
|
353 |
if ($N1 < 10 || $N2 < 10); |
|
354 |
return &erfcc(abs($z1-$z2) / sqrt(2/($N1-3) + 2/($N2-3))); |
|
355 |
} |
|
356 |
||
357 |
sub slevel_rr(@) |
|
358 |
{ |
|
359 |
my($r1,$N1,$r2,$N2) = &antsFunUsage(4,"fcfc","r1, N1, r2, N2",@_); |
|
360 |
return &slevel_zz(&r2z($r1),$N1,&r2z($r2),$N2); |
|
361 |
} |
|
362 |
||
363 |
#---------------------------------------------------------------------- |
|
364 |
# significance of difference btw two measured correlation coeffs |
|
365 |
# from brookes+dick63, p.216f |
|
366 |
# NB: result is returned as ratio of difference in Fisher's z to |
|
367 |
# standard error of Fisher's z; values >> 1 indicate that difference |
|
368 |
# is significant |
|
369 |
#---------------------------------------------------------------------- |
|
370 |
||
371 |
sub sig_rrtrue(@) |
|
372 |
{ |
|
373 |
my($r,$N,$rtrue) = &antsFunUsage(3,"fcf","r, N, r_true",@_); |
|
374 |
return abs(&r2z($r) - &r2z($rtrue)) * sqrt($N-3); |
|
375 |
||
376 |
} |
|
377 |
||
378 |
sub sig_rr(@) |
|
379 |
{ |
|
380 |
my($r1,$N1,$r2,$N2) = &antsFunUsage(4,"fcfc","r1, N1, r2, N2",@_); |
|
381 |
return abs(&r2z($r1) - &r2z($r2)) / (1/sqrt($N1-3)+1/sqrt($N2-3)); |
|
382 |
} |
|
383 |
||
384 |
#---------------------------------------------------------------------- |
|
385 |
||
386 |
1; |
|
387 |