# edapy_12_00 clear all variables and import vatious modules
%reset -f
import os
from datetime import date
from math import exp, pi, sin, cos, tan, sqrt, floor, ceil, log, atan2
import numpy as np
import scipy.sparse.linalg as las
import scipy.interpolate as ip
import scipy.spatial as sp
from scipy import sparse
import scipy.linalg as la
import scipy.signal as sg
import scipy.stats as stats
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
# function to make a numpy N-by-1 column vector
# c=eda_cvec1(v) from entity v that is array-like,
# including a number, a list of numbers, a tuple
# of numbers, an N-by-0 np array and a N-by1 np array.
# It promotes integers to floats, and integers
# and floats to complex, by context. The function
# also insures that, if v is an np array, that
# c is a copy, as contrasted to a view, of it.
# This version takes just one input argment.
# whereas c=eda_cvec(v1,v2,...) concatenates
# many argiments.
def eda_cvec1(v):
if isinstance(v, int):
w = np.zeros((1,1),dtype=int);
w[0,0] = v;
return w;
elif isinstance(v, float):
w = np.zeros((1,1),dtype=float);
w[0,0] = v;
return w;
elif isinstance(v, complex):
w = np.zeros((1,1),dtype=complex);
w[0,0] = v;
return w;
elif isinstance(v, np.ndarray):
s = np.shape(v);
if len(s) == 1:
return np.copy(np.reshape(v,(s[0],1)));
else:
[r,c]=s;
if( c==1 ):
return(np.copy(v));
elif(r==1):
return(np.copy(v.T));
else:
raise TypeError("eda_cvec: %d by %d ndarray not allowed" % (r, c));
elif isinstance(v, list):
r = len(v);
t = int;
for vi in v:
if isinstance(vi,int):
pass;
elif isinstance(vi,float):
t=float;
elif isinstance(vi,complex):
t=complex;
break;
else:
raise TypeError("eda_cvec: list contains unsupported type %s" % type(vi));
w = np.zeros((r,1),dtype=t);
w[:,0] = v;
return w;
elif isinstance(v, tuple):
r = len(v);
t = int;
for vi in v:
if isinstance(vi,int):
pass;
elif isinstance(vi,float):
t=float;
elif isinstance(vi,complex):
t=complex;
break;
else:
raise TypeError("eda_cvec: tuple contains unsupported type %s" % type(vi));
w = np.zeros((r,1),dtype=t);
w[:,0] = v;
return w;
else:
raise TypeError("eda_cvec: %s not supported" % type(v));
# function to make a numpy N-by-1 column vector
# c=eda_cvec(v1, v2, ...) from a list of several
# array-like entities v1, v2, including a number
# a list of numbers, a tuple of numbers, an N-by-0 np array
# and a N-by-1 np array. The function
# also insures that, if v is an np array, that
# c is a copy, as contrasted to a view, of it
# It promotes integers to floats, and integers
# and floats to complex, by context.
# This version concatenates many argments,
# whereas c=eda_cvec1(v1) takes just one argiment.
# I recommend always using eda_cvec(v1, v2, ...)
def eda_cvec(*argv):
t = int;
Nt = 0;
for a in argv:
v = eda_cvec1(a);
N,M = np.shape(v);
Nt = Nt + N;
if( N==0 ):
continue; # skip vector of zero length
if (t==int) and isinstance(v[0,0],float):
t=float;
elif isinstance(v[0,0],complex):
t=complex;
w = np.zeros((Nt,1),dtype=t);
Nt = 0;
for a in argv:
v = eda_cvec1(a);
N,M = np.shape(v);
w[Nt:Nt+N,0] = v[0:N,0];
Nt = Nt + N;
return w;
# eda_draw function makes a "pictorial matrix equation"
# arguments are vectors, matrices and strings
# which are plotted in the order that the appear
# except that strings starting with 'title ' are plotted
# under the subseqeunt matrix or vector
# always returns a status of 1
def eda_draw(*argv):
bw = np.zeros((256,4));
v = 0.9*(256 - np.linspace( 0, 255, 256 ))/255;
bw[:,0] = v;
bw[:,1] = v;
bw[:,2] = v;
bw[:,3] = np.ones(256);
bwcmap = ListedColormap(bw);
# size of plot
W = 16;
H = 4;
fig1 = plt.figure(1);
# figsize width and height in inches
fig1.set_size_inches(W,H);
ax1 = plt.subplot(1,1,1);
plt.axis([0, W, -H/2, H/2]);
plt.axis('off');
LM = W/6; # matrix width and heoght
LV = W/40; # vector width
FS = 0.12; # character width
TO = 0.4; # title vertical offset
SP = 0.2; # space between objects
LS = 0.2; # leading space
p = LS; # starting x-position
istitle=0; # flags presence of a title
for a in argv:
if isinstance(a,np.ndarray):
sh = np.shape(a);
if len(sh) == 1: # conversion to nx1 array
n = sh[0];
m = 1;
ap = a;
a = np.zeros((n,1));
a[:,0] = ap;
else:
n = sh[0];
m = sh[1];
if m==1:
pold=p;
left=p;
right=p+LV;
bottom=-LM/2;
top=LM/2;
plt.imshow( a, cmap=bwcmap, vmin=np.min(a), vmax=np.max(a), extent=(left,right,bottom,top) );
p = p+LV;
pm = (p+pold)/2;
if istitle:
plt.text(pm,-(LM/2)-TO,titlestr,horizontalalignment='center');
istitle=0;
p = p+SP;
else:
pold=p;
left=p;
right=p+LM;
bottom=-LM/2;
top=LM/2;
plt.imshow( a, cmap=bwcmap, vmin=np.min(a), vmax=np.max(a), extent=(left,right,bottom,top) );
p = p+LM;
pm = (p+pold)/2;
if istitle:
plt.text(pm,-(LM/2)-TO,titlestr,horizontalalignment='center');
istitle=0;
p = p+SP;
elif isinstance(a,str):
ns = len(a);
istitle=0;
if( ns>=6 ):
if 'title ' in a[0:6]:
istitle=1;
titlestr=a[6:];
if( istitle != 1):
plt.text(p,0,a);
p = p + ns*FS + SP;
plt.show();
return 1;
# edapy_12_01: p.d.f. of square of a random variable
# standard set-up of e-axis
L=101; # length L
emin=0.01; # start value; avoid zero
emax=3.00; # end value
De=(emax-emin)/(L-1); # sampling interva;
e = eda_cvec( emin+De*np.linspace(0,L-1,L)); # vector e
# Nornal p.d.f. for p(e), for e>=0
pe = sqrt(2/pi)*np.exp(-0.5*np.power(e,2));
Ae = De*np.sum(pe); # area under p.d.f.
# stadard setup of E axis
Emin=0.01; # start value, avoid zero
Emax=3; # end value
DE=(Emax-Emin)/(L-1); # sampling interval
E = eda_cvec(Emin+DE*np.linspace(0,L-1,L)); # vector E
pE = np.sqrt(np.reciprocal(2*pi*E)) * np.exp(-0.5*E); # chi2 p.d.f.
AE = DE*sum(pE); # area under p.d.f.
eda_draw(' ',pe,'caption p(e)',' ',pE,'caption p(E)');
print("area under p(e): %.4f" % (Ae) );
print("area under p(E): %.4f" % (AE) );
# edapy_12_02: chi-squared p.d.f.
# standard set-up of X2-axis
L=1001; # length
X2min=0.0; # start value
X2max=10.0; # end value
DX2=(X2max-X2min)/(L-1); # sampling
X2 = eda_cvec(np.linspace(X2min,X2max,L)); # vector X2
# plot setup
fig1 = plt.figure(1,figsize=(10,6));
plt.axis([X2min, X2max, 0, 1]);
plt.xlabel('X2');
plt.ylabel('p(xX2)');
plt.title('chi-squared pdf');
# plot p(X2,N) p.d.f for several values of N
for N in range(1,6): # loop over degrees of freedom N
pX2 = eda_cvec(stats.chi2.pdf(X2,N)); # chi-squared p.d.f.
plt.plot(X2,pX2,'k-');
# edapy_12_03: student t p.d.f.
# standard set-up of t-axis
L=1001; # length
tmin=-5.0; # start value
tmax=5.0; # end value
Dt=(tmax-tmin)/(L-1); # sampling interval
t = eda_cvec(np.linspace(tmin,tmax,L));
# plot setup
fig1 = plt.figure(1,figsize=(10,6));
plt.axis([tmin, tmax, 0, 0.5]);
plt.xlabel('x');
plt.ylabel('p(x)');
plt.title('students t pdf');
# plot p(t,N) for several values of N
pt = np.zeros((L,1))
for N in range(1,6):
pt = eda_cvec(stats.t.pdf(t,N)); # t p.d.f.
plt.plot(t,pt,'k-');
# edapy_12_04: F p.d.f.
# make a vector F
L=1001; # length
Fmin=0.0; # start value
Fmax=5.0; # end value
DF=(Fmax-Fmin)/(L-1); # sampling interval
F=eda_cvec(np.linspace(Fmin,Fmax,L));
# plot setup
fig1 = plt.figure(1,figsize=(12,12));
# plot of p(F,N,M) has several frames, N constant for each frame
i = 1;
for M in [2, 5, 25, 50]:
ax1 = plt.subplot(4,1,i);
plt.axis([Fmin, Fmax, 0, 2]);
plt.xlabel('F');
plt.ylabel("p(F,%d,.)" % (M) );
if(i==1):
plt.title('F pdf for M, N in [2, 5, 25, 50]' );
i=i+1;
for N in [2, 5, 25, 50]:
pF = eda_cvec(stats.f.pdf(F,N,M));
plt.plot(F,pF,'k-');
# edapy_12_05, simulated particle size data example
# note: to prevent this code from overwriting the existing
# files testA.txt and testB.txt, I have changed the filenames
# to mytestA.txt and mytestB.txt
N=25;
d=100;
s2dtrue=1;
sdtrue=sqrt(s2dtrue);
# calibration test 1
dobs1 = np.random.normal(d,sdtrue,(N,1));
np.savetxt("../Data/mytestA.txt",dobs1, delimiter="\t");
# calibration test 2
dobs2 = np.random.normal(d,sdtrue,(N,1));
np.savetxt("../Data/mytestB.txt",dobs2, delimiter="\t");
# edapy_12_06:, Z and chi-squared tests of particle size data
# Z distribution, two-sided test
Z=0.278;
Pi = stats.norm.cdf(abs(Z),0.0,1.0)-stats.norm.cdf(-abs(Z),0.0,1.0);
Po = 1.0-(stats.norm.cdf(abs(Z),0.0,1.0)-stats.norm.cdf(-abs(Z),0.0,1.0));
print("Z distribution, with Z=%.4f, two sided test:" % (Z) );
print(" The probability that Z is inside the interval %.4f to %.4f" % (-Z, Z) );
print(" is %.4f" % (Pi) );
print(" The probability that Z is outside the interval %.4f to %.4f" % (-Z, Z));
print(" is %.4f" % (Po) );
# Chi-squared distribution, one-sided test
X2=21.9;
N=25;
Pi=stats.chi2.cdf(X2,N);
Po=1.0-stats.chi2.cdf(X2,N);
print("Chi-squared distribution with %d degrees of freedom, one sized test:" % (N) );
print(" The Probability that X2 is inside the interval 0 to %.4f" % (X2) );
print(" is %.4f" % (Pi) );
print(" The Probability that X2 is outside the interval 0 to %.4f" % (X2) );
print(" is %.4f" % (Po));
# edapy_12_07: statistical tests of particle size data
# Part A: first dataset
# load data
D=np.genfromtxt('../Data/testA.txt', delimiter='\t');
s = np.shape(D);
NA = s[0];
dobsA=eda_cvec(D);
dtrueA = 100.0; # known size of calibrations standard'
dbartrueA = dtrueA; # known mean of measurement
sd2trueA=1.0; # known variance of measurement
sdtrueA=1.0; # known root-variance of measurement
# calculate useful statistics
print('file testA.txt');
# number of observations
print("N is %d" % (NA) );
dbartrueA = dtrueA;
print("true mean %.4f" % (dbartrueA) );
dbarestA = np.sum(dobsA)/NA;
print("estimated mean %.4f" % (dbarestA) );
print("true variance %.4f" % (sd2trueA) );
print("true root-variance %.4f" % (sdtrueA) );
sd2estA = np.matmul( (dobsA-dbartrueA).T, (dobsA-dbartrueA)) / (NA);
print("estimated variance (given true mean) %.4f" % sd2estA );
sdestA = sqrt(sd2estA);
print("estimated root-variance (given true mean) %.4f" % (sdestA) );
sd2estpA= np.matmul( (dobsA-dbarestA).T, (dobsA-dbarestA)) / (NA-1);
print("estimated variance (given estimated mean) %.4f" % (sd2estpA) );
sdestpA=sqrt(sd2estpA);
print("estimated root-variance (given estimated mean) %.4f" % (sdestpA));
ZA = (dbarestA-dbartrueA)/(sdtrueA/sqrt(NA));
print("Z is %.4f" % (ZA) );
PZA = 1 - (stats.norm.cdf(abs(ZA),0.0,1.0)-stats.norm.cdf(-abs(ZA),0.0,1.0));
print("Probability of |Z| exceeding %.4f is %.4f" % (abs(ZA), PZA) );
chi2A = np.matmul( ((dobsA-dtrueA).T/sdtrueA), ((dobsA-dtrueA)/sdtrueA) );
print("chi-squared is %.4f" % (chi2A) );
Pchi2A = 1 - stats.chi2.cdf(chi2A,NA);
print("Probability of chi2 exceeding %.4f is %.4f" % (chi2A, Pchi2A) );
tA = (dbarestA-dbartrueA)/(sdestA/sqrt(NA));
print("t is %.4f" % (tA) );
PtA = 1.0 - (stats.t.cdf(abs(tA),NA)-
stats.t.cdf(-abs(tA),NA));
print("Probability of |t| exceeding %.4f is %.4f" % (abs(tA), PtA) );
print(' ');
# Part B: second dataset
# load data
D=np.genfromtxt('../Data/testB.txt', delimiter='\t');
s = np.shape(D);
NB = s[0];
dobsB=eda_cvec(D);
dtrueB = 100.0; # known size of calibrations standard'
dbartrueB = dtrueB; # known mean
sd2trueB = 1.0; # known variance of measurement
sdtrueB = sqrt(sd2trueB); # known root-variance of measurement
# calculate useful statistics
print('file testB.txt');
# number of observations
print("N is %d" % (NB) );
print("true mean %.4f" % (dbartrueB) );
dbarestB = np.sum(dobsB)/NB;
print("estimated mean %.4f" % (dbarestB) );
print("true variance %.4f" % (sd2trueB) );
print("true root-variance %.4f" % (sdtrueB) );
sd2estB = np.matmul( (dobsB-dbartrueB).T, (dobsB-dbartrueB)) / (NB);
print("estimated variance (given true mean) %.4f" % sd2estB );
sdestB = sqrt(sd2estB);
print("estimated root-variance (given true mean) %.4f" % (sdestB) );
sd2estpB= np.matmul( (dobsB-dbarestB).T, (dobsB-dbarestB)) / (NB-1);
print("estimated variance (given estimated mean) %.4f" % (sd2estpB) );
sdestpB=sqrt(sd2estpB);
print("estimated root-variance (given estimated mean) %.4f" % (sdestpB));
ZB = (dbarestB-dbartrueB)/(sdtrueB/sqrt(NB));
print("Z is %.4f" % (ZB) );
PZB = 1 - (stats.norm.cdf(abs(ZB),0.0,1.0)-stats.norm.cdf(-abs(ZB),0.0,1.0));
print("Probability of |Z| exceeding %.4f is %.4f" % (abs(ZB), PZB) );
chi2B = np.matmul( ((dobsB-dtrueB).T/sdtrueB), ((dobsB-dtrueB)/sdtrueB) );
print("chi-squared is %.4f" % (chi2B) );
Pchi2B = 1 - stats.chi2.cdf(chi2B,NB);
print("Probability of chi2 exceeding %.4f is %.4f" % (chi2B, Pchi2B) );
tB = (dbarestB-dbartrueB)/(sdestB/sqrt(NB));
print("t is %.4f" % (tB) );
PtB = 1.0 - ( stats.t.cdf(abs(tB),NB) - stats.t.cdf(-abs(tB),NB) );
print("Probability of |t| exceeding %.4f is %.4f" % (abs(tB), PtB) );
print(' ');
# PART 3 comparison of the tw datsets
# are the means of the two tests significantly different ?
print("significance of difference between two means, using Z-test");
A = dbarestA - dbarestB;
B = sqrt(sd2trueA/NA+sd2estpB/NB);
Z = A/B;
print("Z is %.4f" % (Z) );
# are the means of the two tests significantly different ?
PZ = 1 - (stats.norm.cdf(abs(Z),0,1)-stats.norm.cdf(-abs(Z),0,1));
print("Probability of |Z| exceeding %.4f is %.4f" % (abs(Z), PZ) );
print(' ');
print("difference between two means with t-test");
A = dbarestA - dbarestB;
B = sqrt(sd2estpA/NA+sd2estpB/NB);
t = A/B;
C = (sd2estpA/NA+sd2estpB/NB)**2;
D = ((sd2estpA/NA)**2)/(NA-1) + ((sd2estpB/NB)**2)/(NB-1);
M =floor(C/D+0.5);
print("t is %.4f and has %d degrees of freedom" % (t, M) );
# are the means of the two tests significantly different ?
Pt = 1 - (stats.t.cdf(abs(t),M)-stats.t.cdf(-abs(t),M));
print("Probability of |t| exceeding %.4f is %.4f" % (abs(t), Pt) );
print(' ');
# are the variances of the two tests significantly different ?
F = (chi2A/NA) / (chi2B/NB);
if( F<1 ):
F=1/F;
PF = 1 - (stats.f.cdf(F,NA,NB)-stats.f.cdf(1/F,NA,NB));
print("1/F is %.4f and F is %.4f" % (1/F, F) );
print("Probability of F outside interval (%.4f,%.4f) is %.4f" % (1/F, F, PF) );
# edapy_12_08: F-test to assess difference in fit
# illustration of F-test to assess difference in fit of two models
# using a small fragment of Black Rock Forest temperature data
# load data from file
D=np.genfromtxt('../Data/brf_fragment.txt', delimiter='\t');
s = np.shape(D);
N = s[0];
t = eda_cvec(10.0+np.linspace(0,N-1,N)); # hours from start of orignal record
dobs = eda_cvec(D);
# plot setup
fig1 = plt.figure(1,figsize=(10,6));
plt.plot(t,dobs,'ko');
plt.title('linear fit');
plt.xlabel('time t, hours');
plt.ylabel('d(i)');
# fit 1, straight line
M=2; # number of mode parameters
G=np.zeros((N,M)); # set up data kernel
G[:,0]=np.ones((N,1)).ravel();
G[:,1]=t.ravel();
mestA=la.solve( np.matmul(G.T,G), np.matmul(G.T, dobs) ); # least squares fit
dpreA = np.matmul( G, mestA ); # predicted data
plt.plot(t,dpreA,'k--');
EA = np.matmul( (dobs-dpreA).T, (dobs-dpreA) ); # total error
vA = N-M; # degrees of freedom
plt.show();
# plot setup
fig2 = plt.figure(2,figsize=(10,6));
plt.plot(t,dobs,'ko');
plt.title('cubic fit');
plt.xlabel('time t, hours');
plt.ylabel('d(i)');
# fit 2, cubic fit
M=4; # number of model parameters
G=np.zeros((N,M)); # set up data kernel
G[:,0]=np.ones((N,1)).ravel();
G[:,1]=t.ravel();
G[:,2]=np.power(t,2).ravel();
G[:,3]=np.power(t,3).ravel();
mestB=la.solve( np.matmul(G.T,G), np.matmul(G.T, dobs) ); # least squares solution
dpreB = np.matmul( G, mestB ); # predicted data
plt.plot(t,dpreB,'k--');
EB = np.matmul( (dobs-dpreB).T, (dobs-dpreB) ); # total error
vB = N-M; # degrees of freedom
plt.show();
print("linear error %.2f, degrees of freedom %d" % (EA, vA) );
print("cubic error %.2f, degrees of freedom %d" % (EB, vB) );
print("improvement in error %.2f" % ((EA-EB)/EA) );
F = (EA/vA) / (EB/vB);
print("1/F %.4f F %.4f" % (1/F,F) );
if( F<1 ):
F=1/F
P = 1 - (stats.f.cdf(F,vA,vB)-stats.f.cdf(1/F,vA,vB));
print("P(F<%.4f or F>%.4f) = %f" % (1/F, F, P) );
if( P>0.05):
print('Null hypothesis cannot be excluded to 95% confidence')
else:
print('Null hypothesis can be excluded to 95% confidence')
# edapy_12_09: variance of power spectral density (untapered)
# case of untapered random timeseries
# generic time set up of t-axis
N=floor(2**10); # length of time vector
Dt=0.025; # sampling inteval
T = N*Dt; # duration
t = eda_cvec(Dt*np.linspace(0,N-1,N)); # time vector
# create an uncorrelated random timeseries
sd2true=64.0; # variance
sdtrue=sqrt(sd2true); # standard deviation
draw = np.random.normal(0,sdtrue,(N,1)); # time series of random data
# generic frequency set up
No2 = floor(N/2); # integer version of N.2
fmax=1/(2.0*Dt); # nyquist frequency
Df=fmax/No2; # frequency sampling
Nf=No2+1; # number of non-negative frequencies
# full (pos and neg) frequency vector
f=eda_cvec(Df*np.concatenate((np.linspace(0,No2,Nf),np.linspace(-No2+1,-1,No2-1)),axis=0));
Dw=2.0*pi*Df; # angular feqweuncy sampling
w=2.0*pi*f; # full (pos and neg) angular fedquency vector
Nw=Nf; # number of non-negative angular frequencies
fpos=eda_cvec(Df * np.linspace(0,No2,Nf)); # vector of non-negative frequencies
wpos=2.0*pi*fpos; # vector of non-negative angular frequencies
# constant window function
W = np.ones((N,1));
d = np.multiply( W, draw );
# window changes variance of signal by ff
ff = np.sum(np.multiply(W,W))/N;
# true and estimated variance of d
sd2est=np.std(d)**2; # estimated variance
print("true data variance %.4f" % (ff*sd2true) );
print("estimated data variance %.4f" % (sd2est));
# compute power spectral density
fftraw = np.fft.fft(d,axis=0); # Fourier transform
dbar = eda_cvec(Dt*fftraw[0:Nf,0]); # keep non-negative frequencies only
s2 = (2/T) * np.power(np.abs(dbar),2); # power spectral density
# statistics of power spectral density
s2meanest = np.mean(s2); # mean of p.s.d.
s2varest=np.std(s2)**2; # variance of p.s.d.
s2sigmaest=sqrt(s2varest); # standard deviation
sd2psd=Nf*Df*s2meanest; # variance of data estimated from PSD
print("variance of data estimated from PSD %.2f" % (sd2psd));
print(' ');
# true mean and variance
p=2; # degrees of freedom
c = (ff*sd2true ) / (2*Nf*Df); # normalization factor
s2meantrue = p*c;
s2vartrue = 2*p*(c**2);
print("power spectral density");
print("true mean %.4f variance %.4f" % (s2meantrue,s2vartrue));
print("estimated mean %.4f variance %.4f" % (s2meanest,s2varest));
print(' ');
# plot data
fig1 = plt.figure(1,figsize=(10,6));
damax = np.amax(d);
plt.axis([0, T, -1.5*damax, 1.5*damax])
plt.plot(t,d,'k-');
plt.xlabel('time t (s)');
plt.ylabel('d(t)');
plt.title('random time series');
plt.plot([0, T], [-2*sdtrue*sqrt(ff), -2*sdtrue*sqrt(ff)], 'k-' );
plt.plot([0, T], [0, 0], 'k-', 'LineWidth', 1 );
plt.plot([0, T], [2*sdtrue*sqrt(ff), 2*sdtrue*sqrt(ff)], 'k-' );
# plot PSD
fig2 = plt.figure(2,figsize=(10,6));
plt.xlabel('frequency f (Hz)');
plt.ylabel('ss(f)');
plt.title('power spectral density');
plt.plot(fpos,s2,'k-');
# plot mean on spectra
plt.plot([0, fmax], [s2meantrue, s2meantrue],'k-');
# histogram of values
Nhist=100; # number of bins in histogram
s2min = np.min(s2); # minumun bin value
s2max = np.max(s2); # maximum bin value
ct, ed = np.histogram(s2,Nhist,(s2min,s2max)); # histogram
Nc = len(ct); # length of count vector ct
Ne = len(ed); # length of edges vector ed
s2hist = eda_cvec(ct); # counts as a vector
centers = eda_cvec(0.5*(ed[1:Ne]+ed[0:Ne-1])); # bins centers as a vector
# chi-squared pdf
Db=(centers[1,0]-centers[0,0])/c; # sampling divided by c
s2histtrue=Nf*Db*stats.chi2.pdf(centers/c,p); # normalied chi-squared p.d.f.
# plot 95% confidence level on spectra
pv=0.95;
cv = stats.chi2.ppf(pv,p);
plt.plot([fpos[0,0], fpos[Nf-1,0]], [cv*c, cv*c],'k-');
plt.show();
print("%.4f confidence level is at s2 = %.4f" % (pv,cv*c) );
percent = 100 * np.sum( s2hist[centers<=cv*c]) / np.sum(s2hist);
print("%.4f percent of points are below %.4f confidence" % (percent,pv));
print(' ');
# plot histograms
fig3 = plt.figure(3,figsize=(10,6));
plt.axis([0, np.max(centers), 0, max(s2hist) ])
plt.xlabel('s2)');
plt.ylabel('p(s2)');
plt.title('histogram of power spectral density');
plt.plot(centers,s2hist,'k-');
plt.plot(centers,s2histtrue,'k-');
plt.plot([cv*c, cv*c], [0, 0.1*max(s2hist)],'k:');
plt.show();
# edapy_12_10: variance of power spectral density (tapered)
# case of random timeseries tapered with Hamming window
# generic time set up
N=floor(2**10); # number of points in time series
Dt=0.025; # sampling interval
T = N*Dt; # duration
t = eda_cvec(Dt*np.linspace(0,N-1,N)); # time vector
# create an uncorrelated random timeseries
sd2true=64.0; # variance
sdtrue=sqrt(sd2true); # std dev
draw = np.random.normal(0,sdtrue,(N,1)); # time series of random data
# generic frequency set up
No2 = floor(N/2); # integer version of N.2
fmax=1/(2.0*Dt); # nyquist frequency
Df=fmax/No2; # frequency sampling
Nf=No2+1; # number of non-negative frequencies
# full (pos and neg) frequency vector
f=eda_cvec(Df*np.concatenate((np.linspace(0,No2,Nf),np.linspace(-No2+1,-1,No2-1)),axis=0));
Dw=2.0*pi*Df; # angular feqweuncy sampling
w=2.0*pi*f; # full (pos and neg) angular fedquency vector
Nw=Nf; # number of non-negative angular frequencies
fpos=eda_cvec(Df * np.linspace(0,No2,Nf)); # vector of non-negative frequencies
wpos=2.0*pi*fpos; # vector of non-negative angular frequencies
# Hamming window function
W = eda_cvec( 0.54 - 0.46*np.cos(2*pi*np.linspace(0,N-1,N)/(N-1)) );
d = np.multiply( W, draw );
# window changes variance of signal by ff
ff = np.sum(np.multiply(W,W))/N;
# true and estimated variance of d
sd2est=np.std(d)**2;
print("true data variance %.4f" % (ff*sd2true) );
print("estimated data variance %.4f" % (sd2est));
# compute power spectral density
fftraw = np.zeros((N,1), dtype=np.complex);
fftraw = np.fft.fft(d,axis=0); # Fourier transform
dbar = eda_cvec(Dt*fftraw[0:Nf,0]); # keep non-negative frequencies, only
s2 = (2/T) * np.power(np.abs(dbar),2); # power spectral density
# statistics of power spectral density
s2meanest = np.mean(s2); # mean
s2varest=np.std(s2)**2; # vaiance
s2sigmaest=sqrt(s2varest); # standard deviation
sd2psd=Nf*Df*s2meanest; # variance of data estimated from PSD
print("variance of data estimated from PSD %.2f" % (sd2psd));
print(' ');
# true mean and variance
p=2; # degrees of freedom
c = ( ff*sd2true ) / (2*Nf*Df); # normalization factor
s2meantrue = p*c; # mean of chi-squaed p.d.f., with normalization
s2vartrue = 2*p*(c**2); # variance of chi-squaed p.d.f., with normalization
print("power spectral density");
print("true mean %.4f variance %.4f" % (s2meantrue,s2vartrue));
print("estimated mean %.4f variance %.4f" % (s2meanest,s2varest));
print(' ');
# plot data
fig1 = plt.figure(1,figsize=(10,6));
damax = np.amax(d);
plt.axis([0, T, -1.5*damax, 1.5*damax])
plt.plot(t,d,'k-');
plt.xlabel('time t (s)');
plt.ylabel('d(t)');
plt.title('random time series');
plt.plot([0, T], [-2*sdtrue*sqrt(ff), -2*sdtrue*sqrt(ff)], 'k-' );
plt.plot([0, T], [0, 0], 'k-', 'LineWidth', 1 );
plt.plot([0, T], [2*sdtrue*sqrt(ff), 2*sdtrue*sqrt(ff)], 'k-' );
# plot p.s.d.
fig2 = plt.figure(2,figsize=(10,6));
plt.xlabel('frequency f (Hz)');
plt.ylabel('ss(f)');
plt.title('power spectral density');
plt.plot(fpos,s2,'k-');
# plot mean on spectra
plt.plot([0, fmax], [s2meantrue, s2meantrue],'k-');
# histogram of values
Nhist=100 # number of bins in histogram
s2min = np.min(s2); # start bin
s2max = np.max(s2); # end bin
ct, ed = np.histogram(s2,Nhist,(s2min,s2max)); # histogram
Nc = len(ct); # number of counts ct
Ne = len(ed); # number of edges ed
s2hist = eda_cvec(ct); # counts as a vector
centers = eda_cvec(0.5*(ed[1:Ne]+ed[0:Ne-1])); # edges as a vector
# chi-squared pdf
Db=(centers[1,0]-centers[0,0])/c; # normalization factor
s2histtrue=Nf*Db*stats.chi2.pdf(centers/c,p); # true hisogram from chi-squared p.d.f.
# plot 95% confidence level on spectra
pv=0.95;
cv = stats.chi2.ppf(pv,p);
plt.plot([fpos[0,0], fpos[Nf-1,0]], [cv*c, cv*c],'k-');
plt.show();
print("%.4f confidence level is at s2 = %.4f" % (pv,cv*c) );
percent = 100 * np.sum( s2hist[centers<=cv*c]) / np.sum(s2hist);
print("%.4f percent of points are below %.4f confidence" % (percent,pv));
print(' ');
fig3 = plt.figure(3,figsize=(10,6));
plt.axis([0, np.max(centers), 0, max(s2hist) ])
plt.xlabel('s2)');
plt.ylabel('p(s2)');
plt.title('histogram of power spectral density');
plt.plot(centers,s2hist,'k-');
plt.plot(centers,s2histtrue,'k-');
plt.plot([cv*c, cv*c], [0, 0.1*max(s2hist)],'k:');
plt.show();
# edapy_12_11, confidence of spectral peak
# for random noise plus a small ampliude sinusoid
# generic time set up
N=floor(2**10); # number of points in time series
Dt=0.025; # sampling interval
T = N*Dt; # duration
t = eda_cvec(Dt*np.linspace(0,N-1,N)); # time vector
# generic frequency set up
No2 = floor(N/2); # integer version of N.2
fmax=1/(2.0*Dt); # nyquist frequency
Df=fmax/No2; # frequency sampling
Nf=No2+1; # number of non-negative frequencies
# full (pos and neg) frequency vector
f=eda_cvec(Df*np.concatenate((np.linspace(0,No2,Nf),np.linspace(-No2+1,-1,No2-1)),axis=0));
Dw=2.0*pi*Df; # angular feqweuncy sampling
w=2.0*pi*f; # full (pos and neg) angular fedquency vector
Nw=Nf; # number of non-negative angular frequencies
fpos=eda_cvec(Df * np.linspace(0,No2,Nf)); # vector of non-negative frequencies
wpos=2.0*pi*fpos; # vector of non-negative angular frequencies
# low amplitude periodic signal buried in the noise
sd2true=64.0; # variance
sdtrue=sqrt(sd2true); # standard deviation
# data = signal + noise
draw = 0.25*sdtrue*np.cos(2*pi*(fmax/4)*t) + np.random.normal(0,sdtrue,(N,1));
# Hamming window function
W = eda_cvec(0.54 - 0.46*np.cos(2*pi*np.linspace(0,N-1,N)/(N-1)));
d = np.multiply( W, draw );
# window changes variance of signal by ff
ff = np.sum(np.multiply(W,W))/N;
# true and estimated variance of d
sd2est=np.std(d)**2;
print("true data variance %.4f" % (ff*sd2true) );
print("estimated data variance %.4f" % (sd2est));
# compute power spectral density
fftraw = np.fft.fft(d,axis=0); # Fourier transform
dbar = eda_cvec(Dt*fftraw[0:Nf,0]); # keep non-negative frequencies, only
s2 = (2/T) * np.power(np.abs(dbar),2); # power spectral density
# statistics of power spectral density
s2meanest = np.mean(s2); # mean
s2varest=np.std(s2)**2; # variance
s2sigmaest=sqrt(s2varest); # standard deviation
sd2psd=Nf*Df*s2meanest; # variance of data estimated from PSD
print("variance of data estimated from PSD %.2f" % (sd2psd));
print(' ');
# true mean and variance
p=2; # degrees of freedom
c = ( ff*sd2true ) / (2*Nf*Df); # normalization factor
s2meantrue = p*c; # true mean of normalizenchi-suqared distribution
s2vartrue = 2*p*(c**2); # true varoance of normalized chi-suqared distribution
print("power spectral density");
print("true mean %.4f variance %.4f" % (s2meantrue,s2vartrue));
print("estimated mean %.4f variance %.4f" % (s2meanest,s2varest));
print(' ');
# plot data
fig1 = plt.figure(1,figsize=(10,6));
damax = np.amax(d);
plt.axis([0, T, -1.5*damax, 1.5*damax])
plt.plot(t,d,'k-');
plt.xlabel('time t (s)');
plt.ylabel('d(t)');
plt.title('random time series');
plt.plot([0, T], [-2*sdtrue*sqrt(ff), -2*sdtrue*sqrt(ff)], 'k-' );
plt.plot([0, T], [0, 0], 'k-', 'LineWidth', 1 );
plt.plot([0, T], [2*sdtrue*sqrt(ff), 2*sdtrue*sqrt(ff)], 'k-' );
# plot PSD
fig2 = plt.figure(2,figsize=(10,6));
plt.xlabel('frequency f (Hz)');
plt.ylabel('ss(f)');
plt.title('power spectral density');
plt.plot(fpos,s2,'k-');
# plot mean on spectra
plt.plot([0, fmax], [s2meantrue, s2meantrue],'k-');
# histogram of values
Nhist=100; # number of bins in histogram
s2min = np.min(s2); # start bin
s2max = np.max(s2); # end bin
ct, ed = np.histogram(s2,Nhist,(s2min,s2max)); # histogram
Nc = len(ct); # number of counts ct
Ne = len(ed); # number of edges ed
s2hist = eda_cvec(ct); # counts as a vector
centers = eda_cvec(0.5*(ed[1:Ne]+ed[0:Ne-1])); # bin centers as a vector
# chi-squared pdf
Db=(centers[1,0]-centers[0,0])/c;
# true histogram derived from normalized chi-squared distribution
s2histtrue=Nf*Db*stats.chi2.pdf(centers/c,p);
# plot 95% confidence level on spectra
pv=0.95;
cv = stats.chi2.ppf(pv,p);
plt.plot([fpos[0,0], fpos[Nf-1,0]], [cv*c, cv*c],'k-');
plt.show();
print("%.4f confidence level is at s2 = %.4f" % (pv,cv*c) );
percent = 100 * np.sum( s2hist[centers<=cv*c]) / np.sum(s2hist);
print("%.4f percent of points are below %.4f confidence" % (percent,pv));
print(' ');
fig3 = plt.figure(3,figsize=(10,6));
plt.axis([0, np.max(centers), 0, max(s2hist) ])
plt.xlabel('s2)');
plt.ylabel('p(s2)');
plt.title('histogram of power spectral density');
plt.plot(centers,s2hist,'k-');
plt.plot(centers,s2histtrue,'k-');
plt.plot([s2meantrue, s2meantrue], [0, 0.1*max(s2hist)],'k:');
plt.plot([cv*c, cv*c], [0, 0.1*max(s2hist)],'k:');
plt.show();
# probability that PSD exceeds maximum observed value
speak = np.max(s2);
print("largest spectral peak: %e" % (speak) );
ppeak = stats.chi2.cdf(speak/c,p);
print("probability of laregst peak %.8f" % (ppeak) );
print("probability of PSD exceeding maximum observed value %.8f" % (1-ppeak) );
print("Nf probability of PSD exceeding maximum observed value %.8f" % (1-(ppeak**Nf)) );
# edapy_12_12, p.s.d. of AR1 process
# tunable constants
beta = 0.60; # beta or AR1 process
sigma = 100; # sigma of x
N=1024; # length of time series
# standard set up a time axis, t
Dt=0.6;
T = N*Dt;
t= eda_cvec(Dt*np.linspace(0,N-1,N));
# standard setup of frequency axis, f
No2 = floor(N/2); # integer version of N.2
fmax=1/(2.0*Dt); # nyquist frequency
Df=fmax/No2; # frequency sampling
Nf=No2+1; # number of non-negative frequencies
# full (pos and neg) frequency vector
f=eda_cvec(Df*np.concatenate((np.linspace(0,No2,Nf),np.linspace(-No2+1,-1,No2-1)),axis=0));
Dw=2.0*pi*Df; # angular feqweuncy sampling
w=2.0*pi*f; # full (pos and neg) angular fedquency vector
Nw=Nf; # number of non-negative angular frequencies
fpos=eda_cvec(Df * np.linspace(0,No2,Nf)); # vector of non-negative frequencies
wpos=2.0*pi*fpos; # vector of non-negative angular frequencies
# delete nonnegative frequencies from axes
f=f[0:Nf,0:1];
w=w[0:Nf,0:1];
# plot of spectra with different beta's
betalist = eda_cvec( 0.25, 0.5, 0.75);
NB,i = np.shape(betalist);
print('p.s.d. of AR1 process with these betas');
print(betalist.T);
fig1 = plt.figure(1,figsize=(10,6));
plt.axis( [0, fmax, 0, 1/(1+(betalist[NB-1,0]-2*betalist[NB-1,0])) ] );
plt.xlabel('f');
plt.ylabel('1/|v(f)|^2');
for beta in betalist:
denom = (1 + (beta**2) - 2*beta*np.cos(2*pi*0.5*f/fmax) );
plt.plot( f, np.divide(1,denom), 'k-', lw=2 );
# edapy_12_13, 95% confidence limit of AR1 process
# tunable constants
beta = 0.6; # beta of AR1 process
sigma = 100.0; # sigma of x
N=1024; # length of time series
tmaxplot=120.0; # max time on enlarged plot
ADDSINUSOID=0; # add sinusoid to random timeseries on 1
EXACT95=0; # Exact confidence on 1, (mean+2sigma) approx on 0
# standard set up a time axis, t
Dt=0.6;
T = N*Dt;
t=eda_cvec(Dt*np.linspace(0,N-1,N));
# standard setup of frequency axis, f
No2 = floor(N/2); # integer version of N.2
fmax=1/(2.0*Dt); # nyquist frequency
Df=fmax/No2; # frequency sampling
Nf=No2+1; # number of non-negative frequencies
# full (pos and neg) frequency vector
f=eda_cvec(Df*np.concatenate((np.linspace(0,No2,Nf),np.linspace(-No2+1,-1,No2-1)),axis=0));
Dw=2.0*pi*Df; # angular feqweuncy sampling
w=2.0*pi*f; # full (pos and neg) angular fedquency vector
Nw=Nf; # number of non-negative angular frequencies
fpos=eda_cvec(Df * np.linspace(0,No2,Nf)); # vector of non-negative frequencies
wpos=2.0*pi*fpos; # vector of non-negative angular frequencies
# uncorrelated random time series x(t)
x = np.random.normal(loc=0,scale=sigma,size=(N,1));
# AR1 process y(t)
y = np.zeros((N,1));
y[0,0] = x[0,0];
for i in range(1,N):
y[i,0] = beta*y[i-1,0] + x[i,0];
if( ADDSINUSOID ):
sigmay2 = np.var(y);
sigmay = sqrt(sigmay2);
Ac=0.25*sigmay;
fc=fmax/4.0;
y = y + Ac*np.cos(2.0*pi*fc*t);
print("A small amplitude sinusoid of amplitude %.2f and frequency %.2f has been added",(Ac,fc));
# plot entire time series
print('plot time series x(t) and y(t)');
fig1 = plt.figure(1,figsize=(10,6));
ax1 = plt.subplot(2,1,1);
plt.axis( [0, Dt*(N-1), min(x), max(x) ] );
plt.plot( t, x, 'k-', lw=2 );
plt.xlabel('t');
plt.ylabel('x(t)');
ax1 = plt.subplot(2,1,2);
plt.axis( [0, Dt*(N-1), min(y), max(y) ] );
plt.plot( t, y, 'k-', 'LineWidth', 2 );
plt.xlabel('t');
plt.ylabel('y(t)');
plt.show();
# plot enlarged portion of time series
print('plot enlarged portions of time series x(t) and y(t)');
fig2 = plt.figure(2,figsize=(10,6));
ax1 = plt.subplot(2,1,1);
plt.axis( [0, tmaxplot, min(x), max(x) ] );
plt.plot( t, x, 'k-', lw=2 );
plt.xlabel('t');
plt.ylabel('x(t)');
ax1 = plt.subplot(2,1,2);
plt.axis( [0, tmaxplot, min(y), max(y) ] );
plt.plot( t, y, 'k-', lw=2 );
plt.xlabel('t');
plt.ylabel('y(t)');
plt.show();
# compute power spectral density
xfftraw = np.fft.fft(x,axis=0); # raw DFT output
xbar = Dt*xfftraw[0:Nf,0:1]; # Fourier transform, + freqs only
xs2 = (2/T)*np.power(np.abs(xbar),2); # power spectral density
xs2=xs2[0:Nf,0:1]; # nonnegative f's only
yfftraw = np.fft.fft(y,axis=0); # raw DFT output
ybar = Dt*yfftraw[0:Nf,0:1]; # Fourier transform, + freqs only
ys2 = (2/T) * np.power(abs(ybar),2); # power spectral density
# delete nonnegative frequencies from axes
f=f[0:Nf];
w=w[0:Nf];
# plot of spectra
print('plot power spretral density, |x(f)|^2 and |y(f)|^2');
fig3 = plt.figure(3,figsize=(10,6));
ax1 = plt.subplot(2,1,1);
plt.axis( [0, np.max(f), 0, np.max(xs2) ] );
plt.plot( f, xs2, 'k-', lw=2 );
plt.xlabel('f');
plt.ylabel('|x(f)|**2');
ax1 = plt.subplot(2,1,2);
plt.axis( [0, np.max(f), 0, np.max(ys2) ] );
plt.plot( f, ys2, 'k-', lw=2 );
plt.xlabel('f');
plt.ylabel('|y(f)|**2');
plt.show();
# plot of spectra, onto which other stiff will be overlain
print('plot power spretral density, |x(f)|**2 and |y(f)|**2');
print('with other stuff overlain');
fig4 = plt.figure(4,figsize=(10,6));
ax1 = plt.subplot(2,1,1);
plt.axis( [0, np.max(f), 0, np.max(xs2) ] );
plt.plot( f, xs2, 'k-', lw=2 );
plt.xlabel('f');
plt.ylabel('|x(f)|**2');
ax1 = plt.subplot(2,1,2);
plt.axis( [0, np.max(f), 0, np.max(ys2) ] );
plt.plot( f, ys2, 'k-', lw=2 );
plt.xlabel('f');
plt.ylabel('|y(f)|**2');
Nr = 1000;
Nr95 = 950;
Nr50 = 500;
S = np.zeros((Nf,Nr));
# repeat for Nr realizations
for ir in range(Nr):
# uncorrelated time series
x = np.random.normal(loc=0.0,scale=sigma,size=(N,1));
# build AR1 time sereis
y = np.zeros((N,1));
y[0,0] = x[0,0];
for i in range(1,N):
y[i,0] = beta*y[i-1,0] + x[i,0];
yfftraw = np.fft.fft(y,axis=0); # FFT of y
ybar = Dt*yfftraw[0:Nf,0:1]; # FT of nonnegative f's
ys2 = (2/T) * np.power(np.abs(ybar),2); # p.s.d.
S[0:Nf,ir:ir+1]=ys2[0:Nf,0:1]; # save p.s.d.
# statistics of the ensemble of p.s.d.'s
S95 = np.zeros((Nf,1)); # 95 percent level
Smean = np.zeros((Nf,1)); # mean
for ifr in range(Nf): # loop over frequencies
v = S[ifr:ifr+1,0:Nr].T; # the p.s.d. at one frequency
v = eda_cvec(np.sort(v,axis=0)); # sort the values
S95[ifr,0]=v[Nr95,0]; # 95% less than this value
Smean[ifr,0]=np.mean(v); # mean
plt.plot( f, S95, '-', lw=5, color=[0.8,0.8,0.8] );
plt.plot( f, Smean, '-', lw=5, color=[0.8,0.8,0.8] );
# theoretical spectrum
p=2; # degrees of freedom
# in this code, the window function is just w(i)=1
ff = 1.0; # variance of window function
c = ( ff*(sigma**2) ) / (2*Nf*Df); # scaling constant
xs2meantrue = p*c; # mean of p.s.d. of x
xs2vartrue = 2*p*(c**2); # variance of p.s.d. of x
print("scaling constant c: %e" % (c) );
print("pc: %e" % (xs2meantrue) );
print("2pc**2: %e" % (xs2vartrue) );
# the AR1 filter v(f)
denom = (1.0 + (beta**2) - 2.0*beta*np.cos(pi*f/fmax) );
# mean of p.s.d. of y
ys2mean = xs2meantrue*np.reciprocal(denom);
# variance of p.s.d. of y
if( EXACT95 ):
C95 = stats.chi2.ppf(0.95,p);
ys295 = C95*c*np.reciprocal(denom);
print('Exact 95%% confidence level used');
else:
ys295 = ys2mean + 2.0*sqrt(xs2vartrue)*np.reciprocal(denom);
print('Approximate 95% confidence level used');
plt.plot( f, ys2mean, 'k--', lw=2 );
plt.plot( f, ys295, 'k-', lw=2 );
# edapy_12_14: bootstrap statistics of slope
# in least squares fit of a straight line to a small
# fragment of Black Rock Forest temperature data
# load data and set up time axis
D=np.genfromtxt('../Data/brf_fragment.txt', delimiter='\t');
s = np.shape(D);
N = s[0]; # number of data
torig = eda_cvec(10+np.linspace(0,N-1,N)); # time vector, hours from start of orignal record
dorig = eda_cvec(D); # data vector
# least-squares slope and its variance of original data
M=2; # number of model parameters
G=np.zeros((N,M)); # data kernel setup
G[:,0]=np.ones((N,));
G[:,1]=torig.ravel();
GTG = np.matmul(G.T,G); # G_transpose time G
mest = la.solve( GTG, np.matmul(G.T,dorig) ); # least squares solution
slopeorig=mest[1,0]; # break out estimated slope
# usual error estimate based on propagagtion
# of posterior estimate of sigma2 of data into
# a sigma2 of slope
e = dorig - np.matmul(G,mest); # individual errors
E = np.matmul(e.T,e); # total error
sigma2dorig = E/N; # posterior variance of data
Cmorig = sigma2dorig * la.inv(GTG); # posterior covariance of model parametres
sigma2morig = Cmorig[1,1]; # breal out variance of slope
# number of realizations;
Nr = 10000;
slope = np.zeros((Nr,1)); # vecor of bootstraped slopes
# loop over realizations
for p in range(Nr):
# resample
# N random integers, each bewteen 1 and N-1
rowindex=np.random.randint(N,size=N);
# random resampling of time with duplication
t = eda_cvec(torig[rowindex,0]);
# random resampling of data with duplication
d = eda_cvec(dorig[rowindex,0]);
# straight line fit
M=2; # number of model parameters
G=np.zeros((N,M)); # data akernel setup
G[:,0]=np.ones((N,));
G[:,1]=t.ravel();
GTG = np.matmul(G.T,G); # G-transpose times G
mest=la.solve(GTG,np.matmul(G.T,d)); # least sq soln
slope[p,0]=mest[1,0]; # save slope in vector
# compute histogram of slopes and normalize
# normalize to a probability density distribution
# with Dt*sum(pbootstrap)=1;
Nhist=1000; # number of bins
slopemin = 0.2; # staring bin
slopemax = 0.8; # ending bin
ct, ed = np.histogram(slope,Nhist,(slopemin,slopemax)); # histogram
Nc = len(ct); # length of counts ct
Ne = len(ed); # length of edges ed
slopehist = eda_cvec(ct); # counts as a vector
# bin centers as a vector
centers = eda_cvec(0.5*(ed[1:Ne]+ed[0:Ne-1]));
# approximate pdf via normalized histogram
Ds = centers[1,0] - centers[0,0]; # bin sampling
norm = Ds*np.sum(slopehist); # normalization factor
pbootstrap =slopehist/norm; # p.d.f. from histogram
# compute Normal distribution of slopes
# using parameters estimated from original data
norm = (1/sqrt(2*pi*sigma2morig));
pnormal = norm * np.exp(-np.power((centers-slopeorig),2)/(2*sigma2morig));
# plot pdf
fig1 = plt.figure(1,figsize=(10,6));
pdfmax = np.max(pbootstrap);
plt.axis([0.4, 0.6, 0, 1.5*pdfmax]);
plt.plot(centers,pbootstrap,'k-');
plt.plot(centers,pnormal,'k:');
plt.xlabel('slope b');
plt.ylabel('p(b)');
# mean (expectation) and variance of bootstrap distibution
# approximate integral for mean as sum
mb = Ds*np.sum( np.multiply( centers, pbootstrap) );
# approximate integral for variance as sum
vb = Ds*np.sum( np.multiply( np.power(centers-mb,2) , pbootstrap) );
# 5% and 95% confidence of
# cumulative probability function
Pbootstrap = eda_cvec(Ds*np.cumsum(pbootstrap));
# find 0.025 cumulative probability
ilovec = np.where(Pbootstrap>=0.025);
# np.where() returns a bunch of stuff,
# but what we want, ilo, is the first thing
ilox = ilovec[0];
ilo = ilox[0];
blo = centers[ilo];
# find 0.975 cumulative probability
ihivec = np.where(Pbootstrap>=0.975);
ihix = ihivec[0];
ihi = ihix[0]
bhi = centers[ihi];
plt.plot( [blo, blo], [0, 0.2*pdfmax], 'k-' );
plt.plot( [bhi, bhi], [0, 0.2*pdfmax], 'k-' );
plt.plot( [mb, mb], [0, 0.3*pdfmax], 'k-' );
plt.show();
print("standard method: slope %.4f +/- %.4f (2 sigma)" % (slopeorig, 2*sqrt(sigma2morig)));
print("bootstrap method: slope %.4f +/- %.4f (2 sigma)" % (mb, 2*sqrt(vb)))
print("bootstrap method: slope between %.4f and %.4f (95 percent confidence)" % (blo, bhi));
# edapy_12_15: bootstrap statistics using Atlantic Rock data set
# Atlantic Rocks dataset
# bootstrap error analysis for
# CaO/Na2O ratio of varimax factor 2
# note CaO/Na2O are elements 6/7
# load data
Draw = np.genfromtxt('../Data/rocks.txt', delimiter='\t')
N, M = np.shape(Draw);
# number of realizations
Nr=10000;
ratio = np.zeros((Nr,1));
for ir in range(Nr):
# randomly resample
rowindex=np.random.randint(N,size=N); # N random integers, each between 0 and N-1
D = np.zeros((N,M));
D[:,:] = Draw[rowindex,:];
# compute factors and factor loadings using singular value decompostion
[U, s, VT] = la.svd(D,full_matrices=False);
sh = np.shape(s);
Ns = sh[0];
# keep only first few singular values
P=5;
F = np.copy(VT[0:P,:]);
C = np.matmul( U[:,0:P], np.diag(s[0:P]));
# spike these factors using the varimax procedure
k = [1, 2, 3, 4];
Nk = len(k);
# initial rotated factor matrix, FP, abd rotation matrix, MR
MR=np.identity(P);
FP=np.copy(F);
fA = np.zeros((M,1));
fB = np.zeros((M,1));
fAp = np.zeros((M,1));
fBp = np.zeros((M,1));
mA = np.zeros((P,1));
mB = np.zeros((P,1));
mAp = np.zeros((P,1));
mBp = np.zeros((P,1));
for iter in range(3):
for ii in range(Nk):
for jj in range(ii+1,Nk):
# spike factors i and j
i=k[ii];
j=k[jj];
# copy factors from matrix to vectors
fA[:,0] = np.copy(FP[i,:]);
fB[:,0] = np.copy(FP[j,:]);
# standard varimax procedure to determine rotation angle q
u = np.power(fA,2) - np.power(fB,2);
v = 2.0 * np.multiply(fA, fB);
AA = 2*M*np.matmul(u.T,v);
BB = np.sum(u)*np.sum(v);
top = AA - BB;
CC = M*(np.matmul(u.T,u)-np.matmul(v.T,v));
DD = (np.sum(u)**2)-(np.sum(v)**2);
bot = CC - DD;
q = 0.25 * atan2(top,bot);
# rotate factors
cq = cos(q);
sq = sin(q);
fAp = cq*fA + sq*fB;
fBp = -sq*fA + cq*fB;
# put back into factor matrix, FP
FP[i,:] = fAp.ravel();
FP[j,:] = fBp.ravel();
# accumulate rotation
mA[:,0] = np.copy(MR[i,:]);
mB[:,0] = np.copy(MR[j,:]);
mAp = cq*mA + sq*mB;
mBp = -sq*mA + cq*mB;
MR[i,:] = mAp.ravel();
MR[j,:] = mBp.ravel();
# new factor loadings
CP = np.matmul(C,MR.T);
# MATLAB: ratio of elemnts 6 and 7 of factor 2
# Python: ratio of elemnts 5 and 6 of factor 1
ratio[ir,0]=FP[1,5]/FP[1,6];
# make histogram of ratios
Nhist=500; # number of bins
ratiomin = 0.2; # start bin
ratiomax = 0.8; # end bin
ct, ed = np.histogram(ratio,Nhist,(ratiomin,ratiomax)); # histogram
Nc = len(ct); # number of counts ct
Ne = len(ed); # number rof edges ed
ratiohist = eda_cvec(ct); # counts as a vector
centers = eda_cvec(0.5*(ed[1:Ne]+ed[0:Ne-1])); # bn centers as a vector
# approximate pdf via normalized histogram
Dr = centers[1,0] - centers[0,0]; # bin spacing
norm = Dr*np.sum(ratiohist); # normalization of p.d.f.
pbootstrap = ratiohist/norm; # p.d.f. estimated from histogram
# mean (expectation) and variance of bootstrap distibution
mr = Dr*np.sum( np.multiply( centers, pbootstrap) );
vr = Dr*np.sum( np.multiply( np.power(centers-mr,2) , pbootstrap) );
print("CaO/Na2O ratio of Factor 2");
print("mean %.6f variance %.6f" % (mr,vr) );
# 5% and 95% confidence of bootstrap distibution
Pbootstrap = eda_cvec(Dr*np.cumsum(pbootstrap));
# 2.5% connfidence
ilovec = np.where( (Pbootstrap>=0.025).ravel() );
# np.where() returns a bunch of stuff, we want the first, ilo
ilox = ilovec[0];
ilo = ilox[0];
blo = centers[ilo];
# 97.5% connfidence
ihivec = np.where(Pbootstrap>=0.975);
# np.where() returns a bunch of stuff, we want the first, ihi
ihix = ihivec[0];
ihi = ihix[0]
bhi = centers[ihi];
# plot pdf
fig1 = plt.figure(1,figsize=(10,6));
pdfmax = np.max(pbootstrap);
plt.axis([0.4, 0.6, 0, 1.5*pdfmax]);
plt.plot(centers,pbootstrap,'k-');
plt.xlabel('CaO/Na2O ratio, r');
plt.ylabel('p(r)');
plt.plot( [blo, blo], [0, 0.2*pdfmax], 'k-' );
plt.plot( [bhi, bhi], [0, 0.2*pdfmax], 'k-' );
plt.plot( [mr, mr], [0, 0.3*pdfmax], 'k-' );
plt.show();
print("bootstrap method: ratio %.4f +/- %.4f (2 sigma)" % (mr, 2*sqrt(vr)))
print("bootstrap method: ratio between %.4f and %.4f (95 percent confidence)" % (blo, bhi));