In [31]:
# edapy_08_00 clear all variables and import various modules
# History
# 2022/06/26 - checked with most recent Python & etc. software
# 2023/07/13 - fixed some issues, incl colormap, loglof plt, eda_cvec()
# 2024/05/26 - Patches to accomodate new verions of numpy, scipy, matplotlib
# patched dot product to return scalar value
%reset -f
import os
from datetime import date
from math import exp, pi, sin, cos, sqrt, floor, ceil, atan2
import numpy as np
import scipy.sparse.linalg as las
from scipy import sparse
import scipy.linalg as la
import scipy.signal as sg
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import patches as pch
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d import Axes3D
# eda_draw function makes a "pictorial matrix equation"
# arguments are vectors, matrices and strings
# which are plotted in the order that the appear
# except that strings starting with 'title ' are plotted
# under the subseqeunt matrix or vector
# always returns a status of 1
def eda_draw(*argv):
bw = np.zeros((256,4));
v = 0.9*(256 - np.linspace( 0, 255, 256 ))/255;
bw[:,0] = v;
bw[:,1] = v;
bw[:,2] = v;
bw[:,3] = np.ones(256);
bwcmap = ListedColormap(bw);
# size of plot
W = 16;
H = 4;
fig1 = plt.figure(1);
# figsize width and height in inches
fig1.set_size_inches(W,H);
ax1 = plt.subplot(1,1,1);
plt.axis([0, W, -H/2, H/2]);
plt.axis('off');
LM = W/6; # matrix width and heoght
LV = W/40; # vector width
FS = 0.12; # character width
TO = 0.4; # title vertical offset
SP = 0.2; # space between objects
LS = 0.2; # leading space
p = LS; # starting x-position
istitle=0; # flags presence of a title
for a in argv:
if isinstance(a,np.ndarray):
sh = np.shape(a);
if len(sh) == 1: # conversion to nx1 array
n = sh[0];
m = 1;
ap = a;
a = np.zeros((n,1));
a[:,0] = ap;
else:
n = sh[0];
m = sh[1];
if m==1:
pold=p;
left=p;
right=p+LV;
bottom=-LM/2;
top=LM/2;
plt.imshow( a, cmap=bwcmap, vmin=np.min(a), vmax=np.max(a), extent=(left,right,bottom,top) );
p = p+LV;
pm = (p+pold)/2;
if istitle:
plt.text(pm,-(LM/2)-TO,titlestr,horizontalalignment='center');
istitle=0;
p = p+SP;
else:
pold=p;
left=p;
right=p+LM;
bottom=-LM/2;
top=LM/2;
plt.imshow( a, cmap=bwcmap, vmin=np.min(a), vmax=np.max(a), extent=(left,right,bottom,top) );
p = p+LM;
pm = (p+pold)/2;
if istitle:
plt.text(pm,-(LM/2)-TO,titlestr,horizontalalignment='center');
istitle=0;
p = p+SP;
elif isinstance(a,str):
ns = len(a);
istitle=0;
if( ns>=6 ):
if 'title ' in a[0:6]:
istitle=1;
titlestr=a[6:];
if( istitle != 1):
plt.text(p,0,a);
p = p + ns*FS + SP;
plt.show();
return 1;
def eda_kmeans(S, mu):
# k-mean cluster analsis of NxM sample natrix S
# starting from initial guess KxM mu of cluster means
# returns:
# newmu: new cluster means
# k: Nx1 vector of cluster assignments
# dist2: Nx1 vector of squared distance to cluster center
# count: Kx1 vector of samples in cluster
# Kdist2: KxK matrix of inter-cluster squared distances
N, M = np.shape(S); # N samples with M elements
K, M2 = np.shape(mu); # K cluster meabs
k = np.zeros((N,1),dtype=int); # assoc sample w cluster
dist2 = np.zeros((N,1)); # sq distance from cluster mean
Kdist2 = np.zeros((K,K)); # cluster to nearest cluster sq distance
oldmu = np.copy(mu); # old cluster means
MAXP = 2*K*M; # max iterations
for p in range(MAXP): # iterate
# cluster assignment step
changes = 0; # number of changes
for i in range(N): # loop over samples
for j in range(K): # loop over cluster means
# deviation of sample from mean
delta = (S[i:i+1,0:M] - oldmu[j:j+1,0:M]).T;
# squared distance
r2 = np.matmul(delta.T,delta); r2=r2[0,0];
if( j==0 ): # always accept first distance
r2min = r2;
newk = j;
elif r2<r2min: # accept if smaller
r2min = r2;
newk = j;
if k[i,0] != newk: # count reassignements
changes=changes+1;
k[i,0] = newk; # reassigned cluster
dist2[i,0] = r2min; # sq distance to that cluster
# end loop over iterations on no further changes
if( (p!=0) and (changes==0) ):
break;
# means step
newmu = np.zeros((K,M)); # new cluster means
count = np.zeros((K,1),dtype=int); # samples in cluster
for i in range(N): # loop over samples
for j in range(M): # loop over elements
# cluster sum
newmu[k[i,0],j] = newmu[k[i,0],j] + S[i,j];
# count samples in cluster
count[k[i,0],0] = count[k[i,0],0] + 1;
for j in range(K): # loop over clusters
if( count[j,0] == 0 ): # cluser with no samples
# no change in cluster mean
newmu[j,0:M] = oldmu[j,0:M];
else:
# new cluster mean
newmu[j,0:M] = newmu[j,0:M]/count[j,0]
oldmu=newmu; # update cluster mean
# inter-cluster distances
for i in range(K-1): # loop over cluster means
for j in range(1,K): # loop over cluster means
delta = (newmu[i:i+1,0:M] - newmu[j:j+1,0:M]).T;
r2 = np.matmul(delta.T,delta); r2=r2[0,0];
Kdist2[i,j] = r2; # sq distance between clusters
Kdist2[j,i] = r2; # sq distance between clusters
return newmu, k, dist2, count, Kdist2;
def eda_kmeans_mod(S, mu):
# k-mean cluster analsis of NxM sample natrix S
# modified per Bradley & Fayyad (1998) never to
# return cluster with zero samples.
# Starts with initial guess KxM mu of cluster means
# returns:
# mymu2: new cluster means
# myk2: Nx1 vector of cluster assignments
# mydist2: Nx1 vector of distance to cluster center
# mycount: Kx1 vector of samples in cluster
initmu = np.copy(mu);
mymu, myk, mydist, mycount, myKdist = eda_kmeans(S, initmu);
while np.min(mycount)==0: # while cluster with no samples
ic=np.argmin(mycount); # index of cluster
ir=np.argmax(mydist); # index of most distant sample
ik=myk[ir,0]; # cluster of this sample
mymu[ic:ic+1,0:M] = S[ir:ir+1,0:M]; # new mean
mycount[ic,0]=1; # reset cluster count
myk[ir,0]=ic; # reset sample membership
mydist[ir,0]=0.0; # reset sample distance
# note: in rare cases, next line could create
# infinite loop, so don't do it.
# mycount[ik,0]=mycount[ik,0]-1;
# redo k-means
mymu2, myk2, mydist2, mycount2, myKdist2 = eda_kmeans(S, mymu);
return mymu2, myk2, mydist2, mycount2, myKdist2;
def eda_forgy_mean(S,K):
# create initial k-mean cluster means, from samples
# randomly drawn NxM sample matrix S
# returns KxM matrix mu of means
N,M=np.shape(S);
k = np.random.randint(low=0,high=N,size=K,dtype=int);
mu = S[k,0:M];
return mu;
def eda_random_partition_mean(S,K):
# create initial k-mean cluster means, by K random
# partitions of NxM sample matrix S
# returns KxM matrix mu of means
N,M=np.shape(S);
# random cluster-assignments
k = np.random.randint(low=0,high=K,size=N,dtype=int);
mu = np.zeros((K,M));
for i in range(K): # loop over clusters
r = np.where(k==i); # samples in cluster i
kk = r[0];
mu[i:i+1,0:M] = np.mean(S[kk,0:M],axis=0); # their means
return mu;
def eda_bradley_fayyad_mean(S,K,Nsets,fraction):
# create initial k-mean cluster means, by
# the Bradley & Fayyad (1998) method, which
# is based on clustering clusters. The NxM
# sample matrix S is randomly subsampled Nsets
# times, with each set containing (fraction*N+2*K)
# samples.
# returns KxM matrix mu of means
N,M=np.shape(S);
initmu = eda_random_partition_mean(S,K);
KiMAX = K*Nsets; # maximum number of means mui
Ni = floor(fraction*N)+2*K; # size of small sets
# Step 1, create KixM table CM of means of subsampled data
CM = np.zeros((KiMAX,M));
Ki=0; # counter for rows of CM
for j in range(Nsets):
k = np.random.randint(low=0,high=N,size=Ni,dtype=int);
Si = S[k,0:M];
mymu, myk, mydist, mycount, myKdist = eda_kmeans_mod(Si, initmu);
CM[Ki:Ki+K,0:M] = mymu[0:K,0:M];
Ki=Ki+K;
# Step 2, create KixM table FMS of means of cluster means
FMS = np.zeros((KiMAX,M));
Ki=0; # counter for rows of CM
for j in range(Nsets):
mymu, myk, mydist, mycount, myKdist = eda_kmeans(CM, CM[3*j:3*j+K,0:M]);
FMS[Ki:Ki+K,0:M] = mymu[0:K,0:M];
Ki=Ki+K;
# Step 3A: associate each KxM member FMi of Fm
# with all individual 1xM elements of CM
# and compute the overall error
error = np.zeros((Nsets,1));
Ki=0;
for i in range(Nsets):
FMi = FMS[Ki:Ki+K,0:M] # one group of means
Ki=Ki+K;
for j in range(KiMAX):
CMj = CM[j:j+1,0:M]; # one member of CM
for m in range(K):
delta = (CMj[0:1,0:M] - FMi[m:m+1,0:M]).T;
r2 = np.matmul(delta.T,delta); r2=r2[0,0];
if( m==0 ): # always acceot first distance
r2min = r2;
elif r2<r2min: # accept if smaller
r2min = r2;
error[i,0] = error[i,0] + r2min;
# Step 3B: select the FMi with lowest error
i = np.argmin(error);
mu = np.copy( FMS[K*i:K*i+K,0:M] );
return(mu);
# function to make a numpy N-by-1 column vector
# c=eda_cvec(v1, v2, ...) from a list of several
# array-like entities v1, v2, including a number
# a list of numbers, a tuple of numbers, an N-by-0 np array
# and a N-by-1 np array. The function
# also insures that, if v is an np array, that
# c is a copy, as contrasted to a view, of it
# It promotes integers to floats, and integers
# and floats to complex, by context.
# This version concatenates many argments,
# whereas c=eda_cvec1(v1) takes just one argiment.
# I recommend always using eda_cvec(v1, v2, ...)
def eda_cvec(*argv):
t = int;
Nt = 0;
for a in argv:
v = eda_cvec1(a);
N,M = np.shape(v);
Nt = Nt + N;
if( N==0 ):
continue; # skip vector of zero length
if (t==int) and isinstance(v[0,0],float):
t=float;
elif isinstance(v[0,0],complex):
t=complex;
w = np.zeros((Nt,1),dtype=t);
Nt = 0;
for a in argv:
v = eda_cvec1(a);
N,M = np.shape(v);
w[Nt:Nt+N,0] = v[0:N,0];
Nt = Nt + N;
return w;
# function to make a numpy N-by-1 column vector
# c=eda_cvec1(v) from entity v that is array-like,
# including a number, a list of numbers, a tuple
# of numbers, an N-by-0 np array and a N-by1 np array.
# It promotes integers to floats, and integers
# and floats to complex, by context. The function
# also insures that, if v is an np array, that
# c is a copy, as contrasted to a view, of it.
# This version takes just one input argment.
# whereas c=eda_cvec(v1,v2,...) concatenates
# many argiments.
def eda_cvec1(v):
if isinstance(v, int) or isinstance(v, np.int32):
w = np.zeros((1,1),dtype=int);
w[0,0] = v;
return w;
elif isinstance(v, float):
w = np.zeros((1,1),dtype=float);
w[0,0] = v;
return w;
elif isinstance(v, complex):
w = np.zeros((1,1),dtype=complex);
w[0,0] = v;
return w;
elif isinstance(v, np.ndarray):
s = np.shape(v);
if len(s) == 1:
return np.copy(np.reshape(v,(s[0],1)));
else:
[r,c]=s;
if( c==1 ):
return(np.copy(v));
elif(r==1):
return(np.copy(v.T));
else:
raise TypeError("eda_cvec: %d by %d ndarray not allowed" % (r, c));
elif isinstance(v, list):
r = len(v);
t = int;
for vi in v:
if isinstance(vi,int) or isinstance(v, np.int32):
pass;
elif isinstance(vi,float):
t=float;
elif isinstance(vi,complex):
t=complex;
break;
else:
raise TypeError("eda_cvec: list contains unsupported type %s" % type(vi));
w = np.zeros((r,1),dtype=t);
w[:,0] = v;
return w;
elif isinstance(v, tuple):
r = len(v);
t = int;
for vi in v:
if isinstance(vi,int) or isinstance(v, np.int32):
pass;
elif isinstance(vi,float):
t=float;
elif isinstance(vi,complex):
t=complex;
break;
else:
raise TypeError("eda_cvec: tuple contains unsupported type %s" % type(vi));
w = np.zeros((r,1),dtype=t);
w[:,0] = v;
return w;
else:
raise TypeError("eda_cvec: %s not supported" % type(v));
In [2]:
# edapy_08_01, factors shown on terniary diagram
# vectrs of constants needed to make ternary diagram
a = eda_cvec( [0.0,0.0] );
b = eda_cvec( [1.0,0.0] );
c = eda_cvec( [0.5,0.5*sqrt(3)] );
# factor 1
f1 = eda_cvec( [0.15, 0.25, 0.60] );
f1 = f1/np.sum(f1); # normalize factor 1
# factor 2
f2 = eda_cvec( [0.25, 0.45, 0.30] );
f2 = f2/np.sum(f2); # normalize factor 2
# terniary diagram
fig1 = plt.figure(1,figsize=(6,6));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 1, 0, 1] );
plt.axis('off');
# bounding triangle
plt.plot( [a[0,0], b[0,0]], [a[1,0], b[1,0]], 'k-');
plt.plot( [b[0,0], c[0,0]], [b[1,0], c[1,0]], 'k-');
plt.plot( [c[0,0], a[0,0]], [c[1,0], a[1,0]], 'k-');
plt.text(a[0,0]-0.05, a[1,0],'A',horizontalalignment='center');
plt.text(b[0,0]+0.05, b[1,0],'B',horizontalalignment='center');
plt.text(c[0,0], c[1,0]+0.05,'C',horizontalalignment='center');
# first set of grid lines
L=10;
x = np.zeros((2,2))
for j in range(L):
for k in range(2):
v1 = float(j+1)/float(L);
v2 = (1.0-v1)*float(k);
v3 = 1.0 - (v1+v2);
x[:,k]=(v1*a+v2*b+v3*c).ravel();
plt.plot( [x[0,0], x[0,1]], [x[1,0], x[1,1]], 'k:' );
# second set of grid lines
L=10;
for j in range(L):
for k in range(2):
v3 = float(j+1)/float(L);
v1 =(1-v3)*float(k);
v2 = 1.0 - (v1+v3);
x[:,k]=(v1*a+v2*b+v3*c).ravel();
plt.plot( [x[0,0], x[0,1]], [x[1,0], x[1,1]], 'k:' );
#third set of grid lines
L=10;
for j in range(L):
for k in range(2):
v2 = float(j+1)/float(L);
v1 = (1-v2)*float(k);
v3 = 1.0 - (v1+v2);
x[:,k]=(v1*a+v2*b+v3*c).ravel();
plt.plot( [x[0,0], x[0,1]], [x[1,0], x[1,1]], 'k:' );
# plot factors
y = f1[0]*a+f1[1]*b+f1[2]*c;
plt.plot( y[0], y[1], 'k*' );
plt.text(y[0], y[1]+0.025,'f1',horizontalalignment='center');
y = f2[0]*a+f2[1]*b+f2[2]*c;
plt.plot( y[0], y[1], 'k*' );
plt.text(y[0], y[1]-0.05,'f2',horizontalalignment='center');
# create and plot samples lying between f1 and f2
N=10;
for j in range(N-1):
c1=float(j+1)/float(N);
c2=1-c1;
s = c1*f1 + c2*f2;
y = s[0]*a+s[1]*b+s[2]*c;
plt.plot( y[0], y[1], 'ko' );
plt.show();
In [3]:
# edapy_08_02, factors shown on terniary diagram
# this is eda08_01 w/o the grid lines and w/o annotation of factors
# constants needed to make ternary diagram
a = eda_cvec( [0.0,0.0] );
b = eda_cvec( [1.0,0.0] );
c = eda_cvec( [0.5,0.5*sqrt(3)] );
# factor 1
f1 = eda_cvec( [0.15, 0.25, 0.60] );
f1 = f1/np.sum(f1); # normalize factor 1
# factor 2
f2 = eda_cvec( [0.25, 0.45, 0.30] );
f2 = f2/np.sum(f2); # normalize factor 2
# terniary diagram
fig1 = plt.figure(1,figsize=(6,6));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 1, 0, 1] );
plt.axis('off');
# bounding triangle
plt.plot( [a[0,0], b[0,0]], [a[1,0], b[1,0]], 'k-');
plt.plot( [b[0,0], c[0,0]], [b[1,0], c[1,0]], 'k-');
plt.plot( [c[0,0], a[0,0]], [c[1,0], a[1,0]], 'k-');
plt.text(a[0,0]-0.05, a[1,0],'A',horizontalalignment='center');
plt.text(b[0,0]+0.05, b[1,0],'B',horizontalalignment='center');
plt.text(c[0,0], c[1,0]+0.05,'C',horizontalalignment='center');
# plot factors
y = f1[0]*a+f1[1]*b+f1[2]*c;
plt.plot( y[0], y[1], 'k*' );
y = f2[0]*a+f2[1]*b+f2[2]*c;
plt.plot( y[0], y[1], 'k*' );
# create and plot samples lying between f1 and f2
N=10;
for j in range(N-1):
c1=float(j+1)/float(N);
c2=1-c1;
s = c1*f1 + c2*f2;
y = s[0]*a+s[1]*b+s[2]*c;
plt.plot( y[0], y[1], 'ko' );
plt.show();
In [4]:
# edapy_08_03, alternative factors shown on terniary diagram
# this is eda08_02 with a different choice of factors
# constants needed to make ternary diagram
a = eda_cvec( [0.0,0.0] );
b = eda_cvec( [1.0,0.0] );
c = eda_cvec( [0.5,0.5*sqrt(3)] );
# factor 1
f1 = eda_cvec( [0.15, 0.25, 0.60] );
f1 = f1/np.sum(f1); # normalize factor 1
# factor 2
f2 = eda_cvec( [0.25, 0.45, 0.30] );
f2 = f2/np.sum(f2); # normalie factor 2
f1a = 0.5*(f1+f2); # factor 1a
f2a = f1+0.5*(f1-f2); # factor 2a
# terniary diagram
fig1 = plt.figure(1,figsize=(6,6));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 1, 0, 1] );
plt.axis('off');
# bounding triangle
plt.plot( [a[0,0], b[0,0]], [a[1,0], b[1,0]], 'k-');
plt.plot( [b[0,0], c[0,0]], [b[1,0], c[1,0]], 'k-');
plt.plot( [c[0,0], a[0,0]], [c[1,0], a[1,0]], 'k-');
plt.text(a[0,0]-0.05, a[1,0],'A',horizontalalignment='center');
plt.text(b[0,0]+0.05, b[1,0],'B',horizontalalignment='center');
plt.text(c[0,0], c[1,0]+0.05,'C',horizontalalignment='center');
# create and plot samples lying between f1 and f2
N=10;
for j in range(N-1):
c1=float(j+1)/float(N);
c2=1-c1;
s = c1*f1 + c2*f2;
y = s[0]*a+s[1]*b+s[2]*c;
plt.plot( y[0], y[1], 'ko' );
# plot factors
y = f1a[0]*a+f1a[1]*b+f1a[2]*c;
plt.plot( y[0], y[1], '*', color=(0.7,0.7,0.7) );
y = f2a[0]*a+f2a[1]*b+f2a[2]*c;
plt.plot( y[0], y[1], '*', color=(0.7,0.7,0.7) );
plt.show();
In [7]:
# edapy_08_04, factor analysis of Atlantic Rocks dataset
# using singular value decomposition
# load data
S = np.genfromtxt('../Data/rocks.txt', delimiter='\t')
N, M = np.shape(S);
Smax = np.amax(S);
# break out data into vectors
sio2 = eda_cvec( S[:,0] ); # SiO2
tio2 = eda_cvec( S[:,1] ); # TiO2
als03 = eda_cvec( S[:,2] ); # Al203
feot = eda_cvec( S[:,3] ); # FeO-total
mgo = eda_cvec( S[:,4] ); # MgO
cao = eda_cvec( S[:,5] ); # CaO
na20 = eda_cvec( S[:,6] ); # Na2O
k20 = eda_cvec( S[:,7] ); # K2O
# compute factors F and factor loadings C using singular value decompostion
[U, sigma, VT] = la.svd(S,full_matrices=False);
sh = np.shape(sigma);
Ns = sh[0];
F = np.copy(VT);
C = np.matmul( U, np.diag(sigma) );
# check error of reconstruction
Emax = np.amax(S - np.matmul(C,F));
print('max relative error in CF ', Emax/Smax);
# plot singular values
fig1 = plt.figure(1,figsize=(8,4));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, Ns+1, 0, np.max(sigma)] );
plt.plot( np.linspace(1,Ns,Ns), sigma, 'k-' );
plt.plot( np.linspace(1,Ns,Ns), sigma, 'ko' );
plt.title('singular values, s(i)');
plt.xlabel('index, i');
plt.ylabel('s(i)');
# print first five factors
for j in range(5):
fj = np.zeros((M,1)); # factor j as a column vector
fj[0:M,0:1] = F[j:j+1,0:M].T; # is extracted from a row of F
printstr = "factor %d" % (j);
print(printstr);
printstr = " SiO2 %f" % (fj[0,0]);
print(printstr);
printstr = " TiO2 %f" % (fj[1,0]);
print(printstr);
printstr = " Al203 %f" % (fj[2,0]);
print(printstr);
printstr = " FeO-total %f" % (fj[3,0]);
print(printstr);
printstr = " MgO %f" % (fj[4,0]);
print(printstr);
printstr = " CaO %f" % (fj[5,0]);
print(printstr);
printstr = " Na2O %f" % (fj[6,0]);
print(printstr);
printstr = " K2O %f" % (fj[7,0]);
print(printstr);
print(' ');
cmin=(-50);
cmax=50;
fig2 = plt.figure(2,figsize=(6,6));
ax1 = fig2.add_subplot(111, projection='3d')
ax1.axes.set_xlim3d(left=cmin, right=cmax);
ax1.axes.set_ylim3d(bottom=cmin, top=cmax);
ax1.axes.set_zlim3d(bottom=cmin, top=cmax);
ax1.view_init(30, 30)
ax1.scatter( xs=C[:,1], ys=C[:,2], zs=C[:,3], color=(0,0,0) );
ax1.plot( xs=[cmin,cmin], ys=[cmin,cmin], zs=[cmin,cmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmin,cmin], ys=[cmin,cmax], zs=[cmin,cmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmin,cmax], ys=[cmin,cmin], zs=[cmin,cmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmax], ys=[cmax,cmax], zs=[cmax,cmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmax], ys=[cmax,cmin], zs=[cmax,cmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmin], ys=[cmax,cmax], zs=[cmax,cmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmin], ys=[cmin,cmin], zs=[cmax,cmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmax], ys=[cmin,cmin], zs=[cmax,cmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmin,cmin], ys=[cmax,cmin], zs=[cmax,cmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmin,cmin], ys=[cmax,cmax], zs=[cmax,cmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmax], ys=[cmax,cmin], zs=[cmin,cmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmin], ys=[cmax,cmax], zs=[cmin,cmin], color=(0.8,0.8,0.8) );
ax1.set_xlabel('C(2)');
ax1.set_ylabel('C(3)');
ax1.set_zlabel('C(4)');
plt.show();
max relative error in CF 2.9341870488937076e-15 factor 0 SiO2 -0.908829 TiO2 -0.024638 Al203 -0.275168 FeO-total -0.177851 MgO -0.141341 CaO -0.209989 Na2O -0.044611 K2O -0.003430 factor 1 SiO2 0.007684 TiO2 -0.037474 Al203 -0.301583 FeO-total -0.018421 MgO 0.923193 CaO -0.226917 Na2O -0.058457 K2O -0.007204 factor 2 SiO2 -0.161689 TiO2 -0.126343 Al203 0.567828 FeO-total -0.659205 MgO 0.255748 CaO 0.365682 Na2O -0.041738 K2O -0.006464 factor 3 SiO2 0.209819 TiO2 0.151367 Al203 0.176021 FeO-total -0.427461 MgO -0.118643 CaO -0.780043 Na2O 0.302367 K2O 0.073403 factor 4 SiO2 0.309495 TiO2 -0.100476 Al203 -0.670083 FeO-total -0.585155 MgO -0.195193 CaO 0.207980 Na2O -0.145318 K2O 0.015035
In [11]:
# edapy_08_05: varimax procedure applied to 2 factors
# define two non-spiky factors, fA and fB
M=4;
fA = eda_cvec( [ 1.0, 1.0, 1.0, 1.0 ] );
fB = eda_cvec( [ 1.0, -1.0, 1.0, -1.0 ]);
fAsq = np.matmul(fA.T, fA); fAsq=fAsq[0,0]
fA = fA / sqrt(fAsq); # normalize to unit length
fBsq = np.matmul(fB.T, fB ); fBsq=fBsq[0,0];
fB = fB / sqrt(fBsq); # normalize to unit length
# standard varimax procedure to determine rotation angle q
u = np.power(fA,2) - np.power(fB,2);
v = 2.0 * np.multiply(fA, fB);
A = 2*M*np.matmul(u.T,v); A=A[0,0];
B = np.sum(u)*np.sum(v);
top = A - B;
C = M*(np.matmul(u.T,u)-np.matmul(v.T,v)); C=C[0,0];
D = (np.sum(u)**2)-(np.sum(v)**2);
bot = C - D;
q = 0.25 * atan2(top,bot);
# rotate factors
cq = cos(q);
sq = sin(q);
fAp = cq*fA + sq*fB;
fBp = -sq*fA + cq*fB;
# display results
print("fA %.3f %.3f. %.3f %.3f" % (fA[0,0], fA[1,0], fA[2,0], fA[3,0]) );
print("fB %.3f %.3f. %.3f %.3f" % (fB[0,0], fB[1,0], fB[2,0], fB[3,0]) );
print(' ');
print("fAp %.3f %.3f. %.3f %.3f" % (fAp[0,0], fAp[1,0], fAp[2,0], fAp[3,0]) );
print("FBp %.3f %.3f. %.3f %.3f" % (fBp[0,0], fBp[1,0], fBp[2,0], fBp[3,0]) );
fA 0.500 0.500. 0.500 0.500 fB 0.500 -0.500. 0.500 -0.500 fAp 0.707 0.000. 0.707 0.000 FBp 0.000 -0.707. 0.000 -0.707
In [13]:
# edapy_08_06
# History
# 2022/06/26 - checked with most recent Python & etc. softwarey_08_06
# factor analysis on Atlantic Rocks dataset
# using singular value decomposition
# and the varimax procedure
# load data
S = np.genfromtxt('../Data/rocks.txt', delimiter='\t')
N, M = np.shape(S);
Smax = np.max(np.abs(S));
# compute factors and factor loadings using singular value decompostion
[U, sigma, VT] = la.svd(S,full_matrices=False);
# keep only first five singular values
P=5;
F = np.copy(VT[0:P,:]); # P factors
C = np.matmul( U[:,0:P], np.diag(sigma[0:P])); # P loadings
SP = np.matmul(C,F); # reconstruction of S with P factors
# initial rotated factor matrix, FP, abd rotation matrix, MR
MR=np.identity(P);
FP=np.copy(F);
fA = np.zeros((M,1)); # fator A of a pair
fB = np.zeros((M,1)); # fator B of a pair
fAp = np.zeros((M,1)); # fator A of a pair after rotation
fBp = np.zeros((M,1)); # fator B of a pair after rotation
mA = np.zeros((P,1)); # col of rotation matrix associated with A
mB = np.zeros((P,1)); # col of rotation matrix associated with B
mAp = np.zeros((P,1)); # col A of rotation matrix after rotation
mBp = np.zeros((P,1)); # col B of rotation matrix after rotation
# spike these factors using the varimax procedure
k = [1, 2, 3, 4];
Nk = len(k);
for iter in range(3): # three iterations almost always sufficient
for ii in range(Nk): # first factor of pair
for jj in range(ii+1,Nk): # second factor of pair
# spike factors i and j
i=k[ii];
j=k[jj];
# copy factors from matrix to vectors
fA[:,0] = np.copy(FP[i,:]);
fB[:,0] = np.copy(FP[j,:]);
# standard varimax procedure to determine rotation angle q
u = np.power(fA,2) - np.power(fB,2);
v = 2.0 * np.multiply(fA, fB);
AA = 2*M*np.matmul(u.T,v); AA=AA[0,0];
BB = np.sum(u)*np.sum(v);
top = AA - BB;
CC = M*(np.matmul(u.T,u)-np.matmul(v.T,v)); CC=CC[0,0];
DD = (np.sum(u)**2)-(np.sum(v)**2);
bot = CC - DD;
q = 0.25 * atan2(top,bot);
# rotate factors
cq = cos(q);
sq = sin(q);
fAp = cq*fA + sq*fB;
fBp = -sq*fA + cq*fB;
# put back into factor matrix, FP
FP[i:i+1,0:M] = fAp[0:M,0];
FP[j:j+1,0:M] = fBp[0:M,0];
# accumulate rotation
mA[:,0] = np.copy(MR[i,:]);
mB[:,0] = np.copy(MR[j,:]);
mAp = cq*mA + sq*mB;
mBp = -sq*mA + cq*mB;
MR[i:i+1,0:P] = mAp[0:P,0];
MR[j:j+1,0:P] = mBp[0:P,0];
# new factor loadings
CP = np.matmul(C,MR.T);
# reconstruction using new factors
SPP = np.matmul(CP,FP);
# check error of reconstruction
Smax = np.max(np.abs(S));
Emax1 = np.max(np.abs(S - SP));
Emax2 = np.max(np.abs(S - SPP));
print('max relative error in S-CF: %.4e' % (Emax1/Smax) );
print('max relative error in S-(CP)(FP) %.4e' % (Emax2/Smax) );
Fa = np.abs(F);
FPa = np.abs(FP);
eda_draw( 'title f1', Fa[1,:], 'title f2', Fa[2,:], 'title f3', Fa[3,:], 'title f4', Fa[4,:], 'becomes', 'title fp1', FPa[1,:], 'title fp2', FPa[2,:], 'title fp3', FPa[3,:], 'title fp4', FPa[4,:]);
max relative error in S-CF: 5.6630e-02 max relative error in S-(CP)(FP) 5.6630e-02
In [14]:
# edapy_08_07
# comparison of weighted and unweighted factor analysis
# Part 1: Make synthetic chemical data with a wide range of concentrations
# 7 elements, three factors
M = 7;
P = 3;
# typical analytic errors
# [Fe Cu Zn Pb As Ag Au]
se = eda_cvec( [0.010000, 0.010000, 0.010000, 0.001000, 0.001000, 0.0000010, 0.0000001 ] );
# factors as column vectors
v1 = np.zeros((M,1));
v2 = np.zeros((M,1));
v3 = np.zeros((M,1));
# [Fe Cu Zn Pb As Ag Au]
v1 = eda_cvec( [0.200000, 0.004000, 0.010000, 0.010000, 0.001000, 0.0000100, 0.0000010 ] );
v2 = eda_cvec( [0.050000, 0.180000, 0.180000, 0.030000, 0.000200, 0.0000400, 0.0000005 ] );
v3 = eda_cvec( [0.100000, 0.080000, 0.110000, 0.010000, 0.000700, 0.0000100, 0.0000001 ] );
# factor matrix
F = np.concatenate( (v1.T, v2.T, v3.T), axis=0 );
# randomly chosen loadings
N = 10000;
c1 = np.random.uniform(0.0,1.0,(N,1));
c2 = np.random.uniform(0.0,1.0,(N,1));
c3 = 1.0-(c1+c2);
C = np.concatenate( (c1, c2, c3), axis=1 )
# true sample matrix
Strue = np.matmul(C,F);
Sobs = np.zeros((N,M));
# observed sample matrix is true plus Normally-distributed random noise
for i in range(M):
Sobs[:,i] = Strue[:,i] + np.random.normal(0,se[i,0],N);
Smax = np.amax(Sobs);
# histogram limts
Lh = 100;
Fe_min = 0;
Fe_max = 5;
Au_min = 0;
Au_max = 5;
# Part two factor analysis without weighting
# factors F and loadings F
[U1, s1, V1T] = la.svd(Sobs,full_matrices=False);
Fpre1 = V1T; # factors
Cpre1 = np.matmul( U1, np.diag(s1) ); # loadings
# P significant factors
Fpre1P = np.copy( Fpre1[0:P,:] );
Cpre1P = np.copy( Cpre1[:,0:P] );
Spre1P = np.matmul( Cpre1P, Fpre1P );
fig1 = plt.figure(1,figsize=(8,4));
# error
e1 = np.abs(Strue-Spre1P); # absolute values of individual errors
Emax = np.max(e1); # maximum error
print('witout weighting, max relative error in CF ', Emax/Smax);
c, e = np.histogram( e1[:,0]/se[0,0], Lh, (Fe_min,Fe_max));
Nc = len(c);
Ne = len(e);
count = eda_cvec( c );
edges = eda_cvec( e[0:Ne-1] );
ax1 = plt.subplot(1,2,1);
plt.axis( [Fe_min, Fe_max, 0, 1] );
plt.plot(edges,count/np.max(count),'k-');
plt.title('error in Fe');
plt.xlabel('error/se');
plt.ylabel('count');
c, e = np.histogram( e1[:,M-1]/se[M-1,0], Lh,(Au_min,Au_max));
Nc = len(c);
Ne = len(e);
count = eda_cvec( c );
edges = eda_cvec( e[0:Ne-1] );
ax1 = plt.subplot(1,2,2);
plt.axis( [Au_min, Au_max, 0, 1] );
plt.plot(edges,count/np.max(count),'k-');
plt.title('Error in Au');
plt.xlabel('error/se');
plt.ylabel('count');
# Part three factor analysis with weighting by analytic precision
# factors F and loadings F
w = np.reciprocal(se);
[U2, s2, V2T] = la.svd( np.matmul(Sobs,np.diag(w.ravel())),full_matrices=False);
Fpre2 = np.matmul(V2T,np.diag(se.ravel())); # factors
Cpre2 = np.matmul( U2, np.diag(s2) ); # loadings
# P significant factors
Fpre2P = np.copy( Fpre2[0:P,:] );
Cpre2P = np.copy( Cpre2[:,0:P] );
Spre2P = np.matmul( Cpre2P, Fpre2P );
fig2 = plt.figure(2,figsize=(8,4));
# check error of reconstruction
# Emax is absolute vale of difference between true and reconstructed sample matrices
e2 = np.abs(Strue-Spre2P);
Emax = np.max(e2);
print('with weighting, max relative error in CF ', Emax/Smax);
c, e = np.histogram( e2[:,0]/se[0,0], Lh, (Fe_min,Fe_max));
Nc = len(c);
Ne = len(e);
count = eda_cvec( c );
edges = eda_cvec( e[0:Ne-1] );
ax1 = plt.subplot(1,2,1);
plt.axis( [Fe_min, Fe_max, 0, 1] );
plt.plot(edges,count/np.max(count),'k-');
plt.title('error in Fe');
plt.xlabel('error/se');
plt.ylabel('count');
c, e = np.histogram( e2[:,M-1]/se[M-1,0], Lh,(Au_min,Au_max));
Nc = len(c);
Ne = len(e);
count = eda_cvec( c );
edges = eda_cvec( e[0:Ne-1] );
ax1 = plt.subplot(1,2,2);
plt.axis( [Au_min, Au_max, 0, 1] );
plt.plot(edges,count/np.max(count),'k-');
plt.title('Error in Au');
plt.xlabel('error/se');
plt.ylabel('count');
witout weighting, max relative error in CF 0.17895452318852598 with weighting, max relative error in CF 0.15821508184561714
In [16]:
# edapy_08_08
# factor analysis on Mid Atlantic Ridge Rocks dataset
# Q-mode factors with geographical plots
# load data
D = np.genfromtxt('../Data/rocks_with_lat_lon.txt', delimiter='\t')
N, K = np.shape(D);
M=K-2;
# break out (lat,lon) coordinates and sample matrix S into separate vctors
lat = eda_cvec( D[:,8] ); # lat
lon = eda_cvec( D[:,9] ); # lon
S = np.copy(D[:,0:M]); # sample matric S
# svd
[U, s, VT] = la.svd(S.T,full_matrices=False);
# keep only first few singular values
P=4;
FP = np.copy(VT[0:P,:]); # factor matrix
CP = np.matmul( U[:,0:P], np.diag(s[0:P])); # loading matrix
# map bounds
left = -60.0;
right = 10.0;
bottom = -70.0;
top = 70.0;
# plot
fig1 = plt.figure(1,figsize=(16,10));
# PART 1: factor 1
ax1 = plt.subplot(1,3,1);
plt.axis( [left, right, bottom, top] );
# plot one rectangle per sample i, with color scaled according to
# the strength of the i-th element in factor nf=1
nf = 1;
FPmean = np.mean(FP[nf,:]);
FPstd = np.std(FP[nf,:]);
fmin = -2*FPstd
fmax = 2*FPstd
for i in range(N):
h = 3;
c = (FP[nf,i]-fmin)/(fmax-fmin);
if( c<0.0 ):
c=0.0;
elif ( c>1.0 ):
c=1.0;
ax1.add_patch( pch.Rectangle( (lon[i,0], lat[i,0]), h, h, fc=(c,c,c), ec=(0.0,0.0,0.0) ) );
plt.xlabel('longitude (deg)');
plt.ylabel('latitude (deg)');
plt.title('Factor 1');
# plot coastlines on maps, using coarse coastlines from the file coastdata.txt
# this file consists of lists of (lat,lons) along segments of coastlines, with
# each list deliited with a ">".
coastfile = '../Data/coastdata.txt';
fd = open(coastfile,"r");
NN = 0;
NNMAX = 10000;
mylat = np.zeros((NNMAX,1));
mylon = np.zeros((NNMAX,1));
for i in range(100000):
line = fd.readline();
if not line:
break;
if( line[0] == '>' ):
if( NN == 0):
continue;
plt.plot( mylon[0:NN,0], mylat[0:NN,0], '.', color=(0.7,0.7,0.7) );
NN=0;
else:
latlonstr=line.split();
mylat[NN,0] = float(latlonstr[1]);
mylon[NN,0] = float(latlonstr[0]);
NN=NN+1;
fd.close();
# PART 1: factor 2
ax2 = plt.subplot(1,3,2);
plt.axis( [left, right, bottom, top] );
# plot one rectangle per sample i, with color scaled according to
# the strength of the i-th element in factor nf=2
nf = 2;
FPmean = np.mean(FP[nf,:]);
FPstd = np.std(FP[nf,:]);
fmin = -2*FPstd
fmax = 2*FPstd
for i in range(N):
h = 3;
c = (FP[nf,i]-fmin)/(fmax-fmin);
if( c<0.0 ):
c=0.0;
elif ( c>1.0 ):
c=1.0;
ax2.add_patch( pch.Rectangle( (lon[i,0], lat[i,0]), h, h, fc=(c,c,c), ec=(0.0,0.0,0.0) ) );
plt.xlabel('lonomgitude (deg)');
plt.ylabel('latitude (deg)');
plt.title('Factor 2');
# plot coastlines on maps, using coarse coastlines from the file coastdata.txt
# this file consists of lists of (lat,lons) along segments of coastlines, with
# each list deliited with a ">".
coastfile = '../Data/coastdata.txt';
fd = open(coastfile,"r");
NN = 0;
NNMAX = 10000;
mylat = np.zeros((NNMAX,1));
mylon = np.zeros((NNMAX,1));
for i in range(100000):
line = fd.readline();
if not line:
break;
if( line[0] == '>' ):
if( NN == 0):
continue;
plt.plot( mylon[0:NN,0], mylat[0:NN,0], '.', color=(0.7,0.7,0.7) );
NN=0;
else:
latlonstr=line.split();
mylat[NN,0] = float(latlonstr[1]);
mylon[NN,0] = float(latlonstr[0]);
NN=NN+1;
fd.close();
# PART 3: factor 3
ax3 = plt.subplot(1,3,3);
plt.axis( [left, right, bottom, top] );
# plot one rectangle per sample i, with color scaled according to
# the strength of the i-th element in factor nf=3
nf = 3;
FPmean = np.mean(FP[nf,:]);
FPstd = np.std(FP[nf,:]);
fmin = -2*FPstd
fmax = 2*FPstd
for i in range(N):
h = 3;
c = (FP[nf,i]-fmin)/(fmax-fmin);
if( c<0.0 ):
c=0.0;
elif ( c>1.0 ):
c=1.0;
ax3.add_patch( pch.Rectangle( (lon[i,0], lat[i,0]), h, h, fc=(c,c,c), ec=(0.0,0.0,0.0) ) );
plt.xlabel('lonomgitude (deg)');
plt.ylabel('latitude (deg)');
plt.title('Factor 3');
# plot coastlines on maps, using coarse coastlines from the file coastdata.txt
# this file consists of lists of (lat,lons) along segments of coastlines, with
# each list deliited with a ">".
coastfile = '../data/coastdata.txt';
fd = open(coastfile,"r");
NN = 0;
NNMAX = 10000;
mylat = np.zeros((NNMAX,1));
mylon = np.zeros((NNMAX,1));
for i in range(100000):
line = fd.readline();
if not line:
break;
if( line[0] == '>' ):
if( NN == 0):
continue;
plt.plot( mylon[0:NN,0], mylat[0:NN,0], '.', color=(0.7,0.7,0.7) );
NN=0;
else:
latlonstr=line.split();
mylat[NN,0] = float(latlonstr[1]);
mylon[NN,0] = float(latlonstr[0]);
NN=NN+1;
fd.close();
plt.show();
In [17]:
# edapy_08_09
# CAC Pacific Sea Surface Temperature (SST) dataset
# read in and plot the data
# load the data
D = np.genfromtxt('../Data/cac_sst.txt', delimiter='\t')
# set up sizes of arrays
I, J = np.shape(D);
NBLOCK=85;
NLON = NBLOCK-1;
NLAT = J-1;
IMAGES = floor(I/NBLOCK);
Dt = 1/12;
print('Images:',IMAGES)
# arrays for SST, date, lats, lons
SST = np.zeros((NLON,NLAT,IMAGES));
thedate = np.zeros((IMAGES,1)); # in month.year format
theyear = np.zeros((IMAGES,1)); # year extracted from thedate
themonth = np.zeros((IMAGES,1)); # month extracted from thedate
lats=np.zeros((NLAT,1));
lons=np.zeros((NLON,1));
# cut up data into SST arrays
lats = np.copy(D[0,1:J]).T;
lons = np.copy(D[1:NBLOCK,0]);
for j in range(IMAGES):
k1 = j*NBLOCK;
k2 = k1+NLON;
thedate[j,0]=D[k1,0];
themonth[j,0]=floor(thedate[j,0]);
theyear[j,0]=floor(10000*(thedate[j,0]-themonth[j,0])+0.1);
SST[:,:,j] = np.copy(D[k1+1:k2+1,1:J]);
print("start year %d month %d" % (theyear[0,0], themonth[0,0]));
print("end year %d month %d" % (theyear[IMAGES-1,0], themonth[IMAGES-1,0]));
# some definitions related to plotting
MONTHS=12; # months in year
YEARS=ceil(IMAGES/12); # years of images
YBLOCKSIZE = 6; # 6 years per plot block of years
YBLOCKS = ceil(YEARS/YBLOCKSIZE); # number of blocks
print("%d blocks of years" % (YBLOCKS) );
# grey-scale colormap
bw = np.zeros((256,4));
v = (255 - np.linspace( 0, 255, 256 ))/255;
bw[:,0] = v;
bw[:,1] = v;
bw[:,2] = v;
bw[:,3] = np.ones(256);
bwcmap = ListedColormap(bw);
# plot data, 12 months by six years of maps per figure
lastmonth = (IMAGES % (YBLOCKSIZE*12)) % YBLOCKSIZE;
print("lastmonth %d" % (lastmonth));
done=0;
for yb in range(YBLOCKS):
fig = plt.figure(yb+2,figsize=(16,10));
for y in range(YBLOCKSIZE):
for m in range(MONTHS):
j = m+MONTHS*y+MONTHS*YBLOCKSIZE*yb;
k = y + m*YBLOCKSIZE;
if( j>=IMAGES):
done=1;
break;
ax = plt.subplot(MONTHS, YBLOCKSIZE, k+1);
left=0;
right=3.0;
bottom=0.0;
top=1.0;
plt.axis( [left, right, bottom, top] );
ax.xaxis.set_ticks([]);
ax.yaxis.set_ticks([]);
MAP=np.flipud(SST[:,:,j].T); # reorient so it plots as map
MAPmin = np.min(MAP);
MAPmax = np.max(MAP);
MAPrange = MAPmax-MAPmin;
if( MAPrange<=0.0 ):
MAPrange=1;
plt.imshow( (MAP-MAPmin)/MAPrange, cmap=bwcmap, vmin=0, vmax=1, extent=(left,right,bottom,top) );
if( y==0 ):
plt.ylabel("%d" % (themonth[j,0]) );
if( (m==11) or ((yb==(YBLOCKS-1)) and (m==(lastmonth-1))) ):
plt.xlabel("%d" % (theyear[j,0]) );
if( done==1 ):
break;
if( done==1 ):
break;
plt.show();
Images: 399 start year 1970 month 1 end year 2003 month 3 6 blocks of years lastmonth 3
In [18]:
# edapy_08_10
# CAC Pacific Sea Surface Temperature (SST) dataset
# plot singular values
# load the data
D = np.genfromtxt('../Data/cac_sst.txt', delimiter='\t')
# set up sizes of arrays
I, J = np.shape(D);
NBLOCK=85;
NLON = NBLOCK-1;
NLAT = J-1;
IMAGES = floor(I/NBLOCK);
Dt = 1/12;
# arrays for SST, date, lats, lons
SST = np.zeros((NLON,NLAT,IMAGES));
thedate = np.zeros((IMAGES,1)); # in month.year format
theyear = np.zeros((IMAGES,1)); # year extracted from thedate
themonth = np.zeros((IMAGES,1)); # month extracted from thedate
lats=np.zeros((NLAT,1));
lons=np.zeros((NLON,1));
# cut up data into SST arrays
lats = np.copy(D[0,1:J]).T;
lons = np.copy(D[1:NBLOCK,0]);
for j in range(IMAGES):
k1 = j*NBLOCK;
k2 = k1+NLON;
thedate[j,0]=D[k1,0];
themonth[j,0]=floor(thedate[j,0]);
theyear[j,0]=floor(10000*(thedate[j,0]-themonth[j,0])+0.1);
SST[:,:,j] = np.copy(D[k1+1:k2+1,1:J]);
# some definitions related to plotting
MONTHS=12;
YEARS=ceil(IMAGES/12);
YBLOCKSIZE = 6;
YBLOCKS = ceil(YEARS/YBLOCKSIZE);
# fold out the images into the sample array
SSTV = np.zeros( (IMAGES, NLAT*NLON) );
for j in range(IMAGES):
for p in range(NLAT):
for q in range(NLON):
k = p + q*NLAT;
SSTV[j,k] = SST[q, p, j];
# singular value decomposition
[U, sigma, VT] = la.svd(SSTV,full_matrices=False); # singular values sigma
sh = np.shape(sigma);
Ns = sh[0];
F = np.copy(VT); # factors
C = np.matmul( U, np.diag(sigma) ); # factor loadings
# plot singular values
fig1 = plt.figure(1,figsize=(8,4));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, Ns+1, 0, 1.1*np.max(sigma)] );
plt.plot( np.linspace(1,Ns,Ns), sigma, 'k-');
plt.title('Singular Values of the CAC dataset');
plt.xlabel('index i');
plt.ylabel('singular value i');
plt.show();
In [19]:
# edapy_08_11
# CAC Pacific Sea Surface Temperature (SST) dataset
# plot factors as maps and loadings as timeseries
# load the data
D = np.genfromtxt('../Data/cac_sst.txt', delimiter='\t');
# set up sizes of arrays
I, J = np.shape(D);
NBLOCK=85;
NLON = NBLOCK-1;
NLAT = J-1;
IMAGES = floor(I/NBLOCK);
Dt = 1/12;
# arrays for SST, date, lats, lons
SST = np.zeros((NLON,NLAT,IMAGES));
thedate = np.zeros((IMAGES,1)); # in month.year format
theyear = np.zeros((IMAGES,1)); # year extracted from thedate
themonth = np.zeros((IMAGES,1)); # month extracted from thedate
lats=np.zeros((NLAT,1));
lons=np.zeros((NLON,1));
# cut up data into SST arrays
lats = np.copy(D[0,1:J]).T;
lons = np.copy(D[1:NBLOCK,0]);
for j in range(IMAGES):
k1 = j*NBLOCK;
k2 = k1+NLON;
thedate[j,0]=D[k1,0];
themonth[j,0]=floor(thedate[j,0]);
theyear[j,0]=floor(10000*(thedate[j,0]-themonth[j,0])+0.1);
SST[:,:,j] = np.copy(D[k1+1:k2+1,1:J]);
# some definitions related to plotting
MONTHS=12;
YEARS=ceil(IMAGES/12);
YBLOCKSIZE = 6;
YBLOCKS = ceil(YEARS/YBLOCKSIZE);
# fold out the images into the sample matrix
SSTV = np.zeros( (IMAGES, NLAT*NLON) );
for j in range(IMAGES):
for p in range(NLAT):
for q in range(NLON):
k = p + q*NLAT;
SSTV[j,k] = SST[q, p, j];
# singular value decomposition
[U, sigma, VT] = la.svd(SSTV,full_matrices=False);
sh = np.shape(sigma); # sigma is the singular values
Ns = sh[0];
# keep only P singular values
P=Ns;
F = np.copy(VT[0:P,:]); # factors
C = np.matmul( U[:,0:P], np.diag(sigma[0:P])); # factor loadings
# fold EOF's back into images (maps)
Nf = 12; # just do a few EOF's
SSTF = np.zeros((NLON,NLAT,Nf));
for j in range(Nf):
for p in range(NLAT):
for q in range(NLON):
k = p + q*NLAT;
SSTF[ q, p, j ] = F[j,k];
# grey-scale colormap
bw = np.zeros((256,4));
v = (255 - np.linspace( 0, 255, 256 ))/255;
bw[:,0] = v;
bw[:,1] = v;
bw[:,2] = v;
bw[:,3] = np.ones(256);
bwcmap = ListedColormap(bw);
# plot these EOF's
fig1 = plt.figure(1,figsize=(16,6));
for f in range(Nf):
ax = plt.subplot(3, 4, f+1);
left=0;
right=3.0;
bottom=0.0;
top=1.0;
plt.axis( [left, right, bottom, top] );
ax.xaxis.set_ticks([]);
ax.yaxis.set_ticks([]);
MAP=np.flipud(SSTF[:,:,f].T); # reorient so it plots as map
MAPmin = np.min(MAP);
MAPmax = np.max(MAP);
MAPrange = MAPmax-MAPmin;
if( MAPrange<=0.0 ):
MAPrange=1;
plt.imshow( (MAP-MAPmin)/MAPrange, cmap=bwcmap, vmin=0, vmax=1, extent=(left,right,bottom,top) );
plt.title("EOF %d" % (f+1) );
plt.show();
# plot corresponding amplitude timeseries
fig2 = plt.figure(2,figsize=(16,12));
for f in range(Nf):
ax = plt.subplot(3, 4, f+1);
plt.axis( [theyear[0,0], theyear[IMAGES-1,0]+1, np.min(C[:,0]), np.max(C[:,0]) ] );
plt.plot( theyear[0,0]+np.linspace(1,IMAGES,IMAGES)/12, C[:,f], 'k-' );
plt.title("C(%d,t)" % (f) );
plt.show();
In [20]:
# edapy_08_12, approximate SST's using lowest factors
# CAC Pacific Sea Surface Temperature (SST) dataset
# load the data
D = np.genfromtxt('../Data/cac_sst.txt', delimiter='\t')
# set up sizes of arrays
I, J = np.shape(D);
NBLOCK=85;
NLON = NBLOCK-1;
NLAT = J-1;
IMAGES = floor(I/NBLOCK);
Dt = 1/12;
print('Images:',IMAGES)
# arrays for SST, date, lats, lons
SST = np.zeros((NLON,NLAT,IMAGES));
thedate = np.zeros((IMAGES,1)); # in month.year format
theyear = np.zeros((IMAGES,1)); # year extracted from thedate
themonth = np.zeros((IMAGES,1)); # month extracted from thedate
lats=np.zeros((NLAT,1));
lons=np.zeros((NLON,1));
# cut up data into SST arrays
lats = np.copy(D[0,1:J]).T;
lons = np.copy(D[1:NBLOCK,0]);
for j in range(IMAGES):
k1 = j*NBLOCK;
k2 = k1+NLON;
thedate[j,0]=D[k1,0];
themonth[j,0]=floor(thedate[j,0]);
theyear[j,0]=floor(10000*(thedate[j,0]-themonth[j,0])+0.1);
SST[:,:,j] = np.copy(D[k1+1:k2+1,1:J]);
print("start year %d month %d" % (theyear[0,0], themonth[0,0]));
print("end year %d month %d" % (theyear[IMAGES-1,0], themonth[IMAGES-1,0]));
# some definitions related to plotting
MONTHS=12;
YEARS=ceil(IMAGES/12);
YBLOCKSIZE = 6;
YBLOCKS = ceil(YEARS/YBLOCKSIZE);
# fold out images into the sample matrix
SSTV = np.zeros( (IMAGES, NLAT*NLON) );
for j in range(IMAGES):
for p in range(NLAT):
for q in range(NLON):
k = p + q*NLAT;
SSTV[j,k] = SST[q, p, j];
# singular value decomposition
[U, sigma, VT] = la.svd(SSTV,full_matrices=False);
sh = np.shape(sigma); # sigma is the singular values
Ns = sh[0];
# keep only first few singular values
P=5;
FP = np.copy(VT[0:P,:]); # factors
CP = np.matmul( U[:,0:P], np.diag(sigma[0:P])); # factor loadings
# reconstruct with a small number of EOFs
SSTVp = np.matmul(CP, FP);
# fold reconstructed sample matrix back into images (maps)
Nf = IMAGES;
SSTp = np.zeros((NLON,NLAT,Nf));
for j in range(Nf):
for p in range(NLAT):
for q in range(NLON):
k = p + q*NLAT;
SSTp[ q, p, j ] = SSTVp[j,k];
# gre-scale colormap
bw = np.zeros((256,4));
v = (255 - np.linspace( 0, 255, 256 ))/255;
bw[:,0] = v;
bw[:,1] = v;
bw[:,2] = v;
bw[:,3] = np.ones(256);
bwcmap = ListedColormap(bw);
# plor data, 12 months by six years of maps per figure
lastmonth = (IMAGES % YBLOCKSIZE*12) % YBLOCKSIZE ;
done=0;
for yb in range(YBLOCKS):
fig = plt.figure(yb+2,figsize=(16,10));
for m in range(MONTHS):
for y in range(YBLOCKSIZE):
j = m+MONTHS*y+MONTHS*YBLOCKSIZE*yb;
k = y + m*YBLOCKSIZE;
if( j>=IMAGES):
done=1;
break;
ax = plt.subplot(MONTHS, YBLOCKSIZE, k+1);
left=0;
right=3.0;
bottom=0.0;
top=1.0;
plt.axis( [left, right, bottom, top] );
ax.xaxis.set_ticks([]);
ax.yaxis.set_ticks([]);
MAP=np.flipud(SSTp[:,:,j].T); # reorient so it plots as map
MAPmin = np.min(MAP);
MAPmax = np.max(MAP);
MAPrange = MAPmax-MAPmin;
if( MAPrange<=0.0 ):
MAPrange=1;
plt.imshow( (MAP-MAPmin)/MAPrange, cmap=bwcmap, vmin=0, vmax=1, extent=(left,right,bottom,top) );
if( y==0 ):
plt.ylabel("%d" % (themonth[j,0]) );
if( (m==11) or ((yb==(YBLOCKS-1)) and (m==lastmonth)) ):
plt.xlabel("%d" % (theyear[j,0]) );
if( done==1 ):
break;
if( done==1 ):
break;
plt.show();
Images: 399 start year 1970 month 1 end year 2003 month 3
In [21]:
# edapy_08_13, create a dataset for cluster analysis
# Note: In order to prevent ooverwriting of the originally
# distributed dataset, filenames have been changed from
# "../Data/threeclusters.txt" to "../Data/mythreeclusters.txt"
# "../Data/threeclustermeans.txt" to "../Data/threeclustermeans.txt"
# change them back if you want ...
K=3; # three clusters
# 100 samples per cluster
N1=100;
N2=100;
N3=100;
N=N1+N2+N3;
M=2; # two elements
# means set by hand
mu = np.zeros((K,M));
mu[0,0] = 1.0;
mu[0,1] = 2.0;
mu[1,0] = 3.0;
mu[1,1] = 4.0;
mu[2,0] = 4.0;
mu[2,1] = 1.0;
# sample matrix randomly assigned
S = np.zeros((N,M));
sc = 0.5;
S[0:N1,0:1] = np.random.normal( loc=mu[0,0], scale=sc, size=(N1,1) );
S[0:N1,1:2] = np.random.normal( loc=mu[0,1], scale=sc, size=(N1,1) );
S[N1:N1+N2,0:1] = np.random.normal( loc=mu[1,0], scale=sc, size=(N2,1) );
S[N1:N1+N2,1:2] = np.random.normal( loc=mu[1,1], scale=sc, size=(N2,1) );
S[N1+N2:N,0:1] = np.random.normal( loc=mu[2,0], scale=sc, size=(N3,1) );
S[N1+N2:N,1:2] = np.random.normal( loc=mu[2,1], scale=sc, size=(N3,1) );
# cluster membership
k1 = np.zeros((N1,1),dtype=int);
k2 = np.ones((N2,1),dtype=int);
k3 = 2*np.ones((N3,1),dtype=int);
myk = np.concatenate((k1,k2,k3),axis=0);
fig1 = plt.figure(1,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
for i in range(N):
if myk[i,0]==0:
plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
elif myk[i,0]==1:
plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
else:
plt.plot( S[i,0], S[i,1], 'ko', lw=1 );
plt.plot( mu[0:K,0:1], mu[0:K,1:2], 'b^', lw=2 );
plt.xlabel('x');
plt.ylabel('y');
plt.show();
Dm = np.concatenate( (S,myk),axis=1);
np.savetxt("../Data/mythreeclusters.txt",Dm, delimiter="\t");
np.savetxt("../Data/mythreeclustermeans.txt",mu, delimiter="\t");
In [26]:
# edapy_08_14, cluster analysis w/ prior mean initialization
# load the data
K = 3;
D = np.genfromtxt("../Data/threeclusters.txt", delimiter='\t');
N, M = np.shape(D);
M = M-1;
S = np.copy( D[0:N,0:2] )
myk = np.ndarray.astype(D[0:N,2:3], int);
mu = np.genfromtxt("../Data/threeclustermeans.txt", delimiter='\t');
# initial means are the prior means
initmu = np.zeros((K,M));
initmu[0,0] = 1.2;
initmu[0,1] = 2.2;
initmu[1,0] = 3.4;
initmu[1,1] = 4.4;
initmu[2,0] = 4.1;
initmu[2,1] = 1.1;
# k-mean analysis
mymu, myk, mydist2, mycount, myKdist2 = eda_kmeans(S, initmu);
print('cluster means');
print(mymu);
print('mean sample distance');
print(np.mean(np.sqrt(mydist2)));
print('cluster counts');
print(mycount);
print('mean inter-cluster distance');
print(np.mean(np.sqrt(myKdist2)));
fig2 = plt.figure(2,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
plt.plot( initmu[0:K,0:1], initmu[0:K,1:2], 'r^', lw=2 )
for i in range(N):
if myk[i,0]==0:
plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
elif myk[i,0]==1:
plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
else:
plt.plot( S[i,0], S[i,1], 'ko', lw=1 );
plt.plot( mymu[0:K,0:1], mymu[0:K,1:2], 'b^', lw=2 )
plt.xlabel('x');
plt.ylabel('y');
plt.show();
C:\Users\menke\AppData\Local\Temp\ipykernel_3792\3757833061.py:145: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.) dist2[i,0] = r2min; # sq distance to that cluster
cluster means [[1.07859747 2.02539117] [3.04311042 4.05478594] [4.05072009 1.03414835]] mean sample distance 0.6470009786672823 cluster counts [[100] [100] [100]] mean inter-cluster distance 2.0315147021424114
In [32]:
# edapy_08_15, cluster analysis w/ random-partition initialization
# load the data
K = 3;
D = np.genfromtxt("../Data/threeclusters.txt", delimiter='\t');
N, M = np.shape(D);
M = M-1;
S = np.copy( D[0:N,0:2] )
myk = np.ndarray.astype(D[0:N,2:3], int);
mu = np.genfromtxt("../Data/threeclustermeans.txt", delimiter='\t');
# initial means are from random partition of the data set
initmu = eda_random_partition_mean(S,K);
# k-mean analysis
mymu, myk, mydist, mycount, myKdist = eda_kmeans(S, initmu);
print('cluster means');
print(mymu);
print('mean sample distance');
print(np.mean(np.sqrt(mydist)));
print('cluster counts');
print(mycount);
print('mean inter-cluster distance');
print(np.mean(np.sqrt(myKdist)));
fig3 = plt.figure(3,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
plt.plot( initmu[0:K,0:1], initmu[0:K,1:2], 'r^', lw=4 )
for i in range(N):
if myk[i,0]==0:
plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
elif myk[i,0]==1:
plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
else:
plt.plot( S[i,0], S[i,1], 'ko', lw=1 );
plt.plot( mymu[0:K,0:1], mymu[0:K,1:2], 'b^', lw=4 );
plt.xlabel('x');
plt.ylabel('y');
plt.show();
cluster means [[3.04311042 4.05478594] [4.05072009 1.03414835] [1.07859747 2.02539117]] mean sample distance 0.6470009786672823 cluster counts [[100] [100] [100]] mean inter-cluster distance 2.0315147021424114
In [33]:
# edapy_08_16, cluster analysis w/ forgy initialization
# load the data
K = 3;
D = np.genfromtxt("../Data/threeclusters.txt", delimiter='\t');
N, M = np.shape(D);
M = M-1;
S = np.copy( D[0:N,0:2] )
myk = np.ndarray.astype(D[0:N,2:3], int);
mu = np.genfromtxt("../Data/threeclustermeans.txt", delimiter='\t');
# initial means are randomly-chosen samples
initmu = eda_forgy_mean(S,K);
# k-mean analysis
mymu, myk, mydist2, mycount, myKdist2 = eda_kmeans(S, initmu);
print('cluster means');
print(mymu);
print('mean sample distance');
print(np.mean(np.sqrt(mydist2)));
print('cluster counts');
print(mycount);
print('mean inter-cluster distance');
print(np.mean(np.sqrt(myKdist2)));
fig4 = plt.figure(4,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
plt.plot( initmu[0:K,0:1], initmu[0:K,1:2], 'r^', lw=4 )
for i in range(N):
if myk[i,0]==0:
plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
elif myk[i,0]==1:
plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
else:
plt.plot( S[i,0], S[i,1], 'ko', lw=1 );
plt.plot( mymu[0:K,0:1], mymu[0:K,1:2], 'b^', lw=4 )
plt.xlabel('x');
plt.ylabel('y');
plt.show();
cluster means [[1.07859747 2.02539117] [3.04311042 4.05478594] [4.05072009 1.03414835]] mean sample distance 0.6470009786672823 cluster counts [[100] [100] [100]] mean inter-cluster distance 2.0315147021424114
In [34]:
# edapy_08_17, cluster analysis w/ Bradley-Fayyad initialization
# load the data
K = 3;
D = np.genfromtxt("../Data/threeclusters.txt", delimiter='\t');
N, M = np.shape(D);
M = M-1;
S = np.copy( D[0:N,0:2] )
myk = np.ndarray.astype(D[0:N,2:3], int);
mu = np.genfromtxt("../Data/threeclustermeans.txt", delimiter='\t');
# bradley-fayyad parameters
Nsets = 100; # number of small sets
fraction = 0.05; # size of small sets = fraction of N
# initial means are bradley-fayyad means
initmu = eda_bradley_fayyad_mean(S,K,Nsets,fraction);
# k-mean analysis
mymu, myk, mydist2, mycount, myKdist2 = eda_kmeans(S, initmu);
print('cluster means');
print(mymu);
print('mean sample distance');
print(np.mean(np.sqrt(mydist2)));
print('cluster counts');
print(mycount);
print('mean inter-cluster distance');
print(np.mean(np.sqrt(myKdist2)));
fig5 = plt.figure(5,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
plt.plot( initmu[0:K,0:1], initmu[0:K,1:2], 'r^', lw=4 )
for i in range(N):
if myk[i,0]==0:
plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
elif myk[i,0]==1:
plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
else:
plt.plot( S[i,0], S[i,1], 'ko', lw=1 );
plt.plot( mymu[0:K,0:1], mymu[0:K,1:2], 'b^', lw=4 )
plt.xlabel('x');
plt.ylabel('y');
plt.show();
cluster means [[1.07859747 2.02539117] [3.04311042 4.05478594] [4.05072009 1.03414835]] mean sample distance 0.6470009786672823 cluster counts [[100] [100] [100]] mean inter-cluster distance 2.0315147021424114
In [35]:
# edapy_08_18, more clusters than actually occur
# load the data
K = 3;
D = np.genfromtxt("../Data/threeclusters.txt", delimiter='\t');
N, M = np.shape(D);
M = M-1;
S = np.copy( D[0:N,0:2] )
myk = np.ndarray.astype(D[0:N,2:3], int);
mu = np.genfromtxt("../Data/threeclustermeans.txt", delimiter='\t');
# six clusers
K6=6;
# initial means are forgy means
initmu = eda_forgy_mean(S,K6);
# k-mean analysis
mymu, myk, mydist2, mycount, myKdist2 = eda_kmeans(S, initmu);
print('cluster means');
print(mymu);
print('mean sample distance');
print(np.mean(np.sqrt(mydist2)));
print('cluster counts');
print(mycount);
print('mean inter-cluster distance');
print(np.mean(np.sqrt(myKdist2)));
print('inter-cluster distances');
print(np.sqrt(myKdist2));
fig4 = plt.figure(4,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
for i in range(N):
if myk[i,0]==0:
plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
elif myk[i,0]==1:
plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
elif myk[i,0]==2:
plt.plot( S[i,0], S[i,1], 'k^', lw=1 );
elif myk[i,0]==3:
plt.plot( S[i,0], S[i,1], 'kx', lw=1 );
elif myk[i,0]==4:
plt.plot( S[i,0], S[i,1], 'kv', lw=1 );
else:
plt.plot( S[i,0], S[i,1], 'ks', lw=1 );
plt.plot( mymu[0:K6,0:1], mymu[0:K6,1:2], 'r^', lw=2 )
plt.xlabel('x');
plt.ylabel('y');
plt.show();
cluster means [[0.97471318 2.52414481] [4.05072009 1.03414835] [1.53932691 1.89484044] [2.58293311 4.04672791] [3.50328773 4.06284396] [0.70045566 1.46208681]] mean sample distance 0.5349723526734314 cluster counts [[ 40] [100] [ 32] [ 50] [ 50] [ 28]] mean inter-cluster distance 2.0665781214090626 inter-cluster distances [[0. 3.41788063 0.84546594 2.21464008 2.95994669 1.09689761] [3.41788063 0. 2.6547856 3.35112432 3.07777181 3.37748473] [0.84546594 2.6547856 0. 2.39159646 2.92530022 0.94391773] [2.21464008 3.35112432 2.39159646 0. 0.92049571 3.19751325] [2.95994669 3.07777181 2.92530022 0.92049571 0. 3.82358541] [1.09689761 3.37748473 0.94391773 3.19751325 3.82358541 0. ]]
In [36]:
# edapy_08_19, clusters in the Atlantic Rocks dataset
# read data
S = np.genfromtxt('../Data/rocks.txt', delimiter='\t');
N, M = np.shape(S);
# examine mean-sample distances for sequence of number
# of clusters
KMAX = 11;
Klist = np.zeros((KMAX,1));
Dlist = np.zeros((KMAX,1));
print('K','mean_dist');
for K in range(1,KMAX):
initmu = eda_forgy_mean(S,K);
mymu, myk, mydist2, mycount, myKdist2 = eda_kmeans(S, initmu);
Klist[K,0]=K;
Dlist[K,0]=np.mean(np.sqrt(mydist2));
print(K,Dlist[K,0]);
# plot of distances
fig1 = plt.figure(1,figsize=(10,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, KMAX, 0, 3] );
plt.plot( Klist[1:KMAX,0:1], Dlist[1:KMAX,0:1], 'k-', lw=2 )
plt.plot( Klist[1:KMAX,0:1], Dlist[1:KMAX,0:1], 'ko', lw=2 )
plt.xlabel('K');
plt.ylabel('mean Rij');
plt.show();
K mean_dist 1 2.450872469766111 2 2.159369027284355 3 1.87730362348076 4 1.6844183908052115 5 1.5978670360338794 6 1.523469254491392 7 1.4510963869048237 8 1.42084090605399 9 1.3982566808767423 10 1.3737027099481425
In [37]:
# edapy_08_20, print 8 clusters in the Atlantic Rocks dataset
# read the data
S = np.genfromtxt('../Data/rocks.txt', delimiter='\t')
N, M = np.shape(S);
K=8; # 8 clusters
initmu = eda_forgy_mean(S,K); # forgy initial means
# k-mean analysis
mymu, myk, mydist, mycount, myKdist = eda_kmeans(S, initmu);
# print table of cluster means
names = ["SiO2", "TiO2", "Al203", "FeO-t", "MgO", "CaO", "Na2O", "K2O" ];
print('cluster\t', end='');
for j in range(M):
print("%s\t" % (names[j]), end='');
print('count');
for i in range(K):
print("%d\t" % (i), end='');
for j in range(M):
print("%.2f\t" % (mymu[i,j]), end='');
print("%d" % (mycount[i,0]));
cluster SiO2 TiO2 Al203 FeO-t MgO CaO Na2O K2O count 0 44.90 0.06 2.51 8.20 40.22 1.52 0.13 0.03 54 1 51.38 1.82 15.04 10.33 6.64 10.65 3.00 0.30 815 2 48.03 0.42 21.22 5.32 6.61 15.14 1.60 0.09 40 3 48.94 0.86 16.15 8.96 9.67 12.12 2.16 0.11 497 4 50.67 1.56 14.08 12.22 6.86 11.50 2.26 0.15 850 5 50.91 1.18 14.88 9.87 7.76 12.28 2.20 0.15 1397 6 48.54 1.19 16.74 8.54 6.99 12.92 2.32 0.28 516 7 50.52 1.44 15.68 9.40 7.83 11.43 2.71 0.19 2187
In [38]:
# edapy_08_21, 3D plot of clusters in Atlantic Rocks dataset
# read the data
S = np.genfromtxt('../Data/rocks.txt', delimiter='\t')
N, M = np.shape(S);
names = ["SiO2", "TiO2", "Al203", "FeO-t", "MgO", "CaO", "Na2O", "K2O" ];
K=8; # 8 clusters
initmu = eda_forgy_mean(S,K); # forgy initial means
# k-mean analysis
mymu, myk, mydist, mycount, myKdist = eda_kmeans(S, initmu);
# elements to plot (and their scale)
m1 = 2; # Al203
xmin = 0.0
xmax = 50.0;
m2 = 4; # MgO
ymin = 0.0
ymax = 50.0;
m3 = 5; # Ca0
zmin = 0.0
zmax = 20.0;
# 3D plot
fig2 = plt.figure(2,figsize=(10,5));
ax1 = fig2.add_subplot(121, projection='3d')
ax1.axes.set_xlim3d(left=xmin, right=xmax);
ax1.axes.set_ylim3d(bottom=ymin, top=ymax);
ax1.axes.set_zlim3d(bottom=zmin, top=zmax);
ax1.view_init(1, 30);
ax1.scatter( xs=S[:,m1], ys=S[:,m2], zs=S[:,m3], color=(0,0,0) );
ax1.plot( xs=[xmin,xmin], ys=[ymin,ymin], zs=[zmin,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmin], ys=[ymin,ymax], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmax], ys=[ymin,ymin], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymax,ymax], zs=[zmax,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymax,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmin], ys=[ymax,ymax], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmin], ys=[ymin,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymin,ymin], zs=[zmax,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmin], ys=[ymax,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmin], ys=[ymax,ymax], zs=[zmax,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymax,ymin], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmin], ys=[ymax,ymax], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.set_xlabel(names[m1]);
ax1.set_ylabel(names[m2]);
ax1.set_zlabel(names[m3]);
ax1 = fig2.add_subplot(122, projection='3d')
ax1.axes.set_xlim3d(left=xmin, right=xmax);
ax1.axes.set_ylim3d(bottom=ymin, top=ymax);
ax1.axes.set_zlim3d(bottom=zmin, top=zmax);
ax1.view_init(1, 30);
ax1.scatter( xs=mymu[:,m1], ys=mymu[:,m2], zs=mymu[:,m3], color=(0,0,0) );
ax1.plot( xs=[xmin,xmin], ys=[ymin,ymin], zs=[zmin,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmin], ys=[ymin,ymax], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmax], ys=[ymin,ymin], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymax,ymax], zs=[zmax,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymax,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmin], ys=[ymax,ymax], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmin], ys=[ymin,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymin,ymin], zs=[zmax,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmin], ys=[ymax,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmin], ys=[ymax,ymax], zs=[zmax,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymax,ymin], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmin], ys=[ymax,ymax], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.set_xlabel(names[m1]);
ax1.set_ylabel(names[m2]);
ax1.set_zlabel(names[m3]);
plt.show();
In [39]:
# edama_08_22, Script to support Cribsheet 8.2 on EOF's
# standard set-up of v-axis
M=100;
Dv = 1.0;
vmin = 0.0;
vmax = vmin + Dv*(M-1);
v = eda_cvec(np.linspace(vmin,vmax,M));
# standard set-up of u-axis
N=1000;
Du = 1.0;
umin = 0.0;
umax = umin + Du*(N-1);
u = eda_cvec(np.linspace(umin,umax,N));
# gre-scale colormap
bw = np.zeros((256,4));
vv = (255 - np.linspace( 0, 255, 256 ))/255;
bw[:,0] = vv;
bw[:,1] = vv;
bw[:,2] = vv;
bw[:,3] = np.ones(256);
bwcmap = ListedColormap(bw);
# Part 1: make synthetic data
# four true patterns
fv1 = np.mod(v,10)/10;
v0 = (vmin+vmax)/2;
s2 = (vmax-vmin)/5;
fv2 = np.exp(-0.5*np.power((v-v0)/s2,2));
v1 = (vmin+vmax)/4;
v2 = 3*(vmin+vmax)/4;
s2 = (vmax-vmin)/10;
fv3 = np.exp(-0.5*np.power((v-v1)/s2,2))-np.exp(-0.5*np.power((v-v2),2)/s2);
L = vmax-vmin;
n=10;
fv4 = np.cos(n*pi*v/L);
# plot u dependence of true patterns
print('true patterns');
fig1 = plt.figure(1,figsize=(10,5));
fvlist = [fv1, fv2, fv3, fv4];
for i in range(4):
fvi = fvlist[i];
ax1 = plt.subplot(4,1,i+1);
fmax = np.max(np.abs(fvi));
plt.axis( [vmin, vmax, -fmax, fmax] );
plt.plot( v, fvi, 'k-', lw=2 );
plt.xlabel('v');
plt.ylabel("fv%d" % (i));
plt.show()
# true amplitudes of patterns
L = umax-umin;
cu1 = (u-umin)/L;
cu2 = np.power((u-umin),0.25)/(L**0.25);
n=10;
cu3 = np.cos(n*pi*u/L);
cu4 = (100-np.mod(u,100))/100;
# plot true v dependence of amplitudes
print('amplitudes of true patterns');
fig2 = plt.figure(2,figsize=(10,5));
culist = [cu1, cu2, cu3, cu4];
for i in range(4):
cui = culist[i];
ax1 = plt.subplot(4,1,i+1);
cmax = np.max(np.abs(cui));
plt.axis( [umin, umax, -cmax, cmax] );
plt.plot( u, cui, 'k-', lw=2 );
plt.xlabel('u');
plt.ylabel("cu%d" % (i));
plt.show()
# build hypothetical sample matric
S = np.matmul(
np.concatenate( (cu1, cu2, cu3, cu4), axis=1),
np.concatenate( (fv1.T, fv2.T, fv3.T, fv4.T), axis=0) );
Smax = np.max(np.abs(S));
S = S + np.random.normal(loc=0,scale=(0.1*Smax),size=(N,M));
# plot S
fig3 = plt.figure(3,figsize=(10,5));
ax1 = plt.subplot(1,1,1);
Smax = np.max(np.abs(S));
plt.axis( [vmin,vmax,umin,umax] );
plt.imshow(S,cmap=bwcmap,vmin=-Smax,vmax=Smax,extent=(vmin,vmax,umin,umax),aspect='auto');
plt.xlabel('v');
plt.ylabel('u');
plt.title('S');
plt.show();
# Part 2: singular value decompsition
# F is a matrix of the EOF's and
# C is a matrix of their amplitudes
# factors F and loadings F
[U, sigma, VT] = la.svd(S,full_matrices=False);
F = VT; # EOS's (factors)
C = np.matmul( U, np.diag(sigma) ); # amplitudes (loadings)
fig4 = plt.figure(4,figsize=(10,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, M, 0, np.max(sigma)] );
plt.plot( np.linspace(1,M,M), sigma, 'k-', lw=1 );
plt.plot( np.linspace(1,M,M), sigma, 'ko', lw=2 );
plt.xlabel('i');
plt.ylabel('sigma i');
plt.title('singular values');
# Part 3: selection of number P of significant EOS's
# On the basis of the last plot, P=4
P = 4;
FP = np.copy( F[0:P,:] );
CP = np.copy( C[:,0:P] );
SP = np.matmul( CP, FP );
# plot SP
fig5 = plt.figure(5,figsize=(10,5));
ax1 = plt.subplot(1,1,1);
SPmax = np.max(np.abs(SP));
plt.axis( [vmin,vmax,umin,umax] );
plt.imshow(SP,cmap=bwcmap,vmin=-Smax,vmax=Smax,extent=(vmin,vmax,umin,umax),aspect='auto');
plt.xlabel('v');
plt.ylabel('u');
plt.title('SP');
plt.show();
# Part 3: plot EOF's as a function of v
# and their amplitudes as a function of u
# plot EOF's
print('EOFs');
fig6 = plt.figure(6,figsize=(10,5));
for i in range(P):
ax1 = plt.subplot(4,1,i+1);
fvi = np.copy(FP[i:i+1,0:M]).T;
plt.axis( [vmin, vmax, -max(abs(fvi)), max(abs(fvi))])
plt.plot( v, fvi, 'k-', lw=2);
plt.xlabel('v');
plt.ylabel("fv%d" % (i));
plt.show();
# plot amplitudes
print('amplitudes of EOFs');
fig7 = plt.figure(7,figsize=(10,5));
for i in range(P):
ax1 = plt.subplot(4,1,i+1);
cui=np.copy(CP[0:N,i:i+1]);
plt.axis( [umin, umax, -max(abs(cui)), max(abs(cui))]);
plt.plot( u, cui, 'k-', lw=2);
plt.xlabel('u');
plt.ylabel("cu%d" % (i));
plt.show();
print('Note that the factors do not match the original patterns,');
print('although they do bear some resemblence to them.');
print('That''s because any linear combination of factors will do.');
print('Note also that fv4 and cu4 are very noisy. That''s because the');
print('p=4 singular value is close to the noise level.');
true patterns
amplitudes of true patterns
EOFs
amplitudes of EOFs
Note that the factors do not match the original patterns, although they do bear some resemblence to them. Thats because any linear combination of factors will do. Note also that fv4 and cu4 are very noisy. Thats because the p=4 singular value is close to the noise level.
In [ ]: