# edapy_08_00 clear all variables and import various modules

# History
# 2022/06/26 -  checked with most recent Python & etc. software
# 2023/07/13 -  fixed some issues, incl colormap, loglof plt, eda_cvec()
# 2024/05/26 -  Patches to accomodate new verions of numpy, scipy, matplotlib
#               patched dot product to return scalar value

%reset -f
import os
from datetime import date
from math import exp, pi, sin, cos, sqrt, floor, ceil, atan2
import numpy as np
import scipy.sparse.linalg as las
from scipy import sparse
import scipy.linalg as la
import scipy.signal as sg
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import patches as pch
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d import Axes3D

# eda_draw function makes a "pictorial matrix equation"
# arguments are vectors, matrices and strings
# which are plotted in the order that the appear
# except that strings starting with 'title ' are plotted
# under the subseqeunt matrix or vector
# always returns a status of 1

def eda_draw(*argv):
    bw = np.zeros((256,4));
    v = 0.9*(256 - np.linspace( 0, 255, 256 ))/255;
    bw[:,0] = v;
    bw[:,1] = v;
    bw[:,2] = v;
    bw[:,3] = np.ones(256);
    bwcmap = ListedColormap(bw);
    # size of plot
    W = 16;
    H = 4;
    fig1 = plt.figure(1);
    # figsize width and height in inches
    fig1.set_size_inches(W,H);
    ax1 = plt.subplot(1,1,1);
    plt.axis([0, W, -H/2, H/2]);
    plt.axis('off');
    LM = W/6;    # matrix width and heoght
    LV = W/40;   # vector width
    FS = 0.12;    # character width
    TO = 0.4;    # title vertical offset
    SP = 0.2;    # space between objects
    LS = 0.2;    # leading space
    p = LS; # starting x-position
    istitle=0; # flags presence of a title
    for a in argv:
        if isinstance(a,np.ndarray):
            sh = np.shape(a);
            if len(sh) == 1:  # conversion to nx1 array
                n = sh[0];
                m = 1;
                ap = a;
                a = np.zeros((n,1));
                a[:,0] = ap;
            else:
                n = sh[0];
                m = sh[1];
            if m==1:
                pold=p;
                left=p;
                right=p+LV;
                bottom=-LM/2;
                top=LM/2;
                plt.imshow( a, cmap=bwcmap, vmin=np.min(a), vmax=np.max(a), extent=(left,right,bottom,top) );
                p = p+LV;
                pm = (p+pold)/2;
                if istitle:
                    plt.text(pm,-(LM/2)-TO,titlestr,horizontalalignment='center');
                    istitle=0;
                p = p+SP;
            else:
                pold=p;
                left=p;
                right=p+LM;
                bottom=-LM/2;
                top=LM/2;
                plt.imshow( a, cmap=bwcmap, vmin=np.min(a), vmax=np.max(a), extent=(left,right,bottom,top) );
                p = p+LM;
                pm = (p+pold)/2;
                if istitle:
                    plt.text(pm,-(LM/2)-TO,titlestr,horizontalalignment='center');
                    istitle=0;
                p = p+SP;
        elif isinstance(a,str):
            ns = len(a);
            istitle=0;
            if( ns>=6 ):
                if 'title ' in a[0:6]:
                    istitle=1;
                    titlestr=a[6:];
            if( istitle != 1):
                plt.text(p,0,a);
                p = p + ns*FS + SP;
    plt.show();
    return 1;



def eda_kmeans(S, mu):
    # k-mean cluster analsis of NxM sample natrix S
    # starting from initial guess KxM mu of cluster means
    # returns:
    # newmu: new cluster means
    # k: Nx1 vector of cluster assignments
    # dist2: Nx1 vector of squared distance to cluster center
    # count: Kx1 vector of samples in cluster
    # Kdist2: KxK matrix of inter-cluster squared distances
    N, M = np.shape(S);   # N samples with M elements
    K, M2 = np.shape(mu); # K cluster meabs
    k = np.zeros((N,1),dtype=int); # assoc sample w cluster
    dist2 = np.zeros((N,1)); # sq distance from cluster mean
    Kdist2 = np.zeros((K,K)); # cluster to nearest cluster sq distance
    oldmu = np.copy(mu);  # old cluster means
    
    MAXP = 2*K*M;  # max iterations
    for p in range(MAXP):  # iterate
    
        # cluster assignment step
        changes = 0; # number of changes
        for i in range(N):     # loop over samples
            for j in range(K): # loop over cluster means
                # deviation of sample from mean
                delta = (S[i:i+1,0:M] - oldmu[j:j+1,0:M]).T;
                # squared distance
                r2 = np.matmul(delta.T,delta); r2=r2[0,0];
                if( j==0 ): # always accept first distance
                    r2min = r2;
                    newk = j;
                elif r2<r2min: # accept if smaller
                    r2min = r2;
                    newk = j;
            if k[i,0] != newk: # count reassignements
                changes=changes+1;
            k[i,0] = newk;     # reassigned cluster
            dist2[i,0] = r2min; # sq distance to that cluster

        # end loop over iterations on no further changes
        if( (p!=0) and (changes==0) ):
            break;

        # means step
        newmu = np.zeros((K,M));  # new cluster means
        count = np.zeros((K,1),dtype=int); # samples in cluster
        for i in range(N): # loop over samples
            for j in range(M): # loop over elements
                # cluster sum
                newmu[k[i,0],j] = newmu[k[i,0],j] + S[i,j];
            # count samples in cluster
            count[k[i,0],0] = count[k[i,0],0] + 1;
        for j in range(K): # loop over clusters
            if( count[j,0] == 0 ): # cluser with no samples
                # no change in cluster mean
                newmu[j,0:M] = oldmu[j,0:M];
            else:
                # new cluster mean
                newmu[j,0:M] = newmu[j,0:M]/count[j,0]
        oldmu=newmu; # update cluster mean
        
    # inter-cluster distances
    for i in range(K-1):     # loop over cluster means
        for j in range(1,K): # loop over cluster means
            delta = (newmu[i:i+1,0:M] - newmu[j:j+1,0:M]).T;
            r2 = np.matmul(delta.T,delta); r2=r2[0,0];
            Kdist2[i,j] = r2; # sq distance between clusters
            Kdist2[j,i] = r2; # sq distance between clusters
    return newmu, k, dist2, count, Kdist2;

def eda_kmeans_mod(S, mu):
    # k-mean cluster analsis of NxM sample natrix S
    # modified per Bradley & Fayyad (1998) never to
    # return cluster with zero samples.
    # Starts with initial guess KxM mu of cluster means
    # returns:
    # mymu2: new cluster means
    # myk2: Nx1 vector of cluster assignments
    # mydist2: Nx1 vector of distance to cluster center
    # mycount: Kx1 vector of samples in cluster
    initmu = np.copy(mu);
    mymu, myk, mydist, mycount, myKdist = eda_kmeans(S, initmu);
    while np.min(mycount)==0:  # while cluster with no samples
        ic=np.argmin(mycount); # index of cluster
        ir=np.argmax(mydist);  # index of most distant sample
        ik=myk[ir,0];          # cluster of this sample
        mymu[ic:ic+1,0:M] = S[ir:ir+1,0:M]; # new mean
        mycount[ic,0]=1;       # reset cluster count
        myk[ir,0]=ic;          # reset sample membership
        mydist[ir,0]=0.0;      # reset sample distance
        # note: in rare cases, next line could create
        # infinite loop, so don't do it.
        # mycount[ik,0]=mycount[ik,0]-1;
    # redo k-means
    mymu2, myk2, mydist2, mycount2, myKdist2 = eda_kmeans(S, mymu);  
    return mymu2, myk2, mydist2, mycount2, myKdist2;

def eda_forgy_mean(S,K):
    # create initial k-mean cluster means, from samples
    # randomly drawn NxM sample matrix S
    # returns KxM matrix mu of means
    N,M=np.shape(S);
    k = np.random.randint(low=0,high=N,size=K,dtype=int);
    mu = S[k,0:M];
    return mu;

def eda_random_partition_mean(S,K):
    # create initial k-mean cluster means, by K random
    # partitions of NxM sample matrix S
    # returns KxM matrix mu of means
    N,M=np.shape(S);
    # random cluster-assignments
    k = np.random.randint(low=0,high=K,size=N,dtype=int);
    mu = np.zeros((K,M));
    for i in range(K): # loop over clusters
        r = np.where(k==i); # samples in cluster i
        kk = r[0];
        mu[i:i+1,0:M] = np.mean(S[kk,0:M],axis=0); # their means
    return mu;

def eda_bradley_fayyad_mean(S,K,Nsets,fraction):
    # create initial k-mean cluster means, by
    # the Bradley & Fayyad (1998) method, which
    # is based on clustering clusters. The NxM
    # sample matrix S is randomly subsampled Nsets
    # times, with each set containing (fraction*N+2*K)
    # samples.
    # returns KxM matrix mu of means
    
    N,M=np.shape(S);
    initmu = eda_random_partition_mean(S,K);

    KiMAX = K*Nsets;  # maximum number of means mui
    Ni = floor(fraction*N)+2*K; # size of small sets

    # Step 1, create KixM table CM of means of subsampled data
    CM = np.zeros((KiMAX,M));
    Ki=0; # counter for rows of CM
    for j in range(Nsets):
        k = np.random.randint(low=0,high=N,size=Ni,dtype=int);
        Si = S[k,0:M];
        mymu, myk, mydist, mycount, myKdist = eda_kmeans_mod(Si, initmu);
        CM[Ki:Ki+K,0:M] = mymu[0:K,0:M];
        Ki=Ki+K;

    # Step 2, create KixM table FMS of means of cluster means
    FMS = np.zeros((KiMAX,M));
    Ki=0; # counter for rows of CM
    for j in range(Nsets):
        mymu, myk, mydist, mycount, myKdist = eda_kmeans(CM, CM[3*j:3*j+K,0:M]);
        FMS[Ki:Ki+K,0:M] = mymu[0:K,0:M];
        Ki=Ki+K;

    # Step 3A: associate each KxM member FMi of Fm
    #          with all individual 1xM elements of CM
    #          and compute the overall error
    error = np.zeros((Nsets,1));
    Ki=0;
    for i in range(Nsets):
        FMi = FMS[Ki:Ki+K,0:M] # one group of means
        Ki=Ki+K;
        for j in range(KiMAX):
            CMj = CM[j:j+1,0:M]; # one member of CM
            for m in range(K):
                delta = (CMj[0:1,0:M] - FMi[m:m+1,0:M]).T;
                r2 = np.matmul(delta.T,delta); r2=r2[0,0];
                if( m==0 ): # always acceot first distance
                    r2min = r2;
                elif r2<r2min: # accept if smaller
                    r2min = r2;
        error[i,0] = error[i,0] + r2min;

    # Step 3B: select the FMi with lowest error
    i = np.argmin(error);
    mu = np.copy( FMS[K*i:K*i+K,0:M] );
    return(mu);

# function to make a numpy N-by-1 column vector
# c=eda_cvec(v1, v2, ...) from a list of several
# array-like entities v1, v2, including a number
# a list of numbers, a tuple of numbers, an N-by-0 np array
# and a N-by-1 np array. The function
# also insures that, if v is an np array, that
# c is a copy, as contrasted to a view, of it
# It promotes integers to floats, and integers
# and floats to complex, by context.
# This version concatenates many argments,
# whereas c=eda_cvec1(v1) takes just one argiment.
# I recommend always using eda_cvec(v1, v2, ...)
def eda_cvec(*argv):
    t = int;
    Nt = 0;
    for a in argv:
        v = eda_cvec1(a);
        N,M = np.shape(v);
        Nt = Nt + N;
        if( N==0 ):
            continue; # skip vector of zero length
        if (t==int) and isinstance(v[0,0],float):
            t=float;
        elif isinstance(v[0,0],complex):
            t=complex;
    w = np.zeros((Nt,1),dtype=t);
    Nt = 0;
    for a in argv:
        v = eda_cvec1(a);
        N,M = np.shape(v);
        w[Nt:Nt+N,0] = v[0:N,0];
        Nt = Nt + N;
    return w;

# function to make a numpy N-by-1 column vector
# c=eda_cvec1(v) from entity v that is array-like,
# including a number, a list of numbers, a tuple
# of numbers, an N-by-0 np array and a N-by1 np array.
# It promotes integers to floats, and integers
# and floats to complex, by context. The function
# also insures that, if v is an np array, that
# c is a copy, as contrasted to a view, of it.
# This version takes just one input argment.
# whereas c=eda_cvec(v1,v2,...) concatenates
# many argiments.
def eda_cvec1(v):
    if isinstance(v, int) or isinstance(v, np.int32):
        w = np.zeros((1,1),dtype=int);
        w[0,0] = v;
        return w;
    elif isinstance(v, float):
        w = np.zeros((1,1),dtype=float);
        w[0,0] = v;
        return w;
    elif isinstance(v, complex):
        w = np.zeros((1,1),dtype=complex);
        w[0,0] = v;
        return w;
    elif isinstance(v, np.ndarray):
        s = np.shape(v);
        if len(s) == 1:
            return np.copy(np.reshape(v,(s[0],1)));
        else:
            [r,c]=s;
            if( c==1 ):
                return(np.copy(v));
            elif(r==1):
                return(np.copy(v.T));
            else:
                raise TypeError("eda_cvec: %d by %d ndarray not allowed" % (r, c));
    elif isinstance(v, list):
        r = len(v);
        t = int;
        for vi in v:
            if isinstance(vi,int) or isinstance(v, np.int32):
                pass;
            elif isinstance(vi,float):
                t=float;
            elif isinstance(vi,complex):
                t=complex;
                break;
            else:
                raise TypeError("eda_cvec: list contains unsupported type %s" % type(vi));
        w = np.zeros((r,1),dtype=t);
        w[:,0] = v;
        return w;
    elif isinstance(v, tuple):
        r = len(v);
        t = int;
        for vi in v:
            if isinstance(vi,int) or isinstance(v, np.int32):
                pass;
            elif isinstance(vi,float):
                t=float;
            elif isinstance(vi,complex):
                t=complex;
                break;
            else:
                raise TypeError("eda_cvec: tuple contains unsupported type %s" % type(vi));
        w = np.zeros((r,1),dtype=t);
        w[:,0] = v;
        return w;
    else:
        raise TypeError("eda_cvec: %s not supported" % type(v));

# edapy_08_01, factors shown on terniary diagram

# vectrs of constants needed to make ternary diagram
a = eda_cvec( [0.0,0.0] );
b = eda_cvec( [1.0,0.0] );
c = eda_cvec( [0.5,0.5*sqrt(3)] );

# factor 1
f1 = eda_cvec( [0.15, 0.25, 0.60] );
f1 = f1/np.sum(f1); # normalize factor  1

# factor 2
f2 = eda_cvec( [0.25, 0.45, 0.30] );
f2 = f2/np.sum(f2); # normalize factor 2

# terniary diagram
fig1 = plt.figure(1,figsize=(6,6));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 1, 0, 1] );
plt.axis('off');
# bounding triangle
plt.plot( [a[0,0], b[0,0]], [a[1,0], b[1,0]], 'k-');
plt.plot( [b[0,0], c[0,0]], [b[1,0], c[1,0]], 'k-');
plt.plot( [c[0,0], a[0,0]], [c[1,0], a[1,0]], 'k-');
plt.text(a[0,0]-0.05, a[1,0],'A',horizontalalignment='center');
plt.text(b[0,0]+0.05, b[1,0],'B',horizontalalignment='center');
plt.text(c[0,0], c[1,0]+0.05,'C',horizontalalignment='center');
# first set of grid lines
L=10;
x = np.zeros((2,2))
for j in range(L):
    for k in range(2):
        v1 = float(j+1)/float(L);
        v2 = (1.0-v1)*float(k);
        v3 = 1.0 - (v1+v2);
        x[:,k]=(v1*a+v2*b+v3*c).ravel();
    plt.plot( [x[0,0], x[0,1]], [x[1,0], x[1,1]], 'k:' );
# second set of grid lines
L=10;
for j in range(L):
    for k in range(2):
        v3 = float(j+1)/float(L);
        v1 =(1-v3)*float(k);
        v2 = 1.0 - (v1+v3);
        x[:,k]=(v1*a+v2*b+v3*c).ravel();
    plt.plot( [x[0,0], x[0,1]], [x[1,0], x[1,1]], 'k:' );
#third set of grid lines
L=10;
for j in range(L):
    for k in range(2):
        v2 = float(j+1)/float(L);
        v1 = (1-v2)*float(k);
        v3 = 1.0 - (v1+v2);
        x[:,k]=(v1*a+v2*b+v3*c).ravel();
    plt.plot( [x[0,0], x[0,1]], [x[1,0], x[1,1]], 'k:' );
    
# plot factors
y = f1[0]*a+f1[1]*b+f1[2]*c;
plt.plot( y[0], y[1], 'k*' );
plt.text(y[0], y[1]+0.025,'f1',horizontalalignment='center');
y = f2[0]*a+f2[1]*b+f2[2]*c;
plt.plot( y[0], y[1], 'k*' );
plt.text(y[0], y[1]-0.05,'f2',horizontalalignment='center');

# create and plot samples lying between f1 and f2
N=10;
for j in range(N-1):
    c1=float(j+1)/float(N);
    c2=1-c1;
    s = c1*f1 + c2*f2;
    y = s[0]*a+s[1]*b+s[2]*c;
    plt.plot( y[0], y[1], 'ko' );
    
plt.show();

# edapy_08_02, factors shown on terniary diagram
# this is eda08_01 w/o the grid lines and w/o annotation of factors

# constants needed to make ternary diagram
a = eda_cvec( [0.0,0.0] );
b = eda_cvec( [1.0,0.0] );
c = eda_cvec( [0.5,0.5*sqrt(3)] );

# factor 1
f1 = eda_cvec( [0.15, 0.25, 0.60] );
f1 = f1/np.sum(f1); # normalize factor 1

# factor 2
f2 = eda_cvec( [0.25, 0.45, 0.30] );
f2 = f2/np.sum(f2); # normalize factor 2

# terniary diagram
fig1 = plt.figure(1,figsize=(6,6));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 1, 0, 1] );
plt.axis('off');
# bounding triangle
plt.plot( [a[0,0], b[0,0]], [a[1,0], b[1,0]], 'k-');
plt.plot( [b[0,0], c[0,0]], [b[1,0], c[1,0]], 'k-');
plt.plot( [c[0,0], a[0,0]], [c[1,0], a[1,0]], 'k-');
plt.text(a[0,0]-0.05, a[1,0],'A',horizontalalignment='center');
plt.text(b[0,0]+0.05, b[1,0],'B',horizontalalignment='center');
plt.text(c[0,0], c[1,0]+0.05,'C',horizontalalignment='center');

# plot factors
y = f1[0]*a+f1[1]*b+f1[2]*c;
plt.plot( y[0], y[1], 'k*' );
y = f2[0]*a+f2[1]*b+f2[2]*c;
plt.plot( y[0], y[1], 'k*' );

# create and plot samples lying between f1 and f2
N=10;
for j in range(N-1):
    c1=float(j+1)/float(N);
    c2=1-c1;
    s = c1*f1 + c2*f2;
    y = s[0]*a+s[1]*b+s[2]*c;
    plt.plot( y[0], y[1], 'ko' );
    
plt.show();

# edapy_08_03, alternative factors shown on terniary diagram
# this is eda08_02 with a different choice of factors

# constants needed to make ternary diagram
a = eda_cvec( [0.0,0.0] );
b = eda_cvec( [1.0,0.0] );
c = eda_cvec( [0.5,0.5*sqrt(3)] );

# factor 1
f1 = eda_cvec( [0.15, 0.25, 0.60] );
f1 = f1/np.sum(f1); # normalize factor  1

# factor 2
f2 = eda_cvec( [0.25, 0.45, 0.30] );
f2 = f2/np.sum(f2); # normalie factor 2

f1a = 0.5*(f1+f2); # factor 1a
f2a = f1+0.5*(f1-f2); # factor 2a 

# terniary diagram
fig1 = plt.figure(1,figsize=(6,6));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 1, 0, 1] );
plt.axis('off');
# bounding triangle
plt.plot( [a[0,0], b[0,0]], [a[1,0], b[1,0]], 'k-');
plt.plot( [b[0,0], c[0,0]], [b[1,0], c[1,0]], 'k-');
plt.plot( [c[0,0], a[0,0]], [c[1,0], a[1,0]], 'k-');
plt.text(a[0,0]-0.05, a[1,0],'A',horizontalalignment='center');
plt.text(b[0,0]+0.05, b[1,0],'B',horizontalalignment='center');
plt.text(c[0,0], c[1,0]+0.05,'C',horizontalalignment='center');

# create and plot samples lying between f1 and f2
N=10;
for j in range(N-1):
    c1=float(j+1)/float(N);
    c2=1-c1;
    s = c1*f1 + c2*f2;
    y = s[0]*a+s[1]*b+s[2]*c;
    plt.plot( y[0], y[1], 'ko' );
    
# plot factors
y = f1a[0]*a+f1a[1]*b+f1a[2]*c;
plt.plot( y[0], y[1], '*', color=(0.7,0.7,0.7) );
y = f2a[0]*a+f2a[1]*b+f2a[2]*c;
plt.plot( y[0], y[1], '*', color=(0.7,0.7,0.7) );
    
plt.show();

# edapy_08_04, factor analysis of Atlantic Rocks dataset
# using singular value decomposition

# load data
S = np.genfromtxt('../Data/rocks.txt', delimiter='\t')
N, M = np.shape(S);
Smax = np.amax(S);

# break out data into vectors
sio2 = eda_cvec( S[:,0] );   # SiO2
tio2 = eda_cvec( S[:,1] );   # TiO2
als03 = eda_cvec( S[:,2] );  # Al203
feot = eda_cvec( S[:,3] );   # FeO-total
mgo = eda_cvec( S[:,4] );    # MgO
cao = eda_cvec( S[:,5] );    # CaO
na20 = eda_cvec( S[:,6] );   # Na2O
k20 = eda_cvec( S[:,7] );    # K2O

# compute factors F and factor loadings C using singular value decompostion
[U, sigma, VT] = la.svd(S,full_matrices=False);
sh = np.shape(sigma);
Ns = sh[0];
F = np.copy(VT);
C = np.matmul( U, np.diag(sigma) );

# check error of reconstruction
Emax = np.amax(S - np.matmul(C,F));
print('max relative error in CF  ', Emax/Smax);

# plot singular values
fig1 = plt.figure(1,figsize=(8,4));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, Ns+1, 0, np.max(sigma)] );
plt.plot( np.linspace(1,Ns,Ns), sigma, 'k-' );
plt.plot( np.linspace(1,Ns,Ns), sigma, 'ko' );
plt.title('singular values, s(i)');
plt.xlabel('index, i');
plt.ylabel('s(i)');

# print first five factors
for j in range(5):
    fj = np.zeros((M,1));          # factor j as a column vector
    fj[0:M,0:1] = F[j:j+1,0:M].T;  # is extracted from a row of F
    printstr = "factor %d" % (j);
    print(printstr);
    printstr = "    SiO2 %f"      % (fj[0,0]);
    print(printstr);
    printstr = "    TiO2 %f"      % (fj[1,0]);
    print(printstr);
    printstr = "    Al203 %f"     % (fj[2,0]);
    print(printstr);
    printstr = "    FeO-total %f" % (fj[3,0]);
    print(printstr);
    printstr = "    MgO %f"       % (fj[4,0]);
    print(printstr);
    printstr = "    CaO %f"       % (fj[5,0]);
    print(printstr);
    printstr = "    Na2O %f"      % (fj[6,0]);
    print(printstr);
    printstr = "    K2O %f"       % (fj[7,0]);
    print(printstr);
print(' ');

cmin=(-50);
cmax=50;
fig2 = plt.figure(2,figsize=(6,6));
ax1 = fig2.add_subplot(111, projection='3d')
ax1.axes.set_xlim3d(left=cmin, right=cmax);
ax1.axes.set_ylim3d(bottom=cmin, top=cmax);
ax1.axes.set_zlim3d(bottom=cmin, top=cmax);
ax1.view_init(30, 30)

ax1.scatter( xs=C[:,1], ys=C[:,2], zs=C[:,3], color=(0,0,0) );
ax1.plot( xs=[cmin,cmin], ys=[cmin,cmin], zs=[cmin,cmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmin,cmin], ys=[cmin,cmax], zs=[cmin,cmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmin,cmax], ys=[cmin,cmin], zs=[cmin,cmin], color=(0.8,0.8,0.8) );

ax1.plot( xs=[cmax,cmax], ys=[cmax,cmax], zs=[cmax,cmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmax], ys=[cmax,cmin], zs=[cmax,cmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmin], ys=[cmax,cmax], zs=[cmax,cmax], color=(0.8,0.8,0.8) );

ax1.plot( xs=[cmax,cmin], ys=[cmin,cmin], zs=[cmax,cmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmax], ys=[cmin,cmin], zs=[cmax,cmin], color=(0.8,0.8,0.8) );

ax1.plot( xs=[cmin,cmin], ys=[cmax,cmin], zs=[cmax,cmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmin,cmin], ys=[cmax,cmax], zs=[cmax,cmin], color=(0.8,0.8,0.8) );

ax1.plot( xs=[cmax,cmax], ys=[cmax,cmin], zs=[cmin,cmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[cmax,cmin], ys=[cmax,cmax], zs=[cmin,cmin], color=(0.8,0.8,0.8) );

ax1.set_xlabel('C(2)');
ax1.set_ylabel('C(3)');
ax1.set_zlabel('C(4)');
plt.show();

max relative error in CF   2.9341870488937076e-15
factor 0
    SiO2 -0.908829
    TiO2 -0.024638
    Al203 -0.275168
    FeO-total -0.177851
    MgO -0.141341
    CaO -0.209989
    Na2O -0.044611
    K2O -0.003430
factor 1
    SiO2 0.007684
    TiO2 -0.037474
    Al203 -0.301583
    FeO-total -0.018421
    MgO 0.923193
    CaO -0.226917
    Na2O -0.058457
    K2O -0.007204
factor 2
    SiO2 -0.161689
    TiO2 -0.126343
    Al203 0.567828
    FeO-total -0.659205
    MgO 0.255748
    CaO 0.365682
    Na2O -0.041738
    K2O -0.006464
factor 3
    SiO2 0.209819
    TiO2 0.151367
    Al203 0.176021
    FeO-total -0.427461
    MgO -0.118643
    CaO -0.780043
    Na2O 0.302367
    K2O 0.073403
factor 4
    SiO2 0.309495
    TiO2 -0.100476
    Al203 -0.670083
    FeO-total -0.585155
    MgO -0.195193
    CaO 0.207980
    Na2O -0.145318
    K2O 0.015035

# edapy_08_05: varimax procedure applied to 2 factors

# define two non-spiky factors, fA and fB
M=4;
fA = eda_cvec( [ 1.0,  1.0,  1.0,  1.0 ] );
fB = eda_cvec( [ 1.0, -1.0,  1.0, -1.0 ]);
fAsq = np.matmul(fA.T, fA); fAsq=fAsq[0,0]
fA = fA / sqrt(fAsq); # normalize to unit length
fBsq = np.matmul(fB.T, fB ); fBsq=fBsq[0,0];
fB = fB / sqrt(fBsq); # normalize to unit length

# standard varimax procedure to determine rotation angle q
u = np.power(fA,2) - np.power(fB,2);
v = 2.0 * np.multiply(fA, fB);
A = 2*M*np.matmul(u.T,v); A=A[0,0];
B = np.sum(u)*np.sum(v);
top = A - B;
C = M*(np.matmul(u.T,u)-np.matmul(v.T,v)); C=C[0,0];
D = (np.sum(u)**2)-(np.sum(v)**2);
bot =  C - D;
q = 0.25 * atan2(top,bot);

# rotate factors
cq = cos(q);
sq = sin(q);
fAp =  cq*fA + sq*fB;
fBp = -sq*fA + cq*fB;

# display results
print("fA  %.3f %.3f. %.3f %.3f" % (fA[0,0], fA[1,0], fA[2,0], fA[3,0]) );
print("fB  %.3f %.3f. %.3f %.3f" % (fB[0,0], fB[1,0], fB[2,0], fB[3,0]) );
print(' ');
print("fAp %.3f %.3f. %.3f %.3f" % (fAp[0,0], fAp[1,0], fAp[2,0], fAp[3,0]) );
print("FBp %.3f %.3f. %.3f %.3f" % (fBp[0,0], fBp[1,0], fBp[2,0], fBp[3,0]) );

fA  0.500 0.500. 0.500 0.500
fB  0.500 -0.500. 0.500 -0.500
 
fAp 0.707 0.000. 0.707 0.000
FBp 0.000 -0.707. 0.000 -0.707

# edapy_08_06

# History
# 2022/06/26 -  checked with most recent Python & etc. softwarey_08_06
# factor analysis on Atlantic Rocks dataset
# using singular value decomposition
# and the varimax procedure

# load data
S = np.genfromtxt('../Data/rocks.txt', delimiter='\t')
N, M = np.shape(S);
Smax = np.max(np.abs(S));

# compute factors and factor loadings using singular value decompostion
[U, sigma, VT] = la.svd(S,full_matrices=False);

# keep only first five singular values
P=5;
F = np.copy(VT[0:P,:]);  # P factors
C = np.matmul( U[:,0:P], np.diag(sigma[0:P])); # P loadings
SP = np.matmul(C,F); # reconstruction of S with P factors

# initial rotated factor matrix, FP, abd rotation matrix, MR
MR=np.identity(P);
FP=np.copy(F);

fA = np.zeros((M,1));   # fator A of a pair
fB = np.zeros((M,1));   # fator B of a pair
fAp = np.zeros((M,1));  # fator A of a pair after rotation
fBp = np.zeros((M,1));  # fator B of a pair after rotation
mA = np.zeros((P,1));   # col of rotation matrix associated with A
mB = np.zeros((P,1));   # col of rotation matrix associated with B
mAp = np.zeros((P,1));  # col A of rotation matrix after rotation
mBp = np.zeros((P,1));  # col B of rotation matrix after rotation

# spike these factors using the varimax procedure
k = [1, 2, 3, 4];
Nk = len(k);

for iter in range(3):  # three iterations almost always sufficient
    for ii in range(Nk): # first factor of pair
        for jj in range(ii+1,Nk): # second factor of pair
            # spike factors i and j
            i=k[ii];
            j=k[jj];
            # copy factors from matrix to vectors
            fA[:,0] = np.copy(FP[i,:]);
            fB[:,0] = np.copy(FP[j,:]);
            # standard varimax procedure to determine rotation angle q
            u = np.power(fA,2) - np.power(fB,2);
            v = 2.0 * np.multiply(fA, fB);
            AA = 2*M*np.matmul(u.T,v); AA=AA[0,0];
            BB = np.sum(u)*np.sum(v);
            top = AA - BB;
            CC = M*(np.matmul(u.T,u)-np.matmul(v.T,v)); CC=CC[0,0];
            DD = (np.sum(u)**2)-(np.sum(v)**2);
            bot =  CC - DD;
            q = 0.25 * atan2(top,bot);
            # rotate factors
            cq = cos(q);
            sq = sin(q);
            fAp =  cq*fA + sq*fB;
            fBp = -sq*fA + cq*fB;
            # put back into factor matrix, FP
            FP[i:i+1,0:M] = fAp[0:M,0];
            FP[j:j+1,0:M] = fBp[0:M,0];
            # accumulate rotation
            mA[:,0] = np.copy(MR[i,:]);
            mB[:,0] = np.copy(MR[j,:]);
            mAp =  cq*mA + sq*mB;
            mBp = -sq*mA + cq*mB;
            MR[i:i+1,0:P] = mAp[0:P,0];
            MR[j:j+1,0:P] = mBp[0:P,0];

# new factor loadings
CP = np.matmul(C,MR.T);

# reconstruction using new factors
SPP = np.matmul(CP,FP);

# check error of reconstruction
Smax = np.max(np.abs(S));
Emax1 = np.max(np.abs(S - SP));
Emax2 = np.max(np.abs(S - SPP));
print('max relative error in S-CF:      %.4e' % (Emax1/Smax) );
print('max relative error in S-(CP)(FP) %.4e' % (Emax2/Smax) );

Fa = np.abs(F);
FPa = np.abs(FP);
eda_draw( 'title f1', Fa[1,:], 'title f2', Fa[2,:], 'title f3', Fa[3,:], 'title f4', Fa[4,:], 'becomes', 'title fp1', FPa[1,:], 'title fp2', FPa[2,:], 'title fp3', FPa[3,:], 'title fp4', FPa[4,:]);

max relative error in S-CF:      5.6630e-02
max relative error in S-(CP)(FP) 5.6630e-02

# edapy_08_07
# comparison of weighted and unweighted factor analysis

# Part 1: Make synthetic chemical data with a wide range of concentrations

# 7 elements, three factors
M = 7;
P = 3;

# typical analytic errors
#              [Fe        Cu        Zn        Pb        As        Ag        Au]
se = eda_cvec( [0.010000, 0.010000, 0.010000, 0.001000, 0.001000, 0.0000010, 0.0000001 ] );

# factors as column vectors
v1 = np.zeros((M,1));
v2 = np.zeros((M,1));
v3 = np.zeros((M,1));
#              [Fe        Cu        Zn        Pb        As        Ag         Au]
v1 = eda_cvec( [0.200000, 0.004000, 0.010000, 0.010000, 0.001000, 0.0000100, 0.0000010 ] );
v2 = eda_cvec( [0.050000, 0.180000, 0.180000, 0.030000, 0.000200, 0.0000400, 0.0000005 ] );
v3 = eda_cvec( [0.100000, 0.080000, 0.110000, 0.010000, 0.000700, 0.0000100, 0.0000001 ] );

# factor matrix
F = np.concatenate( (v1.T, v2.T, v3.T), axis=0 );

# randomly chosen loadings
N = 10000;
c1 = np.random.uniform(0.0,1.0,(N,1));
c2 = np.random.uniform(0.0,1.0,(N,1));
c3 = 1.0-(c1+c2);
C = np.concatenate( (c1, c2, c3), axis=1 )

# true sample matrix
Strue = np.matmul(C,F);
Sobs = np.zeros((N,M));

# observed sample matrix is true plus Normally-distributed random noise
for i in range(M):
    Sobs[:,i] = Strue[:,i] + np.random.normal(0,se[i,0],N);
Smax = np.amax(Sobs);
              
# histogram limts
Lh = 100;
Fe_min = 0;
Fe_max = 5;
Au_min = 0;
Au_max = 5;

# Part two factor analysis without weighting

# factors F and loadings F
[U1, s1, V1T] = la.svd(Sobs,full_matrices=False);
Fpre1 = V1T; # factors
Cpre1 = np.matmul( U1, np.diag(s1) ); # loadings

# P significant factors
Fpre1P = np.copy( Fpre1[0:P,:] );
Cpre1P = np.copy( Cpre1[:,0:P] );
Spre1P = np.matmul( Cpre1P, Fpre1P );

fig1 = plt.figure(1,figsize=(8,4));

# error
e1 = np.abs(Strue-Spre1P);  # absolute values of individual errors
Emax = np.max(e1);          # maximum error
print('witout weighting, max relative error in CF  ', Emax/Smax);

c, e = np.histogram( e1[:,0]/se[0,0], Lh, (Fe_min,Fe_max));
Nc = len(c);
Ne = len(e);
count = eda_cvec( c );
edges = eda_cvec( e[0:Ne-1] );

ax1 = plt.subplot(1,2,1);
plt.axis( [Fe_min, Fe_max, 0, 1] );
plt.plot(edges,count/np.max(count),'k-');
plt.title('error in Fe');
plt.xlabel('error/se');
plt.ylabel('count');

c, e = np.histogram( e1[:,M-1]/se[M-1,0], Lh,(Au_min,Au_max));
Nc = len(c);
Ne = len(e);
count = eda_cvec( c );
edges = eda_cvec( e[0:Ne-1] );

ax1 = plt.subplot(1,2,2);
plt.axis( [Au_min, Au_max, 0, 1] );
plt.plot(edges,count/np.max(count),'k-');
plt.title('Error in Au');
plt.xlabel('error/se');
plt.ylabel('count');

# Part three factor analysis with weighting by analytic precision

# factors F and loadings F
w = np.reciprocal(se);
[U2, s2, V2T] = la.svd( np.matmul(Sobs,np.diag(w.ravel())),full_matrices=False);

Fpre2 = np.matmul(V2T,np.diag(se.ravel())); # factors
Cpre2 = np.matmul( U2, np.diag(s2) ); # loadings

# P significant factors
Fpre2P = np.copy( Fpre2[0:P,:] );
Cpre2P = np.copy( Cpre2[:,0:P] );
Spre2P = np.matmul( Cpre2P, Fpre2P );

fig2 = plt.figure(2,figsize=(8,4));

# check error of reconstruction
# Emax is absolute vale of difference between true and reconstructed sample matrices
e2 = np.abs(Strue-Spre2P);  
Emax = np.max(e2);
print('with   weighting, max relative error in CF  ', Emax/Smax);

c, e = np.histogram( e2[:,0]/se[0,0], Lh, (Fe_min,Fe_max));
Nc = len(c);
Ne = len(e);
count = eda_cvec( c );
edges = eda_cvec( e[0:Ne-1] );

ax1 = plt.subplot(1,2,1);
plt.axis( [Fe_min, Fe_max, 0, 1] );
plt.plot(edges,count/np.max(count),'k-');
plt.title('error in Fe');
plt.xlabel('error/se');
plt.ylabel('count');

c, e = np.histogram( e2[:,M-1]/se[M-1,0], Lh,(Au_min,Au_max));
Nc = len(c);
Ne = len(e);
count = eda_cvec( c );
edges = eda_cvec( e[0:Ne-1] );

ax1 = plt.subplot(1,2,2);
plt.axis( [Au_min, Au_max, 0, 1] );
plt.plot(edges,count/np.max(count),'k-');
plt.title('Error in Au');
plt.xlabel('error/se');
plt.ylabel('count');

witout weighting, max relative error in CF   0.17895452318852598
with   weighting, max relative error in CF   0.15821508184561714

# edapy_08_08
# factor analysis on Mid Atlantic Ridge Rocks dataset
# Q-mode factors with geographical plots

# load data
D = np.genfromtxt('../Data/rocks_with_lat_lon.txt', delimiter='\t')
N, K = np.shape(D);
M=K-2;

# break out (lat,lon) coordinates and sample matrix S into separate vctors
lat = eda_cvec( D[:,8] );    # lat
lon = eda_cvec( D[:,9] );    # lon
S = np.copy(D[:,0:M]);       # sample matric S

# svd
[U, s, VT] = la.svd(S.T,full_matrices=False);

# keep only first few singular values
P=4;
FP = np.copy(VT[0:P,:]); # factor matrix
CP = np.matmul( U[:,0:P], np.diag(s[0:P])); # loading matrix

# map bounds
left = -60.0;
right = 10.0;
bottom = -70.0;
top = 70.0;

# plot
fig1 = plt.figure(1,figsize=(16,10));

# PART 1: factor 1
ax1 = plt.subplot(1,3,1);
plt.axis( [left, right, bottom, top] );

# plot one rectangle per sample i, with color scaled according to
# the strength of the i-th element in factor nf=1
nf = 1;
FPmean = np.mean(FP[nf,:]);
FPstd = np.std(FP[nf,:]);
fmin = -2*FPstd
fmax = 2*FPstd
for i in range(N):
    h = 3;
    c = (FP[nf,i]-fmin)/(fmax-fmin);
    if( c<0.0 ):
        c=0.0;
    elif ( c>1.0 ):
        c=1.0;
    ax1.add_patch( pch.Rectangle( (lon[i,0], lat[i,0]), h, h, fc=(c,c,c), ec=(0.0,0.0,0.0) ) );

plt.xlabel('longitude (deg)');
plt.ylabel('latitude (deg)');
plt.title('Factor 1');

# plot coastlines on maps, using coarse coastlines from the file coastdata.txt
# this file consists of lists of (lat,lons) along segments of coastlines, with
# each list deliited with a ">".
coastfile = '../Data/coastdata.txt';
fd = open(coastfile,"r");
NN = 0;
NNMAX = 10000;
mylat = np.zeros((NNMAX,1));
mylon = np.zeros((NNMAX,1));
for i in range(100000):
    line = fd.readline();
    if not line:
        break;
    if( line[0] == '>' ):
        if( NN == 0):
            continue;
        plt.plot( mylon[0:NN,0], mylat[0:NN,0], '.', color=(0.7,0.7,0.7) );
        NN=0;
    else:
        latlonstr=line.split();
        mylat[NN,0] = float(latlonstr[1]);
        mylon[NN,0] = float(latlonstr[0]);
        NN=NN+1;
fd.close();

# PART 1: factor 2
ax2 = plt.subplot(1,3,2);
plt.axis( [left, right, bottom, top] );

# plot one rectangle per sample i, with color scaled according to
# the strength of the i-th element in factor nf=2
nf = 2;
FPmean = np.mean(FP[nf,:]);
FPstd = np.std(FP[nf,:]);
fmin = -2*FPstd
fmax = 2*FPstd
for i in range(N):
    h = 3;
    c = (FP[nf,i]-fmin)/(fmax-fmin);
    if( c<0.0 ):
        c=0.0;
    elif ( c>1.0 ):
        c=1.0;
    ax2.add_patch( pch.Rectangle( (lon[i,0], lat[i,0]), h, h, fc=(c,c,c), ec=(0.0,0.0,0.0) ) );

plt.xlabel('lonomgitude (deg)');
plt.ylabel('latitude (deg)');
plt.title('Factor 2');

# plot coastlines on maps, using coarse coastlines from the file coastdata.txt
# this file consists of lists of (lat,lons) along segments of coastlines, with
# each list deliited with a ">".
coastfile = '../Data/coastdata.txt';
fd = open(coastfile,"r");
NN = 0;
NNMAX = 10000;
mylat = np.zeros((NNMAX,1));
mylon = np.zeros((NNMAX,1));
for i in range(100000):
    line = fd.readline();
    if not line:
        break;
    if( line[0] == '>' ):
        if( NN == 0):
            continue;
        plt.plot( mylon[0:NN,0], mylat[0:NN,0], '.', color=(0.7,0.7,0.7) );
        NN=0;
    else:
        latlonstr=line.split();
        mylat[NN,0] = float(latlonstr[1]);
        mylon[NN,0] = float(latlonstr[0]);
        NN=NN+1;
fd.close();

# PART 3: factor 3
ax3 = plt.subplot(1,3,3);
plt.axis( [left, right, bottom, top] );

# plot one rectangle per sample i, with color scaled according to
# the strength of the i-th element in factor nf=3
nf = 3;
FPmean = np.mean(FP[nf,:]);
FPstd = np.std(FP[nf,:]);
fmin = -2*FPstd
fmax = 2*FPstd
for i in range(N):
    h = 3;
    c = (FP[nf,i]-fmin)/(fmax-fmin);
    if( c<0.0 ):
        c=0.0;
    elif ( c>1.0 ):
        c=1.0;
    ax3.add_patch( pch.Rectangle( (lon[i,0], lat[i,0]), h, h, fc=(c,c,c), ec=(0.0,0.0,0.0) ) );

plt.xlabel('lonomgitude (deg)');
plt.ylabel('latitude (deg)');
plt.title('Factor 3');

# plot coastlines on maps, using coarse coastlines from the file coastdata.txt
# this file consists of lists of (lat,lons) along segments of coastlines, with
# each list deliited with a ">".
coastfile = '../data/coastdata.txt';
fd = open(coastfile,"r");
NN = 0;
NNMAX = 10000;
mylat = np.zeros((NNMAX,1));
mylon = np.zeros((NNMAX,1));
for i in range(100000):
    line = fd.readline();
    if not line:
        break;
    if( line[0] == '>' ):
        if( NN == 0):
            continue;
        plt.plot( mylon[0:NN,0], mylat[0:NN,0], '.', color=(0.7,0.7,0.7) );
        NN=0;
    else:
        latlonstr=line.split();
        mylat[NN,0] = float(latlonstr[1]);
        mylon[NN,0] = float(latlonstr[0]);
        NN=NN+1;
fd.close();


plt.show();

# edapy_08_09
# CAC Pacific Sea Surface Temperature (SST) dataset
# read in and plot the data

# load the data
D = np.genfromtxt('../Data/cac_sst.txt', delimiter='\t')

# set up sizes of arrays
I, J = np.shape(D);
NBLOCK=85;
NLON = NBLOCK-1;
NLAT = J-1;
IMAGES = floor(I/NBLOCK);
Dt = 1/12;
print('Images:',IMAGES)

# arrays for SST, date, lats, lons
SST = np.zeros((NLON,NLAT,IMAGES));
thedate = np.zeros((IMAGES,1)); # in month.year format
theyear = np.zeros((IMAGES,1)); # year extracted from thedate
themonth = np.zeros((IMAGES,1)); # month extracted from thedate
lats=np.zeros((NLAT,1));
lons=np.zeros((NLON,1));

# cut up data into SST arrays
lats = np.copy(D[0,1:J]).T;
lons = np.copy(D[1:NBLOCK,0]);
for j in range(IMAGES):
    k1 = j*NBLOCK;
    k2 = k1+NLON;
    thedate[j,0]=D[k1,0];
    themonth[j,0]=floor(thedate[j,0]);
    theyear[j,0]=floor(10000*(thedate[j,0]-themonth[j,0])+0.1);
    SST[:,:,j] = np.copy(D[k1+1:k2+1,1:J]);
    
print("start year %d month %d" % (theyear[0,0], themonth[0,0]));
print("end   year %d month %d" % (theyear[IMAGES-1,0], themonth[IMAGES-1,0]));
    
# some definitions related to plotting
MONTHS=12;                         # months in year
YEARS=ceil(IMAGES/12);             # years of images
YBLOCKSIZE = 6;                    # 6 years per plot block of years
YBLOCKS = ceil(YEARS/YBLOCKSIZE);  # number of blocks

print("%d blocks of years" % (YBLOCKS) );

# grey-scale colormap
bw = np.zeros((256,4));
v = (255 - np.linspace( 0, 255, 256 ))/255;
bw[:,0] = v;
bw[:,1] = v;
bw[:,2] = v;
bw[:,3] = np.ones(256);
bwcmap = ListedColormap(bw);

# plot data, 12 months by six years of maps per figure
lastmonth = (IMAGES % (YBLOCKSIZE*12)) % YBLOCKSIZE;
print("lastmonth %d" % (lastmonth));
done=0;
for yb in range(YBLOCKS):
    fig = plt.figure(yb+2,figsize=(16,10));
    for y in range(YBLOCKSIZE):
        for m in range(MONTHS):
            j = m+MONTHS*y+MONTHS*YBLOCKSIZE*yb;
            k = y + m*YBLOCKSIZE;
            if( j>=IMAGES):
                done=1;
                break;
            ax = plt.subplot(MONTHS, YBLOCKSIZE, k+1);
            left=0;
            right=3.0;
            bottom=0.0;
            top=1.0;
            plt.axis( [left, right, bottom, top] );
            ax.xaxis.set_ticks([]);
            ax.yaxis.set_ticks([]);
            MAP=np.flipud(SST[:,:,j].T); # reorient so it plots as map
            MAPmin = np.min(MAP);
            MAPmax = np.max(MAP);
            MAPrange = MAPmax-MAPmin;
            if( MAPrange<=0.0 ):
                MAPrange=1;
            plt.imshow( (MAP-MAPmin)/MAPrange, cmap=bwcmap, vmin=0, vmax=1, extent=(left,right,bottom,top) );
            if( y==0 ):
                plt.ylabel("%d" % (themonth[j,0]) );
            if( (m==11) or ((yb==(YBLOCKS-1)) and (m==(lastmonth-1)))  ):
                plt.xlabel("%d" % (theyear[j,0]) );
        if( done==1 ):
            break;
    if( done==1 ):
            break;
    plt.show();

Images: 399
start year 1970 month 1
end   year 2003 month 3
6 blocks of years
lastmonth 3

#  edapy_08_10
# CAC Pacific Sea Surface Temperature (SST) dataset
# plot singular values

# load the data
D = np.genfromtxt('../Data/cac_sst.txt', delimiter='\t')

# set up sizes of arrays
I, J = np.shape(D);
NBLOCK=85;
NLON = NBLOCK-1;
NLAT = J-1;
IMAGES = floor(I/NBLOCK);
Dt = 1/12;

# arrays for SST, date, lats, lons
SST = np.zeros((NLON,NLAT,IMAGES));
thedate = np.zeros((IMAGES,1)); # in month.year format
theyear = np.zeros((IMAGES,1)); # year extracted from thedate
themonth = np.zeros((IMAGES,1)); # month extracted from thedate
lats=np.zeros((NLAT,1));
lons=np.zeros((NLON,1));

# cut up data into SST arrays
lats = np.copy(D[0,1:J]).T;
lons = np.copy(D[1:NBLOCK,0]);
for j in range(IMAGES):
    k1 = j*NBLOCK;
    k2 = k1+NLON;
    thedate[j,0]=D[k1,0];
    themonth[j,0]=floor(thedate[j,0]);
    theyear[j,0]=floor(10000*(thedate[j,0]-themonth[j,0])+0.1);
    SST[:,:,j] = np.copy(D[k1+1:k2+1,1:J]);
    
# some definitions related to plotting
MONTHS=12;
YEARS=ceil(IMAGES/12);
YBLOCKSIZE = 6;
YBLOCKS = ceil(YEARS/YBLOCKSIZE);

# fold out the images into the sample array
SSTV = np.zeros( (IMAGES, NLAT*NLON) );
for j in range(IMAGES):
    for p in range(NLAT):
        for q in range(NLON):
            k = p + q*NLAT;
            SSTV[j,k] = SST[q, p, j];

# singular value decomposition
[U, sigma, VT] = la.svd(SSTV,full_matrices=False); # singular values sigma
sh = np.shape(sigma);
Ns = sh[0];
F = np.copy(VT); # factors
C = np.matmul( U, np.diag(sigma) ); # factor loadings

# plot singular values
fig1 = plt.figure(1,figsize=(8,4));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, Ns+1, 0, 1.1*np.max(sigma)] );
plt.plot( np.linspace(1,Ns,Ns), sigma, 'k-');
plt.title('Singular Values of the CAC dataset');
plt.xlabel('index i');
plt.ylabel('singular value i');

plt.show();

# edapy_08_11
# CAC Pacific Sea Surface Temperature (SST) dataset
# plot factors as maps and loadings as timeseries

# load the data
D = np.genfromtxt('../Data/cac_sst.txt', delimiter='\t');

# set up sizes of arrays
I, J = np.shape(D);
NBLOCK=85;
NLON = NBLOCK-1;
NLAT = J-1;
IMAGES = floor(I/NBLOCK);
Dt = 1/12;

# arrays for SST, date, lats, lons
SST = np.zeros((NLON,NLAT,IMAGES));
thedate = np.zeros((IMAGES,1)); # in month.year format
theyear = np.zeros((IMAGES,1)); # year extracted from thedate
themonth = np.zeros((IMAGES,1)); # month extracted from thedate
lats=np.zeros((NLAT,1));
lons=np.zeros((NLON,1));

# cut up data into SST arrays
lats = np.copy(D[0,1:J]).T;
lons = np.copy(D[1:NBLOCK,0]);
for j in range(IMAGES):
    k1 = j*NBLOCK;
    k2 = k1+NLON;
    thedate[j,0]=D[k1,0];
    themonth[j,0]=floor(thedate[j,0]);
    theyear[j,0]=floor(10000*(thedate[j,0]-themonth[j,0])+0.1);
    SST[:,:,j] = np.copy(D[k1+1:k2+1,1:J]);
    
    
# some definitions related to plotting
MONTHS=12;
YEARS=ceil(IMAGES/12);
YBLOCKSIZE = 6;
YBLOCKS = ceil(YEARS/YBLOCKSIZE);

# fold out the images into the sample matrix
SSTV = np.zeros( (IMAGES, NLAT*NLON) );
for j in range(IMAGES):
    for p in range(NLAT):
        for q in range(NLON):
            k = p + q*NLAT;
            SSTV[j,k] = SST[q, p, j];

# singular value decomposition
[U, sigma, VT] = la.svd(SSTV,full_matrices=False);
sh = np.shape(sigma); # sigma is the singular values
Ns = sh[0];

# keep only P singular values
P=Ns;
F = np.copy(VT[0:P,:]); # factors
C = np.matmul( U[:,0:P], np.diag(sigma[0:P])); # factor loadings

# fold EOF's back into images (maps)
Nf = 12;  # just do a few EOF's
SSTF = np.zeros((NLON,NLAT,Nf));
for j in range(Nf):
    for p in range(NLAT):
        for q in range(NLON):
            k = p + q*NLAT;
            SSTF[ q, p, j ] = F[j,k];
            
# grey-scale colormap
bw = np.zeros((256,4));
v = (255 - np.linspace( 0, 255, 256 ))/255;
bw[:,0] = v;
bw[:,1] = v;
bw[:,2] = v;
bw[:,3] = np.ones(256);
bwcmap = ListedColormap(bw);

# plot these EOF's
fig1 = plt.figure(1,figsize=(16,6));
for f in range(Nf):
    ax = plt.subplot(3, 4, f+1);
    left=0;
    right=3.0;
    bottom=0.0;
    top=1.0;
    plt.axis( [left, right, bottom, top] );
    ax.xaxis.set_ticks([]);
    ax.yaxis.set_ticks([]);
    MAP=np.flipud(SSTF[:,:,f].T); # reorient so it plots as map
    MAPmin = np.min(MAP);
    MAPmax = np.max(MAP);
    MAPrange = MAPmax-MAPmin;
    if( MAPrange<=0.0 ):
        MAPrange=1;
    plt.imshow( (MAP-MAPmin)/MAPrange, cmap=bwcmap, vmin=0, vmax=1, extent=(left,right,bottom,top) );
    plt.title("EOF %d" % (f+1) );

plt.show();

# plot corresponding amplitude timeseries
fig2 = plt.figure(2,figsize=(16,12));
for f in range(Nf):
    ax = plt.subplot(3, 4, f+1);
    plt.axis( [theyear[0,0], theyear[IMAGES-1,0]+1, np.min(C[:,0]), np.max(C[:,0]) ] );
    plt.plot( theyear[0,0]+np.linspace(1,IMAGES,IMAGES)/12, C[:,f], 'k-' );
    plt.title("C(%d,t)" % (f) );

plt.show();

# edapy_08_12, approximate SST's using lowest factors
# CAC Pacific Sea Surface Temperature (SST) dataset

# load the data
D = np.genfromtxt('../Data/cac_sst.txt', delimiter='\t')

# set up sizes of arrays
I, J = np.shape(D);
NBLOCK=85;
NLON = NBLOCK-1;
NLAT = J-1;
IMAGES = floor(I/NBLOCK);
Dt = 1/12;
print('Images:',IMAGES)

# arrays for SST, date, lats, lons
SST = np.zeros((NLON,NLAT,IMAGES));
thedate = np.zeros((IMAGES,1)); # in month.year format
theyear = np.zeros((IMAGES,1)); # year extracted from thedate
themonth = np.zeros((IMAGES,1)); # month extracted from thedate
lats=np.zeros((NLAT,1));
lons=np.zeros((NLON,1));

# cut up data into SST arrays
lats = np.copy(D[0,1:J]).T;
lons = np.copy(D[1:NBLOCK,0]);
for j in range(IMAGES):
    k1 = j*NBLOCK;
    k2 = k1+NLON;
    thedate[j,0]=D[k1,0];
    themonth[j,0]=floor(thedate[j,0]);
    theyear[j,0]=floor(10000*(thedate[j,0]-themonth[j,0])+0.1);
    SST[:,:,j] = np.copy(D[k1+1:k2+1,1:J]);
    
print("start year %d month %d" % (theyear[0,0], themonth[0,0]));
print("end   year %d month %d" % (theyear[IMAGES-1,0], themonth[IMAGES-1,0]));
    
# some definitions related to plotting
MONTHS=12;
YEARS=ceil(IMAGES/12);
YBLOCKSIZE = 6;
YBLOCKS = ceil(YEARS/YBLOCKSIZE);

# fold out images into the sample matrix
SSTV = np.zeros( (IMAGES, NLAT*NLON) );
for j in range(IMAGES):
    for p in range(NLAT):
        for q in range(NLON):
            k = p + q*NLAT;
            SSTV[j,k] = SST[q, p, j];

# singular value decomposition
[U, sigma, VT] = la.svd(SSTV,full_matrices=False);
sh = np.shape(sigma); # sigma is the singular values
Ns = sh[0];

# keep only first few singular values
P=5;
FP = np.copy(VT[0:P,:]); # factors
CP = np.matmul( U[:,0:P], np.diag(sigma[0:P])); # factor loadings

# reconstruct with a small number of EOFs
SSTVp = np.matmul(CP, FP);

# fold reconstructed sample matrix back into images (maps)
Nf = IMAGES;
SSTp = np.zeros((NLON,NLAT,Nf));
for j in range(Nf):
    for p in range(NLAT):
        for q in range(NLON):
            k = p + q*NLAT;
            SSTp[ q, p, j ] = SSTVp[j,k];

# gre-scale colormap
bw = np.zeros((256,4));
v = (255 - np.linspace( 0, 255, 256 ))/255;
bw[:,0] = v;
bw[:,1] = v;
bw[:,2] = v;
bw[:,3] = np.ones(256);
bwcmap = ListedColormap(bw);

# plor data, 12 months by six years of maps per figure
lastmonth = (IMAGES % YBLOCKSIZE*12) % YBLOCKSIZE ;
done=0;
for yb in range(YBLOCKS):
    fig = plt.figure(yb+2,figsize=(16,10));
    for m in range(MONTHS):
        for y in range(YBLOCKSIZE):
            j = m+MONTHS*y+MONTHS*YBLOCKSIZE*yb;
            k = y + m*YBLOCKSIZE;
            if( j>=IMAGES):
                done=1;
                break;
            ax = plt.subplot(MONTHS, YBLOCKSIZE, k+1);
            left=0;
            right=3.0;
            bottom=0.0;
            top=1.0;
            plt.axis( [left, right, bottom, top] );
            ax.xaxis.set_ticks([]);
            ax.yaxis.set_ticks([]);
            MAP=np.flipud(SSTp[:,:,j].T); # reorient so it plots as map
            MAPmin = np.min(MAP);
            MAPmax = np.max(MAP);
            MAPrange = MAPmax-MAPmin;
            if( MAPrange<=0.0 ):
                MAPrange=1;
            plt.imshow( (MAP-MAPmin)/MAPrange, cmap=bwcmap, vmin=0, vmax=1, extent=(left,right,bottom,top) );
            if( y==0 ):
                plt.ylabel("%d" % (themonth[j,0]) );
            if( (m==11) or ((yb==(YBLOCKS-1)) and (m==lastmonth))  ):
                plt.xlabel("%d" % (theyear[j,0]) );
        if( done==1 ):
            break;
    if( done==1 ):
            break;
    plt.show();

Images: 399
start year 1970 month 1
end   year 2003 month 3

# edapy_08_13, create a dataset for cluster analysis

# Note: In order to prevent ooverwriting of the originally
# distributed dataset, filenames have been changed from
# "../Data/threeclusters.txt" to "../Data/mythreeclusters.txt"
# "../Data/threeclustermeans.txt" to "../Data/threeclustermeans.txt"
# change them back if you want ...

K=3; # three clusters
# 100 samples per cluster
N1=100;
N2=100;
N3=100;
N=N1+N2+N3;

M=2; # two elements

# means set by hand
mu = np.zeros((K,M));
mu[0,0] = 1.0;
mu[0,1] = 2.0;
mu[1,0] = 3.0;
mu[1,1] = 4.0;
mu[2,0] = 4.0;
mu[2,1] = 1.0;

# sample matrix randomly assigned
S = np.zeros((N,M));
sc = 0.5;
S[0:N1,0:1] = np.random.normal( loc=mu[0,0], scale=sc, size=(N1,1) );
S[0:N1,1:2] = np.random.normal( loc=mu[0,1], scale=sc, size=(N1,1) );
S[N1:N1+N2,0:1] = np.random.normal( loc=mu[1,0], scale=sc, size=(N2,1) );
S[N1:N1+N2,1:2] = np.random.normal( loc=mu[1,1], scale=sc, size=(N2,1) );
S[N1+N2:N,0:1] = np.random.normal( loc=mu[2,0], scale=sc, size=(N3,1) );
S[N1+N2:N,1:2] = np.random.normal( loc=mu[2,1], scale=sc, size=(N3,1) );

# cluster membership
k1 = np.zeros((N1,1),dtype=int);
k2 = np.ones((N2,1),dtype=int);
k3 = 2*np.ones((N3,1),dtype=int);
myk = np.concatenate((k1,k2,k3),axis=0);

fig1 = plt.figure(1,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
for i in range(N):
    if myk[i,0]==0:
        plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
    elif myk[i,0]==1:
        plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
    else:
        plt.plot( S[i,0], S[i,1], 'ko', lw=1 );
plt.plot( mu[0:K,0:1], mu[0:K,1:2], 'b^', lw=2 );
plt.xlabel('x');
plt.ylabel('y');
plt.show();

Dm = np.concatenate( (S,myk),axis=1);
np.savetxt("../Data/mythreeclusters.txt",Dm, delimiter="\t");
np.savetxt("../Data/mythreeclustermeans.txt",mu, delimiter="\t");

# edapy_08_14, cluster analysis w/ prior mean initialization

# load the data
K = 3;
D = np.genfromtxt("../Data/threeclusters.txt", delimiter='\t');
N, M = np.shape(D);
M = M-1;
S = np.copy( D[0:N,0:2] )
myk = np.ndarray.astype(D[0:N,2:3], int);
mu = np.genfromtxt("../Data/threeclustermeans.txt", delimiter='\t');

# initial means are the prior means
initmu = np.zeros((K,M));
initmu[0,0] = 1.2;
initmu[0,1] = 2.2;
initmu[1,0] = 3.4;
initmu[1,1] = 4.4;
initmu[2,0] = 4.1;
initmu[2,1] = 1.1;

# k-mean analysis
mymu, myk, mydist2, mycount, myKdist2 = eda_kmeans(S, initmu);

print('cluster means');
print(mymu);
print('mean sample distance');
print(np.mean(np.sqrt(mydist2)));
print('cluster counts');
print(mycount);
print('mean inter-cluster distance');
print(np.mean(np.sqrt(myKdist2)));

fig2 = plt.figure(2,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
plt.plot( initmu[0:K,0:1], initmu[0:K,1:2], 'r^', lw=2 )
for i in range(N):
    if myk[i,0]==0:
        plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
    elif myk[i,0]==1:
        plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
    else:
        plt.plot( S[i,0], S[i,1], 'ko', lw=1 );
plt.plot( mymu[0:K,0:1], mymu[0:K,1:2], 'b^', lw=2 )
plt.xlabel('x');
plt.ylabel('y');
plt.show();

C:\Users\menke\AppData\Local\Temp\ipykernel_3792\3757833061.py:145: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
  dist2[i,0] = r2min; # sq distance to that cluster

cluster means
[[1.07859747 2.02539117]
 [3.04311042 4.05478594]
 [4.05072009 1.03414835]]
mean sample distance
0.6470009786672823
cluster counts
[[100]
 [100]
 [100]]
mean inter-cluster distance
2.0315147021424114

# edapy_08_15, cluster analysis w/ random-partition initialization

# load the data
K = 3;
D = np.genfromtxt("../Data/threeclusters.txt", delimiter='\t');
N, M = np.shape(D);
M = M-1;
S = np.copy( D[0:N,0:2] )
myk = np.ndarray.astype(D[0:N,2:3], int);
mu = np.genfromtxt("../Data/threeclustermeans.txt", delimiter='\t');

# initial means are from random partition of the data set
initmu = eda_random_partition_mean(S,K);

# k-mean analysis
mymu, myk, mydist, mycount, myKdist = eda_kmeans(S, initmu);

print('cluster means');
print(mymu);
print('mean sample distance');
print(np.mean(np.sqrt(mydist)));
print('cluster counts');
print(mycount);
print('mean inter-cluster distance');
print(np.mean(np.sqrt(myKdist)));

fig3 = plt.figure(3,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
plt.plot( initmu[0:K,0:1], initmu[0:K,1:2], 'r^', lw=4 )
for i in range(N):
    if myk[i,0]==0:
        plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
    elif myk[i,0]==1:
        plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
    else:
        plt.plot( S[i,0], S[i,1], 'ko', lw=1 );
plt.plot( mymu[0:K,0:1], mymu[0:K,1:2], 'b^', lw=4 );
plt.xlabel('x');
plt.ylabel('y');
plt.show();

cluster means
[[3.04311042 4.05478594]
 [4.05072009 1.03414835]
 [1.07859747 2.02539117]]
mean sample distance
0.6470009786672823
cluster counts
[[100]
 [100]
 [100]]
mean inter-cluster distance
2.0315147021424114

# edapy_08_16, cluster analysis w/ forgy initialization

# load the data
K = 3;
D = np.genfromtxt("../Data/threeclusters.txt", delimiter='\t');
N, M = np.shape(D);
M = M-1;
S = np.copy( D[0:N,0:2] )
myk = np.ndarray.astype(D[0:N,2:3], int);
mu = np.genfromtxt("../Data/threeclustermeans.txt", delimiter='\t');
    
# initial means are randomly-chosen samples
initmu = eda_forgy_mean(S,K);

# k-mean analysis
mymu, myk, mydist2, mycount, myKdist2 = eda_kmeans(S, initmu);

print('cluster means');
print(mymu);
print('mean sample distance');
print(np.mean(np.sqrt(mydist2)));
print('cluster counts');
print(mycount);
print('mean inter-cluster distance');
print(np.mean(np.sqrt(myKdist2)));

fig4 = plt.figure(4,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
plt.plot( initmu[0:K,0:1], initmu[0:K,1:2], 'r^', lw=4 )
for i in range(N):
    if myk[i,0]==0:
        plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
    elif myk[i,0]==1:
        plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
    else:
        plt.plot( S[i,0], S[i,1], 'ko', lw=1 );
plt.plot( mymu[0:K,0:1], mymu[0:K,1:2], 'b^', lw=4 )
plt.xlabel('x');
plt.ylabel('y');
plt.show();

cluster means
[[1.07859747 2.02539117]
 [3.04311042 4.05478594]
 [4.05072009 1.03414835]]
mean sample distance
0.6470009786672823
cluster counts
[[100]
 [100]
 [100]]
mean inter-cluster distance
2.0315147021424114

# edapy_08_17, cluster analysis w/ Bradley-Fayyad initialization

# load the data
K = 3;
D = np.genfromtxt("../Data/threeclusters.txt", delimiter='\t');
N, M = np.shape(D);
M = M-1;
S = np.copy( D[0:N,0:2] )
myk = np.ndarray.astype(D[0:N,2:3], int);
mu = np.genfromtxt("../Data/threeclustermeans.txt", delimiter='\t');

# bradley-fayyad parameters
Nsets = 100;     # number of small sets
fraction = 0.05; # size of small sets = fraction of N

# initial means are bradley-fayyad means
initmu = eda_bradley_fayyad_mean(S,K,Nsets,fraction);

# k-mean analysis
mymu, myk, mydist2, mycount, myKdist2 = eda_kmeans(S, initmu);

print('cluster means');
print(mymu);
print('mean sample distance');
print(np.mean(np.sqrt(mydist2)));
print('cluster counts');
print(mycount);
print('mean inter-cluster distance');
print(np.mean(np.sqrt(myKdist2)));

fig5 = plt.figure(5,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
plt.plot( initmu[0:K,0:1], initmu[0:K,1:2], 'r^', lw=4 )
for i in range(N):
    if myk[i,0]==0:
        plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
    elif myk[i,0]==1:
        plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
    else:
        plt.plot( S[i,0], S[i,1], 'ko', lw=1 );
plt.plot( mymu[0:K,0:1], mymu[0:K,1:2], 'b^', lw=4 )
plt.xlabel('x');
plt.ylabel('y');
plt.show();

cluster means
[[1.07859747 2.02539117]
 [3.04311042 4.05478594]
 [4.05072009 1.03414835]]
mean sample distance
0.6470009786672823
cluster counts
[[100]
 [100]
 [100]]
mean inter-cluster distance
2.0315147021424114

# edapy_08_18, more clusters than actually occur

# load the data
K = 3;
D = np.genfromtxt("../Data/threeclusters.txt", delimiter='\t');
N, M = np.shape(D);
M = M-1;
S = np.copy( D[0:N,0:2] )
myk = np.ndarray.astype(D[0:N,2:3], int);
mu = np.genfromtxt("../Data/threeclustermeans.txt", delimiter='\t');


# six clusers
K6=6;

# initial means are forgy means
initmu = eda_forgy_mean(S,K6);

# k-mean analysis
mymu, myk, mydist2, mycount, myKdist2 = eda_kmeans(S, initmu);

print('cluster means');
print(mymu);
print('mean sample distance');
print(np.mean(np.sqrt(mydist2)));
print('cluster counts');
print(mycount);
print('mean inter-cluster distance');
print(np.mean(np.sqrt(myKdist2)));
print('inter-cluster distances');
print(np.sqrt(myKdist2));

fig4 = plt.figure(4,figsize=(5,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, 5, 0, 5] );
for i in range(N):
    if myk[i,0]==0:
        plt.plot( S[i,0], S[i,1], 'k.', lw=1 );
    elif myk[i,0]==1:
        plt.plot( S[i,0], S[i,1], 'k+', lw=1 );
    elif myk[i,0]==2:
        plt.plot( S[i,0], S[i,1], 'k^', lw=1 );
    elif myk[i,0]==3:
        plt.plot( S[i,0], S[i,1], 'kx', lw=1 );
    elif myk[i,0]==4:
        plt.plot( S[i,0], S[i,1], 'kv', lw=1 );
    else:
        plt.plot( S[i,0], S[i,1], 'ks', lw=1 );
plt.plot( mymu[0:K6,0:1], mymu[0:K6,1:2], 'r^', lw=2 )
plt.xlabel('x');
plt.ylabel('y');
plt.show();

cluster means
[[0.97471318 2.52414481]
 [4.05072009 1.03414835]
 [1.53932691 1.89484044]
 [2.58293311 4.04672791]
 [3.50328773 4.06284396]
 [0.70045566 1.46208681]]
mean sample distance
0.5349723526734314
cluster counts
[[ 40]
 [100]
 [ 32]
 [ 50]
 [ 50]
 [ 28]]
mean inter-cluster distance
2.0665781214090626
inter-cluster distances
[[0.         3.41788063 0.84546594 2.21464008 2.95994669 1.09689761]
 [3.41788063 0.         2.6547856  3.35112432 3.07777181 3.37748473]
 [0.84546594 2.6547856  0.         2.39159646 2.92530022 0.94391773]
 [2.21464008 3.35112432 2.39159646 0.         0.92049571 3.19751325]
 [2.95994669 3.07777181 2.92530022 0.92049571 0.         3.82358541]
 [1.09689761 3.37748473 0.94391773 3.19751325 3.82358541 0.        ]]

# edapy_08_19, clusters in the Atlantic Rocks dataset

# read data
S = np.genfromtxt('../Data/rocks.txt', delimiter='\t');
N, M = np.shape(S);

# examine mean-sample distances for sequence of number
# of clusters
KMAX = 11;
Klist = np.zeros((KMAX,1));
Dlist = np.zeros((KMAX,1));
print('K','mean_dist');
for K in range(1,KMAX):
    initmu = eda_forgy_mean(S,K);
    mymu, myk, mydist2, mycount, myKdist2 = eda_kmeans(S, initmu);
    Klist[K,0]=K;
    Dlist[K,0]=np.mean(np.sqrt(mydist2));
    print(K,Dlist[K,0]);

# plot of distances
fig1 = plt.figure(1,figsize=(10,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, KMAX, 0, 3] );
plt.plot( Klist[1:KMAX,0:1], Dlist[1:KMAX,0:1], 'k-', lw=2 )
plt.plot( Klist[1:KMAX,0:1], Dlist[1:KMAX,0:1], 'ko', lw=2 )
plt.xlabel('K');
plt.ylabel('mean Rij');
plt.show();

K mean_dist
1 2.450872469766111
2 2.159369027284355
3 1.87730362348076
4 1.6844183908052115
5 1.5978670360338794
6 1.523469254491392
7 1.4510963869048237
8 1.42084090605399
9 1.3982566808767423
10 1.3737027099481425

# edapy_08_20, print 8 clusters in the Atlantic Rocks dataset

# read the data
S = np.genfromtxt('../Data/rocks.txt', delimiter='\t')
N, M = np.shape(S);

K=8; # 8 clusters
initmu = eda_forgy_mean(S,K); # forgy initial means

# k-mean analysis
mymu, myk, mydist, mycount, myKdist = eda_kmeans(S, initmu);

# print table of cluster means
names = ["SiO2", "TiO2", "Al203", "FeO-t", "MgO", "CaO", "Na2O", "K2O" ];
print('cluster\t', end='');
for j in range(M):
    print("%s\t" % (names[j]), end='');
print('count');
for i in range(K):
    print("%d\t" % (i), end='');
    for j in range(M):
        print("%.2f\t" % (mymu[i,j]), end='');
    print("%d" % (mycount[i,0]));

cluster	SiO2	TiO2	Al203	FeO-t	MgO	CaO	Na2O	K2O	count
0	44.90	0.06	2.51	8.20	40.22	1.52	0.13	0.03	54
1	51.38	1.82	15.04	10.33	6.64	10.65	3.00	0.30	815
2	48.03	0.42	21.22	5.32	6.61	15.14	1.60	0.09	40
3	48.94	0.86	16.15	8.96	9.67	12.12	2.16	0.11	497
4	50.67	1.56	14.08	12.22	6.86	11.50	2.26	0.15	850
5	50.91	1.18	14.88	9.87	7.76	12.28	2.20	0.15	1397
6	48.54	1.19	16.74	8.54	6.99	12.92	2.32	0.28	516
7	50.52	1.44	15.68	9.40	7.83	11.43	2.71	0.19	2187

# edapy_08_21, 3D plot of clusters in Atlantic Rocks dataset

# read the data
S = np.genfromtxt('../Data/rocks.txt', delimiter='\t')
N, M = np.shape(S);
names = ["SiO2", "TiO2", "Al203", "FeO-t", "MgO", "CaO", "Na2O", "K2O" ];

K=8; # 8 clusters
initmu = eda_forgy_mean(S,K); # forgy initial means

# k-mean analysis
mymu, myk, mydist, mycount, myKdist = eda_kmeans(S, initmu);

# elements to plot (and their scale)

m1 = 2; # Al203
xmin = 0.0
xmax = 50.0;

m2 = 4; # MgO
ymin = 0.0
ymax = 50.0;

m3 = 5; # Ca0
zmin = 0.0
zmax = 20.0;

# 3D plot

fig2 = plt.figure(2,figsize=(10,5));
ax1 = fig2.add_subplot(121, projection='3d')
ax1.axes.set_xlim3d(left=xmin, right=xmax);
ax1.axes.set_ylim3d(bottom=ymin, top=ymax);
ax1.axes.set_zlim3d(bottom=zmin, top=zmax);
ax1.view_init(1, 30);

ax1.scatter( xs=S[:,m1], ys=S[:,m2], zs=S[:,m3], color=(0,0,0) );
ax1.plot( xs=[xmin,xmin], ys=[ymin,ymin], zs=[zmin,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmin], ys=[ymin,ymax], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmax], ys=[ymin,ymin], zs=[zmin,zmin], color=(0.8,0.8,0.8) );

ax1.plot( xs=[xmax,xmax], ys=[ymax,ymax], zs=[zmax,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymax,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmin], ys=[ymax,ymax], zs=[zmax,zmax], color=(0.8,0.8,0.8) );

ax1.plot( xs=[xmax,xmin], ys=[ymin,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymin,ymin], zs=[zmax,zmin], color=(0.8,0.8,0.8) );

ax1.plot( xs=[xmin,xmin], ys=[ymax,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmin], ys=[ymax,ymax], zs=[zmax,zmin], color=(0.8,0.8,0.8) );

ax1.plot( xs=[xmax,xmax], ys=[ymax,ymin], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmin], ys=[ymax,ymax], zs=[zmin,zmin], color=(0.8,0.8,0.8) );

ax1.set_xlabel(names[m1]);
ax1.set_ylabel(names[m2]);
ax1.set_zlabel(names[m3]);

ax1 = fig2.add_subplot(122, projection='3d')
ax1.axes.set_xlim3d(left=xmin, right=xmax);
ax1.axes.set_ylim3d(bottom=ymin, top=ymax);
ax1.axes.set_zlim3d(bottom=zmin, top=zmax);
ax1.view_init(1, 30);

ax1.scatter( xs=mymu[:,m1], ys=mymu[:,m2], zs=mymu[:,m3], color=(0,0,0) );
ax1.plot( xs=[xmin,xmin], ys=[ymin,ymin], zs=[zmin,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmin], ys=[ymin,ymax], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmax], ys=[ymin,ymin], zs=[zmin,zmin], color=(0.8,0.8,0.8) );

ax1.plot( xs=[xmax,xmax], ys=[ymax,ymax], zs=[zmax,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymax,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmin], ys=[ymax,ymax], zs=[zmax,zmax], color=(0.8,0.8,0.8) );

ax1.plot( xs=[xmax,xmin], ys=[ymin,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmax], ys=[ymin,ymin], zs=[zmax,zmin], color=(0.8,0.8,0.8) );

ax1.plot( xs=[xmin,xmin], ys=[ymax,ymin], zs=[zmax,zmax], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmin,xmin], ys=[ymax,ymax], zs=[zmax,zmin], color=(0.8,0.8,0.8) );

ax1.plot( xs=[xmax,xmax], ys=[ymax,ymin], zs=[zmin,zmin], color=(0.8,0.8,0.8) );
ax1.plot( xs=[xmax,xmin], ys=[ymax,ymax], zs=[zmin,zmin], color=(0.8,0.8,0.8) );

ax1.set_xlabel(names[m1]);
ax1.set_ylabel(names[m2]);
ax1.set_zlabel(names[m3]);

plt.show();

# edama_08_22, Script to support Cribsheet 8.2 on EOF's

# standard set-up of v-axis
M=100;
Dv = 1.0;
vmin = 0.0;
vmax = vmin + Dv*(M-1);
v = eda_cvec(np.linspace(vmin,vmax,M));

# standard set-up of u-axis
N=1000;
Du = 1.0;
umin = 0.0;
umax = umin + Du*(N-1);
u = eda_cvec(np.linspace(umin,umax,N));

# gre-scale colormap
bw = np.zeros((256,4));
vv = (255 - np.linspace( 0, 255, 256 ))/255;
bw[:,0] = vv;
bw[:,1] = vv;
bw[:,2] = vv;
bw[:,3] = np.ones(256);
bwcmap = ListedColormap(bw);

# Part 1: make synthetic data

# four true patterns
fv1 = np.mod(v,10)/10;
v0 = (vmin+vmax)/2;
s2 = (vmax-vmin)/5;
fv2 = np.exp(-0.5*np.power((v-v0)/s2,2));
v1 = (vmin+vmax)/4;
v2 = 3*(vmin+vmax)/4;
s2 = (vmax-vmin)/10;
fv3 = np.exp(-0.5*np.power((v-v1)/s2,2))-np.exp(-0.5*np.power((v-v2),2)/s2);
L = vmax-vmin;
n=10;
fv4 = np.cos(n*pi*v/L);

# plot u dependence of true patterns
print('true patterns');
fig1 = plt.figure(1,figsize=(10,5));
fvlist = [fv1, fv2, fv3, fv4];
for i in range(4):
    fvi = fvlist[i];
    ax1 = plt.subplot(4,1,i+1);
    fmax = np.max(np.abs(fvi));
    plt.axis( [vmin, vmax, -fmax, fmax] );
    plt.plot( v, fvi, 'k-', lw=2 );
    plt.xlabel('v');
    plt.ylabel("fv%d" % (i));
plt.show()

# true amplitudes of patterns
L = umax-umin;
cu1 = (u-umin)/L;
cu2 = np.power((u-umin),0.25)/(L**0.25);
n=10;
cu3 = np.cos(n*pi*u/L);
cu4 = (100-np.mod(u,100))/100;

# plot true v dependence of amplitudes
print('amplitudes of true patterns');
fig2 = plt.figure(2,figsize=(10,5));
culist = [cu1, cu2, cu3, cu4];
for i in range(4):
    cui = culist[i];
    ax1 = plt.subplot(4,1,i+1);
    cmax = np.max(np.abs(cui));
    plt.axis( [umin, umax, -cmax, cmax] );
    plt.plot( u, cui, 'k-', lw=2 );
    plt.xlabel('u');
    plt.ylabel("cu%d" % (i));
plt.show()

# build hypothetical sample matric
S = np.matmul(
        np.concatenate( (cu1, cu2, cu3, cu4), axis=1),
        np.concatenate( (fv1.T, fv2.T, fv3.T, fv4.T), axis=0) );
Smax = np.max(np.abs(S));
S = S + np.random.normal(loc=0,scale=(0.1*Smax),size=(N,M));

# plot S
fig3 = plt.figure(3,figsize=(10,5));
ax1 = plt.subplot(1,1,1);
Smax = np.max(np.abs(S));
plt.axis( [vmin,vmax,umin,umax] );
plt.imshow(S,cmap=bwcmap,vmin=-Smax,vmax=Smax,extent=(vmin,vmax,umin,umax),aspect='auto');
plt.xlabel('v');
plt.ylabel('u');
plt.title('S');
plt.show();

# Part 2: singular value decompsition
# F is a matrix of the EOF's and
# C is a matrix of their amplitudes

# factors F and loadings F
[U, sigma, VT] = la.svd(S,full_matrices=False);
F = VT; # EOS's (factors)
C = np.matmul( U, np.diag(sigma) ); # amplitudes (loadings)

fig4 = plt.figure(4,figsize=(10,5));
ax1 = plt.subplot(1,1,1);
plt.axis( [0, M, 0, np.max(sigma)] );
plt.plot( np.linspace(1,M,M), sigma, 'k-', lw=1 );
plt.plot( np.linspace(1,M,M), sigma, 'ko', lw=2 );
plt.xlabel('i');
plt.ylabel('sigma i');
plt.title('singular values');

# Part 3: selection of number P of significant EOS's

# On the basis of the last plot, P=4
P = 4;
FP = np.copy( F[0:P,:] );
CP = np.copy( C[:,0:P] );
SP = np.matmul( CP, FP );

# plot SP
fig5 = plt.figure(5,figsize=(10,5));
ax1 = plt.subplot(1,1,1);
SPmax = np.max(np.abs(SP));
plt.axis( [vmin,vmax,umin,umax] );
plt.imshow(SP,cmap=bwcmap,vmin=-Smax,vmax=Smax,extent=(vmin,vmax,umin,umax),aspect='auto');
plt.xlabel('v');
plt.ylabel('u');
plt.title('SP');
plt.show();

# Part 3: plot EOF's as a function of v
# and their amplitudes as a function of u

# plot EOF's
print('EOFs');
fig6 = plt.figure(6,figsize=(10,5));
for i in range(P):
    ax1 = plt.subplot(4,1,i+1);
    fvi = np.copy(FP[i:i+1,0:M]).T;
    plt.axis( [vmin, vmax, -max(abs(fvi)), max(abs(fvi))])
    plt.plot( v, fvi, 'k-', lw=2);
    plt.xlabel('v');
    plt.ylabel("fv%d" % (i));
plt.show();

# plot amplitudes
print('amplitudes of EOFs');
fig7 = plt.figure(7,figsize=(10,5));
for i in range(P):
    ax1 = plt.subplot(4,1,i+1);
    cui=np.copy(CP[0:N,i:i+1]);
    plt.axis( [umin, umax, -max(abs(cui)), max(abs(cui))]);
    plt.plot( u, cui, 'k-', lw=2);
    plt.xlabel('u');
    plt.ylabel("cu%d" % (i));
plt.show();

print('Note that the factors do not match the original patterns,');
print('although they do bear some resemblence to them.');
print('That''s because any linear combination of factors will do.');
print('Note also that fv4 and cu4 are very noisy. That''s because the');
print('p=4 singular value is close to the noise level.');

true patterns

amplitudes of true patterns

EOFs

amplitudes of EOFs

Note that the factors do not match the original patterns,
although they do bear some resemblence to them.
Thats because any linear combination of factors will do.
Note also that fv4 and cu4 are very noisy. Thats because the
p=4 singular value is close to the noise level.