#TODO: handle missing values
#TODO: for certain sizes, calculate the full covariance matrix, but then use an iterative method directly on it?
# EOF order axis
from pygeode.axis import Axis
class order(Axis):
name = 'eof'
formatstr = '%d'
plotatts = Axis.plotatts.copy()
plotatts['plotname'] = 'EOF'
del Axis
def apply_weights (var, weight):
# Normalize by area weight?
if weight is not False:
if weight is True: weight = var.getweights()
W = weight.sum() / weight.size
weight /= W
weight = weight.sqrt()
name = var.name
var *= weight
var.name = name
return var
def remove_weights (var, weight):
# Undo normalization by area weight?
# (should be the exact opposite of the 'weight' function
if weight is not False:
if weight is True: weight = var.getweights()
W = weight.sum() / weight.size
weight /= W
weight = weight.sqrt()
name = var.name
var /= weight
var.name = name
return var
# Parse the output list
def whichout (out):
# Valid options
namemap = [ 'eof|eofs', 'eig|eigenvalue|eigenvalues', 'pc|timeseries', 'var|variance', 'frac|varfrac', 'eig2' ]
namemap = [ n.split('|') for n in namemap ]
names = [ n for N in namemap for n in N ]
# Defaults
if out is None: out = ('eof','eig','pc')
# Single string (comma separated values?)
if isinstance(out,str): out = out.split(',')
assert isinstance(out,(list,tuple)), "Unknown output type"
for o in out: assert isinstance(o,str), "Expected string argument, found %s"%type(o)
# Lower case
out = [o.lower() for o in out]
for o in out: assert o in names, "Unrecognized output '%s'"%o
return out
##prep the var
def prep (var, iaxis, weight, out):
from pygeode.timeaxis import Time
from pygeode.var import Var
from pygeode.axis import Axis
from pygeode.view import View
from warnings import warn
from pygeode import MAX_ARRAY_SIZE
assert isinstance(var,Var)
assert var.naxes >= 2, "need at least 2 axes"
# Check the outputs
out = whichout(out)
del out # not actually used here
# Keep the name
name = var.name
# Normalize by area weight?
var = apply_weights(var, weight=weight)
del weight
timeaxes = iaxis
del iaxis
if timeaxes is None:
if var.hasaxis(Time): timeaxes = Time
else:
warn ("No explicit record axis provided. Using the first axis.", stacklevel=2)
timeaxes = 0
# Keep the record axis/axes as a tuple
# (in case we have more than one axis, i.e. time and ensemble)
if not isinstance(timeaxes,(list,tuple)):
assert isinstance(timeaxes,int) or issubclass(timeaxes,Axis), 'unknown iaxis type %s'%type(timeaxes)
timeaxes = [timeaxes]
# Convert the axes to integer ids
timeaxes = [var.whichaxis(a) for a in timeaxes]
spaceaxes = [i for i in range(var.naxes) if i not in timeaxes]
# Convert to axis objects
timeaxes = [var.axes[i] for i in timeaxes]
spaceaxes = [var.axes[i] for i in spaceaxes]
# Create a view, to hold the axes together
# (provides us with other useful stuff, like a notion of 'shape' and 'size')
time = View(axes=timeaxes)
space = View(axes=spaceaxes)
# var = SquishedVar(var, timeaxes, spaceaxes)
# Preload the data, if possible
if var.size <= MAX_ARRAY_SIZE: var = var.load()
return var, time, space
# Remove weights, wrap as Vars
def finalize (var, time, space, eof, eig, pc, variance, weight, out):
from pygeode.var import Var
import numpy as np
# Keep the name
name = var.name
num = eof.shape[0]
# Conform to the proper shape
eof = eof.reshape((num,)+space.shape)
pc = pc.reshape((num,)+time.shape)
# Use a consistent sign convention
# (the first element of the eof is non-negative)
sign = [-1 if eof.reshape(num,-1)[i,0] < 0 else 1 for i in range(num)]
for i,s in enumerate(sign):
eof[i,...] *= s
pc[i,...] *= s
# Copy the data into fresh array
# (that way, if these are view on much larger arrays, the rest of
# the unused data can be garbage collected)
eof = np.array(eof)
pc = np.array(pc)
eig = np.array(eig)
# EOF axis
orderaxis = order(num,name="order")
eof = Var((orderaxis,)+space.axes, values=eof)
eig = Var([orderaxis], values=eig)
pc = Var((orderaxis,)+time.axes, values = pc)
# Undo normalization by area weight?
eof = remove_weights(eof, weight=weight).load()
eof.name = name + "_EOF"
pc.name = name + "_timeseries"
eig.name = name + "_eigenvalues"
# Other things
# Fraction of total variance
frac = ((eig**2) / variance).load()
frac.name = name + "_fraction_of_variance"
# Eigenvalues of the covariance matrix
eig2 = (eig**2).load()
eig2.name = name + "_eigenvalues"
# Gather up all possible outputs
outdict = dict(eof=eof, eig=eig, eig2=eig2, pc=pc, var=variance, frac=frac)
out = whichout(out)
out = [outdict[o] for o in out]
return out
##################################################
# Iterative EOF method
##################################################
# After each iteration, does an explicit eigenvalue decomposition on a much
# smaller matrix. Hopefully, this will converge faster than a simple
# power-method.
# Uses EOF_guess for initial patterns
# Assume the mean has already been subtracted!
def EOF_iter (x, num=1, iaxis=None, subspace = -1, max_iter=1000, weight=True, out=None):
"""
(See svd.SVD for documentation on a similar function, but replace each xxx1 and xxx2 parameter with a single xxx parameter.)
"""
import numpy as np
from pygeode import libpath
from pygeode.view import View
from math import sqrt
from pygeode.varoperations import fill
from pygeode import svdcore as lib
# Need vector subspace to be at least as large as the number of EOFs extracted.
if subspace < num: subspace = num
# Run the single-pass guess to seed the first iteration
guess_eof, guess_eig, guess_pc = EOF_guess (x, subspace, iaxis, weight=weight, out=None)
# Convert NaNs to zeros so they don't screw up the matrix operations
guess_eof = fill (guess_eof, 0)
x, time, space = prep(var=x, iaxis=iaxis, weight=weight, out=out)
del iaxis
eofshape = (subspace,) + space.shape
pcshape = time.shape + (subspace,)
pcs = np.empty(pcshape,dtype='d')
oldeofs = np.empty(eofshape,dtype='d')
# Seed with initial guess (in the weighted space)
neweofs = apply_weights (guess_eof, weight=weight).get()
neweofs = np.array(neweofs, dtype='d') # so we can write
# neweofs = np.random.rand(*eofshape)
# Workspace for smaller representative matrix
work1 = np.empty([subspace,subspace], dtype='d')
work2 = np.empty([subspace,subspace], dtype='d')
NX = space.size
# Variance accumulation (on first iteration only)
variance = 0.0
for iter_num in range(1,max_iter+1):
print('iter_num: %d'%iter_num)
neweofs, oldeofs = oldeofs, neweofs
# Reset the accumulation arrays for the next approximations
neweofs[()] = 0
# Apply the covariance matrix
for inview in View(x.axes).loop_mem():
X = np.ascontiguousarray(inview.get(x), dtype='d')
assert X.size >= space.size, "spatial pattern is too large"
nt = inview.shape[0]
time_offset = inview.slices[0].start
ier = lib.build_eofs (subspace, nt, NX, X, oldeofs,
neweofs, pcs[time_offset,...])
assert ier == 0
# Compute variance?
if iter_num == 1:
variance += (X**2).sum()
# Useful dot products
lib.dot(subspace, NX, oldeofs, neweofs, work1)
lib.dot(subspace, NX, neweofs, neweofs, work2)
# Compute surrogate matrix (using all available information from this iteration)
A, residues, rank, s = np.linalg.lstsq(work1,work2,rcond=1e-30)
# Eigendecomposition on surrogate matrix
w, P = np.linalg.eig(A)
# Sort by eigenvalue
S = np.argsort(w)[::-1]
w = w[S]
print(w)
# assert P.dtype.name == 'float64', P.dtype.name
P = np.ascontiguousarray(P[:,S], dtype='d')
# Translate the surrogate eigenvectors to an estimate of the true eigenvectors
lib.transform(subspace, NX, P, neweofs)
# Normalize
lib.normalize (subspace, NX, neweofs)
# # verify orthogonality
# for i in range(num):
# print [np.dot(neweofs[i,...].flatten(), neweofs[j,...].flatten()) for j in range(num)]
if np.allclose(oldeofs[:num,...],neweofs[:num,...], atol=0):
print('converged after %d iterations'%iter_num)
break
assert iter_num != max_iter, "no convergence"
# Wrap as pygeode vars, and return
# Only need some of the eofs for output (the rest might not have even converged yet)
eof = neweofs[:num]
pc = pcs[...,:num].transpose()
# Extract the eigenvalues
# (compute magnitude of pc arrays)
#TODO: keep eigenvalues as a separate variable in the iteration loop
eig = np.array([sqrt( (pc[i,...]**2).sum() ) for i in range(pc.shape[0]) ])
pc = np.dot(np.diag(1/eig), pc)
return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)
##################################################
# Single-pass EOF guess
# Gives a reasonable qualitative representation of the EOF patterns
##################################################
def EOF_guess (x, num=1, iaxis=None, weight=True, out=None):
import numpy as np
from pygeode.var import Var
from pygeode.view import View
from pygeode import eofcore as lib
x, time, space = prep (x, iaxis, weight=weight, out=out)
del iaxis
print("working on array shape %s"%(x.shape,))
# Initialize workspace
work = lib.start (num, space.size)
eof = np.empty((num,)+space.shape, dtype='d')
eig = np.empty([num], dtype='d')
pc = np.empty((num,)+time.shape, dtype='d')
# Variance accumulation
variance = 0.0
# Loop over chunks of the data
for inview in View(x.axes).loop_mem():
X = np.ascontiguousarray(inview.get(x), dtype='d')
assert X.size >= space.size, "Spatial pattern is too large"
nrec = X.size // space.size
lib.process (work, nrec, X)
# Accumulate variance
variance += (X**2).sum()
# Get result
lib.endloop (work, eof, eig, pc)
# Free workspace
lib.finish (work)
# Wrap the stuff
return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)
##################################################
# Compute EOFs directly from a singular value decomposition
# (works best on small data arrays)
##################################################
def EOF_svd (x, num=1, iaxis=None, weight=True, out=None):
import numpy as np
x, time, space = prep (x, iaxis, weight=weight, out=out)
del iaxis
# SVD
data = x.get().reshape([time.size, space.size])
variance = (data**2).sum()
u, s, v = np.linalg.svd(data, full_matrices=False)
# print '?', u.shape, s.shape, v.shape
eof = v[:num,...]
eig = s[:num]
pc = u.transpose()[:num,...]
# print eof.shape, eig.shape, pc.shape
return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)
##################################################
# Compute EOFs explicitly from the covariance matrix
# (works best when the covariance matrix is small)
# Steps:
# 1) Scan through the data, construct an explicit covariance matrix
# 2) Compute the eigenvalues & eigenvectors
# 3) Scan through the data, construct the timeseries data
##################################################
def EOF_cov (x, num=1, iaxis=None, weight=True, out=None):
import numpy as np
from pygeode.view import View
x, time, space = prep (x, iaxis, weight=weight, out=out)
del iaxis
# Initialize space for accumulating the covariance matrix
cov = np.zeros ([space.size, space.size], dtype='d')
# Accumulate the covariance
for inview in View(x.axes).loop_mem():
X = inview.get(x)
assert X.size >= space.size, "Spatial pattern is too large"
X = X.reshape(-1,space.size)
cov += np.dot(X.transpose(),X)
# Decompose the eigenvectors & eigenvalues
w, v = np.linalg.eigh(cov/(time.size-1))
variance = w.sum()
eig = np.sqrt(w[::-1][:num])
eof = v.transpose()[::-1,:][:num,:]
# Compute the timeseries
pc = []
for inview in View(x.axes).loop_mem():
X = inview.get(x).reshape(-1,space.size)
pc.append(np.dot(eof, X.transpose()))
pc = np.concatenate(pc, axis=1)
# Normalize
pc /= eig.reshape(num,1)
return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)
##################################################
# Compute EOFs
# Determine which routine to use (svd, covariance, iterative)
# based on the size of the data
##################################################
[docs]def EOF (x, num=1, iaxis=None, weight=True, out=None):
"""
Computes the leading Empirical Orthogonal Function(s) for the given
variable.
Parameters
----------
x : Var object
The data of interest
num : integer, optional
The number of leading EOFs to calculate. Default is ``1``.
iaxis : Axis object, Axis class, string, or integer, optional
Which axis/axes to treat as the record axis. Multiple axes can be
passed as a tuple. Default is the ``Time`` axis of the data
(if found), otherwise the leftmost axis.
weight : Var object or boolean, optional
Weights to use use for the orthogonality condition.
If ``True``, it uses whatever internal weights the variable posesses.
If ``False`` or ``None``, it doesn't use any weights.
You can also pass in a Var object with explicit weights.
Default is ``True``.
out : string, optional
Which outputs to return. This is a comma-separated string,
built from the following keywords:
======= ==========================================================
keyword meaning
======= ==========================================================
EOF The EOFs (normalized to unit variance)
EIG Eigenvalues (of the singular value decomposition of
the variable). If you want the eigenvalues of the
covariance matrix, square these, or use EIG2).
EIG2 Eigenvalues (of the covariance matrix)
VAR Variance (a single scalar value).
FRAC Fraction of total variance explained by each EOF.
PC Principal components (timeseries data), normalized
to unit variance.
======= ==========================================================
Default is ``'EOF,EIG,PC'``
Returns
-------
eof_decomposition : tuple
A combination of EOFs, eignenvalues or other computed quantities specified
by ``out``.
Notes
-----
This routine doesn't do any pre-processing of the data, such as removing
the mean or detrending. If you want to work with anomalies, then you'll
have to first compute the anomalies!
This routine tries to automatically determine the best way to solve the
EOFs. If you want to use a particular method, you can call the following
functions (with the same parameters):
========= ============================================================
function behaviour
========= ============================================================
EOF_iter Iterative solver (uses a variant of the power method).
EOF_cov Calculates the full covariance matrix, and then does an
explicit eigendecomposition.
EOF_svd Does an explicit singular value decomposition on the data.
EOF_guess Returns an approximation of the EOF decomposition from one
pass through the data. This may be useful if you have a
large dataset, and you just want the qualitative features
of the EOF spatial patterns.
========= ============================================================
"""
from math import sqrt
from pygeode import MAX_ARRAY_SIZE
# Maximum size allowed for explicit solvers
#TODO: run some actual tests to determine this number
max_explicit_size = int(sqrt(MAX_ARRAY_SIZE))
# Get the time and space dimensions
junk, time, space = prep (x, iaxis=iaxis, weight=weight, out=out)
if x.size <= max_explicit_size: f = EOF_svd
elif space.size**2 <= max_explicit_size: f = EOF_cov
else: f = EOF_iter
return f (x, num=num, iaxis=iaxis, weight=weight, out=out)