Source code for pygeode.eof

#TODO: handle missing values
#TODO: for certain sizes, calculate the full covariance matrix, but then use an iterative method directly on it?

# EOF order axis
from pygeode.axis import Axis
class order(Axis): 
  name = 'eof'
  formatstr = '%d'

  plotatts = Axis.plotatts.copy()
  plotatts['plotname'] = 'EOF'
del Axis


def apply_weights (var, weight):
  # Normalize by area weight?
  if weight is not False:
    if weight is True:  weight = var.getweights()
    W = weight.sum() / weight.size
    weight /= W
    weight = weight.sqrt()
    name = var.name
    var *= weight
    var.name = name
  return var

def remove_weights (var, weight):
  # Undo normalization by area weight?
  # (should be the exact opposite of the 'weight' function
  if weight is not False:
    if weight is True:  weight = var.getweights()
    W = weight.sum() / weight.size
    weight /= W
    weight = weight.sqrt()
    name = var.name
    var /= weight
    var.name = name
  return var

# Parse the output list
def whichout (out):
  # Valid options
  namemap = [ 'eof|eofs', 'eig|eigenvalue|eigenvalues', 'pc|timeseries', 'var|variance', 'frac|varfrac', 'eig2' ]
  namemap = [ n.split('|') for n in namemap ]
  names = [ n for N in namemap for n in N ]
  # Defaults
  if out is None: out = ('eof','eig','pc')
  # Single string (comma separated values?)
  if isinstance(out,str): out = out.split(',')
  assert isinstance(out,(list,tuple)), "Unknown output type"
  for o in out: assert isinstance(o,str), "Expected string argument, found %s"%type(o)
  # Lower case
  out = [o.lower() for o in out]
  for o in out: assert o in names, "Unrecognized output '%s'"%o
  return out

##prep the var
def prep (var, iaxis, weight, out):
  from pygeode.timeaxis import Time
  from pygeode.var import Var
  from pygeode.axis import Axis
  from pygeode.view import View
  from warnings import warn
  from pygeode import MAX_ARRAY_SIZE

  assert isinstance(var,Var)
  assert var.naxes >= 2, "need at least 2 axes"

  # Check the outputs
  out = whichout(out)
  del out # not actually used here

  # Keep the name
  name = var.name

  # Normalize by area weight?
  var = apply_weights(var, weight=weight)
  del weight

  timeaxes = iaxis
  del iaxis
  if timeaxes is None:
    if var.hasaxis(Time): timeaxes = Time
    else:
      warn ("No explicit record axis provided.  Using the first axis.", stacklevel=2)
      timeaxes = 0

  # Keep the record axis/axes as a tuple
  # (in case we have more than one axis, i.e. time and ensemble)
  if not isinstance(timeaxes,(list,tuple)):
    assert isinstance(timeaxes,int) or issubclass(timeaxes,Axis), 'unknown iaxis type %s'%type(timeaxes)
    timeaxes = [timeaxes]

  # Convert the axes to integer ids
  timeaxes = [var.whichaxis(a) for a in timeaxes]
  spaceaxes = [i for i in range(var.naxes) if i not in timeaxes]

  # Convert to axis objects
  timeaxes = [var.axes[i] for i in timeaxes]
  spaceaxes = [var.axes[i] for i in spaceaxes]

  # Create a view, to hold the axes together
  # (provides us with other useful stuff, like a notion of 'shape' and 'size')
  time = View(axes=timeaxes)
  space = View(axes=spaceaxes)

#  var = SquishedVar(var, timeaxes, spaceaxes)

  # Preload the data, if possible
  if var.size <= MAX_ARRAY_SIZE: var = var.load()

  return var, time, space



# Remove weights, wrap as Vars
def finalize (var, time, space, eof, eig, pc, variance, weight, out):
  from pygeode.var import Var
  import numpy as np

  # Keep the name
  name = var.name
  num = eof.shape[0]

  # Conform to the proper shape
  eof = eof.reshape((num,)+space.shape)
  pc = pc.reshape((num,)+time.shape)

  # Use a consistent sign convention
  # (the first element of the eof is non-negative)
  sign = [-1 if eof.reshape(num,-1)[i,0] < 0 else 1 for i in range(num)]
  for i,s in enumerate(sign):
    eof[i,...] *= s
    pc[i,...] *= s

  # Copy the data into fresh array
  # (that way, if these are view on much larger arrays, the rest of
  #  the unused data can be garbage collected)
  eof = np.array(eof)
  pc = np.array(pc)
  eig = np.array(eig)

  # EOF axis
  orderaxis = order(num,name="order")
  eof = Var((orderaxis,)+space.axes, values=eof)
  eig = Var([orderaxis], values=eig)
  pc = Var((orderaxis,)+time.axes, values = pc)

  # Undo normalization by area weight?
  eof = remove_weights(eof, weight=weight).load()

  eof.name = name + "_EOF"
  pc.name = name + "_timeseries"
  eig.name = name + "_eigenvalues"

  # Other things
  # Fraction of total variance
  frac = ((eig**2) / variance).load()
  frac.name = name + "_fraction_of_variance"
  # Eigenvalues of the covariance matrix
  eig2 = (eig**2).load()
  eig2.name = name + "_eigenvalues"

  # Gather up all possible outputs
  outdict = dict(eof=eof, eig=eig, eig2=eig2, pc=pc, var=variance, frac=frac)
  out = whichout(out)
  out = [outdict[o] for o in out]

  return out


##################################################
# Iterative EOF method
##################################################

# After each iteration, does an explicit eigenvalue decomposition on a much
# smaller matrix.  Hopefully, this will converge faster than a simple
# power-method.
# Uses EOF_guess for initial patterns

# Assume the mean has already been subtracted!
def EOF_iter (x, num=1, iaxis=None, subspace = -1, max_iter=1000, weight=True, out=None):
  """
  (See svd.SVD for documentation on a similar function, but replace each xxx1 and xxx2 parameter with a single xxx parameter.)
  """
  import numpy as np
  from pygeode import libpath
  from pygeode.view import View
  from math import sqrt
  from pygeode.varoperations import fill
  from pygeode import svdcore as lib

  # Need vector subspace to be at least as large as the number of EOFs extracted.
  if subspace < num: subspace = num

  # Run the single-pass guess to seed the first iteration
  guess_eof, guess_eig, guess_pc = EOF_guess (x, subspace, iaxis, weight=weight, out=None)
  # Convert NaNs to zeros so they don't screw up the matrix operations
  guess_eof = fill (guess_eof, 0)

  x, time, space = prep(var=x, iaxis=iaxis, weight=weight, out=out)
  del iaxis

  eofshape =  (subspace,) + space.shape
  pcshape =  time.shape + (subspace,)

  pcs = np.empty(pcshape,dtype='d')

  oldeofs = np.empty(eofshape,dtype='d')
  # Seed with initial guess (in the weighted space)
  neweofs = apply_weights (guess_eof, weight=weight).get()
  neweofs = np.array(neweofs, dtype='d')  # so we can write
#  neweofs = np.random.rand(*eofshape)

  # Workspace for smaller representative matrix
  work1 = np.empty([subspace,subspace], dtype='d')
  work2 = np.empty([subspace,subspace], dtype='d')

  NX = space.size

  # Variance accumulation (on first iteration only)
  variance = 0.0

  for iter_num in range(1,max_iter+1):

    print('iter_num: %d'%iter_num)

    neweofs, oldeofs = oldeofs, neweofs

    # Reset the accumulation arrays for the next approximations
    neweofs[()] = 0

    # Apply the covariance matrix
    for inview in View(x.axes).loop_mem():
      X = np.ascontiguousarray(inview.get(x), dtype='d')
      assert X.size >= space.size, "spatial pattern is too large"

      nt = inview.shape[0]
      time_offset = inview.slices[0].start
      ier = lib.build_eofs (subspace, nt, NX, X, oldeofs,
                            neweofs, pcs[time_offset,...])
      assert ier == 0

      # Compute variance?
      if iter_num == 1:
        variance += (X**2).sum()

    # Useful dot products
    lib.dot(subspace, NX, oldeofs, neweofs, work1)
    lib.dot(subspace, NX, neweofs, neweofs, work2)

    # Compute surrogate matrix (using all available information from this iteration)
    A, residues, rank, s = np.linalg.lstsq(work1,work2,rcond=1e-30)

    # Eigendecomposition on surrogate matrix
    w, P = np.linalg.eig(A)

    # Sort by eigenvalue
    S = np.argsort(w)[::-1]
    w = w[S]
    print(w)
#    assert P.dtype.name == 'float64', P.dtype.name
    P = np.ascontiguousarray(P[:,S], dtype='d')

    # Translate the surrogate eigenvectors to an estimate of the true eigenvectors
    lib.transform(subspace, NX, P, neweofs)

    # Normalize
    lib.normalize (subspace, NX, neweofs)

#    # verify orthogonality
#    for i in range(num):
#      print [np.dot(neweofs[i,...].flatten(), neweofs[j,...].flatten()) for j in range(num)]

    if np.allclose(oldeofs[:num,...],neweofs[:num,...], atol=0):
      print('converged after %d iterations'%iter_num)
      break

  assert iter_num != max_iter, "no convergence"

  # Wrap as pygeode vars, and return
  # Only need some of the eofs for output (the rest might not have even converged yet)
  eof = neweofs[:num]
  pc = pcs[...,:num].transpose()

  # Extract the eigenvalues
  # (compute magnitude of pc arrays)
  #TODO: keep eigenvalues as a separate variable in the iteration loop
  eig = np.array([sqrt( (pc[i,...]**2).sum() ) for i in range(pc.shape[0]) ])
  pc = np.dot(np.diag(1/eig), pc)

  return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)






##################################################
# Single-pass EOF guess
# Gives a reasonable qualitative representation of the EOF patterns
##################################################
def EOF_guess (x, num=1, iaxis=None, weight=True, out=None):
  import numpy as np
  from pygeode.var import Var
  from pygeode.view import View
  from pygeode import eofcore as lib

  x, time, space = prep (x, iaxis, weight=weight, out=out)
  del iaxis

  print("working on array shape %s"%(x.shape,))

  # Initialize workspace
  work = lib.start (num, space.size)

  eof = np.empty((num,)+space.shape, dtype='d')
  eig = np.empty([num], dtype='d')
  pc = np.empty((num,)+time.shape, dtype='d')

  # Variance accumulation
  variance = 0.0

  # Loop over chunks of the data
  for inview in View(x.axes).loop_mem():
    X = np.ascontiguousarray(inview.get(x), dtype='d')
    assert X.size >= space.size, "Spatial pattern is too large"
    nrec = X.size // space.size
    lib.process (work, nrec, X)

    # Accumulate variance
    variance += (X**2).sum()

  # Get result
  lib.endloop (work, eof, eig, pc)

  # Free workspace
  lib.finish (work)

  # Wrap the stuff
  return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)



##################################################
# Compute EOFs directly from a singular value decomposition
# (works best on small data arrays)
##################################################
def EOF_svd (x, num=1, iaxis=None, weight=True, out=None):
  import numpy as np

  x, time, space = prep (x, iaxis, weight=weight, out=out)
  del iaxis

  # SVD
  data = x.get().reshape([time.size, space.size])
  variance = (data**2).sum()
  u, s, v = np.linalg.svd(data, full_matrices=False)
#  print '?', u.shape, s.shape, v.shape

  eof = v[:num,...]
  eig = s[:num]
  pc = u.transpose()[:num,...]

#  print eof.shape, eig.shape, pc.shape

  return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)


##################################################
# Compute EOFs explicitly from the covariance matrix
# (works best when the covariance matrix is small)
# Steps:
#  1) Scan through the data, construct an explicit covariance matrix
#  2) Compute the eigenvalues & eigenvectors
#  3) Scan through the data, construct the timeseries data
##################################################
def EOF_cov (x, num=1, iaxis=None, weight=True, out=None):
  import numpy as np
  from pygeode.view import View

  x, time, space = prep (x, iaxis, weight=weight, out=out)
  del iaxis

  # Initialize space for accumulating the covariance matrix
  cov = np.zeros ([space.size, space.size], dtype='d')

  # Accumulate the covariance
  for inview in View(x.axes).loop_mem():
    X = inview.get(x)
    assert X.size >= space.size, "Spatial pattern is too large"
    X = X.reshape(-1,space.size)

    cov += np.dot(X.transpose(),X)

  # Decompose the eigenvectors & eigenvalues
  w, v = np.linalg.eigh(cov/(time.size-1))

  variance = w.sum()
  eig = np.sqrt(w[::-1][:num])
  eof = v.transpose()[::-1,:][:num,:]

  # Compute the timeseries
  pc = []
  for inview in View(x.axes).loop_mem():
    X = inview.get(x).reshape(-1,space.size)
    pc.append(np.dot(eof, X.transpose()))

  pc = np.concatenate(pc, axis=1)
  # Normalize
  pc /= eig.reshape(num,1)

  return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)


##################################################
# Compute EOFs
# Determine which routine to use (svd, covariance, iterative)
# based on the size of the data
##################################################
[docs]def EOF (x, num=1, iaxis=None, weight=True, out=None):
  """
    Computes the leading Empirical Orthogonal Function(s) for the given
    variable.

    Parameters
    ----------
    x : Var object
        The data of interest

    num : integer, optional
        The number of leading EOFs to calculate. Default is ``1``.

    iaxis : Axis object, Axis class, string, or integer, optional
        Which axis/axes to treat as the record axis.  Multiple axes can be
        passed as a tuple.  Default is the ``Time`` axis of the data
        (if found), otherwise the leftmost axis.

    weight : Var object or boolean, optional
        Weights to use use for the orthogonality condition.
        If ``True``, it uses whatever internal weights the variable posesses.
        If ``False`` or ``None``, it doesn't use any weights.
        You can also pass in a Var object with explicit weights.
        Default is ``True``.

    out : string, optional
        Which outputs to return.  This is a comma-separated string,
        built from the following keywords:

          =======   ==========================================================
          keyword   meaning
          =======   ==========================================================
          EOF       The EOFs (normalized to unit variance)
          EIG       Eigenvalues (of the singular value decomposition of
                    the variable).  If you want the eigenvalues of the
                    covariance matrix, square these, or use EIG2).
          EIG2      Eigenvalues (of the covariance matrix)
          VAR       Variance (a single scalar value).
          FRAC      Fraction of total variance explained by each EOF.
          PC        Principal components (timeseries data), normalized
                    to unit variance.
          =======   ==========================================================

        Default is ``'EOF,EIG,PC'``

    Returns
    -------
    eof_decomposition : tuple
      A combination of EOFs, eignenvalues or other computed quantities specified
      by ``out``.

    Notes
    -----

    This routine doesn't do any pre-processing of the data, such as removing
    the mean or detrending.  If you want to work with anomalies, then you'll
    have to first compute the anomalies!

    This routine tries to automatically determine the best way to solve the
    EOFs.  If you want to use a particular method, you can call the following
    functions (with the same parameters):

      =========   ============================================================
      function    behaviour
      =========   ============================================================
      EOF_iter    Iterative solver (uses a variant of the power method).
      EOF_cov     Calculates the full covariance matrix, and then does an
                  explicit eigendecomposition.
      EOF_svd     Does an explicit singular value decomposition on the data.
      EOF_guess   Returns an approximation of the EOF decomposition from one
                  pass through the data.  This may be useful if you have a
                  large dataset, and you just want the qualitative features
                  of the EOF spatial patterns.
      =========   ============================================================
  """
  from math import sqrt
  from pygeode import MAX_ARRAY_SIZE
  # Maximum size allowed for explicit solvers
  #TODO: run some actual tests to determine this number
  max_explicit_size = int(sqrt(MAX_ARRAY_SIZE))
  # Get the time and space dimensions
  junk, time, space = prep (x, iaxis=iaxis, weight=weight, out=out)
  if x.size <= max_explicit_size: f = EOF_svd
  elif space.size**2 <= max_explicit_size: f = EOF_cov
  else: f = EOF_iter
  return f (x, num=num, iaxis=iaxis, weight=weight, out=out)