Source code for pygeode.dataset

#TODO: filter function (similar to map, but use a boolean selection function
#      instead of a transformation).

# Dataset
[docs]class Dataset(object): # {{{ """ Container class for :class:`Var` objects. Provides tools for organizing and working with a set of variables. :ivar vars :ivar axes """ #: A list of variables contained by the Dataset. See `Var` class. vars = None #: A dictionary of variables contained by the Dataset, indexed by name. vardict = None #: A list of axes contained by the Dataset. See `Axis` class. axes = None #: A dictionary of metadata associated with the dataset. Sometimes referred #: to as global attributes. atts = None # Get a variable by name
[docs] def __getitem__ (self, key): # {{{ '''Gets a variable or axis object. Parameters ---------- key : string Name of axis or variable to return. Returns ------- :class:`Var` or :class:`Axis` object matching the requested name. Raises :class:`KeyError` if no such member is found.''' if key in self.vardict: return self.vardict[key] if key in self.axisdict: return self.axisdict[key] raise KeyError("%s not in %s. Valid keys are %s"%(key,repr(self),list(self.vardict.keys())))
# }}} # Check if we have a variable of the given name def __contains__ (self, key): # {{{ return key in self.vardict or key in self.axisdict # }}} # Reference a axis/variable as an attribute (for the super lazy) def __getattr__ (self, name): # {{{ # Disregard metaclass stuff if name.startswith('__'): raise AttributeError # print 'dataset getattr ??', name if name in self: return self[name] raise AttributeError (name) # }}} def __dir__(self): # {{{ l = list(self.__dict__.keys()) + dir(self.__class__) return l + list(self.vardict.keys()) + list(self.axisdict.keys()) # }}} # Iterate over the variables def __iter__(self): return iter(self.vars) # Dataset Initialization # Takes a list of Vars, and any global attributes # Vars and axes may be renamed within the Dataset to ensure uniqueness
[docs] def __init__ (self, vars, atts={}, print_warnings=True): # {{{ ''' Create a new :class:`Dataset` from a list of variables. Parameters ---------- vars: list The list of :class:`Var` objects to include. atts: dictionary, optional A dictionary of attributes available in the :attr:`Dataset.atts` attribute. print_warnings: boolean, optional [True] If True, print out warnings when variables and axes are renamed. Returns ------- The new :class:`Dataset` object. Notes ----- Variable names and axis names must be unique. If multiple variables share the same name they will be renamed so that they are unique. If variables have axes with matching names but which are not matching, they will also be renamed. If any names are modified and ``print_warnings`` is True, a warning will be displayed indicating how objects have been renamed. ''' from pygeode.var import Var from warnings import warn atts = atts.copy() vars = list(vars) for v in vars: assert isinstance(v,Var) namedict = axis_name_clumping(vars) # Rename axes that share a common name newaxes = {} for oldname, eqlist in namedict.items(): # Case 1: the name is unique if len(eqlist) == 1: newaxis = eqlist[0][0] for a in eqlist[0]: newaxes[id(a)] = newaxis # Case 2: there is more than one axis with that name else: for i, eq in enumerate(eqlist): newname = oldname + '%02d'%(i+1) # It's possible that we still have a name conflict while newname in namedict: newname += 'x' if print_warnings: warn ("renaming non-unique axis '%s' to '%s'"%(oldname,newname), stacklevel=2) newaxis = eq[0].rename(newname) for a in eq: newaxes[id(a)] = newaxis # Wrap the vars with these new axes for i,v in enumerate(vars): axes = [newaxes[id(a)] for a in v.axes] # Check if we're already using the right axes if all(a1 is a2 for a1,a2 in zip(v.axes,axes)): continue vars[i] = v.replace_axes(newaxes=axes) # Gather all axes together into a list (unique axes only, semi-ordered) axes = [] axis_ids = [] for v in vars: for a in v.axes: if id(a) not in axis_ids: axes.append(a) axis_ids.append(id(a)) self.axes = axes self.axisdict = dict([a.name,a] for a in axes) self.atts = atts # global attributes # Handle name clobbering here # Get list of names, fill in blanks for i, v in enumerate(list(vars)): if v.name == '': if print_warnings: warn ('unnamed variables found - using default name "var"') vars[i] = v.rename('var') # Check for duplicate variable names oldnames = [v.name for v in vars] namecount = {} for n in oldnames: if n not in namecount: namecount[n] = 0 namecount[n] += 1 replace_name = [False if namecount[n] == 1 else True for n in oldnames] namecount = dict([n,0] for n in set(oldnames)) for i, v in enumerate(list(vars)): if replace_name[i]: n = v.name if print_warnings: warn ('multiple variables with the name "%s" found - adding integer suffixes'%n) namecount[n] += 1 newname = "%s%02d"%(n,namecount[n]) # It's possible that we still have a naming conflict while newname in namecount: newname += 'x' vars[i] = v.rename(newname) self.vars = vars self.vardict = dict([v.name,v] for v in vars)
# }}} # String arrays representing the variables, dimensions, etc. # Feeds into __str__ below, to simplify it a bit def __str_vararr__ (self): # {{{ for v in self.vars: oldname = v.name # Get the name used as the reference for the dataset # (not necessarily the var's name) name = [n for n,v2 in self.vardict.items() if v2 is v].pop() axes = '(' + ','.join(a.name for a in v.axes) + ')' shape = ' (' + ','.join(str(len(a)) for a in v.axes) + ')' yield name, axes, shape # }}} # String representation def __str__ (self): # {{{ import textwrap # Degenerate case - no variables?? if len(self.vars) == 0: return '<empty Dataset>' lines = list(self.__str_vararr__()) pad1 = max(len(a[0]) for a in lines) + 1 pad2 = max(len(a[1]) for a in lines) + 1 s = '<' + self.__class__.__name__ + '>:\n' s = s + 'Vars:\n' for name,dims,shape in lines: s = s + ' ' + name.ljust(pad1) + dims.ljust(pad2) + shape + '\n' s = s + 'Axes:\n ' + ' '.join([str(a)+'\n' for a in self.axes]) s = s + 'Global Attributes:' if len(self.atts) == 0: s = s + '\n{}' for k, v in self.atts.items(): s += '\n ' + textwrap.shorten(str(k), 15).ljust(15) \ + ': ' + textwrap.shorten(str(v), 61) return s # }}} # Make a copy of a dataset # (copies the internal lists and dictionaries, does *not* copy the vars)
[docs] def copy (self): # {{{ '''Creates a new copy of this dataset. New instances of the internal lists and dictionaries are created, but the variable still rever to the same :class:`Var` objects. Returns ------- A new :class:`Dataset` object.''' return asdataset(self, copy=True)
# }}} # Rename some variables in the dataset # (need to update vars, vardict)
[docs] def rename_vars (self, vardict={}, **kwargs): # {{{ ''' Rename variables in dataset. Variables to rename can be passed as keyword arguments, or as a dictionary. Parameters ---------- vardict: dictionary, optional A dictionary with keys corresponding to the existing variables to rename and values giving their new names. **kwargs: keyword arguments One or more keyword arguments. The parameters are the existing variable names and the values are the new names to substitute. Returns ------- A new :class:`Dataset` object with the same contents but renamed variables. Examples -------- >>> from pygeode.tutorial import t2 >>> print(t2) <Dataset>: Vars: Temp (time,pres,lat,lon) (3650,20,31,60) U (time,pres,lat,lon) (3650,20,31,60) Axes: time <ModelTime365>: Jan 1, 2011 00:00:00 to Dec 31, 2020 00:00:00 (3650 values) pres <Pres> : 1000 hPa to 50 hPa (20 values) lat <Lat> : 90 S to 90 N (31 values) lon <Lon> : 0 E to 354 E (60 values) Global Attributes: history : Synthetic Temperature and Wind data generated by pygeode >>> print(t2.rename_vars(Temp = 'T', U = 'Wind')) <Dataset>: Vars: T (time,pres,lat,lon) (3650,20,31,60) Wind (time,pres,lat,lon) (3650,20,31,60) Axes: time <ModelTime365>: Jan 1, 2011 00:00:00 to Dec 31, 2020 00:00:00 (3650 values) pres <Pres> : 1000 hPa to 50 hPa (20 values) lat <Lat> : 90 S to 90 N (31 values) lon <Lon> : 0 E to 354 E (60 values) Global Attributes: history : Synthetic Temperature and Wind data generated by pygeode ''' vardict = dict(vardict, **kwargs) varlist = list(self.vars) for i, v in enumerate(varlist): # Rename this var? oldname = v.name if oldname in vardict: newname = vardict[oldname] assert isinstance(newname,str) varlist[i] = v.rename(newname) return Dataset(varlist, atts=self.atts)
# }}} # Remove some variables from the dataset
[docs] def remove (self, *varnames): # {{{ '''Removes variables from the dataset. Parameters ---------- *varnames : strings The names of the variables to remove. Returns ------- A new :class:`Dataset` with the specified variables removed. Notes ----- The sutraction operator is also overloaded to do the same thing; in that case provide a list of strings again specifying the names of the variables to remove. Examples -------- >>> from pygeode.tutorial import t2 >>> print(t2) <Dataset>: Vars: Temp (time,pres,lat,lon) (3650,20,31,60) U (time,pres,lat,lon) (3650,20,31,60) Axes: time <ModelTime365>: Jan 1, 2011 00:00:00 to Dec 31, 2020 00:00:00 (3650 values) pres <Pres> : 1000 hPa to 50 hPa (20 values) lat <Lat> : 90 S to 90 N (31 values) lon <Lon> : 0 E to 354 E (60 values) Global Attributes: history : Synthetic Temperature and Wind data generated by pygeode >>> print(t2.remove('Temp')) <Dataset>: Vars: U (time,pres,lat,lon) (3650,20,31,60) Axes: time <ModelTime365>: Jan 1, 2011 00:00:00 to Dec 31, 2020 00:00:00 (3650 values) pres <Pres> : 1000 hPa to 50 hPa (20 values) lat <Lat> : 90 S to 90 N (31 values) lon <Lon> : 0 E to 354 E (60 values) Global Attributes: history : Synthetic Temperature and Wind data generated by pygeode >>> print(t2 - ['U']) <Dataset>: Vars: Temp (time,pres,lat,lon) (3650,20,31,60) Axes: time <ModelTime365>: Jan 1, 2011 00:00:00 to Dec 31, 2020 00:00:00 (3650 values) pres <Pres> : 1000 hPa to 50 hPa (20 values) lat <Lat> : 90 S to 90 N (31 values) lon <Lon> : 0 E to 354 E (60 values) Global Attributes: history : Synthetic Temperature and Wind data generated by pygeode ''' for n in varnames: assert isinstance(n,str) assert n in self.vardict, "'%s' not found in the dataset"%n vars = [v for v in self.vars if v.name not in varnames] d = Dataset(vars, atts=self.atts) return d
# }}} def __sub__ (self, varnames): # {{{ if isinstance (varnames,(list,tuple)): return self.remove(*varnames) return self.remove(varnames) # }}} # Add some more variables to the dataset
[docs] def add (self, *vars): # {{{ '''Adds variables to the dataset. Parameters ---------- *vars : :class:`Var` objects The variables to add Returns ------- A new :class:`Dataset` with the variables added. Notes ----- The same naming rules are applied in case of name collisions as in :meth:`Dataset.__init__`. The addition operator is also overloaded to do the same thing; in that case provide a list of the variables to add. See Also -------- Examples -------- >>> from pygeode.tutorial import t1, t2 >>> print(t2.add(t1.Temp.rename('Temp2'))) <Dataset>: Vars: Temp (time,pres,lat,lon) (3650,20,31,60) U (time,pres,lat,lon) (3650,20,31,60) Temp2 (lat,lon) (31,60) Axes: time <ModelTime365>: Jan 1, 2011 00:00:00 to Dec 31, 2020 00:00:00 (3650 values) pres <Pres> : 1000 hPa to 50 hPa (20 values) lat <Lat> : 90 S to 90 N (31 values) lon <Lon> : 0 E to 354 E (60 values) Global Attributes: history : Synthetic Temperature and Wind data generated by pygeode >>> print(t2 + t1.Temp.rename('Temp2')) <Dataset>: Vars: Temp (time,pres,lat,lon) (3650,20,31,60) U (time,pres,lat,lon) (3650,20,31,60) Temp2 (lat,lon) (31,60) Axes: time <ModelTime365>: Jan 1, 2011 00:00:00 to Dec 31, 2020 00:00:00 (3650 values) pres <Pres> : 1000 hPa to 50 hPa (20 values) lat <Lat> : 90 S to 90 N (31 values) lon <Lon> : 0 E to 354 E (60 values) Global Attributes: history : Synthetic Temperature and Wind data generated by pygeode ''' from pygeode.var import Var from pygeode.tools import common_dict # Collect global attributes (from any Datasets passsed to us) atts = [self.atts] + [d.atts for d in vars if isinstance(d,Dataset)] atts = common_dict(*atts) for v in vars: assert isinstance(v,(Var,Dataset)), "'%s' is not a Var"%repr(v) # Expand all Datasets to Vars vars = [v for v in vars if isinstance(v,Var)] + sum([ d.vars for d in vars if isinstance(d,Dataset)],[]) vars = list(self.vars) + list(vars) d = Dataset(vars, atts=self.atts) return d
# }}} def __add__ (self, vars): # {{{ if isinstance(vars,(list,tuple)): return self.add(*vars) return self.add(vars) # }}} def __radd__ (self, vars): return self.__add__(vars) # Replace one or more variables
[docs] def replace_vars (self, vardict={}, **kwargs): # {{{ '''Replaces variables in the dataset. Parameters ---------- vardict: dictionary, optional A dictionary with keys corresponding to the existing variables to replace and values giving the new :class:`Var` instances. **kwargs: keyword arguments One or more keyword arguments. The parameters are the existing variable names and the values are the new :class:`Var` instances. Returns ------- A new :class:`Dataset` with the specified variables replaced. Examples -------- >>> from pygeode.tutorial import t1, t2 >>> print(t2.replace_vars(Temp = t1.Temp)) <Dataset>: Vars: Temp (lat,lon) (31,60) U (time,pres,lat,lon) (3650,20,31,60) Axes: lat <Lat> : 90 S to 90 N (31 values) lon <Lon> : 0 E to 354 E (60 values) time <ModelTime365>: Jan 1, 2011 00:00:00 to Dec 31, 2020 00:00:00 (3650 values) pres <Pres> : 1000 hPa to 50 hPa (20 values) Global Attributes: history : Synthetic Temperature and Wind data generated by pygeode ''' vardict = dict(vardict, **kwargs) varlist = list(self.vars) for i, v in enumerate(varlist): if v.name in vardict: varlist[i] = vardict[v.name] return Dataset(varlist, atts=self.atts)
# }}} # Apply the specified var->var function to all variables in the dataset, # and make a new dataset. Anything that gets mapped to 'None' is ignored.
[docs] def map (self, f, *args, **kwargs): # {{{ ''' Calls a given function on every variable in the dataset. Parameters ---------- f: callable Method to call. Must take the variable as its first argument, and return either a single variable, or None. Further positional and keyword arguments can be passed through ``args`` and ``kwargs``. args, kwargs: positional and keyword arguments These are passed on to f. Returns ------- A new :class:`Dataset` with the results of the calls to f. f can return None; in that case no corresponding variable is included in the new Dataset object. ''' from pygeode.var import Var # Special case: f is a string representing a Var method if isinstance(f,str): fname = f assert hasattr(Var,fname), "unknown function '%s'"%fname f = getattr(Var,fname) assert hasattr(f,'__call__'), "Var.%s is not a function"%fname del fname # Allow the function to gracefully fail on vars it can't be applied to. if 'ignore_mismatch' in f.__code__.co_varnames: kwargs['ignore_mismatch'] = True varlist = [f(v, *args, **kwargs) for v in self.vars] varlist = [v for v in varlist if v is not None] for v in varlist: assert isinstance(v,Var), "%s does not map vars to vars"%f return Dataset(varlist, atts=self.atts.copy())
# }}} # Load all the variables in the dataset
[docs] def load(self): # {{{ ''' Loads data from all variables in the dataset. Returns ------- A new dataset in which :meth:`Var.load()` has been called on each variable, loading their data into memory. ''' vars = [v.load() for v in self.vars] d = Dataset(vars, atts=self.atts) return d
# }}} # Slicing # Applies the keyword-based axis slicing to *all* the vars in the dataset. #TODO: more efficient method? # Right now, each var is sliced independantly, so the same axis will be sliced multiple times.
[docs] def __call__ (self, **kwargs): ''' Subsets all variables in this dataset. Behaves in the same way as :meth:`Var.__call__`. Parameters ---------- slices : list of slices See :meth:`Var.__call__` for details. Returns ------- :class:`Dataset` A new Dataset, in which all variables have been restricted to the specified domain. Notes ----- Not all variables need to have the axes being sliced (any slice that doesn't apply to a given variable is simply ignored). This is usually more convenient, but it does mean that if an axis name is misspelled (for example), the call will return successfully without performing any subsetting. See Also -------- Var.__call__ ''' return self.map ('__call__', **kwargs)
def hasaxis (self, iaxis): ''' Checks if the specified axis exists in the dataset. See Also -------- Var.hasaxis ''' from pygeode.tools import whichaxis try: i = whichaxis(self.axes,iaxis) return True except KeyError: return False def getaxis (self, iaxis): ''' Get an axis from the dataset. See Also -------- Var.getaxis ''' from pygeode.tools import whichaxis i = whichaxis(self.axes,iaxis) return self.axes[i]
# }}} # Wrap a variable (or a list of variables) into a dataset # Use this if you want to make sure something is a dataset, in case it's # possible that it's currently a Var list.
[docs]def asdataset (vars, copy=False, print_warnings=True): # {{{ ''' Tries to convert a collection of objects into a single dataset. Parameters ========== vars : collection The collection to convert. See Notes. copy : boolean print_warnings : boolean Returns ======= dataset : :class:`Dataset` Notes ===== If ``vars`` is a single variable or list of variables, asdataset() returns a Dataset wrapping them. If there are datasets present in the list, it merges them into a single dataset. ''' from copy import copy from pygeode.var import Var if hasattr(vars, '__len__') and any([isinstance(d, Dataset) for d in vars]): d = [d for d in vars if isinstance(d, Dataset)] v = [v for v in vars if isinstance(v, Var)] assert len(d) + len(v) == len(vars), '%s must consist of vars and datasets only.' args = d[1:] + v return d[0].add(*args) if isinstance(vars,Dataset): if not copy: return vars dataset = copy(vars) #TODO: update if more members are added to this class dataset.vars = list(dataset.vars) dataset.vardict = dataset.vardict.copy() dataset.axes = list(dataset.axes) dataset.axisdict = dataset.axisdict.copy() dataset.atts = dataset.atts.copy() return dataset if isinstance(vars, Var): vars = [vars] return Dataset(vars, print_warnings=print_warnings)
# }}} def axis_name_clumping (varlist): # {{{ # Name-based dictionary pointing to all related axes # each name maps to a list of 'distinct' axis lists # each of these sublists contains all the equivalent axes namedict = {} for v in varlist: for a in v.axes: name = a.name if name not in namedict: namedict[name] = [] eqlist = namedict[name] # Check if it's in an existing family if any (a is A for e in eqlist for A in e): continue # Check if it's comparable to something in a family # If so, then append it match = False for e in eqlist: if a == e[0]: e.append(a) match = True break # If it's not comparable to anything, start a new family if not match: eqlist.append([a]) return namedict # }}} # Concatenate a bunch of datasets together def concat(*datasets): # {{{ from pygeode.concat import concat from pygeode.tools import common_dict, islist # Did we get passed a list of datasets already? # (need to break out of the outer list) if len(datasets) == 1 and islist(datasets[0]): datasets = list(datasets[0]) # If we only have one dataset, then return it if len(datasets) == 1: return datasets[0] # Collect a list of variable names (in the order they're found in the datasets) # and, a corresponding dictionary mapping the names to vars varnames = [] vardict = {} for dataset in datasets: for v in dataset.vars: if v.name not in vardict: vardict[v.name] = [] varnames.append(v.name) vardict[v.name].append(v) # Merge the var segments together # If only one segment, just use the variable itself vars = [vardict[n] for n in varnames] vars = [concat(v) if len(v)>1 else v[0] for v in vars] d = Dataset(vars) # Keep any common global attributes # Collect all attributes found in the datasets atts = common_dict(*[x.atts for x in datasets]) if len(atts) > 0: d.atts = atts return d # }}} ################################################## # Hook in Var methods ################################################## # Wrapper to convert functions from working on Vars to Datasets def dataset_method (f): # {{{ from pygeode.var import Var def new_f (dataset, *args, **kwargs): # Degenerate case: passed a single var (return a single var) if isinstance(dataset,Var): return f(dataset,*args,**kwargs) # Otherwise, apply the function to the vars in the dataset, construct a new dataset return dataset.map(f, *args, **kwargs) new_f.__name__ = f.__name__ new_f.__doc__ = 'Returns new dataset calling `Var.%s` on each variable.' % (f.__name__) return new_f # }}} from pygeode.var import class_hooks for f in class_hooks: # Don't use built-in operator methods (__xxx__) if f.__name__.startswith('__'): continue if f.__name__.endswith('__'): continue # Don't override functions already defined here if hasattr(Dataset, f.__name__): continue setattr(Dataset, f.__name__, dataset_method(f)) del class_hooks, f