Source code for toyplot.data

# Copyright 2014, Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain
# rights in this software.

"""Classes and functions for working with raw data."""


import collections
import itertools
import numbers
import os
import sys
import xml.etree.ElementTree as xml

import numpy

import toyplot.color

_data_dir = os.path.abspath(os.path.dirname(__file__))

[docs]def minimax(items): """Compute the minimum and maximum of an arbitrary collection of scalar- or array-like items. The `items` parameter must be an iterable containing any combination of `None`, scalars, numpy arrays, or numpy masked arrays. None, NaN, masked values, and empty arrays are all handled correctly. Returns `(None, None)` if the inputs don't contain any usable values. Returns ------- min: minimum value of the input arrays, or None. max: maximum value of the input arrays, or None. """ group_min = None group_max = None for item in items: item_min = None item_max = None if isinstance(item, toyplot.data.Table): raise ValueError("toyplot.data.Table is not allowed.") # pragma: no cover elif isinstance(item, numpy.ma.MaskedArray): pass elif isinstance(item, numpy.ndarray): item = numpy.ma.array(item) elif item is None: item = numpy.ma.array([]) else: item = numpy.ma.array([item]) # Ignore null values selection = numpy.ma.getmaskarray(item) # Ignore NaN values if issubclass(item.dtype.type, numpy.number): selection = numpy.logical_or(selection, numpy.isnan(item).data) selection = numpy.logical_not(selection) if numpy.count_nonzero(selection): item_min = item[selection].min() item_max = item[selection].max() if group_min is None: group_min = item_min else: if item_min is not None: group_min = min(group_min, item_min) if group_max is None: group_max = item_max else: if item_max is not None: group_max = max(group_max, item_max) return group_min, group_max
[docs]def contiguous(a): """Split an array into a collection of contiguous ranges. """ i = 0 begin = [] end = [] values = [] for (value, group) in itertools.groupby(numpy.array(a).ravel()): begin.append(i) end.append(i + len(list(group))) values.append(value) i = end[-1] return numpy.array(begin), numpy.array(end), numpy.array(values)
[docs]class Table(object): """Encapsulates an ordered, heterogeneous collection of labelled data series. Parameters ---------- data: (data series, optional) You may initialize a toyplot.data.Table with any of the following: * None (the default) - creates an empty table (a table without any columns). * :class:`toyplot.data.Table` - creates a copy of the given table. * :class:`collections.OrderedDict` - creates a column for each key-value pair in the input, in the same order. Each value must be implicitly convertable to a numpy masked array, and every value must contain the same number of items. * object returned when loading a `.npz` file with :func:`numpy.load` - creates a column for each key-value pair in the given file, in the same order. Each array in the input file must contain the same number of items. * :class:`dict` / :class:`collections.abc.Mapping` - creates a column for each key-value pair in the input, sorted by key in lexicographical order. Each value must be implicitly convertable to a numpy masked array, and every value must contain the same number of items. * :class:`list` / :class:`collections.abc.Sequence` - creates a column for each key-value tuple in the input sequence, in the same order. Each value must be implicitly convertable to a numpy masked array, and every value must contain the same number of items. * :class:`numpy.ndarray` - creates a column for each column in a numpy matrix (2D array). The order of the columns is maintained, and each column is assigned a unique name. * :class:`pandas.DataFrame` - creates a column for each column in a `Pandas <http://pandas.pydata.org>`_ data frame. The order of the columns is maintained. index: bool or string, optional Controls whether to convert a `Pandas <http://pandas.pydata.org>`_ data frame index to columns in the resulting table. Use index=False (the default) to leave the data frame index out of the table. Use index=True to include the index in the table, using default column names (hierarchical indices will be stored in the table using multiple columns). Use index="format string" to include the index and control how the index column names are generated. The given format string can use positional `{}` / `{0}` or keyword `{index}` arguments to incorporate a zero-based index id into the column names. """ def __init__(self, data=None, index=False): self._columns = collections.OrderedDict() self._metadata = collections.defaultdict(dict) if data is not None: keys = None values = None # Input data for which an explicit column ordering is known. if isinstance(data, ( collections.OrderedDict, toyplot.data.Table, numpy.lib.npyio.NpzFile, )): keys = [key for key in data.keys()] values = [data[key] for key in keys] # Input data for which an explicit column ordering is not known. elif isinstance(data, (dict, collections.Mapping)): keys = [key for key in sorted(data.keys())] values = [data[key] for key in keys] # Input data based on sequences. elif isinstance(data, (list, collections.Sequence)): keys = [key for key, value in data] values = [value for key, value in data] # Input data based on numpy arrays. elif isinstance(data, numpy.ndarray): if data.ndim != 2: raise ValueError( "Only two-dimensional arrays are allowed.") keys = [str(i) for i in numpy.arange(data.shape[1])] values = [data[:, i] for i in numpy.arange(data.shape[1])] # Input data based on Pandas data structures. elif "pandas" in sys.modules: import pandas if isinstance(data, pandas.DataFrame): keys = [str(data.iloc[:, i].name) for i in range(data.shape[1])] values = [data.iloc[:, i] for i in range(data.shape[1])] if index: key_format = "index{}" if index == True else index keys = [key_format.format(i, index=i) for i in range(data.index.nlevels)] + keys values = [data.index.get_level_values(i) for i in range(data.index.nlevels)] + values if keys is None or values is None: raise ValueError("Can't create a toyplot.data.Table from an instance of %s" % type(data)) # Get the set of unique keys, so we can see if there are any duplicates. keys = numpy.array(keys, dtype="object") key_counter = collections.Counter(keys) key_dictionary = numpy.array(list(key_counter.keys())) key_counts = numpy.array(list(key_counter.values())) if numpy.any(key_counts > 1): toyplot.log.warning("Altering duplicate column names to make them unique.") # "Reserve" all of the keys that aren't duplicated. reserved_keys = set([key for key, count in zip(key_dictionary, key_counts) if count == 1]) # Now, iterate through the keys that do contain duplicates, altering them to make them unique, # while ensuring that the unique versions don't conflict with reserved keys. for key, count in zip(key_dictionary, key_counts): if count == 1: continue suffix = 1 for i in numpy.flatnonzero(keys == key): if key not in reserved_keys: reserved_keys.add(key) continue while "%s-%s" % (key, suffix) in reserved_keys: suffix += 1 keys[i] = "%s-%s" % (key, suffix) reserved_keys.add(keys[i]) # Store the data. for key, value in zip(keys, values): self[key] = value def __getitem__(self, index): column = None column_slice = None # Cases that return a column (array): # table["a"] if isinstance(index, str): column = index column_slice = slice(None, None, None) # table["a", 10], table["a", 10:20], table["a", [10, 12, 18]], etc. elif isinstance(index, tuple) and isinstance(index[0], str): column = index[0] column_slice = index[1] if column is not None and column_slice is not None: return self._columns[column][column_slice] row_slice = None columns = None # table[10] if isinstance(index, numbers.Integral): row_slice = slice(index, index + 1) columns = self._columns.keys() # table[10:20] elif isinstance(index, slice): row_slice = index columns = self._columns.keys() elif isinstance(index, tuple): # table[10, ] if isinstance(index[0], numbers.Integral): row_slice = slice(index[0], index[0] + 1) # table[10:20, ], table[[10, 12, 18], ], etc. else: row_slice = index[0] # table[, "a"] if isinstance(index[1], str): columns = [index[1]] # table[, ["a", "b", "c"]], etc. else: columns = index[1] else: index = numpy.array(index) if issubclass(index.dtype.type, numpy.character): columns = index row_slice = slice(None, None, None) else: row_slice = index columns = self._columns.keys() if row_slice is not None and columns is not None: return Table([(column, self._columns[column][row_slice]) for column in columns]) def __setitem__(self, index, value): if isinstance(index, str): value = numpy.ma.array(value) if value.ndim != 1: raise ValueError("Can't assign %s-dimensional array to the '%s' column." % (value.ndim, index)) for column in self._columns.values(): if column.shape != value.shape: raise ValueError("Expected %s values, received %s." % (column.shape[0], value.shape[0])) column = str(index) self._columns[column] = value return if isinstance(index, tuple): if isinstance(index[0], str) and isinstance(index[1], (int, slice)): column, column_slice = index self._columns[column][column_slice] = value return raise ValueError("Unsupported key for assignment: %s" % (index,)) def __delitem__(self, key): return self._columns.__delitem__(key) def __len__(self): return list(self._columns.values())[0].shape[0] if len(self._columns) else 0 def __iter__(self): for row in numpy.arange(self.__len__()): yield tuple([column[row] for column in self._columns.values()]) def _repr_html_(self): root_xml = xml.Element( "table", style="border-collapse:collapse; border:none; color: %s" % toyplot.color.black) root_xml.set("class", "toyplot-data-Table") header_xml = xml.SubElement( root_xml, "tr", style="border:none;border-bottom:1px solid %s" % toyplot.color.black) for name in self._columns.keys(): xml.SubElement( header_xml, "th", style="text-align:left;border:none;padding-right:1em;").text = str(name) iterators = [iter(column) for column in self._columns.values()] for _ in numpy.arange(len(self)): for index, iterator in enumerate(iterators): value = next(iterator) if isinstance(value, numbers.Number): value = "{:.12g}".format(value) else: value = str(value) if index == 0: row_xml = xml.SubElement( root_xml, "tr", style="border:none") xml.SubElement( row_xml, "td", style="border:none;padding-right:1em;").text = value return xml.tostring(root_xml, encoding="unicode", method="html") @property def shape(self): """The table shape (number of rows and columns). Returns ------- shape: tuple (number of rows, number of columns) tuple. """ return ( list(self._columns.values())[0].shape[0] if len(self._columns) else 0, len(self._columns), )
[docs] def items(self): """Return column names and columns, in column order. Returns ------- items: list Sequence of (name, column) tuples. """ return self._columns.items()
[docs] def keys(self): """Return the table column names, in column order. Returns ------- keys: sequence of :class:`str` column names. """ return self._columns.keys()
[docs] def values(self): """Return the table columns, in column order. Returns ------- values: sequence of :class:`numpy.ndarray` columns. """ return self._columns.values()
[docs] def metadata(self, column): """Return metadata for one of the table's columns. Parameters ---------- column: string. The name of an existing column. Returns ------- metadata: :class:`dict` containing key-value pairs. """ if column not in self._columns: raise ValueError("Unknown column name '%s'" % column) return self._metadata[column]
[docs] def matrix(self): """Convert the table to a matrix (2D numpy array). The data type of the returned array is chosen based on the types of the columns within the table. Tables containing a homogeneous set of column types will return an array of the the same type. If the table contains one or more string columns, the results will be an array of strings. Returns ------- matrix: :class:`numpy.ma.MaskedArray` The returned array will have two dimensions. """ return numpy.ma.column_stack(list(self._columns.values()))
[docs]def read_csv(fobj, convert=False): """Load a CSV (delimited text) file. Parameters ---------- fobj: file-like object or string, required The file to read. Use a string filepath, an open file, or a file-like object. convert: boolean, optional By default, the columns in a table will contain strings. If True, convert column types to integers and floats where possible. Returns ------- table: :class:`toyplot.data.Table` Notes ----- read_csv() is a simple tool for use in demos and tutorials. For more full-featured delimited text parsing, you should consider the :mod:`csv` module included in the Python standard library, or functionality provided by `numpy` or `Pandas`. """ import csv if isinstance(fobj, str): fobj = open(fobj, "r") rows = [row for row in csv.reader(fobj)] columns = zip(*rows) result = Table([(column[0], column[1:]) for column in columns]) if convert: for name in result.keys(): try: result[name] = result[name].astype("int") except: try: result[name] = result[name].astype("float") except: pass return result
[docs]def cars(): """Return sample automobile model data. Returns ------- table: :class:`toyplot.data.Table` Table containing descriptions of multiple makes and models of automobile. """ return read_csv(cars.path, convert=True)
cars.path = os.path.join(_data_dir, "cars.csv")
[docs]def communities(): """Return sample community detection data. Returns ------- edges: :class:`numpy.ndarray` An :math:`E \\times 2` matrix containing source and target vertex ids for :math:`E` edges. truth: :class:`numpy.ndarray` A :math:`V \\times 2` matrix containing a vertex id and ground-truth community id for :math:`V` vertices. assigned: :class:`numpy.ndarray` A :math:`V \\times 2` matrix containing a vertex id and alternate community id for :math:`V` vertices. """ edges = numpy.array([row.split() for row in open(os.path.join(_data_dir, "community-edges.csv"), "rb")], dtype="int") truth = numpy.array([row.split() for row in open(os.path.join(_data_dir, "community-truth.csv"), "rb")], dtype="int") assigned = numpy.array([row.split() for row in open(os.path.join(_data_dir, "community-assigned.csv"), "rb")], dtype="int") return edges, truth, assigned
[docs]def commute(): """Return sample OBD-II commuting data. Returns ------- table: :class:`toyplot.data.Table` Table containing a stream of OBD-II data collected from an automobile during a morning commute. """ return read_csv(commute.path)
commute.path = os.path.join(_data_dir, "commute.csv")
[docs]def deliveries(): """Return sample delivery data. Returns ------- table: :class:`toyplot.data.Table` Table containing a stream of OBD-II data collected from an automobile during a morning commute. """ return read_csv(deliveries.path)
deliveries.path = os.path.join(_data_dir, "deliveries.csv")
[docs]def temperatures(): """Return sample temperature data. Returns ------- table: :class:`toyplot.data.Table` Table containing temperature data collected by NOAA. """ return read_csv(temperatures.path)
temperatures.path = os.path.join(_data_dir, "temperatures.csv")