Module `shoji.groupby`

Grouping tensors and applying aggregations.

The available aggregation methods are

    sum

    count

    nnz (number of non-zero elements)

    mean

    variance

    sd (standard deviation)

    skewness

    kurtosis

Expand source code

"""
Grouping tensors and applying aggregations.

The available aggregation methods are

        sum

        count

        nnz (number of non-zero elements)

        mean

        variance

        sd (standard deviation)

        skewness

        kurtosis
"""
from typing import Dict, Union, Callable, Optional
import numpy as np
from sklearn.preprocessing import LabelEncoder
import shoji


# Based on https://github.com/mahmoud/lithoxyl/blob/master/lithoxyl/moment.py
# but adapted to use numpy arrays (element-wise) instead of single values
# and to support labeled groups of array-values
class Accumulator:
        def __init__(self):
                self._sum = None
                self._count = None
                self._nnz = None
                self._min = None
                self._max = None
                self._mean = None
                self._m2 = None
                self._m3 = None
                self._m4 = None
                self._first = None

        def add(self, x):
                if self._count is None:
                        self._sum = x.astype("float64")
                        self._count = np.ones_like(self._sum, dtype="uint64")
                        self._nnz = np.zeros_like(self._sum, dtype="uint64")
                        self._nnz[x > 0] += 1
                        self._min = x.astype("float64")
                        self._max = x.astype("float64")
                        self._mean = x.astype("float64")
                        self._m2 = np.zeros_like(self._mean)
                        self._m3 = np.zeros_like(self._mean)
                        self._m4 = np.zeros_like(self._mean)
                        self._first = x
                else:
                        self._nnz[x > 0] = np.add(self._nnz[x > 0], 1, casting="unsafe")  # This convoluted scheme is needed to avoid error when adding 1 to a scalar array
                        self._sum += x
                        self._min = np.minimum(self._min, x)
                        self._max = np.maximum(self._min, x)
                        np.add(self._count, 1, out=self._count, casting="unsafe")  # Same issue as above, but at least here we can do it in-place
                        n = self._count
                        delta = x - self._mean
                        delta_n = delta / n
                        delta_n2 = delta_n ** 2
                        self._mean += delta_n
                        term = delta * delta_n * (n - 1)
                        self._m4 = (self._m4 + term * delta_n2 * (n ** 2 - 3 * n + 3) + 6 * delta_n2 * self._m2 - 4 * delta_n * self._m3)
                        self._m3 = (self._m3 + term * delta_n * (n - 2) - 3 * delta_n * self._m2)
                        self._m2 = self._m2 + term

        @property
        def first(self):
                return self._first

        @property
        def count(self):
                return self._count

        @property
        def sum(self):
                return self._sum

        @property
        def nnz(self):
                return self._nnz

        @property
        def mean(self):
                return self._mean

        @property
        def variance(self):
                return self._m2 / (self._count - 1)

        @property
        def skewness(self):
                return ((self._count ** 0.5) * self._m3) / (self._m2 ** 1.5)

        @property
        def kurtosis(self):
                # TODO: subtract 3? (for normal curve = 0)
                return (self._count * self._m4) / (self._m2 ** 2)

        @property
        def sd(self):
                return self.variance ** 0.5


class GroupAccumulator:
        def __init__(self) -> None:
                self.groups: Dict[int, Accumulator] = {}

        def add(self, label, x) -> None:
                self.groups.setdefault(label, Accumulator()).add(x)

        def sum(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.sum for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].sum

        def nnz(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.nnz for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].nnz

        def count(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.count for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].count

        def first(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.first for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].first

        def mean(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.mean for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].mean

        def variance(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.variance for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].variance

        def skewness(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.skewness for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].skewness

        def kurtosis(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.kurtosis for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].kurtosis

        def sd(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.sd for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].sd


class GroupViewBy:
        def __init__(self, view: "shoji.view.View", labels: Optional[Union[str, np.ndarray]], projection: Callable = None) -> None:
                self.view = view
                self.labels = labels
                if isinstance(self.labels, str):
                        tensor = view.wsm._get_tensor(self.labels)
                        if tensor.rank != 1:
                                raise ValueError(f"Cannot groupby('{self.labels}'); a rank-1 tensor is required")
                self.projection = projection
                self.acc: Optional[GroupAccumulator] = None

        def stats(self, of_tensor: str) -> GroupAccumulator:
                if self.acc is not None:
                        return self.acc
                tensor = self.view.wsm._get_tensor(of_tensor)
                if tensor.jagged:
                        raise NotImplementedError("Cannot group jagged tensor")
                n_rows = self.view.get_length(tensor.dims[0])
                if self.labels is None:
                        label_values = np.zeros(n_rows)
                elif isinstance(self.labels, np.ndarray):
                        label_values = self.labels
                else:
                        label_values = self.view[self.labels]
                if self.projection is not None:
                        label_values = [self.projection(x) for x in  label_values]
                le = LabelEncoder()
                labels = le.fit_transform(label_values)  # Encode string labels and non-contiguous integers into integers 0, 1, 2, ...
                acc = GroupAccumulator()

                n_rows_per_batch = 1000
                for ix in range(0, n_rows, n_rows_per_batch):
                        batch = self.view._read_batch(tensor, ix, ix + n_rows_per_batch)
                        batch_labels = labels[ix: ix + n_rows_per_batch]
                        for i, label in enumerate(batch_labels):
                                acc.add(le.classes_[label], batch[i])
                return acc

        def sum(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).sum()

        def count(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).count()

        def first(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).first()

        def nnz(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).nnz()

        def mean(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).mean()

        def variance(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).variance()

        def skewness(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).skewness()

        def kurtosis(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).kurtosis()

        def sd(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).sd()


class GroupDimensionBy:
        def __init__(self, dim: "shoji.dimension.Dimension", labels: Optional[Union[str, np.ndarray]], projection: Callable = None, chunk_size: int = 1000) -> None:
                self.dim = dim
                assert dim.wsm is not None, "Cannot group by unbound dimension"
                self.labels = labels
                if isinstance(self.labels, str):
                        tensor = dim.wsm._get_tensor(self.labels)
                        if tensor.rank != 1:
                                raise ValueError(f"Cannot groupby('{self.labels}'); a rank-1 tensor is required")
                self.chunk_size = chunk_size
                self.projection = projection
                self.acc: Optional[GroupAccumulator] = None

        def stats(self, of_tensor: str) -> GroupAccumulator:
                if self.acc is not None:
                        return self.acc
                assert self.dim.wsm is not None
                le = LabelEncoder()
                if self.labels is None:
                        label_values = np.zeros(self.dim.length)
                elif isinstance(self.labels, np.ndarray):
                        label_values = self.labels
                else:
                        label_values = self.dim.wsm[self.labels][:]
                if self.projection is not None:
                        label_values = [self.projection(x) for x in  label_values]
                labels = le.fit_transform(label_values)  # Encode string labels and non-contiguous integers into integers 0, 1, 2, ...
                acc = GroupAccumulator()
                n_rows = self.dim.length
                n_rows_per_batch = 1000
                for ix in range(0, n_rows, n_rows_per_batch):
                        batch = self.dim.wsm._get_tensor(of_tensor)[ix: ix + n_rows_per_batch]
                        batch_labels = labels[ix: ix + n_rows_per_batch]
                        for i, label in enumerate(batch_labels):
                                acc.add(le.classes_[label], batch[i])
                return acc


        def sum(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).sum()

        def count(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).count()

        def nnz(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).nnz()

        def mean(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).mean()

        def variance(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).variance()

        def skewness(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).skewness()

        def kurtosis(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).kurtosis()

        def sd(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).sd()

Classes

class Accumulator

Expand source code

class Accumulator:
        def __init__(self):
                self._sum = None
                self._count = None
                self._nnz = None
                self._min = None
                self._max = None
                self._mean = None
                self._m2 = None
                self._m3 = None
                self._m4 = None
                self._first = None

        def add(self, x):
                if self._count is None:
                        self._sum = x.astype("float64")
                        self._count = np.ones_like(self._sum, dtype="uint64")
                        self._nnz = np.zeros_like(self._sum, dtype="uint64")
                        self._nnz[x > 0] += 1
                        self._min = x.astype("float64")
                        self._max = x.astype("float64")
                        self._mean = x.astype("float64")
                        self._m2 = np.zeros_like(self._mean)
                        self._m3 = np.zeros_like(self._mean)
                        self._m4 = np.zeros_like(self._mean)
                        self._first = x
                else:
                        self._nnz[x > 0] = np.add(self._nnz[x > 0], 1, casting="unsafe")  # This convoluted scheme is needed to avoid error when adding 1 to a scalar array
                        self._sum += x
                        self._min = np.minimum(self._min, x)
                        self._max = np.maximum(self._min, x)
                        np.add(self._count, 1, out=self._count, casting="unsafe")  # Same issue as above, but at least here we can do it in-place
                        n = self._count
                        delta = x - self._mean
                        delta_n = delta / n
                        delta_n2 = delta_n ** 2
                        self._mean += delta_n
                        term = delta * delta_n * (n - 1)
                        self._m4 = (self._m4 + term * delta_n2 * (n ** 2 - 3 * n + 3) + 6 * delta_n2 * self._m2 - 4 * delta_n * self._m3)
                        self._m3 = (self._m3 + term * delta_n * (n - 2) - 3 * delta_n * self._m2)
                        self._m2 = self._m2 + term

        @property
        def first(self):
                return self._first

        @property
        def count(self):
                return self._count

        @property
        def sum(self):
                return self._sum

        @property
        def nnz(self):
                return self._nnz

        @property
        def mean(self):
                return self._mean

        @property
        def variance(self):
                return self._m2 / (self._count - 1)

        @property
        def skewness(self):
                return ((self._count ** 0.5) * self._m3) / (self._m2 ** 1.5)

        @property
        def kurtosis(self):
                # TODO: subtract 3? (for normal curve = 0)
                return (self._count * self._m4) / (self._m2 ** 2)

        @property
        def sd(self):
                return self.variance ** 0.5

Instance variables

var count

Expand source code

@property
def count(self):
        return self._count

var first

Expand source code

@property
def first(self):
        return self._first

var kurtosis

Expand source code

@property
def kurtosis(self):
        # TODO: subtract 3? (for normal curve = 0)
        return (self._count * self._m4) / (self._m2 ** 2)

var mean

Expand source code

@property
def mean(self):
        return self._mean

var nnz

Expand source code

@property
def nnz(self):
        return self._nnz

var sd

Expand source code

@property
def sd(self):
        return self.variance ** 0.5

var skewness

Expand source code

@property
def skewness(self):
        return ((self._count ** 0.5) * self._m3) / (self._m2 ** 1.5)

var sum

Expand source code

@property
def sum(self):
        return self._sum

var variance

Expand source code

@property
def variance(self):
        return self._m2 / (self._count - 1)

Methods

def add(self, x)

Expand source code

def add(self, x):
        if self._count is None:
                self._sum = x.astype("float64")
                self._count = np.ones_like(self._sum, dtype="uint64")
                self._nnz = np.zeros_like(self._sum, dtype="uint64")
                self._nnz[x > 0] += 1
                self._min = x.astype("float64")
                self._max = x.astype("float64")
                self._mean = x.astype("float64")
                self._m2 = np.zeros_like(self._mean)
                self._m3 = np.zeros_like(self._mean)
                self._m4 = np.zeros_like(self._mean)
                self._first = x
        else:
                self._nnz[x > 0] = np.add(self._nnz[x > 0], 1, casting="unsafe")  # This convoluted scheme is needed to avoid error when adding 1 to a scalar array
                self._sum += x
                self._min = np.minimum(self._min, x)
                self._max = np.maximum(self._min, x)
                np.add(self._count, 1, out=self._count, casting="unsafe")  # Same issue as above, but at least here we can do it in-place
                n = self._count
                delta = x - self._mean
                delta_n = delta / n
                delta_n2 = delta_n ** 2
                self._mean += delta_n
                term = delta * delta_n * (n - 1)
                self._m4 = (self._m4 + term * delta_n2 * (n ** 2 - 3 * n + 3) + 6 * delta_n2 * self._m2 - 4 * delta_n * self._m3)
                self._m3 = (self._m3 + term * delta_n * (n - 2) - 3 * delta_n * self._m2)
                self._m2 = self._m2 + term

class GroupAccumulator

Expand source code

class GroupAccumulator:
        def __init__(self) -> None:
                self.groups: Dict[int, Accumulator] = {}

        def add(self, label, x) -> None:
                self.groups.setdefault(label, Accumulator()).add(x)

        def sum(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.sum for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].sum

        def nnz(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.nnz for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].nnz

        def count(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.count for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].count

        def first(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.first for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].first

        def mean(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.mean for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].mean

        def variance(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.variance for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].variance

        def skewness(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.skewness for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].skewness

        def kurtosis(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.kurtosis for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].kurtosis

        def sd(self, label = None) -> np.ndarray:
                if label is None:
                        x = {label: x.sd for label, x in self.groups.items()}
                        return np.array(list(x.keys())), np.array(list(x.values()))
                return self.groups[label].sd

Methods

def add(self, label, x) ‑> NoneType

Expand source code

def add(self, label, x) -> None:
        self.groups.setdefault(label, Accumulator()).add(x)

def count(self, label=None) ‑> numpy.ndarray

Expand source code

def count(self, label = None) -> np.ndarray:
        if label is None:
                x = {label: x.count for label, x in self.groups.items()}
                return np.array(list(x.keys())), np.array(list(x.values()))
        return self.groups[label].count

def first(self, label=None) ‑> numpy.ndarray

Expand source code

def first(self, label = None) -> np.ndarray:
        if label is None:
                x = {label: x.first for label, x in self.groups.items()}
                return np.array(list(x.keys())), np.array(list(x.values()))
        return self.groups[label].first

def kurtosis(self, label=None) ‑> numpy.ndarray

Expand source code

def kurtosis(self, label = None) -> np.ndarray:
        if label is None:
                x = {label: x.kurtosis for label, x in self.groups.items()}
                return np.array(list(x.keys())), np.array(list(x.values()))
        return self.groups[label].kurtosis

def mean(self, label=None) ‑> numpy.ndarray

Expand source code

def mean(self, label = None) -> np.ndarray:
        if label is None:
                x = {label: x.mean for label, x in self.groups.items()}
                return np.array(list(x.keys())), np.array(list(x.values()))
        return self.groups[label].mean

def nnz(self, label=None) ‑> numpy.ndarray

Expand source code

def nnz(self, label = None) -> np.ndarray:
        if label is None:
                x = {label: x.nnz for label, x in self.groups.items()}
                return np.array(list(x.keys())), np.array(list(x.values()))
        return self.groups[label].nnz

def sd(self, label=None) ‑> numpy.ndarray

Expand source code

def sd(self, label = None) -> np.ndarray:
        if label is None:
                x = {label: x.sd for label, x in self.groups.items()}
                return np.array(list(x.keys())), np.array(list(x.values()))
        return self.groups[label].sd

def skewness(self, label=None) ‑> numpy.ndarray

Expand source code

def skewness(self, label = None) -> np.ndarray:
        if label is None:
                x = {label: x.skewness for label, x in self.groups.items()}
                return np.array(list(x.keys())), np.array(list(x.values()))
        return self.groups[label].skewness

def sum(self, label=None) ‑> numpy.ndarray

Expand source code

def sum(self, label = None) -> np.ndarray:
        if label is None:
                x = {label: x.sum for label, x in self.groups.items()}
                return np.array(list(x.keys())), np.array(list(x.values()))
        return self.groups[label].sum

def variance(self, label=None) ‑> numpy.ndarray

Expand source code

def variance(self, label = None) -> np.ndarray:
        if label is None:
                x = {label: x.variance for label, x in self.groups.items()}
                return np.array(list(x.keys())), np.array(list(x.values()))
        return self.groups[label].variance

class GroupDimensionBy (dim: Dimension, labels: Union[str, numpy.ndarray, NoneType], projection: Callable = None, chunk_size: int = 1000)

Expand source code

class GroupDimensionBy:
        def __init__(self, dim: "shoji.dimension.Dimension", labels: Optional[Union[str, np.ndarray]], projection: Callable = None, chunk_size: int = 1000) -> None:
                self.dim = dim
                assert dim.wsm is not None, "Cannot group by unbound dimension"
                self.labels = labels
                if isinstance(self.labels, str):
                        tensor = dim.wsm._get_tensor(self.labels)
                        if tensor.rank != 1:
                                raise ValueError(f"Cannot groupby('{self.labels}'); a rank-1 tensor is required")
                self.chunk_size = chunk_size
                self.projection = projection
                self.acc: Optional[GroupAccumulator] = None

        def stats(self, of_tensor: str) -> GroupAccumulator:
                if self.acc is not None:
                        return self.acc
                assert self.dim.wsm is not None
                le = LabelEncoder()
                if self.labels is None:
                        label_values = np.zeros(self.dim.length)
                elif isinstance(self.labels, np.ndarray):
                        label_values = self.labels
                else:
                        label_values = self.dim.wsm[self.labels][:]
                if self.projection is not None:
                        label_values = [self.projection(x) for x in  label_values]
                labels = le.fit_transform(label_values)  # Encode string labels and non-contiguous integers into integers 0, 1, 2, ...
                acc = GroupAccumulator()
                n_rows = self.dim.length
                n_rows_per_batch = 1000
                for ix in range(0, n_rows, n_rows_per_batch):
                        batch = self.dim.wsm._get_tensor(of_tensor)[ix: ix + n_rows_per_batch]
                        batch_labels = labels[ix: ix + n_rows_per_batch]
                        for i, label in enumerate(batch_labels):
                                acc.add(le.classes_[label], batch[i])
                return acc


        def sum(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).sum()

        def count(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).count()

        def nnz(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).nnz()

        def mean(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).mean()

        def variance(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).variance()

        def skewness(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).skewness()

        def kurtosis(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).kurtosis()

        def sd(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).sd()

Methods

def count(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def count(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).count()

def kurtosis(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def kurtosis(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).kurtosis()

def mean(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def mean(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).mean()

def nnz(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def nnz(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).nnz()

def sd(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def sd(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).sd()

def skewness(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def skewness(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).skewness()

def stats(self, of_tensor: str) ‑> GroupAccumulator

Expand source code

def stats(self, of_tensor: str) -> GroupAccumulator:
        if self.acc is not None:
                return self.acc
        assert self.dim.wsm is not None
        le = LabelEncoder()
        if self.labels is None:
                label_values = np.zeros(self.dim.length)
        elif isinstance(self.labels, np.ndarray):
                label_values = self.labels
        else:
                label_values = self.dim.wsm[self.labels][:]
        if self.projection is not None:
                label_values = [self.projection(x) for x in  label_values]
        labels = le.fit_transform(label_values)  # Encode string labels and non-contiguous integers into integers 0, 1, 2, ...
        acc = GroupAccumulator()
        n_rows = self.dim.length
        n_rows_per_batch = 1000
        for ix in range(0, n_rows, n_rows_per_batch):
                batch = self.dim.wsm._get_tensor(of_tensor)[ix: ix + n_rows_per_batch]
                batch_labels = labels[ix: ix + n_rows_per_batch]
                for i, label in enumerate(batch_labels):
                        acc.add(le.classes_[label], batch[i])
        return acc

def sum(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def sum(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).sum()

def variance(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def variance(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).variance()

class GroupViewBy (view: View, labels: Union[str, numpy.ndarray, NoneType], projection: Callable = None)

Expand source code

class GroupViewBy:
        def __init__(self, view: "shoji.view.View", labels: Optional[Union[str, np.ndarray]], projection: Callable = None) -> None:
                self.view = view
                self.labels = labels
                if isinstance(self.labels, str):
                        tensor = view.wsm._get_tensor(self.labels)
                        if tensor.rank != 1:
                                raise ValueError(f"Cannot groupby('{self.labels}'); a rank-1 tensor is required")
                self.projection = projection
                self.acc: Optional[GroupAccumulator] = None

        def stats(self, of_tensor: str) -> GroupAccumulator:
                if self.acc is not None:
                        return self.acc
                tensor = self.view.wsm._get_tensor(of_tensor)
                if tensor.jagged:
                        raise NotImplementedError("Cannot group jagged tensor")
                n_rows = self.view.get_length(tensor.dims[0])
                if self.labels is None:
                        label_values = np.zeros(n_rows)
                elif isinstance(self.labels, np.ndarray):
                        label_values = self.labels
                else:
                        label_values = self.view[self.labels]
                if self.projection is not None:
                        label_values = [self.projection(x) for x in  label_values]
                le = LabelEncoder()
                labels = le.fit_transform(label_values)  # Encode string labels and non-contiguous integers into integers 0, 1, 2, ...
                acc = GroupAccumulator()

                n_rows_per_batch = 1000
                for ix in range(0, n_rows, n_rows_per_batch):
                        batch = self.view._read_batch(tensor, ix, ix + n_rows_per_batch)
                        batch_labels = labels[ix: ix + n_rows_per_batch]
                        for i, label in enumerate(batch_labels):
                                acc.add(le.classes_[label], batch[i])
                return acc

        def sum(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).sum()

        def count(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).count()

        def first(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).first()

        def nnz(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).nnz()

        def mean(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).mean()

        def variance(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).variance()

        def skewness(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).skewness()

        def kurtosis(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).kurtosis()

        def sd(self, of_tensor: str) -> np.ndarray:
                return self.stats(of_tensor).sd()

Methods

def count(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def count(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).count()

def first(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def first(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).first()

def kurtosis(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def kurtosis(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).kurtosis()

def mean(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def mean(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).mean()

def nnz(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def nnz(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).nnz()

def sd(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def sd(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).sd()

def skewness(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def skewness(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).skewness()

def stats(self, of_tensor: str) ‑> GroupAccumulator

Expand source code

def stats(self, of_tensor: str) -> GroupAccumulator:
        if self.acc is not None:
                return self.acc
        tensor = self.view.wsm._get_tensor(of_tensor)
        if tensor.jagged:
                raise NotImplementedError("Cannot group jagged tensor")
        n_rows = self.view.get_length(tensor.dims[0])
        if self.labels is None:
                label_values = np.zeros(n_rows)
        elif isinstance(self.labels, np.ndarray):
                label_values = self.labels
        else:
                label_values = self.view[self.labels]
        if self.projection is not None:
                label_values = [self.projection(x) for x in  label_values]
        le = LabelEncoder()
        labels = le.fit_transform(label_values)  # Encode string labels and non-contiguous integers into integers 0, 1, 2, ...
        acc = GroupAccumulator()

        n_rows_per_batch = 1000
        for ix in range(0, n_rows, n_rows_per_batch):
                batch = self.view._read_batch(tensor, ix, ix + n_rows_per_batch)
                batch_labels = labels[ix: ix + n_rows_per_batch]
                for i, label in enumerate(batch_labels):
                        acc.add(le.classes_[label], batch[i])
        return acc

def sum(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def sum(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).sum()

def variance(self, of_tensor: str) ‑> numpy.ndarray

Expand source code

def variance(self, of_tensor: str) -> np.ndarray:
        return self.stats(of_tensor).variance()