Source code for tessif.identify.core

# src/tessif/identify/core.py
"""Tessif module providing the core identification utilities."""
import abc
import logging
import operator

import pandas as pd

from tessif.identify.auxilliary import parse_reference_df

logger = logging.getLogger(__name__)


[docs]class Identificier(abc.ABC):
    """Identificaiton Base Class.

    Identificaiton algorithm houses in :attr:`Identificier.cluster_interest`
    which needs to be overriden by child specific implementations.

    Parameters
    ----------
    data
        Result data to be analyzed for significant differences.

    conditions_dict: dict, None, default=None
        Dictionairy keying :class:`container(s) <collections.abc.Container>`
        of dicts by the respective cluster labels "high", "medium" and "low".
        The dictionairies inside the tuples need to have following keywords:

            - ``thres`` specyfying the threshold used
            - ``oprt`` specifying the :mod:`operator` used.

        Used to cluster :paramref:`~Identificier.data` by category/cluster
        label.

    reference: str, None, default=None
        Defines the reference results to be used for calculating the
        statistical error values and pearson correlation coeficients.

        In case ``None`` is used (default), the dataframes average is used as
        returned by :func:`average_timevarying_dataframe_results`.
    """

    def __init__(self, data, conditions_dict, reference=None):

        self.data = data
        self._conditions = conditions_dict
        self._reference = parse_reference_df(data, reference)

        # core utility: cluster component flows by interest
        self._clustered_interest = self.cluster_interest()

        # map respective results
        self.map_interest_results(data)

    @property
    def of_high_interest(self):
        """Node uid representations identified as of ``high interest``."""
        dtf = self.clustered_interest
        dtf = dtf[dtf == "high"].stack().unstack()
        return dtf

    @property
    def high(self):
        """Alias for :attr:`of_high_interest`."""
        return self.of_high_interest

    @property
    def high_interest_results(self):
        """Inter component results identified as highly interesting."""
        return self._high_interest_results

    @property
    def of_medium_interest(self):
        """Node uid representations identified as of ``medium interest``."""
        dtf = self.clustered_interest
        dtf = dtf[dtf == "medium"].stack().unstack()
        return dtf

    @property
    def medium(self):
        """Alias for :attr:`of_medium_interest`."""
        return self.of_medium_interest

    @property
    def medium_interest_results(self):
        """Inter component results identified as mediumly interesting."""
        return self._medium_interest_results

    @property
    def of_low_interest(self):
        """Node uid representations identified as of ``low interest``."""
        return dict(self._of_low_interest)

    @property
    def low(self):
        """Alias for :attr:`of_low_interest`."""
        dtf = self.clustered_interest
        dtf = dtf[dtf == "low"].stack().unstack()
        return dtf

    @property
    def low_interest_results(self):
        """Inter component results identified as lowly interesting."""
        return self._low_interest_results

    @property
    def cluster_conditions(self):
        """Dictionairy of clustering conditions used."""
        return self._conditions

    @property
    def clustered_interest(self):
        """Inter component results clustered by interest."""
        return self._clustered_interest

    @property
    def reference(self):
        """Reference Model Used for Ientifications."""
        return self._reference

[docs]    @abc.abstractmethod
    def cluster_interest(self):
        """Cluster inter component results by interest."""

[docs]    @abc.abstractmethod
    def map_interest_results(self, data):
        """Map data to identified interest categories."""


[docs]def cluster(values, conditions_dict):
    """Cluster value(s) on condition(s).

    Uses a dcitionairy of conditions utilizing pythons
    :mod:`operators <operator>`.

    Parameters
    ----------
    values: ~collections.abc.Container
        Container of :class:`number(s) <numbers.Number>` on which the cluster
        conditions are checked on.
    conditions_dict: dict
        Dictionairy keying :class:`container(s) <collections.abc.Container>`
        of dicts by the respective cluster labels. The dictionairies inside
        the tuples need to have following keywords:

            - ``thres`` specyfying the threshold used
            -  ``oprt`` specifying the :mod:`operator` used.

    Returns
    -------
    ~collections.abc.Hashable
        Dictionairy key specifying the cluster. Usually a string or a number.

    Examples
    --------
    Using a single value condition check with 2 categories/clusters. Note that
    on single value conditions both, the value itself as well as the inner
    conditions dict must be Containers. Hence the trailing ``,`` to turn both
    into tuples.

    >>> values = [(9000,), (9001,), (42,)]
    >>> conditions = {
    ...     "Its over 9000!": ({"oprt": "gt", "thres": 9000},),
    ...     "Nope": ({"oprt": "le", "thres": 9000},),
    ... }

    >>> for value in values:
    ...     print(cluster(value, conditions))
    Nope
    Its over 9000!
    Nope

    Multiple values and conditions (inner dict tuple length) can be used. Their
    length must match however:

    >>> values = [
    ...     ([0, 1], "high"),
    ...     ([1, 1],  "medium1"),
    ...     ([0, 0],  "medium2"),
    ...     ([1, 0],  "low"),
    ... ]

    >>> # first condition = pcc, second condition = nmae
    >>> conditions = {
    ...     "high": ({"oprt": "lt", "thres": 0.7}, {"oprt": "ge", "thres": 0.1}),
    ...     "medium1": ({"oprt": "ge", "thres": 0.7}, {"oprt": "ge", "thres": 0.1}),
    ...     "medium2": ({"oprt": "lt", "thres": 0.7}, {"oprt": "lt", "thres": 0.1}),
    ...     "low": ({"oprt": "ge", "thres": 0.7}, {"oprt": "lt", "thres": 0.1}),
    ... }

    >>> for value_pairing in values:
    ...     print(cluster(value_pairing[0], conditions))
    high
    medium1
    medium2
    low
    """

    for cluster, conditions in conditions_dict.items():
        if all(
                [
                    getattr(operator, cond["oprt"])(values[pos], cond["thres"])
                    for pos, cond in enumerate(conditions)
                ]
        ):
            return cluster
    else:
        logger.warning("Value could not be clustered.")
        return None