Source code for tessif.identify.timeframes

# src/tessif/identify/calculate.py
"""Tessif module providing tools for identifying differing timeframes."""

import numpy as np
import pandas as pd


from tessif.identify.auxilliary import parse_reference_df
from tessif.identify.calculate import calc_ardiffs


[docs]def significant_differences( data, method="ardiffs", threshold=0.1, reference=None, neighs=True, ): """Identify significant differences between timeseries results. Designed to detect significant deviations between software specific flow results between the same components. Each continous sequence of detected difference is stored as a seperate DatFrame. Parameters ---------- data: pandas.DataFrame, ~collections.abc.Container DataFrame of which each column is assumed to contain one flow result. Or container of flow results. method: {"ardiffs"}, method, default="ardiffs" String specifying wich precoded function to use for calculating differences or function that takes ``data`` as :class:`pandas.DataFrame` or :class:`pandas.Series` and :paramref:`~significant_differences.reference` (as ``reference``) to return a dataframe indexed like :paramref:`~significant_differences.data`. threshold: ~numbers.Number, default=0.1 Number specifying the threshold on which relative differences are seen as "significant". Comparison are made based on :paramref:`~significant_differences.reference` reference: str, None, default=None Specifies which columns of :paramref:`~significant_differences.data` are to be used as reference results to calculate actual differences. For ``None`` (default), the dataframes' average is used as returned by ``numpy.mean(data, axis="columns")`` Returns ------- list List of DataFrames where each DataFrame represent one continues sequence of detected differences. Examples -------- >>> import pandas as pd >>> data=[ ... [10, 10, 10], ... [10, 12, 10], ... [10, 10, 10], ... [10, 10, 10], ... [10, 10, 12], ... ] Simple use case of integer indexed data frames: >>> dtf = pd.DataFrame( ... data, ... columns=["software1", "software2", "software3"], ... ) >>> identified_differences = significant_differences(dtf, neighs=False) >>> for dtf in identified_differences: ... print(dtf) ... print(59*'-') software1 software2 software3 1 10.666667 12.0 10.666667 ----------------------------------------------------------- software1 software2 software3 4 10.666667 10.666667 12.0 ----------------------------------------------------------- Design use case of timeindex indexed dataframes including neighbouring averages for creating telling stepplots: >>> dtf2 = pd.DataFrame( ... data, ... columns=["software1", "software2", "software3"], ... index=pd.date_range("1990-07-13", periods=5, freq="H"), ... ) >>> identified_differences = significant_differences(dtf2, neighs=True) >>> print(identified_differences[0]) software1 software2 software3 1990-07-13 00:00:00 10.000000 10.0 10.000000 1990-07-13 01:00:00 10.666667 12.0 10.666667 1990-07-13 02:00:00 10.000000 10.0 10.000000 >>> from tessif.visualize import component_loads >>> axes = component_loads.step(identified_differences[0]) >>> # axes.figure.show() .. image:: ../../_static/images/identify_timeframes_step1.png :align: center :alt: Step plot image of the first identified timeframes Note how the second dataframe of identified differences does not include an average on the last index, despite the above's ``neighs=True``. This is due to a significant difference beeing detected at the last entry where a neighbour is not added. >>> print(identified_differences[1]) software1 software2 software3 1990-07-13 03:00:00 10.000000 10.000000 10.0 1990-07-13 04:00:00 10.666667 10.666667 12.0 >>> from tessif.visualize import component_loads >>> axes = component_loads.step(identified_differences[1]) >>> # axes.figure.show() .. image:: ../../_static/images/identify_timeframes_step2.png :align: center :alt: Step plot image of the second identified timeframes Using "software2" as reference and setting threshold to 30% results in no significant differences beeing detected: >>> dtf2 = pd.DataFrame( ... data, ... columns=["software1", "software2", "software3"], ... index=pd.date_range("1990-07-13", periods=5, freq="H"), ... ) >>> identified_differences = significant_differences( ... dtf2, reference="software2", threshold=0.3) >>> print(identified_differences) [] Using "software2" as reference and resetting threshold to 10%: >>> identified_differences = significant_differences( ... dtf2, reference="software2", neighs=False) >>> print(identified_differences[0]) software1 software2 software3 1 10 12 10 >>> print(identified_differences[1]) software1 software2 software3 4 10 10 12 """ # parse data to DataFrame if not isinstance(data, pd.DataFrame): data = pd.DataFrame.from_dict(data, orient='columns') # temporarily change data index to int to allow clustering and neighbouring old_index = data.index data.index = range(len(data.index)) # parse reference ref = parse_reference_df(data, reference) # calculate relative deviations if method == "ardiffs": relative_deviations = calc_ardiffs(data, ref) else: relative_deviations = method(data, ref) # identify only those significant relative_significant_differences = relative_deviations[ relative_deviations > threshold] # extract integer indices using the stack().unstack() combination int_indices = list( relative_significant_differences.stack().unstack().index) # cluster the integer indices into continous sequences index_clusters = _continous_int_sequences(int_indices) # add neighbouring indices if requested if neighs: index_clusters = list( _add_integer_neighbours(idx_cl, upper_bound=max(data.index)) for idx_cl in index_clusters ) # construct an all averaged dataframe all_averaged = pd.DataFrame( data=[ref] * len(data.columns), columns=data.index, index=data.columns, ).transpose() # update the all averaged dataframe with those from the original # data where significant differenes were detected mindices = list(relative_significant_differences.stack().index) for idx, col in mindices: all_averaged.at[idx, col] = data.at[idx, col] # seperate the detected differences into cluster of continous indices # reinstating the old index average_and_differences = all_averaged.copy() average_and_differences.index = old_index dataframes = [ average_and_differences.iloc[idx_cluster] for idx_cluster in index_clusters ] return dataframes
def _continous_int_sequences(integers): """Identify continous sequences of integers. Parameters ---------- integers: ~collections.abc.Container Container of sequence of integers Returns ------- list Nested list of identified continous integer sequences. Examples -------- Default Use Case: >>> sequences = [ ... (0, 1, 3, 5, 6, 10), ... (1, 4, 5, 6, 12), ... [3, 4, 5], ... (1, 3, 7,), ... ] >>> for seq in sequences: ... cnt_seq = _continous_int_sequences(seq) ... print(seq) ... print(cnt_seq) ... print(59*'-') (0, 1, 3, 5, 6, 10) [[0, 1], [3], [5, 6], [10]] ----------------------------------------------------------- (1, 4, 5, 6, 12) [[1], [4, 5, 6], [12]] ----------------------------------------------------------- [3, 4, 5] [[3, 4, 5]] ----------------------------------------------------------- (1, 3, 7) [[1], [3], [7]] ----------------------------------------------------------- """ integers = tuple(sorted(integers)) sequences = [] sequence = [] for number in integers: sequence.append(number) if number+1 not in integers: sequences.append(sequence) sequence = [] return sequences def _add_integer_neighbours(sequence, lower_bound=0, upper_bound=None): """ Add lower and upper integers to given sequence. Parameters ---------- sequence: int Sequence of integers of which neighbours are to be added at lower and upper bounds Examples -------- Default Use Case: >>> sequences = [ ... (0, 1, 3, 5, 6, 10), ... (1, 4, 5, 6, 12), ... [3, 4, 5], ... (1, 3, 7,), ... ] >>> for seq in sequences: ... neighboured_seq = _add_integer_neighbours(seq) ... print(seq) ... print(neighboured_seq) ... print(59*'-') (0, 1, 3, 5, 6, 10) [0, 1, 3, 5, 6, 10, 11] ----------------------------------------------------------- (1, 4, 5, 6, 12) [0, 1, 4, 5, 6, 12, 13] ----------------------------------------------------------- [3, 4, 5] [2, 3, 4, 5, 6] ----------------------------------------------------------- (1, 3, 7) [0, 1, 3, 7, 8] ----------------------------------------------------------- """ if sequence[0] != lower_bound: sequence = [sequence[0] - 1, *sequence] if upper_bound is None: upper_bound = float("+inf") if sequence[-1] != upper_bound: sequence = [*sequence, sequence[-1] + 1] return sequence