Source code for tessif.identify.calculate

# pylint: disable=trailing-whitespace
# src/tessif/identify/calculate.py
# pylint error disabled since the dataframe doctest results require those
"""Tessif module providing calc. tools for identifying result differences."""
import numpy as np
import pandas as pd


[docs]def calc_nmae(dataframes_dict, reference_df, method="mean"): """Calculate the Normalized Mean Average Error along the rows. Parameters ---------- dataframes_dict: dict Dictionairy of of :class:`pandas.DataFrame` objects to calculate the nmae error values between columns of identically indexed columns relative to the :paramref:`~calc_nmae.reference_df` (see example for less gibberish). Designed for using with timevarying (load) results between different softwares for the same component(s). reference_df: pandas.DataFrame Dataframe indexed like those of :paramref:`~calc_nmae.dataframes_dict` method: {"mean", "spread", "std"}, default = "mean" Method of normalization: - ``"mean"``: MAE is divided by ``mean(reference)`` - ``"spread"``: MAE is divided by ``abs(max(reference)-min(reference))`` - ``"std"`` MAE is divided by ``std(reference)`` Returns ------- pandas.DataFrame DataFrame holding the calculated NMAE. Columns and index are swapped in comparison to the dataframes passed as arguments. Examples -------- Picking up on the :paramref:`Identificier Data Input Example <tessif.identify.core.Identificier.data>`: >>> import pandas as pd >>> software1 = pd.DataFrame( ... data=[[10, 8, 2], [0, 0, 0], [20, 2, 18]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) >>> reference_df = pd.DataFrame( ... data=[[13, 7, 1990], [42, 0, 42], [90, 0, 0]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) Original Data Frames: >>> print(software1) A B B C D 2019-01-01 00:00:00 10 8 2 2019-01-01 01:00:00 0 0 0 2019-01-01 02:00:00 20 2 18 >>> print(reference_df) A B B C D 2019-01-01 00:00:00 13 7 1990 2019-01-01 01:00:00 42 0 42 2019-01-01 02:00:00 90 0 0 Normalized Mean Average Error: >>> nmae = calc_nmae( ... dataframes_dict={"software1": software1}, ... reference_df=reference_df, ... ) >>> print(nmae) software1 A B 0.793103 B C 0.428571 D 1.007874 Using "spread" instead of the default "mean" normalization: >>> nmae = calc_nmae( ... dataframes_dict={"software1": software1}, ... reference_df=reference_df, ... method="spread", ... ) >>> print(nmae) software1 A B 0.497835 B C 0.142857 D 0.343049 """ result_data = {} for label, dtf in dataframes_dict.items(): result_data[label] = np.mean( np.abs(dtf - reference_df), axis="index" ) / _normalize(reference_df, method) result_df = pd.concat(result_data, axis="columns") return result_df
[docs]def calc_nmbe(dataframes_dict, reference_df, method="mean"): """Calculate the Normalized Mean Biased Error along the rows. Parameters ---------- dataframes_dict: dict Dictionairy of of :class:`pandas.DataFrame` objects to calculate the nmbe error values between columns of identically indexed columns relative to the :paramref:`~calc_nmbe.reference_df` (see example for less gibberish). Designed for using with timevarying (load) results between different softwares for the same component(s). reference_df: pandas.DataFrame Dataframe indexed like those of :paramref:`~calc_nmbe.dataframes_dict` method: {"mean", "spread", "std"}, default = "mean" Method of normalization: - ``"mean"``: MBE is divided by ``mean(reference)`` - ``"spread"``: MBE is divided by ``abs(max(reference)-min(reference))`` - ``"std"`` MBE is divided by ``std(reference)`` Returns ------- pandas.DataFrame DataFrame holding the calculated NMBE. Columns and index are swapped in comparison to the dataframes passed as arguments. Examples -------- Picking up on the :paramref:`Identificier Data Input Example <tessif.identify.core.Identificier.data>`: >>> import pandas as pd >>> software1 = pd.DataFrame( ... data=[[10, 8, 2], [0, 0, 0], [20, 2, 18]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) >>> reference_df = pd.DataFrame( ... data=[[13, 7, 1990], [42, 0, 42], [90, 0, 0]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) Original Data Frames: >>> print(software1) A B B C D 2019-01-01 00:00:00 10 8 2 2019-01-01 01:00:00 0 0 0 2019-01-01 02:00:00 20 2 18 >>> print(reference_df) A B B C D 2019-01-01 00:00:00 13 7 1990 2019-01-01 01:00:00 42 0 42 2019-01-01 02:00:00 90 0 0 Normalized Mean Biased Error: >>> nmbe = calc_nmbe( ... dataframes_dict={"software1": software1}, ... reference_df=reference_df, ... ) >>> print(nmbe) software1 A B -0.793103 B C 0.428571 D -0.990157 Using "spread" instead of the default "mean" normalization: >>> nmbe = calc_nmbe( ... dataframes_dict={"software1": software1}, ... reference_df=reference_df, ... method="spread", ... ) >>> print(nmbe) software1 A B -0.497835 B C 0.142857 D -0.337018 """ result_data = {} for label, dtf in dataframes_dict.items(): result_data[label] = np.mean( np.subtract(dtf, reference_df), axis="index" ) / _normalize(reference_df, method) result_df = pd.concat(result_data, axis="columns") return result_df
[docs]def calc_nrmse(dataframes_dict, reference_df, method="mean"): """Calculate the Normalized Root Mean Square Error along the rows. Parameters ---------- dataframes_dict: dict Dictionairy of of :class:`pandas.DataFrame` objects to calculate the NRMSE error values between columns of identically indexed columns relative to the :paramref:`~calc_nrmse.reference_df` (see example for less gibberish). Designed for using with timevarying (load) results between different softwares for the same component(s). reference_df: pandas.DataFrame Dataframe indexed like those of :paramref:`~calc_nrmse.dataframes_dict` method: {"mean", "spread", "std"}, default = "mean" Method of normalization: - ``"mean"``: RMSE is divided by ``mean(reference)`` - ``"spread"``: RMSE is divided by ``abs(max(reference)-min(reference))`` - ``"std"`` NRMSE is divided by ``std(reference)`` Returns ------- pandas.DataFrame DataFrame holding the calculated NRMSE. Columns and index are swapped in comparison to the dataframes passed as arguments. Examples -------- Picking up on the :paramref:`Identificier Data Input Example <tessif.identify.core.Identificier.data>`: >>> import pandas as pd >>> software1 = pd.DataFrame( ... data=[[10, 8, 2], [0, 0, 0], [20, 2, 18]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) >>> reference_df = pd.DataFrame( ... data=[[13, 7, 1990], [42, 0, 42], [90, 0, 0]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) Original Data Frames: >>> print(software1) A B B C D 2019-01-01 00:00:00 10 8 2 2019-01-01 01:00:00 0 0 0 2019-01-01 02:00:00 20 2 18 >>> print(reference_df) A B B C D 2019-01-01 00:00:00 13 7 1990 2019-01-01 01:00:00 42 0 42 2019-01-01 02:00:00 90 0 0 Normalized Root Mean Square Error: >>> nrmse = calc_nrmse( ... dataframes_dict={"software1": software1}, ... reference_df=reference_df, ... ) >>> print(nrmse) software1 A B 0.975783 B C 0.553283 D 1.694993 Using "spread" instead of the default "mean" normalization: >>> nrmse = calc_nrmse( ... dataframes_dict={"software1": software1}, ... reference_df=reference_df, ... method="spread", ... ) >>> print(nrmse) software1 A B 0.612504 B C 0.184428 D 0.576922 """ result_data = {} for label, dtf in dataframes_dict.items(): result_data[label] = np.sqrt( np.mean( np.square( np.subtract( dtf, reference_df ), ), axis="index", ) ) / _normalize(reference_df, method) result_df = pd.concat(result_data, axis="columns") return result_df
def _calc_corr_between_two_dfs(dataframes, method="pearson", fillna=None): """Calculate column pairwise correlation. Function to calculate correlation coefficients to quickly sort out difference between two different software results. Uses :attr:`pandas.DataFrame.corrwith` under the hood. Note ---- Its only Possible to compare two different models Parameters ---------- dataframes: ~collections.abc.Container Container of :class:`pandas.DataFrame` objects of which identically indexed columns will be used for correlation. method : {'pearson', 'kendall', 'spearman'} or callable Method of correlation: - pearson : standard correlation coefficient - kendall : Kendall Tau correlation coefficient - spearman : Spearman rank correlation - callable: callable with input two 1d ndarrays and returning a float. fillna: str, ~numbers.Number, None, default=None String, number or None specifying what to do when pearson corrleation results to ``NaN``. For design case usage, this is usually the case when one of the correlated timeseries results is all zeros. If ``None``, then :attr:`pandas.DataFrame.corrwith` output is kept. Returns ------- pandas.Series: Pandas Series of the pairwise pearson correlation results. Where: - ``1`` = perfect correlation (good) - ``0`` = no correlation at all (bad) - ``-1`` = trending in opposite directions (woops!) See also -------- `Pearson Correlation Coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_ """ # hdf5 is sometimes odd df1 = dataframes[0].astype('float64') df2 = dataframes[1].astype('float64') # correlation coefficients ccfs = df1.corrwith(df2, axis="index", method=method) if fillna is not None: ccfs = ccfs.fillna(value=fillna) return ccfs
[docs]def calc_avgs(dataframes): """ Calculate average results on timevarying dataframes. Takes any number of :class:`pandas.DataFrame` objects to calculate the average between rows of identically indexed columns (see example for less gibberish). Designed to average the timevarying (load) results between different softwares for the same component(s). Parameters ---------- dataframes: ~collections.abc.Container Container of :class:`pandas.DataFrame` objects of which each row is averaged out. Returns ------- pandas.DataFrame Averaged out results Examples -------- Picking up on the :paramref:`Identificier Data Input Example <tessif.identify.core.Identificier.data>`: >>> import pandas as pd >>> software1 = pd.DataFrame( ... data=[[10, 8, 2], [0, 0, 0], [20, 2, 18]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) >>> software2 = pd.DataFrame( ... data=[[13, 7, 1990], [42, 0, 42], [90, 0, 0]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) Original Data Frames: >>> print(software1) A B B C D 2019-01-01 00:00:00 10 8 2 2019-01-01 01:00:00 0 0 0 2019-01-01 02:00:00 20 2 18 >>> print(software2) A B B C D 2019-01-01 00:00:00 13 7 1990 2019-01-01 01:00:00 42 0 42 2019-01-01 02:00:00 90 0 0 Average Results: >>> averaged_results = calc_avgs( ... [software1, software2]) >>> print(averaged_results) A B B C D 2019-01-01 00:00:00 11.5 7.5 996.0 2019-01-01 01:00:00 21.0 0.0 21.0 2019-01-01 02:00:00 55.0 1.0 9.0 """ average_model_concat = pd.concat(dataframes, sort=False) average_model = average_model_concat.groupby(average_model_concat.index) average_model_mean = average_model.mean() return average_model_mean
[docs]def calc_evs( dataframes, labels=None, reference=None, error="NMAE", normalization="mean", ): """Calculate Error Value between timevarying dataframes. Takes any number of :class:`pandas.DataFrame` objects to calculate chosen error values between rows of identically indexed columns (see example for less gibberish). Designed for using with timevarying (load) results between different softwares for the same component(s). Parameters ---------- dataframes: ~collections.abc.Container Container of :class:`pandas.DataFrame` objects of which each row is averaged out. labels: ~collections.abc.Container, None, default=None Container of strings specifying the respective dataframe labels. Equals software names in the design case. reference: int, str, None, default=None Defines the reference results to be used for calculating the statistical error values. Integer denotes the 0-indexed container position of :paramref:`~calc_timevarrying_error_value.dataframes`. String the respective label. String parameter only works if :paramref:`~calc_timevarrying_error_value.labels` are stated as container of strings. In case ``None`` is used (default), the dataframes average is used as returned by :func:`average_timevarying_dataframe_results`. error: str String abbrevating the error value calculated. Currently supported are: - ``nmae`` for ``Normalized Mean Average Error`` (default) - ``nmbe`` for ``Normalized Mean Biased Error`` - ``nrmse`` for ``Normalized Root Mean Square Error`` normalization: {"mean", "spread"}, default = "mean" Method of error value normalization: - ``"mean"``: NMBE is divided by ``mean(reference)`` - ``"spread"``: NMBE is divided by ``abs(max(reference)-min(reference))`` Returns ------- pandas.DataFrame DataFrame holding the calculated error values. Columns and index are swapped in comparison to the dataframes passed as arguments. Examples -------- Picking up on the :paramref:`Identificier Data Input Example <tessif.identify.core.Identificier.data>`: >>> import pandas as pd >>> software1 = pd.DataFrame( ... data=[[10, 8, 2], [0, 0, 0], [20, 2, 18]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) >>> software2 = pd.DataFrame( ... data=[[13, 7, 1990], [42, 0, 42], [90, 0, 0]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) >>> software3 = pd.DataFrame( ... data=[[13, 7, 1990], [42, 0, 42], [90, 0, 0]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) Original Data Frames: >>> print(software1) A B B C D 2019-01-01 00:00:00 10 8 2 2019-01-01 01:00:00 0 0 0 2019-01-01 02:00:00 20 2 18 >>> print(software2) A B B C D 2019-01-01 00:00:00 13 7 1990 2019-01-01 01:00:00 42 0 42 2019-01-01 02:00:00 90 0 0 >>> print(software3) A B B C D 2019-01-01 00:00:00 13 7 1990 2019-01-01 01:00:00 42 0 42 2019-01-01 02:00:00 90 0 0 Normalized Mean Average Error: >>> nmae = calc_evs( ... dataframes=[software1, software2, software3], ... labels=["software1", "software2", "software3"], ... reference="software2", ... error="nmae", ... ) >>> print(nmae) software1 software2 software3 A B 0.793103 0.0 0.0 B C 0.428571 0.0 0.0 D 1.007874 0.0 0.0 Using "spread" normalization: >>> nmae = calc_evs( ... dataframes=[software1, software2, software3], ... labels=["software1", "software2", "software3"], ... reference="software2", ... error="nmae", ... normalization="spread", ... ) >>> print(nmae) software1 software2 software3 A B 0.497835 0.0 0.0 B C 0.142857 0.0 0.0 D 0.343049 0.0 0.0 Normalized Mean Biased Error: >>> nmbe = calc_evs( ... dataframes=[software1, software2, software3], ... labels=["software1", "software2", "software3"], ... reference="software2", ... error="nmbe", ... ) >>> print(nmbe) software1 software2 software3 A B -0.793103 0.0 0.0 B C 0.428571 0.0 0.0 D -0.990157 0.0 0.0 Using "spread" normalization: >>> nmbe = calc_evs( ... dataframes=[software1, software2, software3], ... labels=["software1", "software2", "software3"], ... reference="software2", ... error="nmbe", ... normalization="spread", ... ) >>> print(nmbe) software1 software2 software3 A B -0.497835 0.0 0.0 B C 0.142857 0.0 0.0 D -0.337018 0.0 0.0 Normalized Root Mean Square Error: >>> nrmse = calc_evs( ... dataframes=[software1, software2, software3], ... labels=["software1", "software2", "software3"], ... reference="software2", ... error="nrmse", ... ) >>> print(nrmse) software1 software2 software3 A B 0.975783 0.0 0.0 B C 0.553283 0.0 0.0 D 1.694993 0.0 0.0 Using "spread" normalization: >>> nrmse = calc_evs( ... dataframes=[software1, software2, software3], ... labels=["software1", "software2", "software3"], ... reference="software2", ... error="nrmse", ... normalization="spread", ... ) >>> print(nrmse) software1 software2 software3 A B 0.612504 0.0 0.0 B C 0.184428 0.0 0.0 D 0.576922 0.0 0.0 """ if labels is None: labels = range(len(dataframes)) dataframe_dict = dict(zip(labels, dataframes)) if reference is None: reference_results = calc_avgs(dataframes) else: if isinstance(reference, int): reference_results = tuple(dataframes)[reference] else: reference_results = dataframe_dict[reference] ev_mapping = { "nmae": calc_nmae, "nmbe": calc_nmbe, "nrmse": calc_nrmse, } result_df = ev_mapping[error]( dataframe_dict, reference_results, method=normalization) return result_df
[docs]def calc_corrs( dataframes, method="pearson", labels=None, reference=None, fillna=None, ): """Calc Pearson Correlation Coefficient between timevarying dataframes. Takes any number of :class:`pandas.DataFrame` objects to calculate the pearson correlation coefficients between rows of identically indexed columns (see example for less gibberish). Designed for using with timevarying (load) results between different softwares for the same component(s). Uses :attr:`pandas.DataFrame.corrwith` under the hood. Parameters ---------- dataframes: ~collections.abc.Container Container of :class:`pandas.DataFrame` objects of which each row is averaged out. method : {'pearson', 'kendall', 'spearman'} or callable Method of correlation: - pearson : standard correlation coefficient - kendall : Kendall Tau correlation coefficient - spearman : Spearman rank correlation - callable: callable with input two 1d ndarrays and returning a float. labels: ~collections.abc.Container, None, default=None Container of strings specifying the respective dataframe labels. Equals software names in the design case. reference: int, str, None, default=None Defines the reference results to be used for calculating the statistical error values. Integer denotes the 0-indexed container position of :paramref:`~calc_timevarrying_error_value.dataframes`. String the respective label. String parameter only works if :paramref:`~calc_timevarrying_error_value.labels` are stated as container of strings. In case ``None`` is used (default), the dataframes average is used as returned by :func:`average_timevarying_dataframe_results`. fillna: str, ~numbers.Number, None, default=None String, number or None specifying what to do when pearson corrleation results to ``NaN``. For design case usage, this is usually the case when one of the correlated timeseries results is all zeros. If ``None``, then :attr:`pandas.DataFrame.corrwith` output is kept. Returns ------- pandas.DataFrame DataFrame holding the calculated PCC values. Columns and index are swapped in comparison to the dataframes passed as arguments. Examples -------- Picking up on the :paramref:`Identificier Data Input Example <tessif.identify.core.Identificier.data>`: >>> import pandas as pd >>> software1 = pd.DataFrame( ... data=[[10, 8, 2], [0, 0, 0], [20, 2, 18]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) >>> software2 = pd.DataFrame( ... data=[[13, 7, 1990], [42, 0, 42], [90, 0, 0]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) >>> software3 = pd.DataFrame( ... data=[[13, 7, 1990], [42, 0, 42], [90, 0, 0]], ... columns=pd.MultiIndex.from_tuples( ... [("A", "B"), ("B", "C"), ("B", "D")]), ... index=pd.date_range('2019-01-01', periods=3, freq='H'), ... ) Original Data Frames: >>> print(software1) A B B C D 2019-01-01 00:00:00 10 8 2 2019-01-01 01:00:00 0 0 0 2019-01-01 02:00:00 20 2 18 >>> print(software2) A B B C D 2019-01-01 00:00:00 13 7 1990 2019-01-01 01:00:00 42 0 42 2019-01-01 02:00:00 90 0 0 >>> print(software3) A B B C D 2019-01-01 00:00:00 13 7 1990 2019-01-01 01:00:00 42 0 42 2019-01-01 02:00:00 90 0 0 Pearson Correlation Coefficients: >>> pcc = calc_corrs( ... dataframes=[software1, software2, software3], ... method="pearson", ... labels=["software1", "software2", "software3"], ... reference="software2", ... ) >>> print(pcc) software1 software2 software3 A B 0.617145 1.0 1.0 B C 0.970725 1.0 1.0 D -0.426423 1.0 1.0 Spearman Correlation Coefficients: >>> spear = calc_corrs( ... dataframes=[software1, software2, software3], ... method="spearman", ... labels=["software1", "software2", "software3"], ... reference="software2", ... ) >>> print(spear) software1 software2 software3 A B 0.500000 1.0 1.0 B C 0.866025 1.0 1.0 D -0.500000 1.0 1.0 Kendall Correlation Coefficients: >>> kend = calc_corrs( ... dataframes=[software1, software2, software3], ... method="kendall", ... labels=["software1", "software2", "software3"], ... reference="software2", ... ) >>> print(kend) software1 software2 software3 A B 0.333333 1.0 1.0 B C 0.816497 1.0 1.0 D -0.333333 1.0 1.0 """ if labels is None: labels = range(len(dataframes)) dataframe_dict = dict(zip(labels, dataframes)) if reference is None: reference_results = calc_avgs(dataframes) else: if isinstance(reference, int): reference_results = tuple(dataframes)[reference] else: reference_results = dataframe_dict[reference] pearson_results = pd.DataFrame() for label, dtf in dataframe_dict.items(): pearson_results[label] = _calc_corr_between_two_dfs( dataframes=[reference_results, dtf], method=method, fillna=fillna, ) return pearson_results
def _normalize(reference, method): norms = { "mean": np.mean(reference, axis="index"), "spread": abs(reference.max() - reference.min()), "std": np.std(reference, axis="index"), } return norms[method]
[docs]def calc_reldiffs(data, reference): """Calculate absolute relative difference between data frames.""" biased_differences = data.subtract(reference, axis="index") relative_biased_differences = biased_differences.divide( reference, axis="index") absolute_relative_differences = relative_biased_differences return absolute_relative_differences
[docs]def calc_ardiffs(data, reference): """Calculate absolute relative difference between data frames.""" biased_differences = data.subtract(reference, axis="index") relative_biased_differences = biased_differences.divide( reference, axis="index") absolute_relative_differences = relative_biased_differences.abs() return absolute_relative_differences