from __future__ import annotations
from datetime import timedelta
from functools import partial
from multiprocessing import cpu_count, Pool
import warnings
import numpy as np
import pandas as pd
from p_tqdm import p_umap
from typing_extensions import Self
from covsirphy.util.config import config
from covsirphy.util.error import EmptyError, NotEnoughDataError, UnExpectedNoneError, UnExpectedValueRangeError
from covsirphy.util.evaluator import Evaluator
from covsirphy.util.stopwatch import StopWatch
from covsirphy.util.validator import Validator
from covsirphy.util.term import Term
from covsirphy.visualization.compare_plot import compare_plot
from covsirphy.dynamics.ode import ODEModel
from covsirphy.dynamics._trend import _TrendAnalyzer
from covsirphy.dynamics._simulator import _Simulator
[docs]
class Dynamics(Term):
"""Class to hand phase-dependent SIR-derived ODE models.
Args:
model: definition of ODE model
date_range: start date and end date of dynamics to analyze
tau: tau value [min] or None (set later with data)
name: name of dynamics to show in figures (e.g. "baseline") or None (un-set)
"""
def __init__(self, model: ODEModel, date_range: tuple[str | None, str | None], tau: int | None = None, name: str | None = None) -> None:
self._model = Validator(model, "model", accept_none=False).subclass(ODEModel)
first_date, last_date = Validator(date_range, "date_range", accept_none=False).sequence(length=2)
self._first = Validator(first_date, name="the first value of @date_range", accept_none=False).date()
self._last = Validator(
last_date, name="the second date of @date_range", accept_none=False).date(value_range=(self._first, None))
self._tau = Validator(tau, "tau", accept_none=True).tau()
self._name = None if name is None else Validator(name, "name").instance(str)
# Index: Date, Columns: S, I, F, R, ODE parameters
self._parameters = self._model._PARAMETERS[:]
self._df = pd.DataFrame(
{self._PH: 0}, index=pd.date_range(start=self._first, end=self._last, freq="D"),
columns=[self._PH, *self._SIRF, *self._parameters])
def __len__(self) -> int:
return self._df[self._PH].nunique()
@property
def model(self) -> ODEModel:
"""Return model class.
"""
return self._model
@property
def model_name(self) -> str:
"""Return name of ODE model.
"""
return self._model._NAME
@property
def tau(self) -> int | None:
"""Return tau value [min] or None (un-set).
"""
return self._tau
@tau.setter
def tau(self, value: int | None) -> None:
self._tau = Validator(value, "tau", accept_none=True).tau()
@tau.deleter
def tau(self) -> None:
self._tau = None
@property
def name(self) -> str | None:
"""Return name of dynamics to show in figures (e.g. "baseline") or None (un-set).
"""
return self._name
@name.setter
def name(self, name: str | None) -> None:
self._name = Validator(name, "name").instance(str)
@name.deleter
def name(self) -> None:
self._name = None
[docs]
@classmethod
def from_sample(cls, model: ODEModel, date_range: tuple[str | None, str | None] | None = None, tau: int = 1440) -> Self:
"""Initialize model with sample data of one-phase ODE model.
Args:
model: definition of ODE model
date_range: start date and end date of simulation
tau value [min]
Returns:
initialized model
Note:
Regarding @date_range, refer to covsirphy.ODEModel.from_sample().
"""
Validator(model, "model", accept_none=False).subclass(ODEModel)
model_instance = model.from_sample(date_range=date_range, tau=tau)
settings_dict = model_instance.settings()
variable_df = model.inverse_transform(model_instance.solve()).iloc[[0]]
param_df = pd.DataFrame(settings_dict["param_dict"], index=[pd.to_datetime(settings_dict["date_range"][0])])
param_df.index.name = cls.DATE
df = pd.concat([variable_df, param_df], axis=1)
instance = cls(model=model, date_range=settings_dict["date_range"], tau=tau, name="Sample data")
instance.register(data=df)
return instance
[docs]
@classmethod
def from_data(cls, model: ODEModel, data: pd.DataFrame, tau: int | None = 1440, name: str | None = None) -> Self:
"""Initialize model with data.
Args:
data: new data to overwrite the current information
Index
Date (pandas.Timestamp): Observation dates
Columns
Susceptible (int): the number of susceptible cases
Infected (int): the number of currently infected cases
Fatal (int): the number of fatal cases
Recovered (int): the number of recovered cases
(numpy.float64): ODE parameter values defined with the ODE model (optional)
tau: tau value [min] or None (un-set)
name: name of dynamics to show in figures (e.g. "baseline") or None (un-set)
Returns:
initialized model
Note:
Regarding @date_range, refer to covsirphy.ODEModel.from_sample().
"""
Validator(model, "model", accept_none=False).subclass(ODEModel)
Validator(data, "data").dataframe(time_index=True)
instance = cls(model=model, date_range=(data.index.min(), data.index.max()), tau=tau, name=name)
instance.register(data=data)
return instance
[docs]
def register(self, data: pd.DataFrame | None = None) -> pd.DataFrame:
"""Register data to get initial values and ODE parameter values (if available).
Args:
data: new data to overwrite the current information or None (no new records)
Index
Date (pandas.Timestamp): Observation dates
Columns
Susceptible (int): the number of susceptible cases
Infected (int): the number of currently infected cases
Recovered (int): the number of recovered cases
Fatal (int): the number of fatal cases
(numpy.float64): ODE parameter values defined with the model
Returns:
dataframe of the current information:
Index
Date (pandas.Timestamp): Observation dates
Columns
Susceptible (int): the number of susceptible cases
Infected (int): the number of currently infected cases
Recovered (int): the number of recovered cases
Fatal (int): the number of fatal cases
(numpy.float64): ODE parameter values defined with model.PARAMETERS
Note:
Change points of ODE parameter values will be recognized as the change points of phases.
Note:
NA can used in the newer phases because filled with that of the older phases.
"""
if data is not None:
new_df = Validator(data, "data").dataframe(time_index=True)
new_df.index = pd.to_datetime(new_df.index).round("D")
all_df = pd.DataFrame(
np.nan,
index=self._df.index,
columns=self._df.columns,
)
all_df[self._PH] = 0
for col in new_df:
new_df[col] = new_df[col].astype(pd.Float64Dtype())
all_df[col] = all_df[col].astype(pd.Float64Dtype())
all_df.update(new_df, overwrite=True)
if all_df.loc[self._first, self._SIRF].isna().any():
raise EmptyError(
f"records on {self._first.strftime(self.DATE_FORMAT)}", details="Records must be registered for simulation")
if all_df.min().min() < 0:
raise UnExpectedValueRangeError("minimum value of the data", all_df.min().min(), (0, None))
all_df.index.name = self.DATE
self._df = all_df.convert_dtypes()
# Find change points with parameter values
param_df = all_df.loc[:, self._parameters].ffill().drop_duplicates().dropna(axis=0)
if not param_df.empty:
self._segment(points=param_df.index.tolist(), overwrite=True)
return self._df.loc[:, [*self._SIRF, *self._parameters]]
def _segment(self, points: list[str], overwrite: bool) -> None:
"""Perform time-series segmentation with points.
Args:
points: dates of change points
overwrite: whether remove all phases before segmentation or not
Note:
@points can include the first date, but not required.
Note:
@points must be selected from the first date to three days before the last date specified covsirphy.Dynamics(date_range).
"""
point_dates = [Validator(point, "a change point", accept_none=False).date() for point in points]
candidates = pd.date_range(start=self._first, end=self._last - timedelta(days=2), freq="D")
change_points = Validator(point_dates, "points", accept_none=False).sequence(unique=True, candidates=candidates)
df = self._df.copy()
if overwrite:
df[self._PH] = 0
for point in change_points:
df.loc[point:, self._PH] += 1
self._df = df.convert_dtypes()
[docs]
def segment(self, points: list[str] | None = None, overwrite: bool = False, **kwargs) -> Self:
"""Perform time-series segmentation with points manually selected or found with S-R trend analysis.
Args:
points: dates of change points or None (will be found with S-R trend analysis via .detect() method)
overwrite: whether remove all phases before segmentation or not
**kwargs: keyword arguments of covsirphy.Dynamics.detect()
Returns:
Updated Dynamics object
Note:
@points can include the first date, but not required.
Note:
@points must be selected from the first date to three days before the last date specified covsirphy.Dynamics(date_range).
"""
self._segment(points=points or self.detect(**kwargs)[0], overwrite=overwrite)
return self
[docs]
def detect(self, algo: str = "Binseg-normal", min_size: int = 7, display: bool = True, **kwargs) -> tuple[pd.Timestamp, pd.DataFrame]:
"""Perform S-R trend analysis to find change points of log10(S) - R of model-specific variables, not that segmentation requires .segment() method.
Args:
algo: detection algorithms and models
min_size: minimum value of phase length [days], be equal to or over 3
display: whether display figure of log10(S) - R plane or not
**kwargs: keyword arguments of algorithm classes (ruptures.Pelt, .Binseg, BottomUp) except for "model",
covsirphy.VisualizeBase(), matplotlib.legend.Legend()
Raises:
NotEnoughDataError: we have not enough records, the length of the records must be equal to or over min_size * 2
Returns:
- pandas.Timestamp: date of change points
- pandas.Dataframe:
Index
R (int): actual R (R of the ODE model) values
Columns
Actual (float): actual log10(S) (common logarithm of S of the ODE model) values
Fitted (float): log10(S) values fitted with y = a * R + b
0th (float): log10(S) values fitted with y = a * R + b and 0th phase data
1st, 2nd... (float): fitted values of 1st, 2nd phases
Note:
- Python library `ruptures` will be used for off-line change point detection.
- Refer to documentation of `ruptures` library, https://centre-borelli.github.io/ruptures-docs/
- Candidates of @algo are "Pelt-rbf", "Binseg-rbf", "Binseg-normal", "BottomUp-rbf", "BottomUp-normal".
Note:
- S-R trend analysis is original to Covsirphy, https://www.kaggle.com/code/lisphilar/covid-19-data-with-sir-model/notebook
- "Phase" means a sequential dates in which the parameters of SIR-derived models are fixed.
- "Change points" means the dates when trend was changed.
- "Change points" is the same as the start dates of phases except for the 0th phase.
"""
Validator(min_size, "min_size", accept_none=False).int(value_range=(3, None))
df = self._df.dropna(how="any", subset=self._SIRF)
if len(df) < min_size * 2:
raise NotEnoughDataError("the records of the number of cases without NAs", df, required_n=min_size * 2)
analyzer = _TrendAnalyzer(data=df, model=self._model, min_size=min_size)
points = analyzer.find_points(algo=algo, **kwargs)
fit_df = analyzer.fitting(points=points)
if display:
analyzer.display(points=points, fit_df=fit_df, name=self._name, **kwargs)
return points, fit_df
[docs]
def summary(self) -> pd.DataFrame:
"""Summarize phase information.
Returns:
Summarized information.
Index
Phase (str): phase names, 0th, 1st,...
Columns
Start (pandas.Timestamp): start date of the phase
End (pandas.Timestamp): end date of the phase
Rt (float): phase-dependent reproduction number (if parameters are available)
(float): parameter values, including rho (if available)
(int or float): dimensional parameters, including 1/beta [days] (if tau and parameters are available)
"""
df = self._df.reset_index()
df[self._PH], _ = df[self._PH].factorize()
first_df = df.groupby(self._PH).first()
df = first_df.join(df.groupby(self._PH).last(), rsuffix="_last")
df = df.rename(columns={self.DATE: self.START, f"{self.DATE}_last": self.END})
df = df.loc[:, [col for col in df.columns if "_last" not in col]]
df.index = [self.num2str(num) for num in df.index]
df.index.name = self.PHASE # type: ignore
# Reproduction number
df[self.RT] = df[self._parameters].apply(
lambda x: np.nan if x.isna().any() else self._model.from_data(data=self._df.reset_index(), param_dict=x.to_dict(), tau=self._tau).r0(), axis=1)
# Day parameters
if self._tau is not None:
days_df = df[self._parameters].apply(
lambda x: np.nan if x.isna().any() else self._model.from_data(
data=self._df.reset_index(), param_dict=x.to_dict(), tau=self._tau).dimensional_parameters(),
axis=1, result_type="expand"
)
df = pd.concat([df, days_df], axis=1)
# Set the order of columns
fixed_cols = [
self.START, self.END, self.RT, *self._model._PARAMETERS, *self._model._DAY_PARAMETERS]
others = [col for col in df.columns if col not in set(fixed_cols) | set(self._SIRF)]
return df.reindex(columns=[*fixed_cols, *others]).dropna(how="all", axis=1).ffill().convert_dtypes()
[docs]
def track(self) -> pd.DataFrame:
"""Track reproduction number, parameter value and dimensional parameter values.
Returns:
Dataframe of time-series data of the values.
Index
Date (pandas.Timestamp): dates
Columns
Rt (float): phase-dependent reproduction number (if parameters are available)
(float): parameter values, including rho (if available)
(int or float): dimensional parameters, including 1/beta [days] (if tau and parameters are available)
"""
df = self.summary()
df[self.DATE] = df[[self.START, self.END]].apply(
lambda x: pd.date_range(start=x[self.START], end=x[self.END], freq="D"), axis=1)
return df.explode(self.DATE).set_index(self.DATE).drop([self.START, self.END], axis=1)
[docs]
def simulate(self, model_specific: bool = False) -> pd.DataFrame:
"""Perform simulation with phase-dependent ODE model.
Args:
model_specific (bool): whether convert S, I, F, R to model-specific variables or not
Raises:
UnExpectedNoneError: tau value is un-set
NAFoundError: ODE parameter values on the start dates of phases are un-set
Returns:
dataframe of time-series simulated data.
Index
Date (pd.Timestamp): dates
Columns
if @model_specific is False:
Susceptible (int): the number of susceptible cases
Infected (int): the number of currently infected cases
Recovered (int): the number of recovered cases
Fatal (int): the number of fatal cases
if @model_specific is True, variables defined by model.VARIABLES of covsirphy.Dynamics(model)
"""
if self._tau is None:
raise UnExpectedNoneError(
"tau", details="Tau value must be set with covsirphy.Dynamics(tau) or covsirphy.Dynamics.tau or covsirphy.Dynamics.estimate_tau()")
simulator = _Simulator(model=self._model, data=self._df)
return simulator.run(tau=self._tau, model_specific=model_specific).set_index(self.DATE)
[docs]
def estimate(self, **kwargs) -> Self:
"""Run covsirphy.Dynamics.estimate_tau() and covsirphy.Dynamics.estimate_params().
Args:
**kwargs: keyword arguments of covsirphy.Dynamics.estimate_tau() and covsirphy.Dynamics.estimate_params()
Returns:
Updated Dynamics object with estimated ODE parameter values.
"""
self.estimate_tau(**Validator(kwargs).kwargs(self.estimate_tau))
self.estimate_params(**kwargs)
return self
[docs]
def estimate_tau(self, metric: str = "RMSLE", q: float = 0.5, digits: int | None = None, n_jobs: int | None = None) -> tuple[float, pd.DataFrame]:
"""Set the best tau value for the registered data, estimating ODE parameters with quantiles.
Args:
metric: metric name for scoring when selecting best tau value
q: the quantiles to compute, values between (0, 1)
digits: effective digits of ODE parameter values or None (skip rounding)
n_jobs: the number of parallel jobs or None (CPU count)
Raises:
NotEnoughDataError: less than three non-NA records are registered
Returns:
- float: tau value with best metric score
- pandas.DataFrame: metric scores of tau candidates
Index
tau (int): candidate of tau values
Columns
{metric}: score of estimation with metric
"""
all_df = self._df.dropna(how="any", subset=self._SIRF)
if len(all_df) < 3:
raise NotEnoughDataError("registered S/I/F/R data except NAs", all_df, 3)
score_f = partial(self._score_with_tau, metric=metric, q=q, digits=digits)
divisors = [i for i in range(1, 1441) if 1440 % i == 0]
n_jobs_validated = Validator(n_jobs, "n_jobs").int(value_range=(1, cpu_count()), default=cpu_count())
with Pool(n_jobs_validated) as p:
scores = p.map(score_f, divisors)
score_dict = dict(zip(divisors, scores))
comp_f = {True: min, False: max}[Evaluator.smaller_is_better(metric=metric)]
self._tau = comp_f(score_dict.items(), key=lambda x: x[1])[0]
return self._tau, pd.DataFrame.from_dict(score_dict, orient="index", columns=[metric])
def _score_with_tau(self, tau: int, metric: str, q: float, digits: int | None) -> float:
"""Return the metric score with tau.
Args:
tau: tau value [min]
metric: metric name for scoring when selecting best tau value
q: the quantiles to compute, values between (0, 1)
digits: effective digits of ODE parameter values or None (skip rounding)
Returns:
metric score
"""
parameters = self._model._PARAMETERS[:]
all_df = self._df.dropna(how="any", subset=self._SIRF)
all_df[parameters] = all_df.loc[:, parameters].astype("Float64")
starts = all_df.reset_index().groupby(self._PH)[self.DATE].first().sort_values()
ends = all_df.reset_index().groupby(self._PH)[self.DATE].last().sort_values()
for start, end in zip(starts, ends):
model_instance = self._model.from_data_with_quantile(
data=all_df.loc[start: end].reset_index(), tau=tau, q=q, digits=digits)
all_df.loc[start, parameters] = pd.Series(model_instance.settings()["param_dict"])
simulator = _Simulator(model=self._model, data=all_df)
sim_df = simulator.run(tau=tau, model_specific=False).set_index(self.DATE)
evaluator = Evaluator(all_df[self._SIRF], sim_df[self._SIRF], how="inner")
return evaluator.score(metric=metric)
[docs]
def estimate_params(self, metric: str = "RMSLE", digits: int | None = None, n_jobs: int | None = None, **kwargs) -> pd.DataFrame:
"""Set ODE parameter values optimized for the registered data with hyperparameter optimization using Optuna.
Args:
metric: metric name for scoring when optimizing ODE parameter values of phases
digits: effective digits of ODE parameter values or None (skip rounding)
n_jobs: the number of parallel jobs or None (CPU count)
**kwargs: keyword arguments of optimization, refer to covsirphy.ODEModel.from_data_with_optimization()
Raises:
UnExpectedNoneError: tau value is un-set
NotEnoughDataError: less than three non-NA records are registered
Returns:
Index
Date (pandas.Timestamp): dates
Columns
(numpy.float64): ODE parameter values defined with model.PARAMETERS
{metric}: score with the estimated parameter values
Trials (int): the number of trials
Runtime (str): runtime of optimization, like 0 min 10 sec
"""
if self._tau is None:
raise UnExpectedNoneError(
"tau", details="Tau value must be set with covsirphy.Dynamics(tau) or covsirphy.Dynamics.tau or covsirphy.Dynamics.estimate_tau()")
all_df = self._df.loc[:, [self._PH, *self._SIRF]].dropna(how="any")
if len(all_df) < 3:
raise NotEnoughDataError("registered S/I/F/R data except NAs", all_df, 3)
n_jobs_validated = Validator(n_jobs, "n_jobs").int(value_range=(1, cpu_count()), default=cpu_count())
starts = all_df.reset_index().groupby(self._PH)[self.DATE].first().sort_values()
ends = all_df.reset_index().groupby(self._PH)[self.DATE].last().sort_values()
est_f = partial(
self._optimized_params, model=self._model, tau=self._tau, metric=metric, digits=digits, **kwargs)
phase_dataframes = [all_df[start: end] for start, end in zip(starts, ends)]
config.info(f"\n<{self._model._NAME}: parameter estimation>")
config.info(f"Running optimization with {n_jobs_validated} CPUs...")
stopwatch = StopWatch()
# p-tqdm with Python 3.12: DeprecationWarning: datetime.datetime.utcfromtimestamp() is deprecated and scheduled for removal in a future version.
warnings.filterwarnings("ignore", category=DeprecationWarning)
results = p_umap(est_f, phase_dataframes, num_cpus=n_jobs_validated)
config.info(f"Completed optimization. Total: {stopwatch.stop_show()}\n")
est_df = pd.concat(results, sort=True, axis=0)
est_df = est_df.loc[:, [*self._parameters, metric, self.TRIALS, self.RUNTIME]].ffill().convert_dtypes()
# Update registered parameter values
r_df = self.register()
for col in self._parameters:
r_df[col] = r_df[col].astype(pd.Float64Dtype())
r_df.update(est_df, overwrite=True)
self.register(data=r_df)
return est_df
def _optimized_params(self, phase_df: pd.DataFrame, model: ODEModel, tau: int, metric: str, digits: int | None, **kwargs) -> pd.DataFrame:
"""Return ODE parameter values optimized with the registered data, estimating ODE parameters hyperparameter optimization using Optuna.
Args:
phase_df: records of a phase
Index
Date (pandas.Timestamp): observation dates
Columns
variables of the model
model: definition of ODE model
tau: tau value [min]
metric: metric name for scoring when optimizing ODE parameter values of phases
digits: effective digits of ODE parameter values or None (skip rounding)
n_jobs: the number of parallel jobs or None (CPU count)
**kwargs: keyword arguments of optimization, refer to covsirphy.ODEModel.from_data_with_optimization()
Raises:
UnExpectedNoneError: tau value is un-set
Returns:
Index
Date (pandas.Timestamp): dates
Columns
(numpy.float64): ODE parameter values defined with model.PARAMETERS
{metric}: score with the estimated parameter values
Trials (int): the number of trials
Runtime (str): runtime of optimization, like 0 min 10 sec
"""
df = phase_df.copy()
# ODE parameter optimization
model_instance = model.from_data_with_optimization(
data=df.reset_index(), tau=tau, metric=metric, digits=digits, **kwargs)
df.loc[df.index[0], model._PARAMETERS] = pd.Series(model_instance.settings()["param_dict"])
# Get information regarding optimization
est_dict = model_instance.settings(with_estimation=True)["estimation_dict"]
est_dict = {k: v for k, v in est_dict.items() if k in {metric, self.TRIALS, self.RUNTIME}}
warnings.filterwarnings("ignore", category=FutureWarning)
df.loc[df.index[0], list(est_dict.keys())] = pd.Series(est_dict)
return df
[docs]
def parse_phases(self, phases: list[str] | None = None) -> tuple[pd.Timestamp, pd.Timestamp]:
"""Return minimum date and maximum date of the phases.
Args:
phases: phases (0th, 1st, 2nd,... last) or None (all phases)
Returns:
minimum date and maximum date of the phases
Note:
"last" can be used to specify the last phase.
"""
if phases is None:
return self._first, self._last
all_df = self._df.copy()
all_df[self._PH], _ = all_df[self._PH].factorize()
phase_numbers = [all_df[self._PH].max() if ph == "last" else self.str2num(ph) for ph in phases]
df = all_df.loc[all_df[self._PH].isin(phase_numbers)]
# FutureWarning to be fixed by pandas version 3.0.0 release
warnings.filterwarnings("ignore", category=FutureWarning)
return df.index.min(), df.index.max()
[docs]
def parse_days(self, days: int, ref: pd.Timestamp | str | None = "last") -> tuple[pd.Timestamp, pd.Timestamp]:
"""Return min(ref, ref + days) and max(ref, ref + days).
Args:
days: the number of days
ref: reference date or "first" (the first date of records) or "last"/None (the last date)
Returns:
minimum date and maximum date of the selected dates
Note:
Note that the days clipped with the first and the last dates of records.
"""
days_n = Validator(days, "days", accept_none=False).int()
ref_dict = {"first": self._first, "last": self._last}
ref_date = Validator(ref_dict.get(ref, ref) if isinstance(ref, str) else ref, name="ref").date(
value_range=(self._first, self._last), default=self._last)
min_date = min(ref_date, ref_date + timedelta(days=days_n))
max_date = max(ref_date, ref_date + timedelta(days=days_n))
return max(min_date, self._first), min(max_date, self._last)
[docs]
def evaluate(self, date_range: tuple[str | pd.Timestamp | None, str | pd.Timestamp | None] | None = None, metric: str = "RMSLE", display: bool = True, **kwargs) -> float:
"""Compare the simulated results and actual records, and evaluate the differences.
Args:
date_range: range of dates to evaluate or None (the first and the last date)
metric: metric to evaluate the difference
display: whether display figure of comparison or not
kwargs: keyword arguments of covsirphy.compare_plot()
Returns:
evaluation score
"""
variables = [self.CI, self.F, self.R]
start_date, end_date = Validator(date_range, name="date_range").sequence(
default=(self._first, self._last), length=2)
start = Validator(start_date, "date_range[0]").date(value_range=(self._first, self._last), default=self._first)
end = Validator(end_date, "date_range[1]").date(value_range=(self._first, self._last), default=self._last)
actual_df = self._df.loc[start:end, variables].dropna(how="any", axis=0)
sim_df = self.simulate(model_specific=False).loc[start: end, variables].dropna(how="any", axis=0)
df = actual_df.join(sim_df, how="inner", lsuffix="_actual", rsuffix="_simulated")
if display:
compare_plot(df, variables=variables, groups=["actual", "simulated"], **kwargs)
return Evaluator(actual_df, sim_df).score(metric=metric)
[docs]
def start_dates(self) -> list[str]:
"""Return the start dates of phases.
Returns:
start dates
"""
df = self._df.reset_index()
df[self._PH], _ = df[self._PH].factorize()
return df.groupby(self._PH).first()[self.DATE].tolist()