Source code for covsirphy.dynamics.ode

from __future__ import annotations
from datetime import datetime, timedelta
from functools import partial
import math
import optuna
import numpy as np
import pandas as pd
from scipy.integrate import solve_ivp
from typing_extensions import Any, NoReturn, Self
from covsirphy.util.error import UnExpectedNoneError, NotNoneError
from covsirphy.util.stopwatch import StopWatch
from covsirphy.util.evaluator import Evaluator
from covsirphy.util.validator import Validator
from covsirphy.util.term import Term



[docs]
class ODEModel(Term):
    """Basic class of ordinary differential equation (ODE) model.

    Args:
        date_range: start date and end date of simulation
        tau: tau value [min]
        initial_dict: initial values
        param_dict: non-dimensional parameter values
    """
    _logS: str = "log10(S)"
    _r: str = "R"
    # Name of ODE model
    _NAME: str = "ODE Model"
    # Variables
    _VARIABLES: list[str] = []
    # Non-dimensional parameters
    _PARAMETERS: list[str] = []
    # Dimensional parameters
    _DAY_PARAMETERS: list[str] = []
    # Variables that increases monotonically
    _VARS_INCREASE: list[str] = []
    # Sample data
    _SAMPLE_DICT: dict[str, dict[str, Any]] = {
        "initial_dict": dict.fromkeys(_VARIABLES),
        "param_dict": dict.fromkeys(_PARAMETERS)
    }

    def __init__(self, date_range: tuple[str, str], tau: int, initial_dict: dict[str, int], param_dict: dict[str, float]) -> None:
        start_date, end_date = Validator(date_range, "date_range", accept_none=False).sequence(length=2)
        self._start: pd.Timestamp = Validator(
            start_date, name="the first value of @date_range", accept_none=False).date()
        self._end: pd.Timestamp = Validator(
            end_date, name="the second date of @date_range", accept_none=False).date(value_range=(self._start, None))
        self._tau: int = Validator(tau, "tau", accept_none=False).tau()  # type: ignore
        self._initial_dict: dict[str, int] = Validator(initial_dict, "initial_dict", accept_none=False).dict(
            required_keys=self._VARIABLES, errors="raise")
        self._param_dict: dict[str, float] = Validator(param_dict, "param_dict", accept_none=False).dict(
            required_keys=self._PARAMETERS, errors="raise")
        # Total population
        self._population: int = sum(initial_dict.values())
        # Information regarding ODE parameter estimation
        self._estimation_dict: dict[str, Any] | None = None

    def __str__(self) -> str:
        return self._NAME

    def __repr__(self) -> str:
        _dict = self.settings()
        return f"{type(self).__name__}({', '.join([f'{k}={v}' for k, v in _dict.items()])})"

    def __eq__(self, other) -> bool:
        return repr(self) == repr(other)


[docs]
    @classmethod
    def name(cls) -> str:
        """Return name of ODE model.
        """
        return cls._NAME



[docs]
    @classmethod
    def definitions(cls) -> dict[str, Any]:
        """Return definitions of ODE model.

        Returns:
            - "name" (str): ODE model name
            - "variables" (list of [str]): variable names
            - "parameters" (list of [str]): non-dimensional parameter names
            - "dimensional_parameters" (list of [str]): dimensional parameter names
        """
        return {
            "name": cls._NAME,
            "variables": cls._VARIABLES,
            "parameters": cls._PARAMETERS,
            "dimensional_parameters": cls._DAY_PARAMETERS,
        }



[docs]
    def settings(self, with_estimation: bool = False) -> dict[str, Any]:
        """Return settings.

        Args:
            with_estimation (bool): whether includes information regarding ODE parameter estimation or not

        Returns:
            - date_range (tuple of (str, str)): start date and end date of simulation
            - tau (int): tau value [min]
            - initial_dict (dict of {str: int}): initial values
            - param_dict (dict of {str: float}): non-dimensional parameter values
            - estimation_dict (dict of {str: str or int}: information regarding ODE parameter estimation, when @with_estimation is True
                - method (str): method of estimation, "with_quantile" or "with_optimization" or "not_performed"
                - {metric} (int): score of hyperparameter optimization, if available
                - Trials (int) : the number of trials of hyperparameter optimization, if available
                - Runtime (str): runtime of hyperparameter optimization, if available
                - keyword arguments set with covsirphy.ODEModel.with_optimization(), if available
                - keyword arguments set with covsirphy.ODEModel.with_quantile(), if available
        """
        _dict = {
            "date_range": (self._start.strftime(self.DATE_FORMAT), self._end.strftime(self.DATE_FORMAT)),
            "tau": self._tau,
            "initial_dict": self._initial_dict,
            "param_dict": self._param_dict,
        }
        if with_estimation:
            _dict["estimation_dict"] = self._estimation_dict or {"method": "not_performed"}
        return _dict



[docs]
    @classmethod
    def from_sample(cls, date_range: tuple[str | None, str | None] | None = None, tau: int = 1440) -> Self:
        """Initialize model with sample data.

        Args:
            date_range: start date and end date of simulation
            tau: tau value [min]

        Returns:
            initialized model

        Note:
            - When @date_range or the first value of @date_range is None, today when executed will be set as start date.
            - When @date_range or the second value of @date_range is None, 180 days after start date will be used as end date.
        """
        start_date, end_date = Validator(date_range, "date_range").sequence(default=(None, None), length=2)
        start = Validator(start_date, name="the first value of @date_range").date(default=datetime.now())
        end = Validator(
            end_date, name="the second date of @date_range").date(value_range=(start, None), default=start + timedelta(days=180))
        return cls(date_range=(start, end), tau=tau, **cls._SAMPLE_DICT)


    def _discretize(self, t: int, X: np.ndarray) -> np.ndarray:
        """Discretize the ODE.

        Args:
            t: discrete time-steps
            X: the current values of the model

        Returns:
            the next values of the model
        """
        raise NotImplementedError


[docs]
    def solve(self) -> pd.DataFrame:
        """Solve an initial value problem.

        Return:
            dataframe of analytical solutions.
                Index
                    Date (pandas.Timestamp): dates from start date to end date
                Columns
                    (pandas.Int64): model-specific dimensional variables of the model
        """
        step_n = math.ceil((self._end - self._start) / timedelta(minutes=self._tau))
        sol = solve_ivp(
            fun=self._discretize,
            t_span=[0, step_n],
            y0=np.array([self._initial_dict[variable] for variable in self._VARIABLES]),
            t_eval=np.arange(0, step_n + 1, 1),
            dense_output=False
        )
        df = pd.DataFrame(data=sol["y"].T.copy(), columns=self._VARIABLES)
        df = self._non_dim_to_date(data=df, tau=self._tau, start_date=self._start)
        return df.round().convert_dtypes()


    @staticmethod
    def _date_to_non_dim(series: pd.DatetimeIndex, tau: int | None) -> pd.DatetimeIndex:
        """Convert date information (TIME) to time(x) = (TIME(x) - TIME(0)) / tau

        Args:
            series: date information
            tau: tau value [min]

        Returns:
            as-is @series when tau is None else converted time information without series name
        """
        Validator(series, "index of data").instance(pd.DatetimeIndex)
        if tau is None:
            return series
        Validator(tau, "tau", accept_none=False).tau()
        converted = (series - series.min()) / np.timedelta64(tau, "m")
        return converted.rename(None).astype("Int64")

    @classmethod
    def _non_dim_to_date(cls, data: pd.DataFrame, tau: int, start_date: str | pd.TimeStamp | None) -> pd.DataFrame:
        """Convert non-dimensional date information (time) to TIME(x) = TIME(0) + tau * time(x) and resample with dates.

        Args:
            data:
                Index
                    non-dimensional date information pandas.DatetimeIndex
                Columns
                    any columns
            tau: tau value [min]
            start_date: start date of records ie. TIME(0)

        Raises:
            NotNoneError: @tau is None, but start_date is not None
            UnExpectedNoneError: @tau is not None, but start_date is None

        Returns:
            pandas.DataFrame:
                Index
                    Date (pandas.DatetimeIndex) or as-is @data (when @tau is None)
                Columns
                    any columns of @data

        Note:
            The first values on date will be selected when resampling.
        """
        df = Validator(data, "data").dataframe()
        if (tau is None and start_date is None) or isinstance(df.index, pd.DatetimeIndex):
            return data
        if tau is None:
            raise NotNoneError("start_date", start_date, details="None is required because tau is None")
        if start_date is None:
            raise UnExpectedNoneError("start_date", details="Not None value is required because tau is not None")
        Validator(tau, "tau", accept_none=False).tau()
        start = Validator(start_date, "start_date", accept_none=False).date()
        df[cls.DATE] = (start + pd.Series(df.index * tau).apply(lambda x: timedelta(minutes=x))).tolist()
        return df.set_index(cls.DATE).resample("D").first()


[docs]
    @classmethod
    def transform(cls, data: pd.DataFrame, tau: int | None = None) -> NoReturn:
        """Transform a dataframe, converting Susceptible/Infected/Fatal/Recovered to model-specific variables.

        Args:
            data:
                Index
                    reset index or pandas.DatetimeIndex (when tau is not None)
                Columns
                    - Susceptible (int): the number of susceptible cases
                    - Infected (int): the number of currently infected cases
                    - Fatal (int): the number of fatal cases
                    - Recovered (int): the number of recovered cases
            tau: tau value [min]

        Returns:
            Index
                as the same as index of @data when @tau is None else converted to time(x) = (TIME(x) - TIME(0)) / tau
            Columns
                model-specific variables

        Note:
            This method must be defined by child classes.
        """
        raise NotImplementedError



[docs]
    @classmethod
    def inverse_transform(cls, data: pd.DataFrame, tau: int | None = None, start_date: str | pd.Timestamp | None = None) -> NoReturn:
        """Transform a dataframe, converting model-specific variables to Susceptible/Infected/Fatal/Recovered.

        Args:
            data:
                Index
                    any index
                Columns
                    model-specific variables
            tau: tau value [min]
            start_date: start date of records ie. TIME(0)

        Returns:
            Index
                Date (pandas.DatetimeIndex) or as-is @data (when either @tau or @start_date are None)
            Columns
                - Susceptible (int): the number of susceptible cases
                - Infected (int): the number of currently infected cases
                - Fatal (int): the number of fatal cases
                - Recovered (int): the number of recovered cases

        Note:
            This method must be defined by child classes.
        """
        raise NotImplementedError



[docs]
    def r0(self) -> NoReturn:
        """Calculate basic reproduction number.

        Returns:
            reproduction number of the ODE model and parameters

        Note:
            This method must be defined by child classes.
        """
        raise NotImplementedError



[docs]
    def dimensional_parameters(self) -> NoReturn:
        """Calculate dimensional parameter values.

        Returns:
            dictionary of dimensional parameter values

        Note:
            This method must be defined by child classes.
        """
        raise NotImplementedError



[docs]
    @classmethod
    def from_data(cls, data: pd.DataFrame, param_dict: dict[str, float], tau: int = 1440, digits: int | None = None) -> Self:
        """Initialize model with data and ODE parameter values.

        Args:
            data:
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Susceptible (int): the number of susceptible cases
                    - Infected (int): the number of currently infected cases
                    - Recovered (int): the number of recovered cases
                    - Fatal (int): the number of fatal cases
            param_dict: non-dimensional parameter values
            tau: tau value [min]
            digits: effective digits of ODE parameter values or None (skip rounding)

        Returns:
            initialized model
        """
        Validator(data, "data", accept_none=False).dataframe(columns=[cls.DATE, *cls._SIRF], empty_ok=False)
        Validator(tau, "tau", accept_none=False).tau()
        Validator(param_dict, "param_dict", accept_none=False).dict(required_keys=cls._PARAMETERS)
        start, end = data[cls.DATE].min(), data[cls.DATE].max()
        trans_df = cls.transform(data=data.set_index(cls.DATE), tau=tau)
        initial_dict = trans_df.iloc[0].to_dict()
        return cls(
            date_range=(start, end), tau=tau, initial_dict=initial_dict,
            param_dict=param_dict if digits is None else {k: Validator(v, k).float(
                value_range=(0, 1), digits=digits) for k, v in param_dict.items()}
        )



[docs]
    @classmethod
    def from_data_with_quantile(cls, data: pd.DataFrame, tau: int = 1440, q: float = 0.5, digits: int | None = None) -> Self:
        """Initialize model with data, estimating ODE parameters with quantiles.

        Args:
            data:
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Susceptible (int): the number of susceptible cases
                    - Infected (int): the number of currently infected cases
                    - Recovered (int): the number of recovered cases
                    - Fatal (int): the number of fatal cases
            tau: tau value [min]
            q: the quantiles to compute, values between (0, 1)
            digits: effective digits of ODE parameter values or None (skip rounding)

        Returns:
            initialized model
        """
        Validator(data, "data", accept_none=False).dataframe(columns=[cls.DATE, *cls._SIRF], empty_ok=False)
        Validator(tau, "tau", accept_none=False).tau()
        Validator(q, "q", accept_none=False).float(value_range=(0, 1))
        trans_df = cls.transform(data=data.set_index(cls.DATE), tau=tau)
        cls_obj = cls.from_data(data=data, param_dict=cls._param_quantile(data=trans_df, q=q), tau=tau, digits=digits)
        cls_obj._estimation_dict = dict(method="with_quantile", tau=tau, q=q, digits=digits)
        return cls_obj


    @classmethod
    def _param_quantile(cls, data: pd.DataFrame, q: float | pd.Series = 0.5) -> NoReturn:
        """With combinations (X, dX/dt) for variables, calculate quantile values of ODE parameters.

        Args:
            data: transformed data with covsirphy.ODEModel.transform(data=data, tau=tau)
            q: the quantile(s) to compute, value(s) between (0, 1)

        Returns:
            parameter values at the quantile(s)

        Note:
            This method must be defined by child classes.
        """
        raise NotImplementedError

    @classmethod
    def _clip(cls, values: float | pd.Series, lower: float, upper: float) -> float | pd.Series:
        """
        Trim values at input threshold.

        Args:
            values: values to trim
            lower: minimum threshold
            upper: maximum threshold

        Returns:
            clipped array
        """
        return min(max(values, lower), upper) if isinstance(values, float) else pd.Series(values).clip()


[docs]
    @classmethod
    def from_data_with_optimization(cls, data: pd.DataFrame, tau: int = 1440, metric: str = "RMSLE", digits: int | None = None, **kwargs) -> Self:
        """Initialize model with data, estimating ODE parameters hyperparameter optimization using Optuna.

        Args:
            data:
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Susceptible (int): the number of susceptible cases
                    - Infected (int): the number of currently infected cases
                    - Fatal (int): the number of fatal cases
                    - Recovered (int): the number of recovered cases
            tau: tau value [min]
            metric: metric to minimize, refer to covsirphy.Evaluator.score()
            digits: effective digits of ODE parameter values or None (skip rounding)
            **kwargs: keyword arguments of optimization
                - quantiles (tuple(int, int)): quantiles to cut parameter range, like confidence interval, (0.1, 0.9) as default
                - timeout (int): timeout of optimization, 180 as default
                - timeout_iteration (int): timeout of one iteration, 1 as default
                - tail_n (int): the number of iterations to decide whether score did not change for the last iterations, 4 as default
                - allowance (tuple(float, float)): the allowance of the max predicted values, (0.99, 1.01) as default
                - pruner (str): kind of pruner (hyperband, median, threshold or percentile), "threshold" as default
                - upper (float): works for "threshold" pruner, intermediate score is larger than this value, it prunes, 0.5 as default
                - percentile (float): works for "Percentile" pruner, the best intermediate value is in the bottom percentile among trials, it prunes, 50 as default
                - constant_liar (bool): whether use constant liar to reduce search effort or not, False as default

        Returns:
            initialized model
        """
        kwargs_default = {
            "quantiles": (0.1, 0.9),
            "timeout": 180,
            "timeout_iteration": 1,
            "tail_n": 4,
            "allowance": (0.99, 1.01),
            "pruner": "threshold",
            "upper": 0.5,
            "percentile": 50,
            "constant_liar": False,
        }
        kwargs_dict = Validator(kwargs, "kwargs").dict(default=kwargs_default)
        Validator(data, "data", accept_none=False).dataframe(columns=[cls.DATE, *cls._SIRF], empty_ok=False)
        Validator(tau, "tau", accept_none=False).tau()
        Validator([metric], "metric").sequence(candidates=Evaluator.metrics())
        trans_df = cls.transform(data=data.set_index(cls.DATE), tau=tau)
        param_dict, score, n_trials, runtime = cls._estimate_params(
            data=trans_df, tau=tau, metric=metric, **kwargs_dict)
        cls_obj = cls.from_data(data=data, param_dict=param_dict, tau=tau, digits=digits)
        cls_obj._estimation_dict = {
            "method": "with_optimization", metric: score, cls.TRIALS: n_trials, cls.RUNTIME: runtime,
            cls.TAU: tau, "digits": digits, **kwargs_dict, "data": data,
        }
        return cls_obj


    @classmethod
    def _estimate_params(cls, data: pd.DataFrame, tau: int, metric: str, **kwargs) -> tuple[dict[str, float], float, str, int]:
        """Estimate ODE parameter values with hyperparameter optimization.

        Args:
            data: transformed data with covsirphy.ODEModel.transform(data=data, tau=tau)
            tau: tau value [min]
            metric: metric to minimize, refer to covsirphy.Evaluator.score()
            **kwargs: keyword arguments of optimization and must includes
                - quantiles (tuple(int, int)): quantiles to cut parameter range, like confidence interval
                - timeout (int): timeout of optimization
                - timeout_iteration (int): timeout of one iteration
                - tail_n (int): the number of iterations to decide whether score did not change for the last iterations
                - allowance (tuple(float, float)): the allowance of the max predicted values
                - pruner (str): kind of pruner (hyperband, median, threshold or percentile)
                - upper (float): works for "threshold" pruner, intermediate score is larger than this value, it prunes
                - percentile (float): works for "Percentile" pruner, the best intermediate value is in the bottom percentile among trials, it prunes
                - constant_liar (bool): whether use constant liar to reduce search effort or not

        Returns:
            - dict of {str: float}: dictionary of parameter values
            - float: score of the metric
            - str: runtime of hyperparameter optimization
            - int: the number of trials of hyperparameter optimization
        """
        optuna.logging.set_verbosity(optuna.logging.WARNING)  # type: ignore
        # Create study of optimization
        pruner_dict = {
            "hyperband": optuna.pruners.HyperbandPruner,
            "median": optuna.pruners.MedianPruner,
            "threshold": optuna.pruners.ThresholdPruner,
            "percentile": optuna.pruners.PercentilePruner,
        }
        pruner_name = Validator([kwargs["pruner"]], "pruner", accept_none=False).sequence(
            candidates=pruner_dict.keys(), length=1)[0]
        pruner_class = pruner_dict[pruner_name]
        pruner = pruner_class(**Validator(kwargs, "keyword arguments").kwargs(pruner_class))
        sampler = optuna.samplers.TPESampler(
            **Validator(kwargs, "keyword arguments").kwargs(optuna.samplers.TPESampler))
        study = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)
        # Create objective function
        value_range_dict = cls._param_quantile(data, q=kwargs["quantiles"])
        objective_func = partial(
            cls._optuna_objective, data=data, tau=tau, value_range_dict=value_range_dict, metric=metric)
        allowance = Validator(kwargs["allowance"], "allowance", accept_none=False).sequence(length=2)
        # Iteration of optimization
        iter_n = math.ceil(kwargs["timeout"] / kwargs["timeout_iteration"])
        stopwatch = StopWatch()
        scores = []
        param_dict = {}
        tail_n = Validator(kwargs["tail_n"], "tail_n").int(value_range=(1, None))
        for _ in range(iter_n):
            # Run iteration
            study.optimize(objective_func, n_jobs=1, timeout=kwargs["timeout_iteration"])
            param_dict = study.best_params.copy()
            # If score did not change in the last iterations, stop running
            scores.append(cls._optuna_score(param_dict, data, tau, metric))
            if len(scores) >= tail_n and len(set(scores[-tail_n:])) == 1:
                break
            # Check max values are in the allowance
            if cls._optuna_is_in_allowance(param_dict, data, tau, allowance):
                break
        return param_dict, scores[-1], len(study.trials), stopwatch.stop_show()

    @classmethod
    def _optuna_objective(cls, trial: optuna.Trial, data: pd.DataFrame, tau: int, value_range_dict: dict[str, pd.Series], metric: str) -> float:
        """Objective function to minimize (evaluation score of the difference of actual data and solved data) by Optuna.

        Args:
            trial: a trial of the study
            data: transformed data with covsirphy.ODEModel.transform(data=data, tau=tau)
            tau: tau value [min]
            value_range_dict: dictionary of value range of ODE parameters
            metric: metric to minimize, refer to covsirphy.Evaluator.score()

        Returns:
            score to minimize
        """
        param_dict = {}
        for (k, v) in value_range_dict.items():
            try:
                param_dict[k] = trial.suggest_float(k, *v)
            except OverflowError:
                param_dict[k] = trial.suggest_float(k, 0, 1)
        return cls._optuna_score(param_dict, data, tau, metric)

    @classmethod
    def _optuna_score(cls, param_dict: dict[str, float], data: pd.DataFrame, tau: int, metric: str) -> float:
        """Score function to minimize (i.e. evaluation score of the difference of actual data and solved data).

        Args:
            param_dict: non-dimensional parameter values
            data: transformed data with covsirphy.ODEModel.transform(data=data, tau=tau)
            tau: tau value [min]
            metric: metric to minimize, refer to covsirphy.Evaluator.score()

        Returns:
            score to minimize or positive infinity (when negative values are included in simulated values)
        """
        df = cls._non_dim_to_date(data=data, tau=tau, start_date="01Jan2022")
        model = cls.from_data(data=cls.inverse_transform(df).reset_index(), param_dict=param_dict, tau=tau, digits=None)
        sim_df = model.solve()
        evaluator = Evaluator(df, sim_df, how="inner", on=None)
        return evaluator.score(metric=metric)

    @classmethod
    def _optuna_is_in_allowance(cls, param_dict: dict[str, float], data: pd.DataFrame, tau: int, allowance: tuple[float, float]) -> bool:
        """
        Return whether all max values of estimated values are in allowance or not.

        Args:
            param_dict: non-dimensional parameter values
            data: transformed data with covsirphy.ODEModel.transform(data=data, tau=tau)
            tau: tau value [min]
            allowance: the allowance of the predicted value

        Returns:
            True when all max values of predicted values are in allowance
        """
        df = cls.inverse_transform(data=data, tau=tau, start_date="01Jan2022").reset_index()
        max_dict = {v: data[v].max() for v in cls._VARIABLES}
        model = cls.from_data(data=df, param_dict=param_dict, tau=tau, digits=None)
        sim_df = model.solve()
        sim_max_dict = {v: sim_df[v].max() for v in cls._VARIABLES}
        # Check all max values are in allowance
        allowance0, allowance1 = allowance
        ok_list = [
            a * allowance0 <= p <= a * allowance1 for (a, p) in zip(max_dict.values(), sim_max_dict.values())]
        return all(ok_list)


[docs]
    @classmethod
    def sr(cls, data: pd.DataFrame) -> NoReturn:
        """Return log10(S) and R of model-specific variables for S-R trend analysis.

        Args:
            data:
                Index
                    Date (pd.Timestamp): Observation date
                Columns
                    Susceptible (int): the number of susceptible cases
                    Infected (int): the number of currently infected cases
                    Recovered (int): the number of recovered cases
                    Fatal (int): the number of fatal cases

        Returns:
            Index
                Date (pandas.Timestamp): date
            Columns
                log10(S) (np.float64): common logarithm of S of the ODE model
                R (np.int64): R of the model

        Note:
            This method must be defined by child classes.
        """
        raise NotImplementedError