import contextlib
from pathlib import Path
import geopandas as gpd
from matplotlib import pyplot as plt
import pandas as pd
from covsirphy.util.error import NotRegisteredError, SubsetNotFoundError
from covsirphy.util.validator import Validator
from covsirphy.util.term import Term
from covsirphy.gis._subset import _SubsetManager
from covsirphy.gis._layer import _LayerAdjuster
from covsirphy.gis._geometry import _Geometry
from covsirphy.gis._choropleth import _ChoroplethMap
[docs]
class GIS(Term):
"""Class of geographic information system to handle geo-spatial time-series data.
Args:
layers (list[str] or None): list of layers of geographic information or None (["ISO3", "Province", "City"])
country (str or None): layer name of countries or None (countries are not included in the layers)
date (str): column name of observation dates
Raises:
ValueError: @layers has duplicates
Note:
Country level data specified with @country will be stored with ISO3 codes.
"""
def __init__(self, layers=None, country="ISO3", date="Date", **kwargs):
# Countries will be specified with ISO3 codes and this requires conversion
self._country = None if country is None else str(country)
# Location data
self._layers = Validator(layers or [self._country, self.PROVINCE, self.CITY], "layers").sequence()
# Date column
self._date = str(date)
# Layer adjuster
self._adjuster = _LayerAdjuster(layers=self._layers, country=self._country, date=self._date)
self._un_registered = True
[docs]
def all(self, variables=None, errors="raise"):
"""Return all available data.
Args:
variables (list[str] or None): list of variables to collect or None (all available variables)
errors (str): 'raise' or 'coerce'
Raises:
NotRegisteredError: No records have been registered yet
Returns:
pandas.DataFrame:
Index
reset index
Columns
- (pandas.Category): columns defined by covsirphy.GIS(layers)
- (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date)
- columns defined by @variables
"""
if self._un_registered and errors == "raise":
raise NotRegisteredError("No records have been registered yet.")
df = self._adjuster.all(variables=variables)
return df.astype(dict.fromkeys(self._layers, "category"))
[docs]
def citations(self, variables=None):
"""
Return citation list of the secondary data sources.
Args:
variables (list[str] or None): list of variables to collect or None (all available variables)
Returns:
list[str]: citation list
"""
return self._adjuster.citations(variables=variables)
[docs]
def register(self, data, layers=None, date="Date", variables=None, citations=None, convert_iso3=True, **kwargs):
"""Register new data.
Args:
data (pandas.DataFrame): new data
Index
reset index
Columns
- columns defined by @layers
- column defined by @date
- columns defined by @variables
layers (list[str] or None): layers of the data or None (as the same as covsirphy.GIS(layer))
date (str): column name of observation dates of the data
variables (list[str] or None): list of variables to add or None (all available columns)
citations (list[str] or str or None): citations of the dataset or None (["my own dataset"])
convert_iso3 (bool): whether convert country names to ISO3 codes or not
**kwargs: keyword arguments of pandas.to_datetime() including "dayfirst (bool): whether date format is DD/MM or not"
Raises:
ValueError: @data_layers has duplicates
Returns:
covsirphy.GIS: self
"""
self._adjuster.register(
data=data, layers=layers, date=date, variables=variables, citations=citations,
convert_iso3=convert_iso3, **Validator(kwargs, "keyword arguments").kwargs(pd.to_datetime))
self._un_registered = False
return self
[docs]
def layer(self, geo=None, start_date=None, end_date=None, variables=None, errors="raise"):
"""Return the data at the selected layer in the date range.
Args:
geo (tuple(list[str] or tuple(str) or str) or str or None): location names to specify the layer or None (the top level)
start_date (str or None): start date, like 22Jan2020
end_date (str or None): end date, like 01Feb2020
variables (list[str] or None): list of variables to add or None (all available columns)
errors (str): whether raise errors or not, 'raise' or 'coerce'
Raises:
TypeError: @geo has un-expected types
ValueError: the length of @geo is larger than the length of layers
NotRegisteredError: No records have been registered at the layer yet
Returns:
pandas.DataFrame:
Index:
reset index
Columns
- (str): columns defined by covsirphy.GIS(layers)
- (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date)
- columns defined by @variables
Note:
Note that records with NAs as country names will be always removed.
Note:
When `geo=None` or `geo=(None,)`, returns country-level data, assuming we have country/province/city as layers here.
Note:
When `geo=("Japan",)` or `geo="Japan"`, returns province-level data in Japan.
Note:
When `geo=(["Japan", "UK"],)`, returns province-level data in Japan and UK.
Note:
When `geo=("Japan", "Kanagawa")`, returns city-level data in Kanagawa/Japan.
Note:
When `geo=("Japan", ["Tokyo", "Kanagawa"])`, returns city-level data in Tokyo/Japan and Kanagawa/Japan.
"""
# Get all data
if self._un_registered and errors == "raise":
raise NotRegisteredError("GIS.register()", details="No records have been registered yet")
data = self._adjuster.all(variables=variables)
# Filter with geo
geo_converted = self._parse_geo(geo=geo, data=data)
manager = _SubsetManager(layers=self._layers)
df = manager.layer(data=data, geo=geo_converted)
if df.empty and errors == "raise":
raise NotRegisteredError("GIS.register()", details="No records have been registered at the layer yet")
# Filter with date
series = df[self._date].copy()
start = Validator(start_date).date(default=series.min())
end = Validator(end_date).date(default=series.max())
df = df.loc[(df[self._date] >= start) & (df[self._date] <= end)]
if df.empty and errors == "raise":
raise NotRegisteredError(
"GIS.register()", details=f"No records have been registered at the layer yet from {start_date} to {end_date}")
# Get representative records for dates
df = df.groupby([*self._layers, self._date], dropna=True, observed=True).first()
return df.reset_index().convert_dtypes()
[docs]
def to_geopandas(self, geo=None, on=None, variables=None, directory=None, natural_earth=None):
"""Add geometry information with GeoJSON file of "Natural Earth" GitHub repository to data.
Args:
geo (tuple(list[str] or tuple(str) or str) or str or None): location names to specify the layer or None (the top level)
on (str or None): the date, like 22Jan2020, or None (the last date of each location)
variables (list[str] or None): list of variables to add or None (all available columns)
directory (list[str] or tuple(str) or str): top directory name(s) to save GeoJSON files or None (directory of this this script)
natural_earth (str or None): title of GeoJSON file (without extension) of "Natural Earth" GitHub repository or None (automatically determined)
Raises:
ValueError: country layer is not included in the dataset
Returns:
geopandas.GeoDataFrame:
Index:
- reset index
Columns:
- (str): layer focused on with @gis and GIS.layer()
- (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date)
- geometry: geometric information
Note:
Regarding @geo argument, please refer to covsirphy.GIS.layer().
Note:
GeoJSON files are listed in https://github.com/nvkelso/natural-earth-vector/tree/master/geojson
https://www.naturalearthdata.com/
https://github.com/nvkelso/natural-earth-vector
Natural Earth (Free vector and raster map data at naturalearthdata.com, Public Domain)
"""
if self._country not in self._layers:
raise ValueError("This cannot be done because country layer is not included in the dataset.")
df = self.layer(geo=geo, variables=variables)
if on is None:
df = df.sort_values(self._date, ascending=True).groupby(self._layers, observed=True).last().reset_index()
else:
df = df.loc[df[self._date] == Validator(on).date()]
focused_layer = [layer for layer in self._layers if df[layer][df[layer] != self.NA].nunique() > 0][-1]
geometry = _Geometry(
data=df, layer=focused_layer, directory=directory or Path(__file__).with_name("Natural_Earth"))
iso3 = None if focused_layer == self._country else self._to_iso3(list(df[self._country].unique())[0])
return geometry.to_geopandas(iso3=iso3, natural_earth=natural_earth).drop(set(self._layers) - {focused_layer}, axis=1)
[docs]
def choropleth(self, variable, filename, title="Choropleth map", logscale=True, **kwargs):
"""Create choropleth map.
Args:
variable (str): variable name to show
filename (str or None): filename to save the figure or None (display)
title (str): title of the map
logscale (bool): whether convert the value to log10 scale values or not
kwargs: keyword arguments of the following classes and methods.
- covsirphy.GIS.to_geopandas() except for @variables,
- matplotlib.pyplot.savefig(), matplotlib.pyplot.legend(), and
- pandas.DataFrame.plot()
"""
v = Validator(kwargs, "keyword arguments")
gdf = self.to_geopandas(variables=[variable], **v.kwargs(functions=GIS.to_geopandas, default=None))
focused_layer = [layer for layer in self._layers if layer in gdf.columns][0]
gdf.rename(columns={focused_layer: "Location", variable: "Variable"}, inplace=True)
with _ChoroplethMap(filename=filename, **v.kwargs(functions=plt.savefig, default=None)) as cm:
cm.title = str(title)
cm.plot(data=gdf, logscale=logscale, **v.kwargs(functions=gpd.GeoDataFrame.plot, default=None))
[docs]
def subset(self, geo=None, start_date=None, end_date=None, variables=None, errors="raise"):
"""Return subset of the location and date range.
Args:
geo (tuple(list[str] or tuple(str) or str) or str or None): location names to filter or None (total at the top level)
start_date (str or None): start date, like 22Jan2020
end_date (str or None): end date, like 01Feb2020
variables (list[str] or None): list of variables to add or None (all available columns)
errors (str): whether raise errors or not, 'raise' or 'coerce'
Raises:
TypeError: @geo has un-expected types
ValueError: the length of @geo is larger than the length of layers
NotRegisteredError: No records have been registered yet
SubsetNotFoundError: no records were found for the country and @errors is 'raise'
Returns:
pandas.DataFrame:
Index:
reset index
Columns
- (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date)
- columns defined by @variables
Note:
Note that records with NAs as country names will be always removed.
Note:
When `geo=None` or `geo=(None,)`, returns global scale records (total values of all country-level data), assuming we have country/province/city as layers here.
Note:
When `geo=("Japan",)` or `geo="Japan"`, returns country-level data in Japan.
Note:
When `geo=(["Japan", "UK"],)`, returns country-level data of Japan and UK.
Note:
When `geo=("Japan", "Tokyo")`, returns province-level data of Tokyo/Japan.
Note:
When `geo=("Japan", ["Tokyo", "Kanagawa"])`, returns total values of province-level data of Tokyo/Japan and Kanagawa/Japan.
Note:
When `geo=("Japan", "Kanagawa", "Yokohama")`, returns city-level data of Yokohama/Kanagawa/Japan.
Note:
When `geo=(("Japan", "Kanagawa", ["Yokohama", "Kawasaki"])`, returns total values of city-level data of Yokohama/Kanagawa/Japan and Kawasaki/Kanagawa/Japan.
"""
# Get all data
if self._un_registered and errors == "raise":
raise NotRegisteredError("GIS.register()", details="No records have been registered yet.")
data = self._adjuster.all(variables=variables)
# Filter with geo
geo_converted = self._parse_geo(geo=geo, data=data)
manager = _SubsetManager(layers=self._layers)
df = manager.filter(data=data, geo=geo_converted)
if df.empty and errors == "raise":
raise SubsetNotFoundError(geo=geo)
# Filter with date
series = df[self._date].copy()
start = Validator(start_date).date(default=series.min())
end = Validator(end_date).date(default=series.max())
df = df.loc[df[self._date].between(start, end)]
if df.empty and errors == "raise":
raise SubsetNotFoundError(geo=geo, start_date=start_date, end_date=end_date)
# Calculate total value if geo=None
if geo is None or geo[0] is None:
variables_agg = list(set(df.columns) - {*self._layers, self._date})
df = df.pivot_table(values=variables_agg, index=self._date, columns=self._layers[0], aggfunc="last")
df = df.ffill().fillna(0).stack().reset_index()
# Get representative records for dates
with contextlib.suppress(IndexError, KeyError):
df = df.drop(self._layers[1:], axis=1)
df = df.groupby([self._layers[0], self._date], dropna=True, observed=True).first().reset_index(level=self._date)
return df.groupby(self._date, as_index=False).sum().convert_dtypes()
[docs]
@classmethod
def area_name(cls, geo=None):
"""
Return area name of the geographic information, like 'Japan', 'Tokyo/Japan', 'Japan_UK', 'the world'.
Args:
geo (tuple(list[str] or tuple(str) or str) or str or None): location names
Returns:
str: area name
"""
if geo is None or geo[0] is None:
return "the world"
names = [
info if isinstance(info, str) else "_".join(list(info)) for info in ([geo] if isinstance(geo, str) else geo)]
return cls.SEP.join(names[:: -1])
def _parse_geo(self, geo, data):
"""Parse geographic specifier.
Args:
geo (tuple(list[str] or tuple(str) or str) or str or None): location names
data (pandas.DataFrame):
Index
reset index
Columns
- (str): column defined by @country (of covsirphy.GIS) if @country is not None
Returns:
geo (tuple(list[str] or tuple(str) or str or None) or str or None): parsed location names
"""
if geo is None:
return geo
return [self._info_to_iso3(info, self._layers[i], data) for i, info in enumerate([geo] if isinstance(geo, str) else geo)]
def _info_to_iso3(self, geo_info, layer, data):
"""Convert a element of geographic specifier to ISO3 code.
Args:
geo_info (list[str] or tuple(str) or str or None): element of geographic specifier
layer (str): layer of geographic information
data (pandas.DataFrame):
Index
reset index
Columns
- (str): column defined by @country if @country is not None
"""
if layer != self._country or geo_info is None or set(geo_info).issubset(data[layer].unique()):
return geo_info
return self._to_iso3(geo_info)