Source code for covsirphy.gis.gis

import contextlib
from pathlib import Path
try:
    import geopandas as gpd
except DeprecationWarning:
    import warnings
    with warnings.catch_warnings():
        # Issue #1819
        warnings.simplefilter("ignore", category=DeprecationWarning)
        import geopandas as gpd
from matplotlib import pyplot as plt
import pandas as pd
from covsirphy.util.error import NotRegisteredError, SubsetNotFoundError
from covsirphy.util.validator import Validator
from covsirphy.util.term import Term
from covsirphy.gis._subset import _SubsetManager
from covsirphy.gis._layer import _LayerAdjuster
from covsirphy.gis._geometry import _Geometry
from covsirphy.gis._choropleth import _ChoroplethMap


[docs] class GIS(Term): """Class of geographic information system to handle geo-spatial time-series data. Args: layers (list[str] or None): list of layers of geographic information or None (["ISO3", "Province", "City"]) country (str or None): layer name of countries or None (countries are not included in the layers) date (str): column name of observation dates Raises: ValueError: @layers has duplicates Note: Country level data specified with @country will be stored with ISO3 codes. """ def __init__(self, layers=None, country="ISO3", date="Date", **kwargs): # Countries will be specified with ISO3 codes and this requires conversion self._country = None if country is None else str(country) # Location data self._layers = Validator(layers or [self._country, self.PROVINCE, self.CITY], "layers").sequence() # Date column self._date = str(date) # Layer adjuster self._adjuster = _LayerAdjuster(layers=self._layers, country=self._country, date=self._date) self._un_registered = True
[docs] def all(self, variables=None, errors="raise"): """Return all available data. Args: variables (list[str] or None): list of variables to collect or None (all available variables) errors (str): 'raise' or 'coerce' Raises: NotRegisteredError: No records have been registered yet Returns: pandas.DataFrame: Index reset index Columns - (pandas.Category): columns defined by covsirphy.GIS(layers) - (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date) - columns defined by @variables """ if self._un_registered and errors == "raise": raise NotRegisteredError("No records have been registered yet.") df = self._adjuster.all(variables=variables) return df.astype(dict.fromkeys(self._layers, "category"))
[docs] def citations(self, variables=None): """ Return citation list of the secondary data sources. Args: variables (list[str] or None): list of variables to collect or None (all available variables) Returns: list[str]: citation list """ return self._adjuster.citations(variables=variables)
[docs] def register(self, data, layers=None, date="Date", variables=None, citations=None, convert_iso3=True, **kwargs): """Register new data. Args: data (pandas.DataFrame): new data Index reset index Columns - columns defined by @layers - column defined by @date - columns defined by @variables layers (list[str] or None): layers of the data or None (as the same as covsirphy.GIS(layer)) date (str): column name of observation dates of the data variables (list[str] or None): list of variables to add or None (all available columns) citations (list[str] or str or None): citations of the dataset or None (["my own dataset"]) convert_iso3 (bool): whether convert country names to ISO3 codes or not **kwargs: keyword arguments of pandas.to_datetime() including "dayfirst (bool): whether date format is DD/MM or not" Raises: ValueError: @data_layers has duplicates Returns: covsirphy.GIS: self """ self._adjuster.register( data=data, layers=layers, date=date, variables=variables, citations=citations, convert_iso3=convert_iso3, **Validator(kwargs, "keyword arguments").kwargs(pd.to_datetime)) self._un_registered = False return self
[docs] def layer(self, geo=None, start_date=None, end_date=None, variables=None, errors="raise"): """Return the data at the selected layer in the date range. Args: geo (tuple(list[str] or tuple(str) or str) or str or None): location names to specify the layer or None (the top level) start_date (str or None): start date, like 22Jan2020 end_date (str or None): end date, like 01Feb2020 variables (list[str] or None): list of variables to add or None (all available columns) errors (str): whether raise errors or not, 'raise' or 'coerce' Raises: TypeError: @geo has un-expected types ValueError: the length of @geo is larger than the length of layers NotRegisteredError: No records have been registered at the layer yet Returns: pandas.DataFrame: Index: reset index Columns - (str): columns defined by covsirphy.GIS(layers) - (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date) - columns defined by @variables Note: Note that records with NAs as country names will be always removed. Note: When `geo=None` or `geo=(None,)`, returns country-level data, assuming we have country/province/city as layers here. Note: When `geo=("Japan",)` or `geo="Japan"`, returns province-level data in Japan. Note: When `geo=(["Japan", "UK"],)`, returns province-level data in Japan and UK. Note: When `geo=("Japan", "Kanagawa")`, returns city-level data in Kanagawa/Japan. Note: When `geo=("Japan", ["Tokyo", "Kanagawa"])`, returns city-level data in Tokyo/Japan and Kanagawa/Japan. """ # Get all data if self._un_registered and errors == "raise": raise NotRegisteredError("GIS.register()", details="No records have been registered yet") data = self._adjuster.all(variables=variables) # Filter with geo geo_converted = self._parse_geo(geo=geo, data=data) manager = _SubsetManager(layers=self._layers) df = manager.layer(data=data, geo=geo_converted) if df.empty and errors == "raise": raise NotRegisteredError("GIS.register()", details="No records have been registered at the layer yet") # Filter with date series = df[self._date].copy() start = Validator(start_date).date(default=series.min()) end = Validator(end_date).date(default=series.max()) df = df.loc[(df[self._date] >= start) & (df[self._date] <= end)] if df.empty and errors == "raise": raise NotRegisteredError( "GIS.register()", details=f"No records have been registered at the layer yet from {start_date} to {end_date}") # Get representative records for dates df = df.groupby([*self._layers, self._date], dropna=True, observed=True).first() return df.reset_index().convert_dtypes()
[docs] def to_geopandas(self, geo=None, on=None, variables=None, directory=None, natural_earth=None): """Add geometry information with GeoJSON file of "Natural Earth" GitHub repository to data. Args: geo (tuple(list[str] or tuple(str) or str) or str or None): location names to specify the layer or None (the top level) on (str or None): the date, like 22Jan2020, or None (the last date of each location) variables (list[str] or None): list of variables to add or None (all available columns) directory (list[str] or tuple(str) or str): top directory name(s) to save GeoJSON files or None (directory of this this script) natural_earth (str or None): title of GeoJSON file (without extension) of "Natural Earth" GitHub repository or None (automatically determined) Raises: ValueError: country layer is not included in the dataset Returns: geopandas.GeoDataFrame: Index: - reset index Columns: - (str): layer focused on with @gis and GIS.layer() - (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date) - geometry: geometric information Note: Regarding @geo argument, please refer to covsirphy.GIS.layer(). Note: GeoJSON files are listed in https://github.com/nvkelso/natural-earth-vector/tree/master/geojson https://www.naturalearthdata.com/ https://github.com/nvkelso/natural-earth-vector Natural Earth (Free vector and raster map data at naturalearthdata.com, Public Domain) """ if self._country not in self._layers: raise ValueError("This cannot be done because country layer is not included in the dataset.") df = self.layer(geo=geo, variables=variables) if on is None: df = df.sort_values(self._date, ascending=True).groupby(self._layers, observed=True).last().reset_index() else: df = df.loc[df[self._date] == Validator(on).date()] focused_layer = [layer for layer in self._layers if df[layer][df[layer] != self.NA].nunique() > 0][-1] geometry = _Geometry( data=df, layer=focused_layer, directory=directory or Path(__file__).with_name("Natural_Earth")) iso3 = None if focused_layer == self._country else self._to_iso3(list(df[self._country].unique())[0]) return geometry.to_geopandas(iso3=iso3, natural_earth=natural_earth).drop(set(self._layers) - {focused_layer}, axis=1)
[docs] def choropleth(self, variable, filename, title="Choropleth map", logscale=True, **kwargs): """Create choropleth map. Args: variable (str): variable name to show filename (str or None): filename to save the figure or None (display) title (str): title of the map logscale (bool): whether convert the value to log10 scale values or not kwargs: keyword arguments of the following classes and methods. - covsirphy.GIS.to_geopandas() except for @variables, - matplotlib.pyplot.savefig(), matplotlib.pyplot.legend(), and - pandas.DataFrame.plot() """ v = Validator(kwargs, "keyword arguments") gdf = self.to_geopandas(variables=[variable], **v.kwargs(functions=GIS.to_geopandas, default=None)) focused_layer = [layer for layer in self._layers if layer in gdf.columns][0] gdf.rename(columns={focused_layer: "Location", variable: "Variable"}, inplace=True) with _ChoroplethMap(filename=filename, **v.kwargs(functions=plt.savefig, default=None)) as cm: cm.title = str(title) cm.plot(data=gdf, logscale=logscale, **v.kwargs(functions=gpd.GeoDataFrame.plot, default=None))
[docs] def subset(self, geo=None, start_date=None, end_date=None, variables=None, errors="raise"): """Return subset of the location and date range. Args: geo (tuple(list[str] or tuple(str) or str) or str or None): location names to filter or None (total at the top level) start_date (str or None): start date, like 22Jan2020 end_date (str or None): end date, like 01Feb2020 variables (list[str] or None): list of variables to add or None (all available columns) errors (str): whether raise errors or not, 'raise' or 'coerce' Raises: TypeError: @geo has un-expected types ValueError: the length of @geo is larger than the length of layers NotRegisteredError: No records have been registered yet SubsetNotFoundError: no records were found for the country and @errors is 'raise' Returns: pandas.DataFrame: Index: reset index Columns - (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date) - columns defined by @variables Note: Note that records with NAs as country names will be always removed. Note: When `geo=None` or `geo=(None,)`, returns global scale records (total values of all country-level data), assuming we have country/province/city as layers here. Note: When `geo=("Japan",)` or `geo="Japan"`, returns country-level data in Japan. Note: When `geo=(["Japan", "UK"],)`, returns country-level data of Japan and UK. Note: When `geo=("Japan", "Tokyo")`, returns province-level data of Tokyo/Japan. Note: When `geo=("Japan", ["Tokyo", "Kanagawa"])`, returns total values of province-level data of Tokyo/Japan and Kanagawa/Japan. Note: When `geo=("Japan", "Kanagawa", "Yokohama")`, returns city-level data of Yokohama/Kanagawa/Japan. Note: When `geo=(("Japan", "Kanagawa", ["Yokohama", "Kawasaki"])`, returns total values of city-level data of Yokohama/Kanagawa/Japan and Kawasaki/Kanagawa/Japan. """ # Get all data if self._un_registered and errors == "raise": raise NotRegisteredError("GIS.register()", details="No records have been registered yet.") data = self._adjuster.all(variables=variables) # Filter with geo geo_converted = self._parse_geo(geo=geo, data=data) manager = _SubsetManager(layers=self._layers) df = manager.filter(data=data, geo=geo_converted) if df.empty and errors == "raise": raise SubsetNotFoundError(geo=geo) # Filter with date series = df[self._date].copy() start = Validator(start_date).date(default=series.min()) end = Validator(end_date).date(default=series.max()) df = df.loc[df[self._date].between(start, end)] if df.empty and errors == "raise": raise SubsetNotFoundError(geo=geo, start_date=start_date, end_date=end_date) # Calculate total value if geo=None if geo is None or geo[0] is None: variables_agg = list(set(df.columns) - {*self._layers, self._date}) df = df.pivot_table(values=variables_agg, index=self._date, columns=self._layers[0], aggfunc="last") df = df.ffill().fillna(0).stack().reset_index() # Get representative records for dates with contextlib.suppress(IndexError, KeyError): df = df.drop(self._layers[1:], axis=1) df = df.groupby([self._layers[0], self._date], dropna=True, observed=True).first().reset_index(level=self._date) return df.groupby(self._date, as_index=False).sum().convert_dtypes()
[docs] @classmethod def area_name(cls, geo=None): """ Return area name of the geographic information, like 'Japan', 'Tokyo/Japan', 'Japan_UK', 'the world'. Args: geo (tuple(list[str] or tuple(str) or str) or str or None): location names Returns: str: area name """ if geo is None or geo[0] is None: return "the world" names = [ info if isinstance(info, str) else "_".join(list(info)) for info in ([geo] if isinstance(geo, str) else geo)] return cls.SEP.join(names[:: -1])
def _parse_geo(self, geo, data): """Parse geographic specifier. Args: geo (tuple(list[str] or tuple(str) or str) or str or None): location names data (pandas.DataFrame): Index reset index Columns - (str): column defined by @country (of covsirphy.GIS) if @country is not None Returns: geo (tuple(list[str] or tuple(str) or str or None) or str or None): parsed location names """ if geo is None: return geo return [self._info_to_iso3(info, self._layers[i], data) for i, info in enumerate([geo] if isinstance(geo, str) else geo)] def _info_to_iso3(self, geo_info, layer, data): """Convert a element of geographic specifier to ISO3 code. Args: geo_info (list[str] or tuple(str) or str or None): element of geographic specifier layer (str): layer of geographic information data (pandas.DataFrame): Index reset index Columns - (str): column defined by @country if @country is not None """ if layer != self._country or geo_info is None or set(geo_info).issubset(data[layer].unique()): return geo_info return self._to_iso3(geo_info)