Source code for covsirphy.gis.gis

import contextlib
from pathlib import Path
import geopandas as gpd
from matplotlib import pyplot as plt
import pandas as pd
from covsirphy.util.error import NotRegisteredError, SubsetNotFoundError
from covsirphy.util.validator import Validator
from covsirphy.util.term import Term
from covsirphy.gis._subset import _SubsetManager
from covsirphy.gis._layer import _LayerAdjuster
from covsirphy.gis._geometry import _Geometry
from covsirphy.gis._choropleth import _ChoroplethMap


[docs] class GIS(Term): """Class of geographic information system to handle geo-spatial time-series data. Args: layers (list[str] or None): list of layers of geographic information or None (["ISO3", "Province", "City"]) country (str or None): layer name of countries or None (countries are not included in the layers) date (str): column name of observation dates Raises: ValueError: @layers has duplicates Note: Country level data specified with @country will be stored with ISO3 codes. """ def __init__(self, layers=None, country="ISO3", date="Date", **kwargs): # Countries will be specified with ISO3 codes and this requires conversion self._country = None if country is None else str(country) # Location data self._layers = Validator(layers or [self._country, self.PROVINCE, self.CITY], "layers").sequence() # Date column self._date = str(date) # Layer adjuster self._adjuster = _LayerAdjuster(layers=self._layers, country=self._country, date=self._date) self._un_registered = True
[docs] def all(self, variables=None, errors="raise"): """Return all available data. Args: variables (list[str] or None): list of variables to collect or None (all available variables) errors (str): 'raise' or 'coerce' Raises: NotRegisteredError: No records have been registered yet Returns: pandas.DataFrame: Index reset index Columns - (pandas.Category): columns defined by covsirphy.GIS(layers) - (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date) - columns defined by @variables """ if self._un_registered and errors == "raise": raise NotRegisteredError("No records have been registered yet.") df = self._adjuster.all(variables=variables) return df.astype(dict.fromkeys(self._layers, "category"))
[docs] def citations(self, variables=None): """ Return citation list of the secondary data sources. Args: variables (list[str] or None): list of variables to collect or None (all available variables) Returns: list[str]: citation list """ return self._adjuster.citations(variables=variables)
[docs] def register(self, data, layers=None, date="Date", variables=None, citations=None, convert_iso3=True, **kwargs): """Register new data. Args: data (pandas.DataFrame): new data Index reset index Columns - columns defined by @layers - column defined by @date - columns defined by @variables layers (list[str] or None): layers of the data or None (as the same as covsirphy.GIS(layer)) date (str): column name of observation dates of the data variables (list[str] or None): list of variables to add or None (all available columns) citations (list[str] or str or None): citations of the dataset or None (["my own dataset"]) convert_iso3 (bool): whether convert country names to ISO3 codes or not **kwargs: keyword arguments of pandas.to_datetime() including "dayfirst (bool): whether date format is DD/MM or not" Raises: ValueError: @data_layers has duplicates Returns: covsirphy.GIS: self """ self._adjuster.register( data=data, layers=layers, date=date, variables=variables, citations=citations, convert_iso3=convert_iso3, **Validator(kwargs, "keyword arguments").kwargs(pd.to_datetime)) self._un_registered = False return self
[docs] def layer(self, geo=None, start_date=None, end_date=None, variables=None, errors="raise"): """Return the data at the selected layer in the date range. Args: geo (tuple(list[str] or tuple(str) or str) or str or None): location names to specify the layer or None (the top level) start_date (str or None): start date, like 22Jan2020 end_date (str or None): end date, like 01Feb2020 variables (list[str] or None): list of variables to add or None (all available columns) errors (str): whether raise errors or not, 'raise' or 'coerce' Raises: TypeError: @geo has un-expected types ValueError: the length of @geo is larger than the length of layers NotRegisteredError: No records have been registered at the layer yet Returns: pandas.DataFrame: Index: reset index Columns - (str): columns defined by covsirphy.GIS(layers) - (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date) - columns defined by @variables Note: Note that records with NAs as country names will be always removed. Note: When `geo=None` or `geo=(None,)`, returns country-level data, assuming we have country/province/city as layers here. Note: When `geo=("Japan",)` or `geo="Japan"`, returns province-level data in Japan. Note: When `geo=(["Japan", "UK"],)`, returns province-level data in Japan and UK. Note: When `geo=("Japan", "Kanagawa")`, returns city-level data in Kanagawa/Japan. Note: When `geo=("Japan", ["Tokyo", "Kanagawa"])`, returns city-level data in Tokyo/Japan and Kanagawa/Japan. """ # Get all data if self._un_registered and errors == "raise": raise NotRegisteredError("GIS.register()", details="No records have been registered yet") data = self._adjuster.all(variables=variables) # Filter with geo geo_converted = self._parse_geo(geo=geo, data=data) manager = _SubsetManager(layers=self._layers) df = manager.layer(data=data, geo=geo_converted) if df.empty and errors == "raise": raise NotRegisteredError("GIS.register()", details="No records have been registered at the layer yet") # Filter with date series = df[self._date].copy() start = Validator(start_date).date(default=series.min()) end = Validator(end_date).date(default=series.max()) df = df.loc[(df[self._date] >= start) & (df[self._date] <= end)] if df.empty and errors == "raise": raise NotRegisteredError( "GIS.register()", details=f"No records have been registered at the layer yet from {start_date} to {end_date}") # Get representative records for dates df = df.groupby([*self._layers, self._date], dropna=True, observed=True).first() return df.reset_index().convert_dtypes()
[docs] def to_geopandas(self, geo=None, on=None, variables=None, directory=None, natural_earth=None): """Add geometry information with GeoJSON file of "Natural Earth" GitHub repository to data. Args: geo (tuple(list[str] or tuple(str) or str) or str or None): location names to specify the layer or None (the top level) on (str or None): the date, like 22Jan2020, or None (the last date of each location) variables (list[str] or None): list of variables to add or None (all available columns) directory (list[str] or tuple(str) or str): top directory name(s) to save GeoJSON files or None (directory of this this script) natural_earth (str or None): title of GeoJSON file (without extension) of "Natural Earth" GitHub repository or None (automatically determined) Raises: ValueError: country layer is not included in the dataset Returns: geopandas.GeoDataFrame: Index: - reset index Columns: - (str): layer focused on with @gis and GIS.layer() - (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date) - geometry: geometric information Note: Regarding @geo argument, please refer to covsirphy.GIS.layer(). Note: GeoJSON files are listed in https://github.com/nvkelso/natural-earth-vector/tree/master/geojson https://www.naturalearthdata.com/ https://github.com/nvkelso/natural-earth-vector Natural Earth (Free vector and raster map data at naturalearthdata.com, Public Domain) """ if self._country not in self._layers: raise ValueError("This cannot be done because country layer is not included in the dataset.") df = self.layer(geo=geo, variables=variables) if on is None: df = df.sort_values(self._date, ascending=True).groupby(self._layers, observed=True).last().reset_index() else: df = df.loc[df[self._date] == Validator(on).date()] focused_layer = [layer for layer in self._layers if df[layer][df[layer] != self.NA].nunique() > 0][-1] geometry = _Geometry( data=df, layer=focused_layer, directory=directory or Path(__file__).with_name("Natural_Earth")) iso3 = None if focused_layer == self._country else self._to_iso3(list(df[self._country].unique())[0]) return geometry.to_geopandas(iso3=iso3, natural_earth=natural_earth).drop(set(self._layers) - {focused_layer}, axis=1)
[docs] def choropleth(self, variable, filename, title="Choropleth map", logscale=True, **kwargs): """Create choropleth map. Args: variable (str): variable name to show filename (str or None): filename to save the figure or None (display) title (str): title of the map logscale (bool): whether convert the value to log10 scale values or not kwargs: keyword arguments of the following classes and methods. - covsirphy.GIS.to_geopandas() except for @variables, - matplotlib.pyplot.savefig(), matplotlib.pyplot.legend(), and - pandas.DataFrame.plot() """ v = Validator(kwargs, "keyword arguments") gdf = self.to_geopandas(variables=[variable], **v.kwargs(functions=GIS.to_geopandas, default=None)) focused_layer = [layer for layer in self._layers if layer in gdf.columns][0] gdf.rename(columns={focused_layer: "Location", variable: "Variable"}, inplace=True) with _ChoroplethMap(filename=filename, **v.kwargs(functions=plt.savefig, default=None)) as cm: cm.title = str(title) cm.plot(data=gdf, logscale=logscale, **v.kwargs(functions=gpd.GeoDataFrame.plot, default=None))
[docs] def subset(self, geo=None, start_date=None, end_date=None, variables=None, errors="raise"): """Return subset of the location and date range. Args: geo (tuple(list[str] or tuple(str) or str) or str or None): location names to filter or None (total at the top level) start_date (str or None): start date, like 22Jan2020 end_date (str or None): end date, like 01Feb2020 variables (list[str] or None): list of variables to add or None (all available columns) errors (str): whether raise errors or not, 'raise' or 'coerce' Raises: TypeError: @geo has un-expected types ValueError: the length of @geo is larger than the length of layers NotRegisteredError: No records have been registered yet SubsetNotFoundError: no records were found for the country and @errors is 'raise' Returns: pandas.DataFrame: Index: reset index Columns - (pandas.Timestamp): observation dates, column defined by covsirphy.GIS(date) - columns defined by @variables Note: Note that records with NAs as country names will be always removed. Note: When `geo=None` or `geo=(None,)`, returns global scale records (total values of all country-level data), assuming we have country/province/city as layers here. Note: When `geo=("Japan",)` or `geo="Japan"`, returns country-level data in Japan. Note: When `geo=(["Japan", "UK"],)`, returns country-level data of Japan and UK. Note: When `geo=("Japan", "Tokyo")`, returns province-level data of Tokyo/Japan. Note: When `geo=("Japan", ["Tokyo", "Kanagawa"])`, returns total values of province-level data of Tokyo/Japan and Kanagawa/Japan. Note: When `geo=("Japan", "Kanagawa", "Yokohama")`, returns city-level data of Yokohama/Kanagawa/Japan. Note: When `geo=(("Japan", "Kanagawa", ["Yokohama", "Kawasaki"])`, returns total values of city-level data of Yokohama/Kanagawa/Japan and Kawasaki/Kanagawa/Japan. """ # Get all data if self._un_registered and errors == "raise": raise NotRegisteredError("GIS.register()", details="No records have been registered yet.") data = self._adjuster.all(variables=variables) # Filter with geo geo_converted = self._parse_geo(geo=geo, data=data) manager = _SubsetManager(layers=self._layers) df = manager.filter(data=data, geo=geo_converted) if df.empty and errors == "raise": raise SubsetNotFoundError(geo=geo) # Filter with date series = df[self._date].copy() start = Validator(start_date).date(default=series.min()) end = Validator(end_date).date(default=series.max()) df = df.loc[df[self._date].between(start, end)] if df.empty and errors == "raise": raise SubsetNotFoundError(geo=geo, start_date=start_date, end_date=end_date) # Calculate total value if geo=None if geo is None or geo[0] is None: variables_agg = list(set(df.columns) - {*self._layers, self._date}) df = df.pivot_table(values=variables_agg, index=self._date, columns=self._layers[0], aggfunc="last") df = df.ffill().fillna(0).stack().reset_index() # Get representative records for dates with contextlib.suppress(IndexError, KeyError): df = df.drop(self._layers[1:], axis=1) df = df.groupby([self._layers[0], self._date], dropna=True, observed=True).first().reset_index(level=self._date) return df.groupby(self._date, as_index=False).sum().convert_dtypes()
[docs] @classmethod def area_name(cls, geo=None): """ Return area name of the geographic information, like 'Japan', 'Tokyo/Japan', 'Japan_UK', 'the world'. Args: geo (tuple(list[str] or tuple(str) or str) or str or None): location names Returns: str: area name """ if geo is None or geo[0] is None: return "the world" names = [ info if isinstance(info, str) else "_".join(list(info)) for info in ([geo] if isinstance(geo, str) else geo)] return cls.SEP.join(names[:: -1])
def _parse_geo(self, geo, data): """Parse geographic specifier. Args: geo (tuple(list[str] or tuple(str) or str) or str or None): location names data (pandas.DataFrame): Index reset index Columns - (str): column defined by @country (of covsirphy.GIS) if @country is not None Returns: geo (tuple(list[str] or tuple(str) or str or None) or str or None): parsed location names """ if geo is None: return geo return [self._info_to_iso3(info, self._layers[i], data) for i, info in enumerate([geo] if isinstance(geo, str) else geo)] def _info_to_iso3(self, geo_info, layer, data): """Convert a element of geographic specifier to ISO3 code. Args: geo_info (list[str] or tuple(str) or str or None): element of geographic specifier layer (str): layer of geographic information data (pandas.DataFrame): Index reset index Columns - (str): column defined by @country if @country is not None """ if layer != self._country or geo_info is None or set(geo_info).issubset(data[layer].unique()): return geo_info return self._to_iso3(geo_info)