Source code for urbanpy.download.download

from typing import Optional, Tuple, Union
from warnings import warn

import geopandas as gpd
import numpy as np
import osmnx as ox
import pandas as pd
import requests
from geopandas import GeoDataFrame, GeoSeries
from hdx.api.configuration import Configuration
from hdx.data.dataset import Dataset
from pandas import DataFrame
from shapely.geometry import MultiPolygon, Polygon

from urbanpy.utils import (
    HDX_POPULATION_TYPES,
    get_hdx_label,
    overpass_to_gdf,
    to_overpass_query,
)

__all__ = [
    "nominatim_osm",
    "overpass_pois",
    "overpass",
    "osmnx_graph",
    "search_hdx_dataset",
    "get_hdx_dataset",
    "hdx_fb_population",
    "hdx_dataset",
]

hdx_config = Configuration.create(
    hdx_site="prod", user_agent="urbanpy", hdx_read_only=True
)



[docs]
def nominatim_osm(query: str, expected_position: "int | None" = 0, email: str = "" ) -> GeoDataFrame:
    """
    Download OpenStreetMaps data for a specific city.

    Parameters
    ----------
    query: str
        Query string for OSM data to be downloaded (e.g. "Lima, Peru").
    expected_position: int 0:n
        Expected position of the polygon data within the Nominatim results.
        Default 0 returns the first result.
        If set to None, all the results are returned.

    Returns
    -------
    gdf: GeoDataFrame
        GeoDataFrame with the fetched OSM data.

    Examples
    --------
    >>> lima = nominatim_osm('Lima, Peru', 2)
    >>> lima.head()
    geometry	 | place_id	 | osm_type	| osm_id     | display_name	| place_rank  |  category | type	       | importance	| icon
    MULTIPOLYGON | 235480647 | relation	| 1944670.0  | Lima, Peru	| 12	      |  boundary |	administrative | 0.703484	| https://nominatim.openstreetmap.org/images/map...
    """
    if email == "":
        raise ValueError("Please provide an email to avoid violating Nominatim API rules.")
    osm_url = "https://nominatim.openstreetmap.org/search.php"
    osm_parameters = {
        "q": query,
        "polygon_geojson": "1", 
        "format": "geojson", 
        "email": email
    }

    response = requests.get(osm_url, params=osm_parameters)
    all_results = response.json()
    gdf = gpd.GeoDataFrame.from_features(all_results["features"], crs="EPSG:4326")
    if expected_position is None:
        return gdf
    
    return gdf.iloc[expected_position : expected_position + 1]




[docs]
def overpass_pois(bounds, facilities=None, custom_query=None):
    """
    Download POIs using Overpass API.

    Parameters
    ----------
    bounds: array_like
        Input bounds for query. Follows [minx,miny,maxx,maxy] pattern.
    facilities: str. One of {'food', 'health', 'education', 'finance'}
        Type of facilities to download according to HOTOSM types. Based on this
        a different type of query is constructed.
    custom_query: str (Optional). Default None.
        String with custom Overpass QL query (See https://wiki.openstreetmap.org/wiki/Overpass_API/Language_Guide).
        If this parameter is diferent than None, bounds and facilities values
        are ignored.

    Returns
    -------
    gdf: GeoDataFrame
        POIs from the selected type of facility. If 'custom_query' is given
        response is returned instead of gdf.
    response: request.Response
        Returned only if 'custom_query' is given. Contains the server's response
        to the HTTP request from the Overpass API Server.

    Examples
    --------
    >>> lima = nominatim_osm('Lima, Peru', 2)
    >>> urbanpy.download.overpass_pois(lima.total_bounds, 'health')
    type |  id	      | lat	      | lon	       | tags	                                           | geometry                   | poi_type
    node |	367826732 |	-0.944005 |	-80.733941 | {'amenity': 'pharmacy', 'name': 'Fybeca'}         | POINT (-80.73394 -0.94401)	| pharmacy
    node |	367830051 |	-0.954086 |	-80.742420 | {'amenity': 'hospital', 'emergency': 'yes', 'n... | POINT (-80.74242 -0.95409)	| hospital
    node |	367830065 |	-0.954012 |	-80.741554 | {'amenity': 'hospital', 'name': 'Clínica del S... | POINT (-80.74155 -0.95401)	| hospital
    node |	367830072 |	-0.953488 |	-80.740739 | {'amenity': 'hospital', 'name': 'Clínica Cente... | POINT (-80.74074 -0.95349)	| hospital
    node |	3206491590|	-1.040708 |	-80.665107 | {'amenity': 'hospital', 'name': 'Clínica Monte... | POINT (-80.66511 -1.04071)	| hospital
    """
    minx, miny, maxx, maxy = bounds

    bbox_string = f"{minx},{miny},{maxx},{maxy}"

    overpass_url = "https://overpass-api.de/api/interpreter"

    facilities_opt = {
        "food": 'node["amenity"="marketplace"];\nnode["shop"~"supermarket|kiosk|mall|convenience|butcher|greengrocer"];',
        "health": 'node["amenity"~"doctors|dentist|clinic|hospital|pharmacy"];',
        "education": 'node["amenity"~"kindergarten|school|college|university"];',
        "finance": 'node["amenity"~"mobile_money_agent|bureau_de_change|bank|microfinance|atm|sacco|money_transfer|post_office"];',
    }

    if custom_query is None:
        overpass_query = f"""
            [timeout:120][out:json][bbox];
            (
                 {facilities_opt[facilities]}
            );
            out body geom;
            """
        # Request data
        response = requests.get(
            overpass_url, params={"data": overpass_query, "bbox": bbox_string}
        )
        data = response.json()
        df = pd.DataFrame.from_dict(data["elements"])
        df_geom = gpd.points_from_xy(df["lon"], df["lat"])
        gdf = gpd.GeoDataFrame(df, geometry=df_geom, crs="EPSG:4326")

        gdf["poi_type"] = gdf["tags"].apply(
            lambda tag: tag["amenity"] if "amenity" in tag.keys() else np.NaN
        )

        if facilities == "food":
            # Food facilities also have its POI type wthin the shop tag (See query)
            also_poi_type = gdf["tags"].apply(
                lambda tag: tag["shop"] if "shop" in tag.keys() else np.NaN
            )
            gdf["poi_type"] = gdf["poi_type"].fillna(also_poi_type)

        return gdf

    else:
        response = requests.get(
            overpass_url, params={"data": custom_query, "bbox": bbox_string}
        )
        return response



[docs]
def overpass(
    type_of_data: str,
    query: dict,
    mask: Union[GeoDataFrame, GeoSeries, Polygon, MultiPolygon],
) -> Tuple[GeoDataFrame, Optional[DataFrame]]:
    """
    Download geographic data using Overpass API.

    Parameters
    ----------
    type_of_data : str
        One of {'node', 'way', 'relation', 'rel'}. OSM Data structure to be queried from Overpass API.
    query : dict
        Dict containing OSM tag filters. Dict keys can take OSM tags and Dict values can be a list of strings, str, or None.
        Example: { 'key0': ['v0a', 'v0b', 'v0c'], 'key1': 'v1', 'key2': None }
        Check keys [OSM Map Features](https://wiki.openstreetmap.org/wiki/Map_features).
    mask : GeoDataFrame, GeoSeries, Polygon, or MultiPolygon
        Total bounds of mask to be used for the query.

    Returns
    -------
    gdf : GeoDataFrame
        POIs from the selected type of facility.
    df : DataFrame
        Relations metadata such as ID and tags. Returns None if 'type_of_data' is other than 'relation'.

    """

    minx, miny, maxx, maxy = mask.total_bounds
    bbox_string = f"{minx},{miny},{maxx},{maxy}"

    # Request data
    overpass_url = "https://overpass-api.de/api/interpreter"
    params = {
        "data": to_overpass_query(
            type_of_data, query
        ),  # Parse query dict to build Overpass QL query
        "bbox": bbox_string,
    }
    response = requests.get(overpass_url, params=params)
    try:
        data = response.json()
    except Exception as e:
        print(e)
        print(response.status_code)
        print(response.reason)
        return response

    ov_keys = list(
        set(query.keys())
    )  # get unique keys used in query (e.g. "amenity", "shop", etc)

    return overpass_to_gdf(type_of_data, data, mask, ov_keys)




[docs]
def osmnx_graph(
    download_type: str,
    network_type="drive",
    query_str=None,
    geom=None,
    distance=None,
    **kwargs,
):
    """
    Download a graph from OSM using osmnx.

    Parameters
    ----------
    download_type: str. One of {'polygon', 'place', 'point'}
        Input download type. If polygon, the polygon parameter must be provided
        as a Shapely Polygon.
    network_type: str. One of {'drive', 'drive_service', 'walk', 'bike', 'all', 'all_private'}
        Network type to download. Defaults to drive.
    query_str: str (Optional).
        Only requiered for place type downloads. Query string to download a network.
    polygon: Shapely Polygon or Point (Optional).
        Polygon requiered for polygon type downloads, Point for place downloads.
        Polygons are used as bounds for network download, points as the center.
        with a distance buffer.
    distance: int
        Distance in meters to use as buffer from a point to download the network.

    Returns
    -------
    G: networkx.MultiDiGraph
        Requested graph with simplyfied geometries.

    Examples
    --------
    >>> poly = urbanpy.download.nominatim_osm('San Isidro, Peru')
    >>> G = urbanpy.download.osmnx_graph('polygon', geom=lima.loc[0,'geometry'])
    >>> G
    <networkx.classes.multidigraph.MultiDiGraph at 0x1a2ba08150>
    """
    if (
        (download_type == "polygon")
        and (geom is not None)
        and isinstance(geom, Polygon)
    ):
        G = ox.graph_from_polygon(geom, network_type=network_type)
        return G
    elif (download_type == "point") and (geom is not None) and (distance is not None):
        G = ox.graph_from_point(geom, dist=distance, network_type=network_type)
        return G

    elif download_type == "place" and query_str is not None:
        G = ox.graph_from_place(query_str, network_type=network_type)
        return G

    elif download_type == "polygon" and geom is None:
        print("Please provide a polygon to download a network from.")

    elif download_type == "place" and query_str is None:
        print("Please provide a query string to download a network from.")

    else:
        if distance is None and download_type == "point":
            print("Please provide a distance buffer for the point download")

        if geom is None and distance is not None:
            print("Please provide a Point geometry.")




[docs]
def search_hdx_dataset(
    country: str,
    repository="high-resolution-population-density-maps-demographic-estimates",
):
    """
    Dataset search within HDX repositories. Defaults to population density maps.

    Parameters
    ----------
    country: str
        Country to search datasets

    resource: str
        Resource type within the HDX database

    Returns
    -------
    datasets: DataFrame
        DataFrame of available datasets within HDX

    Examples
    --------
    >>> resources_df = urbanpy.download.search_hdx_dataset('peru')
    >>> resources_df
       | created	| name	                                            | population	                            | size_mb |	url
    id |			| 	                                                |                                           |         |
    0  | 2019-06-11	| population_per_2018-10-01.csv.zip	                | Overall population density	            | 19.36	  | https://data.humdata.org/dataset/4e74db39-87f1...
    2  | 2019-06-11	| PER_children_under_five_2019-06-01_csv.zip	    | Children (ages 0-5)	                    | 16.60	  | https://data.humdata.org/dataset/4e74db39-87f1...
    4  | 2019-06-11	| PER_elderly_60_plus_2019-06-01_csv.zip	        | Elderly (ages 60+)	                    | 16.59	  | https://data.humdata.org/dataset/4e74db39-87f1...
    6  | 2019-06-11	| PER_men_2019-06-01_csv.zip	                    | Men	                                    | 16.64	  | https://data.humdata.org/dataset/4e74db39-87f1...
    8  | 2019-06-11	| PER_women_2019-06-01_csv.zip	                    | Women	                                    | 16.63	  | https://data.humdata.org/dataset/4e74db39-87f1...
    10 | 2019-06-11	| PER_women_of_reproductive_age_15_49_2019-06-01... | Women of reproductive age (ages 15-49)    | 16.62	  | https://data.humdata.org/dataset/4e74db39-87f1...
    12 | 2019-06-11	| PER_youth_15_24_2019-06-01_csv.zip	            | Youth (ages 15-24)	                    | 16.61	  | https://data.humdata.org/dataset/4e74db39-87f1...
    """
    # Get dataset list
    datasets = Dataset.search_in_hdx(f"title:{country.lower()}-{repository}")

    resources_records = Dataset.get_all_resources(datasets)
    resources_df = pd.DataFrame.from_records(resources_records)
    if resources_df.shape[0] == 0:
        print("No datasets found")

    else:
        resources_csv_df = resources_df[
            resources_df["download_url"].str.contains("csv")
        ]

        resources_csv_df = resources_csv_df.assign(
            created=pd.to_datetime(resources_csv_df["created"]).dt.date,
            size_mb=(resources_csv_df["size"] / 2**20).round(2),
        )

        if (
            repository
            == "high-resolution-population-density-maps-demographic-estimates"
        ):
            resources_csv_df = resources_csv_df.assign(
                population=resources_csv_df["name"].apply(get_hdx_label)
            )

        resources_csv_df.index.name = "id"

        return resources_csv_df[["created", "name", "population", "size_mb", "url"]]




[docs]
def get_hdx_dataset(resources_df: DataFrame, ids: Union[int, list]) -> DataFrame:
    """
    HDX dataset download.

    Parameters
    ----------
    resources_df: pd.DataFrame
        Resources dataframe from returned by search_hdx_dataset
    ids: int or list
        IDs in the resources dataframe

    Returns
    -------
    data: pd.DataFrame
        The corresponding dataset in DataFrame format

    Examples
    --------
    >>> resources_df = urbanpy.download.search_hdx_dataset('peru')
    >>> population_df = urbanpy.download.get_hdx_dataset(resources_df, 0)
    >>> population_df
    latitude   | longitude  | population_2015 |	population_2020
    -18.339306 | -70.382361 | 11.318147	      | 12.099885
    -18.335694 | -70.393750 | 11.318147	      | 12.099885
    -18.335694 | -70.387361	| 11.318147	      | 12.099885
    -18.335417 | -70.394028	| 11.318147	      | 12.099885
    -18.335139 | -70.394306	| 11.318147	      | 12.099885

    """

    urls = resources_df.loc[ids, "url"]

    if isinstance(ids, list) and len(ids) > 1:
        return pd.concat([pd.read_csv(url) for url in urls])
    else:
        return pd.read_csv(urls)




[docs]
def hdx_fb_population(country, map_type):
    """
    Download population density maps from Facebook HDX.

    Parameters
    ----------
    country: str. One of {'argentina', 'bolivia', 'brazil', 'chile', 'colombia', 'ecuador', 'paraguay', 'peru', 'uruguay'}
        Input country to download data from.
    map_type: str. One of {'full', 'children', 'youth', 'elderly'}
        Input population map to download.

    Returns
    -------
    population: DataFrame
        DataFrame with lat, lon, and population columns. Coordinates are in
        EPSG 4326.

    Examples
    --------
    >>> urbanpy.download.hdx_fb_population('peru', 'full')
    latitude   | longitude  | population_2015 |	population_2020
    -18.339306 | -70.382361 | 11.318147	      | 12.099885
    -18.335694 | -70.393750 | 11.318147	      | 12.099885
    -18.335694 | -70.387361	| 11.318147	      | 12.099885
    -18.335417 | -70.394028	| 11.318147	      | 12.099885
    -18.335139 | -70.394306	| 11.318147	      | 12.099885
    """

    resources_df = search_hdx_dataset(country)

    # Rename older "full" map_type to "overall"
    if map_type == "full":
        map_type = "overall"

    # Get correct dataset/s index
    dataset_ix = resources_df[
        resources_df["population"] == HDX_POPULATION_TYPES[map_type]
    ].index.tolist()

    population = get_hdx_dataset(resources_df, dataset_ix)

    return population



# Added for backwards compatibility

[docs]
def hdx_dataset(resource):
    """
    Download a dataset from HDX. The allowed formats are CSV (.csv) and zipped
    CSV (.csv.zip). To run our function we would only copy what is after "https://data.humdata.org/dataset/"
    to the 'resource' parameter.

    For example: '4e74db39-87f1-4383-9255-eaf8ebceb0c9/resource/317f1c39-8417-4bde-a076-99bd37feefce/download/population_per_2018-10-01.csv.zip'.

    Parameters
    ----------
    resource: str
        Specific address to the HDX dataset resource. Since every dataset is
        referenced to a diferent resource id, only the base url can be provided
        by this library.

    Returns
    -------
    dataset: DataFrame
        Contains the requested HDX dataset resource.

    Examples
    --------
    >>> hdx_data = hdx_dataset('4e74db39-87f1-4383-9255-eaf8ebceb0c9/resource/317f1c39-8417-4bde-a076-99bd37feefce/download/population_per_2018-10-01.csv.zip')
    >>> hdx_data.head()
    latitude   | longitude  | population_2015 |	population_2020
    -18.339306 | -70.382361 | 11.318147	      | 12.099885
    -18.335694 | -70.393750 | 11.318147	      | 12.099885
    -18.335694 | -70.387361	| 11.318147	      | 12.099885
    -18.335417 | -70.394028	| 11.318147	      | 12.099885
    -18.335139 | -70.394306	| 11.318147	      | 12.099885
    """
    warn(
        "This function will be deprecated. Please use search_hdx_dataset and get_hdx_dataset instead.",
        DeprecationWarning,
        stacklevel=2,
    )

    hdx_url = f"https://data.humdata.org/dataset/{resource}"
    dataset = pd.read_csv(hdx_url)
    return dataset