Source code for urbanpy.download.download

from typing import Optional, Tuple, Union
from warnings import warn

import geopandas as gpd
import numpy as np
import osmnx as ox
import pandas as pd
import requests
from geopandas import GeoDataFrame, GeoSeries
from hdx.api.configuration import Configuration
from hdx.data.dataset import Dataset
from pandas import DataFrame
from shapely.geometry import MultiPolygon, Polygon

from urbanpy.utils import (
    HDX_POPULATION_TYPES,
    get_hdx_label,
    overpass_to_gdf,
    to_overpass_query,
)

__all__ = [
    "nominatim_osm",
    "overpass_pois",
    "overpass",
    "osmnx_graph",
    "search_hdx_dataset",
    "get_hdx_dataset",
    "hdx_fb_population",
    "hdx_dataset",
]

hdx_config = Configuration.create(
    hdx_site="prod", user_agent="urbanpy", hdx_read_only=True
)


[docs] def nominatim_osm(query: str, expected_position: "int | None" = 0, email: str = "" ) -> GeoDataFrame: """ Download OpenStreetMaps data for a specific city. Parameters ---------- query: str Query string for OSM data to be downloaded (e.g. "Lima, Peru"). expected_position: int 0:n Expected position of the polygon data within the Nominatim results. Default 0 returns the first result. If set to None, all the results are returned. Returns ------- gdf: GeoDataFrame GeoDataFrame with the fetched OSM data. Examples -------- >>> lima = nominatim_osm('Lima, Peru', 2) >>> lima.head() geometry | place_id | osm_type | osm_id | display_name | place_rank | category | type | importance | icon MULTIPOLYGON | 235480647 | relation | 1944670.0 | Lima, Peru | 12 | boundary | administrative | 0.703484 | https://nominatim.openstreetmap.org/images/map... """ if email == "": raise ValueError("Please provide an email to avoid violating Nominatim API rules.") osm_url = "https://nominatim.openstreetmap.org/search.php" osm_parameters = { "q": query, "polygon_geojson": "1", "format": "geojson", "email": email } response = requests.get(osm_url, params=osm_parameters) all_results = response.json() gdf = gpd.GeoDataFrame.from_features(all_results["features"], crs="EPSG:4326") if expected_position is None: return gdf return gdf.iloc[expected_position : expected_position + 1]
[docs] def overpass_pois(bounds, facilities=None, custom_query=None): """ Download POIs using Overpass API. Parameters ---------- bounds: array_like Input bounds for query. Follows [minx,miny,maxx,maxy] pattern. facilities: str. One of {'food', 'health', 'education', 'finance'} Type of facilities to download according to HOTOSM types. Based on this a different type of query is constructed. custom_query: str (Optional). Default None. String with custom Overpass QL query (See https://wiki.openstreetmap.org/wiki/Overpass_API/Language_Guide). If this parameter is diferent than None, bounds and facilities values are ignored. Returns ------- gdf: GeoDataFrame POIs from the selected type of facility. If 'custom_query' is given response is returned instead of gdf. response: request.Response Returned only if 'custom_query' is given. Contains the server's response to the HTTP request from the Overpass API Server. Examples -------- >>> lima = nominatim_osm('Lima, Peru', 2) >>> urbanpy.download.overpass_pois(lima.total_bounds, 'health') type | id | lat | lon | tags | geometry | poi_type node | 367826732 | -0.944005 | -80.733941 | {'amenity': 'pharmacy', 'name': 'Fybeca'} | POINT (-80.73394 -0.94401) | pharmacy node | 367830051 | -0.954086 | -80.742420 | {'amenity': 'hospital', 'emergency': 'yes', 'n... | POINT (-80.74242 -0.95409) | hospital node | 367830065 | -0.954012 | -80.741554 | {'amenity': 'hospital', 'name': 'Clínica del S... | POINT (-80.74155 -0.95401) | hospital node | 367830072 | -0.953488 | -80.740739 | {'amenity': 'hospital', 'name': 'Clínica Cente... | POINT (-80.74074 -0.95349) | hospital node | 3206491590| -1.040708 | -80.665107 | {'amenity': 'hospital', 'name': 'Clínica Monte... | POINT (-80.66511 -1.04071) | hospital """ minx, miny, maxx, maxy = bounds bbox_string = f"{minx},{miny},{maxx},{maxy}" overpass_url = "https://overpass-api.de/api/interpreter" facilities_opt = { "food": 'node["amenity"="marketplace"];\nnode["shop"~"supermarket|kiosk|mall|convenience|butcher|greengrocer"];', "health": 'node["amenity"~"doctors|dentist|clinic|hospital|pharmacy"];', "education": 'node["amenity"~"kindergarten|school|college|university"];', "finance": 'node["amenity"~"mobile_money_agent|bureau_de_change|bank|microfinance|atm|sacco|money_transfer|post_office"];', } if custom_query is None: overpass_query = f""" [timeout:120][out:json][bbox]; ( {facilities_opt[facilities]} ); out body geom; """ # Request data response = requests.get( overpass_url, params={"data": overpass_query, "bbox": bbox_string} ) data = response.json() df = pd.DataFrame.from_dict(data["elements"]) df_geom = gpd.points_from_xy(df["lon"], df["lat"]) gdf = gpd.GeoDataFrame(df, geometry=df_geom, crs="EPSG:4326") gdf["poi_type"] = gdf["tags"].apply( lambda tag: tag["amenity"] if "amenity" in tag.keys() else np.NaN ) if facilities == "food": # Food facilities also have its POI type wthin the shop tag (See query) also_poi_type = gdf["tags"].apply( lambda tag: tag["shop"] if "shop" in tag.keys() else np.NaN ) gdf["poi_type"] = gdf["poi_type"].fillna(also_poi_type) return gdf else: response = requests.get( overpass_url, params={"data": custom_query, "bbox": bbox_string} ) return response
[docs] def overpass( type_of_data: str, query: dict, mask: Union[GeoDataFrame, GeoSeries, Polygon, MultiPolygon], ) -> Tuple[GeoDataFrame, Optional[DataFrame]]: """ Download geographic data using Overpass API. Parameters ---------- type_of_data : str One of {'node', 'way', 'relation', 'rel'}. OSM Data structure to be queried from Overpass API. query : dict Dict containing OSM tag filters. Dict keys can take OSM tags and Dict values can be a list of strings, str, or None. Example: { 'key0': ['v0a', 'v0b', 'v0c'], 'key1': 'v1', 'key2': None } Check keys [OSM Map Features](https://wiki.openstreetmap.org/wiki/Map_features). mask : GeoDataFrame, GeoSeries, Polygon, or MultiPolygon Total bounds of mask to be used for the query. Returns ------- gdf : GeoDataFrame POIs from the selected type of facility. df : DataFrame Relations metadata such as ID and tags. Returns None if 'type_of_data' is other than 'relation'. """ minx, miny, maxx, maxy = mask.total_bounds bbox_string = f"{minx},{miny},{maxx},{maxy}" # Request data overpass_url = "https://overpass-api.de/api/interpreter" params = { "data": to_overpass_query( type_of_data, query ), # Parse query dict to build Overpass QL query "bbox": bbox_string, } response = requests.get(overpass_url, params=params) try: data = response.json() except Exception as e: print(e) print(response.status_code) print(response.reason) return response ov_keys = list( set(query.keys()) ) # get unique keys used in query (e.g. "amenity", "shop", etc) return overpass_to_gdf(type_of_data, data, mask, ov_keys)
[docs] def osmnx_graph( download_type: str, network_type="drive", query_str=None, geom=None, distance=None, **kwargs, ): """ Download a graph from OSM using osmnx. Parameters ---------- download_type: str. One of {'polygon', 'place', 'point'} Input download type. If polygon, the polygon parameter must be provided as a Shapely Polygon. network_type: str. One of {'drive', 'drive_service', 'walk', 'bike', 'all', 'all_private'} Network type to download. Defaults to drive. query_str: str (Optional). Only requiered for place type downloads. Query string to download a network. polygon: Shapely Polygon or Point (Optional). Polygon requiered for polygon type downloads, Point for place downloads. Polygons are used as bounds for network download, points as the center. with a distance buffer. distance: int Distance in meters to use as buffer from a point to download the network. Returns ------- G: networkx.MultiDiGraph Requested graph with simplyfied geometries. Examples -------- >>> poly = urbanpy.download.nominatim_osm('San Isidro, Peru') >>> G = urbanpy.download.osmnx_graph('polygon', geom=lima.loc[0,'geometry']) >>> G <networkx.classes.multidigraph.MultiDiGraph at 0x1a2ba08150> """ if ( (download_type == "polygon") and (geom is not None) and isinstance(geom, Polygon) ): G = ox.graph_from_polygon(geom, network_type=network_type) return G elif (download_type == "point") and (geom is not None) and (distance is not None): G = ox.graph_from_point(geom, dist=distance, network_type=network_type) return G elif download_type == "place" and query_str is not None: G = ox.graph_from_place(query_str, network_type=network_type) return G elif download_type == "polygon" and geom is None: print("Please provide a polygon to download a network from.") elif download_type == "place" and query_str is None: print("Please provide a query string to download a network from.") else: if distance is None and download_type == "point": print("Please provide a distance buffer for the point download") if geom is None and distance is not None: print("Please provide a Point geometry.")
[docs] def search_hdx_dataset( country: str, repository="high-resolution-population-density-maps-demographic-estimates", ): """ Dataset search within HDX repositories. Defaults to population density maps. Parameters ---------- country: str Country to search datasets resource: str Resource type within the HDX database Returns ------- datasets: DataFrame DataFrame of available datasets within HDX Examples -------- >>> resources_df = urbanpy.download.search_hdx_dataset('peru') >>> resources_df | created | name | population | size_mb | url id | | | | | 0 | 2019-06-11 | population_per_2018-10-01.csv.zip | Overall population density | 19.36 | https://data.humdata.org/dataset/4e74db39-87f1... 2 | 2019-06-11 | PER_children_under_five_2019-06-01_csv.zip | Children (ages 0-5) | 16.60 | https://data.humdata.org/dataset/4e74db39-87f1... 4 | 2019-06-11 | PER_elderly_60_plus_2019-06-01_csv.zip | Elderly (ages 60+) | 16.59 | https://data.humdata.org/dataset/4e74db39-87f1... 6 | 2019-06-11 | PER_men_2019-06-01_csv.zip | Men | 16.64 | https://data.humdata.org/dataset/4e74db39-87f1... 8 | 2019-06-11 | PER_women_2019-06-01_csv.zip | Women | 16.63 | https://data.humdata.org/dataset/4e74db39-87f1... 10 | 2019-06-11 | PER_women_of_reproductive_age_15_49_2019-06-01... | Women of reproductive age (ages 15-49) | 16.62 | https://data.humdata.org/dataset/4e74db39-87f1... 12 | 2019-06-11 | PER_youth_15_24_2019-06-01_csv.zip | Youth (ages 15-24) | 16.61 | https://data.humdata.org/dataset/4e74db39-87f1... """ # Get dataset list datasets = Dataset.search_in_hdx(f"title:{country.lower()}-{repository}") resources_records = Dataset.get_all_resources(datasets) resources_df = pd.DataFrame.from_records(resources_records) if resources_df.shape[0] == 0: print("No datasets found") else: resources_csv_df = resources_df[ resources_df["download_url"].str.contains("csv") ] resources_csv_df = resources_csv_df.assign( created=pd.to_datetime(resources_csv_df["created"]).dt.date, size_mb=(resources_csv_df["size"] / 2**20).round(2), ) if ( repository == "high-resolution-population-density-maps-demographic-estimates" ): resources_csv_df = resources_csv_df.assign( population=resources_csv_df["name"].apply(get_hdx_label) ) resources_csv_df.index.name = "id" return resources_csv_df[["created", "name", "population", "size_mb", "url"]]
[docs] def get_hdx_dataset(resources_df: DataFrame, ids: Union[int, list]) -> DataFrame: """ HDX dataset download. Parameters ---------- resources_df: pd.DataFrame Resources dataframe from returned by search_hdx_dataset ids: int or list IDs in the resources dataframe Returns ------- data: pd.DataFrame The corresponding dataset in DataFrame format Examples -------- >>> resources_df = urbanpy.download.search_hdx_dataset('peru') >>> population_df = urbanpy.download.get_hdx_dataset(resources_df, 0) >>> population_df latitude | longitude | population_2015 | population_2020 -18.339306 | -70.382361 | 11.318147 | 12.099885 -18.335694 | -70.393750 | 11.318147 | 12.099885 -18.335694 | -70.387361 | 11.318147 | 12.099885 -18.335417 | -70.394028 | 11.318147 | 12.099885 -18.335139 | -70.394306 | 11.318147 | 12.099885 """ urls = resources_df.loc[ids, "url"] if isinstance(ids, list) and len(ids) > 1: return pd.concat([pd.read_csv(url) for url in urls]) else: return pd.read_csv(urls)
[docs] def hdx_fb_population(country, map_type): """ Download population density maps from Facebook HDX. Parameters ---------- country: str. One of {'argentina', 'bolivia', 'brazil', 'chile', 'colombia', 'ecuador', 'paraguay', 'peru', 'uruguay'} Input country to download data from. map_type: str. One of {'full', 'children', 'youth', 'elderly'} Input population map to download. Returns ------- population: DataFrame DataFrame with lat, lon, and population columns. Coordinates are in EPSG 4326. Examples -------- >>> urbanpy.download.hdx_fb_population('peru', 'full') latitude | longitude | population_2015 | population_2020 -18.339306 | -70.382361 | 11.318147 | 12.099885 -18.335694 | -70.393750 | 11.318147 | 12.099885 -18.335694 | -70.387361 | 11.318147 | 12.099885 -18.335417 | -70.394028 | 11.318147 | 12.099885 -18.335139 | -70.394306 | 11.318147 | 12.099885 """ resources_df = search_hdx_dataset(country) # Rename older "full" map_type to "overall" if map_type == "full": map_type = "overall" # Get correct dataset/s index dataset_ix = resources_df[ resources_df["population"] == HDX_POPULATION_TYPES[map_type] ].index.tolist() population = get_hdx_dataset(resources_df, dataset_ix) return population
# Added for backwards compatibility
[docs] def hdx_dataset(resource): """ Download a dataset from HDX. The allowed formats are CSV (.csv) and zipped CSV (.csv.zip). To run our function we would only copy what is after "https://data.humdata.org/dataset/" to the 'resource' parameter. For example: '4e74db39-87f1-4383-9255-eaf8ebceb0c9/resource/317f1c39-8417-4bde-a076-99bd37feefce/download/population_per_2018-10-01.csv.zip'. Parameters ---------- resource: str Specific address to the HDX dataset resource. Since every dataset is referenced to a diferent resource id, only the base url can be provided by this library. Returns ------- dataset: DataFrame Contains the requested HDX dataset resource. Examples -------- >>> hdx_data = hdx_dataset('4e74db39-87f1-4383-9255-eaf8ebceb0c9/resource/317f1c39-8417-4bde-a076-99bd37feefce/download/population_per_2018-10-01.csv.zip') >>> hdx_data.head() latitude | longitude | population_2015 | population_2020 -18.339306 | -70.382361 | 11.318147 | 12.099885 -18.335694 | -70.393750 | 11.318147 | 12.099885 -18.335694 | -70.387361 | 11.318147 | 12.099885 -18.335417 | -70.394028 | 11.318147 | 12.099885 -18.335139 | -70.394306 | 11.318147 | 12.099885 """ warn( "This function will be deprecated. Please use search_hdx_dataset and get_hdx_dataset instead.", DeprecationWarning, stacklevel=2, ) hdx_url = f"https://data.humdata.org/dataset/{resource}" dataset = pd.read_csv(hdx_url) return dataset