import streamlit as st import geopandas as gpd import pandas as pd import pyarrow.parquet as pq from huggingface_hub import hf_hub_download import warnings # Suppress specific RuntimeWarnings (USECOLS) warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*USECOLS.*") # Set Streamlit Page Configuration st.set_page_config(page_title="Optimized GADM Data Processing", layout="wide") # Hugging Face Credentials hf_api_key = st.secrets["World_Map_Dataset_Cleaning"] repo_id = "hussain2010/World_Map_Files" filename = "gadm_410.gpkg" # Streamlit App Title st.title("🌍 Optimized GADM Geospatial Data Processing") # Step 1: Download File from Hugging Face st.subheader("📥 Downloading Dataset from Hugging Face") try: dataset_path = hf_hub_download( repo_id=repo_id, filename=filename, token=hf_api_key, repo_type="dataset" ) st.success("✅ Dataset downloaded successfully!") except Exception as e: st.error(f"⚠️ Error downloading dataset: {e}") st.stop() # Step 2: Efficiently Load the Dataset st.subheader("📍 Loading Dataset Efficiently") try: # Load only metadata (no geometry) for filtering metadata_df = gpd.read_file(dataset_path, layer=0, usecols=["GID_0", "NAME_0"]).drop_duplicates() # Let user select a country first selected_country = st.selectbox("🌎 Select a Country", metadata_df["NAME_0"].unique()) # Load only selected country data (efficient filtering) gdf = gpd.read_file(dataset_path, layer=0, where=f"NAME_0 = '{selected_country}'") # Drop invalid geometries gdf = gdf[gdf.is_valid] # Convert CRS to EPSG:4326 if needed if gdf.crs and gdf.crs.to_string() != "EPSG:4326": gdf = gdf.to_crs("EPSG:4326") # Convert geometry to WKT to avoid PyArrow serialization errors gdf["geometry_wkt"] = gdf["geometry"].apply(lambda geom: geom.wkt if geom else None) # Drop original geometry column to reduce memory usage gdf.drop(columns=["geometry"], inplace=True) # Save to memory-efficient Parquet format optimized_file = "optimized_gadm.parquet" gdf.to_parquet(optimized_file, engine="pyarrow") # Show optimized dataset st.write("✔ Memory-efficient dataset preview:") st.dataframe(gdf.head(100)) # Show only first 100 rows # Provide Download Option with open(optimized_file, "rb") as file: st.download_button("📥 Download Optimized Geospatial Data", data=file, file_name="optimized_gadm.parquet", mime="application/octet-stream") except Exception as e: st.error(f"⚠️ Error processing file: {e}")