hussain2010's picture
Update app.py
ef6f6b5 verified
import streamlit as st
import geopandas as gpd
import pandas as pd
import pyarrow.parquet as pq
from huggingface_hub import hf_hub_download
import warnings
# Suppress specific RuntimeWarnings (USECOLS)
warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*USECOLS.*")
# Set Streamlit Page Configuration
st.set_page_config(page_title="Optimized GADM Data Processing", layout="wide")
# Hugging Face Credentials
hf_api_key = st.secrets["World_Map_Dataset_Cleaning"]
repo_id = "hussain2010/World_Map_Files"
filename = "gadm_410.gpkg"
# Streamlit App Title
st.title("🌍 Optimized GADM Geospatial Data Processing")
# Step 1: Download File from Hugging Face
st.subheader("πŸ“₯ Downloading Dataset from Hugging Face")
try:
dataset_path = hf_hub_download(
repo_id=repo_id, filename=filename, token=hf_api_key, repo_type="dataset"
)
st.success("βœ… Dataset downloaded successfully!")
except Exception as e:
st.error(f"⚠️ Error downloading dataset: {e}")
st.stop()
# Step 2: Efficiently Load the Dataset
st.subheader("πŸ“ Loading Dataset Efficiently")
try:
# Load only metadata (no geometry) for filtering
metadata_df = gpd.read_file(dataset_path, layer=0, usecols=["GID_0", "NAME_0"]).drop_duplicates()
# Let user select a country first
selected_country = st.selectbox("🌎 Select a Country", metadata_df["NAME_0"].unique())
# Load only selected country data (efficient filtering)
gdf = gpd.read_file(dataset_path, layer=0, where=f"NAME_0 = '{selected_country}'")
# Drop invalid geometries
gdf = gdf[gdf.is_valid]
# Convert CRS to EPSG:4326 if needed
if gdf.crs and gdf.crs.to_string() != "EPSG:4326":
gdf = gdf.to_crs("EPSG:4326")
# Convert geometry to WKT to avoid PyArrow serialization errors
gdf["geometry_wkt"] = gdf["geometry"].apply(lambda geom: geom.wkt if geom else None)
# Drop original geometry column to reduce memory usage
gdf.drop(columns=["geometry"], inplace=True)
# Save to memory-efficient Parquet format
optimized_file = "optimized_gadm.parquet"
gdf.to_parquet(optimized_file, engine="pyarrow")
# Show optimized dataset
st.write("βœ” Memory-efficient dataset preview:")
st.dataframe(gdf.head(100)) # Show only first 100 rows
# Provide Download Option
with open(optimized_file, "rb") as file:
st.download_button("πŸ“₯ Download Optimized Geospatial Data", data=file, file_name="optimized_gadm.parquet", mime="application/octet-stream")
except Exception as e:
st.error(f"⚠️ Error processing file: {e}")