Spaces:

hussain2010
/

Geospatial_Cleaning_Preprocessing_Wrangling_FeatureEngineering

Sleeping

App Files Files Community

Geospatial_Cleaning_Preprocessing_Wrangling_FeatureEngineering / app.py

hussain2010's picture

Update app.py

ef6f6b5 verified 3 months ago

history blame contribute delete

2.59 kB

	import streamlit as st
	import geopandas as gpd
	import pandas as pd
	import pyarrow.parquet as pq
	from huggingface_hub import hf_hub_download
	import warnings

	# Suppress specific RuntimeWarnings (USECOLS)
	warnings.filterwarnings("ignore", category=RuntimeWarning, message=".USECOLS.")

	# Set Streamlit Page Configuration
	st.set_page_config(page_title="Optimized GADM Data Processing", layout="wide")

	# Hugging Face Credentials
	hf_api_key = st.secrets["World_Map_Dataset_Cleaning"]
	repo_id = "hussain2010/World_Map_Files"
	filename = "gadm_410.gpkg"

	# Streamlit App Title
	st.title("🌍 Optimized GADM Geospatial Data Processing")

	# Step 1: Download File from Hugging Face
	st.subheader("📥 Downloading Dataset from Hugging Face")
	try:
	dataset_path = hf_hub_download(
	repo_id=repo_id, filename=filename, token=hf_api_key, repo_type="dataset"
	)
	st.success("✅ Dataset downloaded successfully!")
	except Exception as e:
	st.error(f"⚠️ Error downloading dataset: {e}")
	st.stop()

	# Step 2: Efficiently Load the Dataset
	st.subheader("📍 Loading Dataset Efficiently")
	try:
	# Load only metadata (no geometry) for filtering
	metadata_df = gpd.read_file(dataset_path, layer=0, usecols=["GID_0", "NAME_0"]).drop_duplicates()

	# Let user select a country first
	selected_country = st.selectbox("🌎 Select a Country", metadata_df["NAME_0"].unique())

	# Load only selected country data (efficient filtering)
	gdf = gpd.read_file(dataset_path, layer=0, where=f"NAME_0 = '{selected_country}'")

	# Drop invalid geometries
	gdf = gdf[gdf.is_valid]

	# Convert CRS to EPSG:4326 if needed
	if gdf.crs and gdf.crs.to_string() != "EPSG:4326":
	gdf = gdf.to_crs("EPSG:4326")

	# Convert geometry to WKT to avoid PyArrow serialization errors
	gdf["geometry_wkt"] = gdf["geometry"].apply(lambda geom: geom.wkt if geom else None)

	# Drop original geometry column to reduce memory usage
	gdf.drop(columns=["geometry"], inplace=True)

	# Save to memory-efficient Parquet format
	optimized_file = "optimized_gadm.parquet"
	gdf.to_parquet(optimized_file, engine="pyarrow")

	# Show optimized dataset
	st.write("✔ Memory-efficient dataset preview:")
	st.dataframe(gdf.head(100)) # Show only first 100 rows

	# Provide Download Option
	with open(optimized_file, "rb") as file:
	st.download_button("📥 Download Optimized Geospatial Data", data=file, file_name="optimized_gadm.parquet", mime="application/octet-stream")

	except Exception as e:
	st.error(f"⚠️ Error processing file: {e}")