Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import streamlit as st | |
import re | |
import subprocess | |
import os | |
import pandas as pd | |
from geopy.geocoders import Nominatim | |
from geopy.exc import GeocoderUnavailable, GeocoderTimedOut | |
def get_coordinates(city, state, timeout=10): | |
geolocator = Nominatim(user_agent="geo_locator") | |
try: | |
location = geolocator.geocode(f"{city}, {state}, USA", timeout=timeout) | |
if location: | |
return f"{location.latitude}, {location.longitude}" | |
else: | |
return "" | |
except (GeocoderUnavailable, GeocoderTimedOut) as e: | |
print(f"Geocoding error: {e}") | |
return "" | |
def load_county_data(): | |
df = pd.read_csv("us_counties.csv") | |
df["stateName"] = df["stateName"].str.strip() | |
df["countyName"] = df["countyName"].str.strip() | |
return df | |
county_data = load_county_data() | |
# Mapping of full state names to abbreviations (including District of Columbia) | |
state_abbr_map = { | |
"Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", | |
"California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", | |
"Florida": "FL", "Georgia": "GA", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", | |
"Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", | |
"Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", | |
"Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", | |
"Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", | |
"New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", | |
"Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", | |
"Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", | |
"Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virginia": "VA", | |
"Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY", | |
"District of Columbia": "DC" | |
} | |
# Inverse mapping: abbreviation to full state name | |
abbr_to_full = {abbr: name for name, abbr in state_abbr_map.items()} | |
st.title("Batch Data Ingestion Portal") | |
st.write("Upload multiple PDF files of climate action plans. Files should be named as follows:") | |
st.write("**City, State Plan Type Year.pdf** (e.g., *Carson, CA Mitigation Only CAP 2017.pdf* or *Washington, District of Columbia Green Plan 2019.pdf*)") | |
uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True) | |
api_key = st.text_input("OpenAI API Key", type="password") | |
file_info = {} | |
if uploaded_files: | |
with st.form("metadata_form"): | |
st.write("### File Details and County Selection") | |
for uploaded_file in uploaded_files: | |
with st.expander(f"File: {uploaded_file.name}", expanded=True): | |
base_name = os.path.splitext(uploaded_file.name)[0] | |
# Regex with alternation: | |
# - Either exactly two letters as state_abbr (if followed by whitespace) | |
# - Or a full state name (one or more words) | |
pattern = r"^(?P<city>.+?),\s*((?P<state_abbr>[A-Za-z]{2})(?=\s)|(?P<state_full>[A-Za-z\.]+(?:\s+[A-Za-z\.]+)*?))\s+(?P<plan_type>.+?)\s+(?P<year>\d{4})$" | |
match = re.match(pattern, base_name) | |
if not match: | |
st.error("Filename format is incorrect. Please ensure it follows 'City, State Plan Type Year.pdf'") | |
continue | |
city = match.group("city").strip() | |
# Determine if the state was captured as an abbreviation or full name. | |
if match.group("state_abbr"): | |
state_abbrev = match.group("state_abbr").upper() | |
full_state = abbr_to_full.get(state_abbrev) | |
if not full_state: | |
st.error(f"State abbreviation {state_abbrev} not recognized.") | |
continue | |
else: | |
full_state = match.group("state_full").strip() | |
# Normalize common variations for District of Columbia. | |
if full_state.lower() in ["district", "d.c.", "dc"]: | |
full_state = "District of Columbia" | |
if full_state in state_abbr_map: | |
state_abbrev = state_abbr_map[full_state] | |
else: | |
st.error(f"State name {full_state} not recognized.") | |
continue | |
plan_type = match.group("plan_type").strip() | |
year = match.group("year").strip() | |
st.write(f"**City:** {city}") | |
st.write(f"**State:** {full_state} ({state_abbrev})") | |
st.write(f"**Plan Type:** {plan_type}") | |
st.write(f"**Year:** {year}") | |
county_options = county_data[county_data["stateName"] == full_state]["countyName"].tolist() | |
selected_counties = st.multiselect("Select County(ies) for this plan", county_options, key=f"counties_{uploaded_file.name}") | |
default_coords = get_coordinates(city, state_abbrev) | |
coords = st.text_input("City Center Coordinates (latitude, longitude)", value=default_coords, key=f"coords_{uploaded_file.name}") | |
file_info[uploaded_file.name] = { | |
"uploaded_file": uploaded_file, | |
"city": city, | |
"state": state_abbrev, | |
"plan_type": plan_type, | |
"year": year, | |
"counties": selected_counties, | |
"coords": coords | |
} | |
form_submitted = st.form_submit_button("Process All Files") | |
if form_submitted: | |
if not api_key: | |
st.error("Please provide the OpenAI API Key.") | |
else: | |
with st.spinner("Processing files..."): | |
for file_name, info in file_info.items(): | |
if (not info["city"] or not info["state"] or not info["plan_type"] or | |
not info["year"] or not api_key or not info["counties"] or not info["coords"]): | |
st.error(f"Missing required fields for file {file_name}. Please fill in all fields.") | |
continue | |
county_str = ", ".join(info["counties"]) | |
city = info["city"] | |
state_abbrev = info["state"] | |
plan_type = info["plan_type"] | |
year = info["year"] | |
coords = info["coords"] | |
uploaded_file = info["uploaded_file"] | |
out_file_name = f"{city}, {state_abbrev} {plan_type} {year}.pdf" | |
summary_file_name = f"{city}, {state_abbrev} {plan_type} {year}_Summary.md" | |
file_path = os.path.join("CAPS", out_file_name) | |
if os.path.exists(file_path): | |
st.error(f"File for {out_file_name} already exists. Skipping this file.") | |
continue | |
os.makedirs("CAPS", exist_ok=True) | |
with open(file_path, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.write(f"Saved {out_file_name} to CAPS folder.") | |
subprocess.run(["python", "data_ingestion_helpers/city_county_mapping_addition.py", city, state_abbrev, county_str, coords]) | |
st.write(f"City, State, County(s), and Coordinates added for {out_file_name}.") | |
subprocess.run(["python", "data_ingestion_helpers/summary_generation.py", api_key, file_path]) | |
st.write(f"Summary generated for {out_file_name}.") | |
subprocess.run(["python", "data_ingestion_helpers/data_ingestion_vectorstores.py", api_key, out_file_name, summary_file_name]) | |
st.write(f"Vector store created for {out_file_name}.") | |
subprocess.run(["python", "data_ingestion_helpers/dataset_addition.py", api_key, file_path]) | |
st.write(f"Data added to dataset for {out_file_name}.") | |
# Run final batch scripts once after all files are processed. | |
subprocess.run(["python", "batch_scripts/caps_directory_reader.py"]) | |
st.write("CAPS directory reader executed.") | |
subprocess.run(["python", "maps_helpers/maps_data.py"]) | |
st.write("Maps data re-created.") | |
subprocess.run(["python", "region_vectorstores.py", api_key]) | |
st.write("Region vectorstores created.") | |
st.success("All files processed successfully!") |