climate-policy-tracker / region_vectorstores.py
umangchaudhry's picture
Upload 15 files
08ace8f verified
import os
import pandas as pd
import argparse
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
def create_region_vectorstores(api_key):
# Set the OpenAI API key in the environment
os.environ["OPENAI_API_KEY"] = api_key
# === Configuration ===
# Folder containing individual vector stores
input_folder = "Individual_All_Vectorstores"
# Folder to output the combined regional vector stores
output_folder = "Combined_By_Region_Vectorstores"
# CSV file with EPA region mapping
epa_csv_path = "epa_regions.csv"
# === Step 1: Load EPA Region Mapping ===
epa_df = pd.read_csv(epa_csv_path, skipinitialspace=True)
state_abbr_to_region = {
str(s).strip().strip('"'): region for s, region in zip(epa_df['States'], epa_df['Region'])
}
print("EPA Mapping keys:", list(state_abbr_to_region.keys()))
# === Step 2: Group individual vector store directories by EPA region ===
# Process only directories ending with "_vectorstore" but skip those that are summary stores.
region_to_store_paths = {}
cities_by_region = {}
for fname in os.listdir(input_folder):
# Skip if the folder does not end with '_vectorstore'
if not fname.endswith("_vectorstore"):
continue
# Explicitly skip any summary vector stores
if fname.endswith("_Summary_vectorstore"):
continue
try:
# Expected filename format: "City, ST PlanName_vectorstore"
parts = fname.split(", ")
if len(parts) < 2:
print(f"⚠️ Unexpected filename format: {fname}")
continue
city = parts[0].strip()
state_rest = parts[1]
state_abbr = state_rest.split(" ")[0].strip()
# Determine the EPA region from the state abbreviation
region = state_abbr_to_region.get(state_abbr)
if region is None:
print(f"⚠️ State abbreviation '{state_abbr}' not found in EPA mapping for file: {fname}")
continue
full_path = os.path.join(input_folder, fname)
region_to_store_paths.setdefault(region, []).append(full_path)
cities_by_region.setdefault(region, set()).add(city)
except Exception as e:
print(f"❌ Failed to parse filename: {fname}, error: {e}")
# === Step 3: Create combined vector store and cities CSV for each region ===
os.makedirs(output_folder, exist_ok=True)
for region, paths in region_to_store_paths.items():
region_dir = os.path.join(output_folder, f"Region_{region}")
os.makedirs(region_dir, exist_ok=True)
print(f"πŸ”„ Combining {len(paths)} vector stores for EPA Region {region}")
all_documents = []
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
for store_path in paths:
try:
vector_store = FAISS.load_local(store_path, embedding_model, allow_dangerous_deserialization=True)
# Extract stored documents using the underlying InMemoryDocstore's internal dictionary.
docs = list(vector_store.docstore._dict.values())
all_documents.extend(docs)
except Exception as e:
print(f"❌ Failed to load or extract documents from '{store_path}' for region {region}: {e}")
if all_documents:
combined_vector_store = FAISS.from_documents(all_documents, embedding_model)
combined_store_path = os.path.join(region_dir, f"Region_{region}_vectorstore")
combined_vector_store.save_local(combined_store_path)
print(f"βœ… Created combined vector store for Region {region} at {combined_store_path}")
else:
print(f"⚠️ No documents found for Region {region}, skipping vector store creation.")
# Create a CSV file listing the cities in this region
cities = sorted(cities_by_region.get(region, []))
cities_df = pd.DataFrame(cities, columns=["City"])
cities_csv_path = os.path.join(region_dir, f"Region_{region}_cities.csv")
cities_df.to_csv(cities_csv_path, index=False)
print(f"βœ… Created cities CSV file for Region {region} at {cities_csv_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create combined EPA Region vector stores")
parser.add_argument("api_key", type=str, help="OpenAI API Key")
args = parser.parse_args()
create_region_vectorstores(args.api_key)