Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import pandas as pd | |
import argparse | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain_core.documents import Document | |
def create_region_vectorstores(api_key): | |
# Set the OpenAI API key in the environment | |
os.environ["OPENAI_API_KEY"] = api_key | |
# === Configuration === | |
# Folder containing individual vector stores | |
input_folder = "Individual_All_Vectorstores" | |
# Folder to output the combined regional vector stores | |
output_folder = "Combined_By_Region_Vectorstores" | |
# CSV file with EPA region mapping | |
epa_csv_path = "epa_regions.csv" | |
# === Step 1: Load EPA Region Mapping === | |
epa_df = pd.read_csv(epa_csv_path, skipinitialspace=True) | |
state_abbr_to_region = { | |
str(s).strip().strip('"'): region for s, region in zip(epa_df['States'], epa_df['Region']) | |
} | |
print("EPA Mapping keys:", list(state_abbr_to_region.keys())) | |
# === Step 2: Group individual vector store directories by EPA region === | |
# Process only directories ending with "_vectorstore" but skip those that are summary stores. | |
region_to_store_paths = {} | |
cities_by_region = {} | |
for fname in os.listdir(input_folder): | |
# Skip if the folder does not end with '_vectorstore' | |
if not fname.endswith("_vectorstore"): | |
continue | |
# Explicitly skip any summary vector stores | |
if fname.endswith("_Summary_vectorstore"): | |
continue | |
try: | |
# Expected filename format: "City, ST PlanName_vectorstore" | |
parts = fname.split(", ") | |
if len(parts) < 2: | |
print(f"β οΈ Unexpected filename format: {fname}") | |
continue | |
city = parts[0].strip() | |
state_rest = parts[1] | |
state_abbr = state_rest.split(" ")[0].strip() | |
# Determine the EPA region from the state abbreviation | |
region = state_abbr_to_region.get(state_abbr) | |
if region is None: | |
print(f"β οΈ State abbreviation '{state_abbr}' not found in EPA mapping for file: {fname}") | |
continue | |
full_path = os.path.join(input_folder, fname) | |
region_to_store_paths.setdefault(region, []).append(full_path) | |
cities_by_region.setdefault(region, set()).add(city) | |
except Exception as e: | |
print(f"β Failed to parse filename: {fname}, error: {e}") | |
# === Step 3: Create combined vector store and cities CSV for each region === | |
os.makedirs(output_folder, exist_ok=True) | |
for region, paths in region_to_store_paths.items(): | |
region_dir = os.path.join(output_folder, f"Region_{region}") | |
os.makedirs(region_dir, exist_ok=True) | |
print(f"π Combining {len(paths)} vector stores for EPA Region {region}") | |
all_documents = [] | |
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large") | |
for store_path in paths: | |
try: | |
vector_store = FAISS.load_local(store_path, embedding_model, allow_dangerous_deserialization=True) | |
# Extract stored documents using the underlying InMemoryDocstore's internal dictionary. | |
docs = list(vector_store.docstore._dict.values()) | |
all_documents.extend(docs) | |
except Exception as e: | |
print(f"β Failed to load or extract documents from '{store_path}' for region {region}: {e}") | |
if all_documents: | |
combined_vector_store = FAISS.from_documents(all_documents, embedding_model) | |
combined_store_path = os.path.join(region_dir, f"Region_{region}_vectorstore") | |
combined_vector_store.save_local(combined_store_path) | |
print(f"β Created combined vector store for Region {region} at {combined_store_path}") | |
else: | |
print(f"β οΈ No documents found for Region {region}, skipping vector store creation.") | |
# Create a CSV file listing the cities in this region | |
cities = sorted(cities_by_region.get(region, [])) | |
cities_df = pd.DataFrame(cities, columns=["City"]) | |
cities_csv_path = os.path.join(region_dir, f"Region_{region}_cities.csv") | |
cities_df.to_csv(cities_csv_path, index=False) | |
print(f"β Created cities CSV file for Region {region} at {cities_csv_path}") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Create combined EPA Region vector stores") | |
parser.add_argument("api_key", type=str, help="OpenAI API Key") | |
args = parser.parse_args() | |
create_region_vectorstores(args.api_key) | |