from typing import Dict, List, Optional from pathlib import Path import os import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from indexes.csv_index_builder import EnhancedCSVReader class CSVIndexManager: """Manages creation and retrieval of indexes for CSV files.""" def __init__(self): self.csv_reader = EnhancedCSVReader() self.indexes = {} self.vectorizer = TfidfVectorizer() self.document_vectors = {} self.all_documents = {} def create_index(self, file_path: str) -> bool: """Create index for a CSV file.""" # Extract filename as identifier file_id = Path(file_path).stem # Load documents with metadata try: documents = self.csv_reader.load_data(file_path) # Store documents self.all_documents[file_id] = documents # Create document content for vectorization doc_contents = [doc["content"] for doc in documents] # Vectorize documents if doc_contents: # If this is our first document, fit the vectorizer if not self.document_vectors: vectors = self.vectorizer.fit_transform(doc_contents) else: # Otherwise, use the existing vocabulary vectors = self.vectorizer.transform(doc_contents) self.document_vectors[file_id] = vectors # Store metadata self.indexes[file_id] = { "metadata": documents[0]["metadata"] if documents else {}, "path": file_path } return True except Exception as e: print(f"Error creating index for {file_path}: {e}") return False def index_directory(self, directory_path: str) -> Dict[str, bool]: """Index all CSV files in a directory.""" indexed_files = {} # Get all CSV files in directory csv_files = [f for f in os.listdir(directory_path) if f.lower().endswith('.csv')] # Create index for each CSV file for csv_file in csv_files: file_path = os.path.join(directory_path, csv_file) file_id = Path(file_path).stem success = self.create_index(file_path) indexed_files[file_id] = success return indexed_files def find_relevant_csvs(self, query: str, top_k: int = 3) -> List[str]: """Find most relevant CSV files for a given query.""" if not self.indexes: return [] # Vectorize the query query_vector = self.vectorizer.transform([query]) # Calculate similarity with each CSV's content similarities = {} for file_id, vectors in self.document_vectors.items(): # Calculate max similarity across all documents in this CSV file_similarities = cosine_similarity(query_vector, vectors).flatten() similarities[file_id] = np.max(file_similarities) # Sort by similarity and return top_k sorted_files = sorted(similarities.items(), key=lambda x: x[1], reverse=True) return [file_id for file_id, _ in sorted_files[:top_k]] def get_documents(self, file_id: str) -> List[Dict]: """Get all documents for a specific CSV file.""" return self.all_documents.get(file_id, [])