Chamin09 commited on
Commit
c158aae
·
verified ·
1 Parent(s): e135305

Create index_manager.py

Browse files
Files changed (1) hide show
  1. indexes/index_manager.py +97 -0
indexes/index_manager.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional
2
+ from pathlib import Path
3
+ import os
4
+ import pandas as pd
5
+ import numpy as np
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+
9
+ from indexes.csv_index_builder import EnhancedCSVReader
10
+
11
+ class CSVIndexManager:
12
+ """Manages creation and retrieval of indexes for CSV files."""
13
+
14
+ def __init__(self):
15
+ self.csv_reader = EnhancedCSVReader()
16
+ self.indexes = {}
17
+ self.vectorizer = TfidfVectorizer()
18
+ self.document_vectors = {}
19
+ self.all_documents = {}
20
+
21
+ def create_index(self, file_path: str) -> bool:
22
+ """Create index for a CSV file."""
23
+ # Extract filename as identifier
24
+ file_id = Path(file_path).stem
25
+
26
+ # Load documents with metadata
27
+ try:
28
+ documents = self.csv_reader.load_data(file_path)
29
+
30
+ # Store documents
31
+ self.all_documents[file_id] = documents
32
+
33
+ # Create document content for vectorization
34
+ doc_contents = [doc["content"] for doc in documents]
35
+
36
+ # Vectorize documents
37
+ if doc_contents:
38
+ # If this is our first document, fit the vectorizer
39
+ if not self.document_vectors:
40
+ vectors = self.vectorizer.fit_transform(doc_contents)
41
+ else:
42
+ # Otherwise, use the existing vocabulary
43
+ vectors = self.vectorizer.transform(doc_contents)
44
+
45
+ self.document_vectors[file_id] = vectors
46
+
47
+ # Store metadata
48
+ self.indexes[file_id] = {
49
+ "metadata": documents[0]["metadata"] if documents else {},
50
+ "path": file_path
51
+ }
52
+
53
+ return True
54
+
55
+ except Exception as e:
56
+ print(f"Error creating index for {file_path}: {e}")
57
+ return False
58
+
59
+ def index_directory(self, directory_path: str) -> Dict[str, bool]:
60
+ """Index all CSV files in a directory."""
61
+ indexed_files = {}
62
+
63
+ # Get all CSV files in directory
64
+ csv_files = [f for f in os.listdir(directory_path)
65
+ if f.lower().endswith('.csv')]
66
+
67
+ # Create index for each CSV file
68
+ for csv_file in csv_files:
69
+ file_path = os.path.join(directory_path, csv_file)
70
+ file_id = Path(file_path).stem
71
+ success = self.create_index(file_path)
72
+ indexed_files[file_id] = success
73
+
74
+ return indexed_files
75
+
76
+ def find_relevant_csvs(self, query: str, top_k: int = 3) -> List[str]:
77
+ """Find most relevant CSV files for a given query."""
78
+ if not self.indexes:
79
+ return []
80
+
81
+ # Vectorize the query
82
+ query_vector = self.vectorizer.transform([query])
83
+
84
+ # Calculate similarity with each CSV's content
85
+ similarities = {}
86
+ for file_id, vectors in self.document_vectors.items():
87
+ # Calculate max similarity across all documents in this CSV
88
+ file_similarities = cosine_similarity(query_vector, vectors).flatten()
89
+ similarities[file_id] = np.max(file_similarities)
90
+
91
+ # Sort by similarity and return top_k
92
+ sorted_files = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
93
+ return [file_id for file_id, _ in sorted_files[:top_k]]
94
+
95
+ def get_documents(self, file_id: str) -> List[Dict]:
96
+ """Get all documents for a specific CSV file."""
97
+ return self.all_documents.get(file_id, [])