Chamin09 commited on
Commit
a784e49
·
verified ·
1 Parent(s): 3613a7e

Create query_engine.py

Browse files
Files changed (1) hide show
  1. indexes/query_engine.py +105 -0
indexes/query_engine.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any, Optional
2
+ import pandas as pd
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+
6
+ class CSVQueryEngine:
7
+ """Query engine for CSV data with multi-file support."""
8
+
9
+ def __init__(self, index_manager, llm):
10
+ """Initialize with index manager and language model."""
11
+ self.index_manager = index_manager
12
+ self.llm = llm
13
+
14
+ def query(self, query_text: str) -> Dict[str, Any]:
15
+ """Process a natural language query across CSV files."""
16
+ # Find relevant CSV files
17
+ relevant_csvs = self.index_manager.find_relevant_csvs(query_text)
18
+
19
+ if not relevant_csvs:
20
+ return {
21
+ "answer": "No relevant CSV files found for your query.",
22
+ "sources": []
23
+ }
24
+
25
+ # Prepare context from relevant CSVs
26
+ context = self._prepare_context(query_text, relevant_csvs)
27
+
28
+ # Generate prompt
29
+ prompt = self._generate_prompt(query_text, context)
30
+
31
+ # Get response from LLM
32
+ response = self.llm.complete(prompt)
33
+
34
+ # Return formatted response
35
+ return {
36
+ "answer": response.text,
37
+ "sources": self._get_sources(relevant_csvs)
38
+ }
39
+
40
+ def _prepare_context(self, query: str, csv_ids: List[str]) -> str:
41
+ """Prepare context from relevant CSV files."""
42
+ context_parts = []
43
+
44
+ for csv_id in csv_ids:
45
+ # Get metadata
46
+ if csv_id not in self.index_manager.indexes:
47
+ continue
48
+
49
+ metadata = self.index_manager.indexes[csv_id]["metadata"]
50
+ file_path = self.index_manager.indexes[csv_id]["path"]
51
+
52
+ # Add CSV metadata
53
+ context_parts.append(f"CSV File: {metadata['filename']}")
54
+ context_parts.append(f"Columns: {', '.join(metadata['columns'])}")
55
+ context_parts.append(f"Row Count: {metadata['row_count']}")
56
+
57
+ # Add sample data
58
+ try:
59
+ df = pd.read_csv(file_path)
60
+ context_parts.append("\nSample Data:")
61
+ context_parts.append(df.head(5).to_string())
62
+
63
+ # Add some basic statistics that might be relevant
64
+ context_parts.append("\nNumeric Column Statistics:")
65
+ numeric_cols = df.select_dtypes(include=['number']).columns
66
+ for col in numeric_cols:
67
+ stats = df[col].describe()
68
+ context_parts.append(f"{col} - mean: {stats['mean']:.2f}, min: {stats['min']:.2f}, max: {stats['max']:.2f}")
69
+ except Exception as e:
70
+ context_parts.append(f"Error reading CSV: {str(e)}")
71
+
72
+ return "\n\n".join(context_parts)
73
+
74
+ def _generate_prompt(self, query: str, context: str) -> str:
75
+ """Generate a prompt for the LLM."""
76
+ return f"""You are an AI assistant specialized in analyzing CSV data.
77
+ Your goal is to help users understand their data and extract insights.
78
+
79
+ Below is information about CSV files that might help answer the query:
80
+
81
+ {context}
82
+
83
+ User Query: {query}
84
+
85
+ Please provide a comprehensive and accurate answer based on the data.
86
+ If calculations are needed, explain your process.
87
+ If the data doesn't contain information to answer the query, say so clearly.
88
+
89
+ Answer:"""
90
+
91
+ def _get_sources(self, csv_ids: List[str]) -> List[Dict[str, str]]:
92
+ """Get source information for the response."""
93
+ sources = []
94
+
95
+ for csv_id in csv_ids:
96
+ if csv_id not in self.index_manager.indexes:
97
+ continue
98
+
99
+ metadata = self.index_manager.indexes[csv_id]["metadata"]
100
+ sources.append({
101
+ "csv": metadata["filename"],
102
+ "columns": ", ".join(metadata["columns"][:5]) + ("..." if len(metadata["columns"]) > 5 else "")
103
+ })
104
+
105
+ return sources