Adityabhaskar commited on
Commit
a9ad3d8
·
verified ·
1 Parent(s): 9c0f2dd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +271 -0
app.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from langchain_openai import OpenAI
4
+ from langchain_core.documents import Document
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ import re
9
+ import os
10
+ from typing import Dict, List, Any
11
+ import warnings
12
+ import gradio as gr
13
+ from dotenv import load_dotenv
14
+
15
+ # Ignore warnings for a cleaner interface
16
+ warnings.filterwarnings('ignore')
17
+ # Load environment variables from .env file
18
+ load_dotenv()
19
+
20
+ class ExcelAIQuerySystem:
21
+ """
22
+ A system to query Excel files using natural language, powered by OpenAI and LangChain.
23
+ """
24
+ def __init__(self, openai_api_key: str):
25
+ os.environ["OPENAI_API_KEY"] = openai_api_key
26
+ self.llm = OpenAI(temperature=0)
27
+ self.embeddings = OpenAIEmbeddings()
28
+ self.excel_data = {}
29
+ self.sheet_descriptions = {}
30
+ self.vectorstore = None
31
+ self.logs = []
32
+
33
+ def load_excel_file(self, file_path: str) -> str:
34
+ """Loads and processes an Excel file, generating descriptions and a vector store."""
35
+ self.logs.clear()
36
+ try:
37
+ excel_file = pd.ExcelFile(file_path)
38
+ sheet_names = excel_file.sheet_names
39
+ self.logs.append(f"✅ Found {len(sheet_names)} sheets: {', '.join(sheet_names)}")
40
+
41
+ for sheet_name in sheet_names:
42
+ try:
43
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
44
+ df = self._clean_dataframe(df)
45
+ self.excel_data[sheet_name] = df
46
+
47
+ description = self._generate_sheet_description(sheet_name, df)
48
+ self.sheet_descriptions[sheet_name] = description
49
+ self.logs.append(f" - Loaded and described sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
50
+ except Exception as e:
51
+ self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
52
+ continue
53
+
54
+ self._create_vectorstore()
55
+ self.logs.append("✅ Vector store created successfully.")
56
+ return "\n".join(self.logs)
57
+ except Exception as e:
58
+ raise Exception(f"Error loading Excel file: {str(e)}")
59
+
60
+ def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
61
+ """Cleans a DataFrame by removing empty rows/columns and converting data types."""
62
+ df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
63
+ for col in df.columns:
64
+ if df[col].dtype == 'object':
65
+ try:
66
+ df[col] = pd.to_datetime(df[col], errors='ignore')
67
+ except:
68
+ pass
69
+ try:
70
+ df[col] = pd.to_numeric(df[col], errors='ignore')
71
+ except:
72
+ pass
73
+ return df
74
+
75
+ def _generate_sheet_description(self, sheet_name: str, df: pd.DataFrame) -> str:
76
+ """Generates a text description of a DataFrame using an LLM."""
77
+ sample_data = df.head(3).to_string()
78
+ prompt = f"""
79
+ Analyze this Excel sheet and provide a concise one-paragraph summary.
80
+ Sheet Name: {sheet_name}
81
+ Columns: {list(df.columns)}
82
+ Sample Data:
83
+ {sample_data}
84
+
85
+ Focus on the main purpose of the data, key metrics, and the time period covered.
86
+ """
87
+ try:
88
+ return self.llm.invoke(prompt)
89
+ except Exception:
90
+ return f"Sheet: {sheet_name}, Columns: {', '.join(list(df.columns))}"
91
+
92
+ def _create_vectorstore(self):
93
+ """Creates a FAISS vector store from sheet descriptions for similarity search."""
94
+ documents = [
95
+ Document(page_content=desc, metadata={"sheet_name": name})
96
+ for name, desc in self.sheet_descriptions.items()
97
+ ]
98
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
99
+ splits = text_splitter.split_documents(documents)
100
+ self.vectorstore = FAISS.from_documents(splits, self.embeddings)
101
+
102
+ def identify_relevant_sheets(self, query: str) -> List[str]:
103
+ """Identifies the most relevant sheets for a given query using the vector store."""
104
+ if not self.vectorstore:
105
+ return list(self.excel_data.keys())
106
+ try:
107
+ docs = self.vectorstore.similarity_search(query, k=3)
108
+ sheet_names = [doc.metadata['sheet_name'] for doc in docs if 'sheet_name' in doc.metadata]
109
+ return list(dict.fromkeys(sheet_names))[:5]
110
+ except Exception:
111
+ return list(self.excel_data.keys())
112
+
113
+ def query_data(self, query: str) -> Dict[str, Any]:
114
+ """Processes a user query against the loaded Excel data."""
115
+ results = {'query': query, 'relevant_sheets': [], 'sheet_results': {}, 'summary': '', 'insights': []}
116
+ try:
117
+ relevant_sheets = self.identify_relevant_sheets(query)
118
+ results['relevant_sheets'] = relevant_sheets
119
+
120
+ for sheet_name in relevant_sheets:
121
+ if sheet_name not in self.excel_data:
122
+ continue
123
+ df = self.excel_data[sheet_name]
124
+ analysis_prompt = f"""
125
+ Analyze the data from sheet '{sheet_name}' to answer the query: "{query}"
126
+ Columns: {list(df.columns)}
127
+ Sample Data:
128
+ {df.head(5).to_string()}
129
+
130
+ Provide a direct answer, including key numbers, trends, or patterns.
131
+ """
132
+ response = self.llm.invoke(analysis_prompt)
133
+ results['sheet_results'][sheet_name] = {'response': response}
134
+
135
+ results['summary'] = self._generate_summary(query, results['sheet_results'])
136
+ results['insights'] = self._extract_insights(results['sheet_results'])
137
+ return results
138
+ except Exception as e:
139
+ results['summary'] = f"Error processing query: {str(e)}"
140
+ return results
141
+
142
+ def _generate_summary(self, query: str, sheet_results: Dict) -> str:
143
+ """Generates a final, consolidated summary from individual sheet analyses."""
144
+ if not sheet_results:
145
+ return "No relevant data found to answer the query."
146
+
147
+ combined_responses = "\n\n".join(
148
+ f"--- Analysis from Sheet '{name}' ---\n{res['response']}"
149
+ for name, res in sheet_results.items()
150
+ )
151
+ prompt = f"""
152
+ Based on the following analyses, provide a final, consolidated answer to the query.
153
+ Original Query: {query}
154
+
155
+ {combined_responses}
156
+
157
+ Synthesize these findings into a clear and direct summary.
158
+ """
159
+ return self.llm.invoke(prompt)
160
+
161
+ def _extract_insights(self, sheet_results: Dict) -> List[str]:
162
+ """Extracts simple, actionable insights from the analysis results."""
163
+ insights = set()
164
+ for sheet_name, result in sheet_results.items():
165
+ response = result.get('response', '').lower()
166
+ if re.search(r'\b\d+\.?\d*\b', response):
167
+ insights.add(f"Numerical data found in '{sheet_name}'")
168
+ trend_keywords = ['increase', 'decrease', 'growth', 'decline', 'trend', 'pattern']
169
+ if any(keyword in response for keyword in trend_keywords):
170
+ insights.add(f"Trend analysis available in '{sheet_name}'")
171
+ return list(insights)
172
+
173
+ # --- Gradio Interface ---
174
+
175
+ def process_file(api_key, file_obj):
176
+ """Gradio function to load the file and prepare the system."""
177
+ if not api_key:
178
+ raise gr.Error("OpenAI API Key is required.")
179
+ if file_obj is None:
180
+ raise gr.Error("Please upload an Excel file.")
181
+ try:
182
+ excel_system = ExcelAIQuerySystem(api_key)
183
+ loading_logs = excel_system.load_excel_file(file_obj.name)
184
+
185
+ return (
186
+ loading_logs,
187
+ excel_system,
188
+ gr.update(visible=True),
189
+ gr.update(visible=True),
190
+ gr.update(visible=True)
191
+ )
192
+ except Exception as e:
193
+ raise gr.Error(f"Failed to process file: {e}")
194
+
195
+ def generate_response(query, system_state):
196
+ """Gradio function to handle user queries and display results."""
197
+ if not query:
198
+ raise gr.Error("Please enter a query.")
199
+ if system_state is None:
200
+ raise gr.Error("File not loaded. Please upload and load a file first.")
201
+
202
+ try:
203
+ result = system_state.query_data(query)
204
+ summary = result.get('summary', 'No summary available.')
205
+ sheets = ", ".join(result.get('relevant_sheets', []))
206
+ insights = ", ".join(result.get('insights', []))
207
+
208
+ details = f"**🔍 Relevant Sheets Identified:**\n{sheets}\n\n"
209
+ if insights:
210
+ details += f"**💡 Key Insights:**\n{insights}"
211
+
212
+ return summary, details
213
+ except Exception as e:
214
+ raise gr.Error(f"Error during query: {e}")
215
+
216
+ # --- UI Layout ---
217
+
218
+ with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
219
+ system_state = gr.State(None)
220
+
221
+ gr.Markdown("# 📊 Excel AI Query System")
222
+ gr.Markdown("Upload an Excel file, and ask questions about your data in plain English.")
223
+
224
+ with gr.Row():
225
+ with gr.Column(scale=1):
226
+ gr.Markdown("### 1. Setup")
227
+ api_key_input = gr.Textbox(
228
+ label="OpenAI API Key",
229
+ type="password",
230
+ placeholder="Enter your OpenAI API key...",
231
+ value=os.getenv("OPENAI_API_KEY", "")
232
+ )
233
+ file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
234
+ load_button = gr.Button("Load File", variant="primary")
235
+ status_output = gr.Textbox(label="Loading Status", interactive=False, lines=5)
236
+
237
+ with gr.Column(scale=2):
238
+ gr.Markdown("### 2. Ask a Question")
239
+ query_input = gr.Textbox(
240
+ label="Your Question",
241
+ placeholder="e.g., 'What were the total sales in Q3?' or 'Show me the performance trend for Product X.'",
242
+ visible=False
243
+ )
244
+ ask_button = gr.Button("Get Answer", variant="primary", visible=False)
245
+
246
+ results_accordion = gr.Accordion("Results", open=False, visible=False)
247
+ with results_accordion:
248
+ summary_output = gr.Markdown(label="Summary")
249
+ details_output = gr.Markdown(label="Details")
250
+
251
+ # --- Event Handlers ---
252
+
253
+ load_button.click(
254
+ fn=process_file,
255
+ inputs=[api_key_input, file_input],
256
+ outputs=[status_output, system_state, query_input, ask_button, results_accordion]
257
+ )
258
+
259
+ ask_button.click(
260
+ fn=generate_response,
261
+ inputs=[query_input, system_state],
262
+ outputs=[summary_output, details_output]
263
+ ).then(
264
+ lambda: gr.update(open=True),
265
+ outputs=results_accordion
266
+ )
267
+ if __name__ == "__main__":
268
+ # Render provides the PORT environment variable
269
+ port = int(os.environ.get('PORT', 10000))
270
+ # Launch on 0.0.0.0 to make it accessible outside the container
271
+ demo.launch(server_name="0.0.0.0", server_port=port)