Adityabhaskar commited on
Commit
6ef497e
·
verified ·
1 Parent(s): 253800a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +245 -148
app.py CHANGED
@@ -1,180 +1,277 @@
 
 
 
 
 
 
 
 
1
  import os
 
 
2
  import gradio as gr
3
- import pandas as pd
4
- from typing import List, Dict, Any
5
 
6
- # --- LlamaIndex & LangChain Imports ---
7
- from llama_index.core import VectorStoreIndex, Document, Settings
8
- from llama_index.llms.openai import OpenAI as LlamaOpenAI
9
- from llama_index.embeddings.openai import OpenAIEmbedding
10
- from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
11
- from langchain_openai import ChatOpenAI
12
- from langchain.agents.agent_types import AgentType
13
 
14
- class HybridExcelQuerySystem:
 
 
 
15
  def __init__(self, openai_api_key: str):
16
  os.environ["OPENAI_API_KEY"] = openai_api_key
17
- Settings.llm = LlamaOpenAI(model="gpt-4o")
18
- Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
19
- self.agent_llm = ChatOpenAI(temperature=0, model="gpt-4o")
20
-
21
- self.dataframes: Dict[str, pd.DataFrame] = {}
22
- self.vector_stores: Dict[str, VectorStoreIndex] = {}
23
  self.logs = []
24
- self.sheet_names = []
25
-
26
- def _pivot_numerical_data(self, df: pd.DataFrame) -> pd.DataFrame:
27
- """
28
- --- NEW, MORE ROBUST VERSION ---
29
- Intelligently reconstructs the 'Numerical Data' sheet into a clean, tidy format.
30
- """
31
- # Define expected months to correctly identify the data block
32
- months = ['Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar']
33
-
34
- # Find the first row that contains at least one month name
35
- header_row_index = df[df.isin(months)].dropna(how='all').index[0]
36
- header_series = df.iloc[header_row_index]
37
-
38
- # Filter this row to get only the actual month names
39
- actual_month_headers = [h for h in header_series if h in months]
40
-
41
- # Find the start and end column positions of the month data
42
- start_col_pos = header_series.tolist().index(actual_month_headers[0])
43
- end_col_pos = header_series.tolist().index(actual_month_headers[-1])
44
-
45
- # Find the start row of the financial metrics
46
- metric_col = df.iloc[:, 0].dropna()
47
- start_row_pos = metric_col.index[1] # The metrics start after "Profit & Loss Account"
48
-
49
- # Slice the core data block
50
- data = df.iloc[start_row_pos:, start_col_pos:end_col_pos+1]
51
- metrics = df.iloc[start_row_pos:, 0]
52
-
53
- # Create a new, clean DataFrame
54
- clean_df = pd.DataFrame(data.values, index=metrics, columns=actual_month_headers)
55
-
56
- # Transpose so months are rows
57
- clean_df = clean_df.T
58
- clean_df = clean_df.reset_index().rename(columns={'index': 'Month'})
59
-
60
- # Clean column names
61
- clean_df.columns = clean_df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_%]', '_', regex=True).str.replace('__', '_')
62
-
63
- # Convert all columns except 'Month' to numeric
64
- for col in clean_df.columns:
65
- if col != 'Month':
66
- if clean_df[col].dtype == 'object':
67
- clean_df[col] = clean_df[col].astype(str).str.replace('%', '', regex=False)
68
- clean_df[col] = pd.to_numeric(clean_df[col], errors='coerce')
69
-
70
- return clean_df
71
-
72
 
73
  def load_excel_file(self, file_path: str) -> str:
 
74
  self.logs.clear()
75
  try:
76
- xls = pd.ExcelFile(file_path)
77
- self.sheet_names = xls.sheet_names
78
- self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
79
 
80
- for sheet_name in self.sheet_names:
81
- df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
82
-
83
- if sheet_name == "Numerical Data":
84
- agent_df = self._pivot_numerical_data(df.copy())
85
- else:
86
- agent_df = self._clean_dataframe_for_agent(df.copy())
87
- self.dataframes[sheet_name] = agent_df
88
-
89
- rag_df = self._clean_dataframe_for_rag(df.copy())
90
- markdown_text = rag_df.to_markdown(index=False)
91
- doc = Document(text=markdown_text, metadata={"source": sheet_name})
92
- self.vector_stores[sheet_name] = VectorStoreIndex.from_documents([doc])
93
-
94
- self.logs.append(f" - Prepared sheet '{sheet_name}' for both Lookup and Calculation.")
95
 
96
- self.logs.append("✅ All sheets are ready.")
 
97
  return "\n".join(self.logs)
98
  except Exception as e:
99
- raise Exception(f"Error loading Excel file: {e}")
100
 
101
- def _clean_dataframe_for_agent(self, df: pd.DataFrame) -> pd.DataFrame:
102
- df.columns = [f"Col_{i}" for i in range(len(df.columns))]
103
- return df
104
-
105
- def _clean_dataframe_for_rag(self, df: pd.DataFrame) -> pd.DataFrame:
106
  for col in df.columns:
107
- df[col] = df[col].astype(str)
 
 
 
 
 
 
 
 
108
  return df
109
 
110
- def _classify_query(self, query: str) -> str:
111
- prompt = f"""Classify the user's query about an Excel sheet as either "lookup" or "calculation". "lookup": for questions asking for specific data or summaries. "calculation": for questions requiring math, sorting, or filtering. Query: "{query}" Classification:"""
112
- response = self.agent_llm.invoke(prompt)
113
- classification = response.content.strip().lower()
114
- return "calculation" if "calculation" in classification else "lookup"
115
-
116
- def query(self, query: str, selected_sheet: str) -> Dict[str, Any]:
117
- if not selected_sheet:
118
- return {"answer": "Error: Please select a sheet first.", "tool_used": "None"}
119
- classification = self._classify_query(query)
120
- if classification == "calculation":
121
- return self._execute_agent_query(query, selected_sheet)
122
- else:
123
- return self._execute_rag_query(query, selected_sheet)
124
-
125
- def _execute_rag_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
126
  try:
127
- query_engine = self.vector_stores[sheet_name].as_query_engine()
128
- response = query_engine.query(query)
129
- return {"answer": str(response), "tool_used": "Lookup (RAG Search)"}
130
- except Exception as e:
131
- return {"answer": f"Error during lookup: {e}", "tool_used": "Lookup (RAG Search)"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- def _execute_agent_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
 
 
134
  try:
135
- df = self.dataframes[sheet_name]
136
- agent = create_pandas_dataframe_agent(self.agent_llm, df, agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True, allow_dangerous_code=True, max_iterations=15, handle_parsing_errors=True)
137
- response = agent.invoke(query)
138
- return {"answer": response['output'], "tool_used": "Calculation (Pandas Agent)"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  except Exception as e:
140
- return {"answer": f"Error during calculation: {e}", "tool_used": "Calculation (Pandas Agent)"}
 
141
 
142
- # --- Gradio UI ---
143
- def process_excel(api_key: str, file_obj: gr.File):
144
- if not api_key: raise gr.Error("Please provide your OpenAI API key.")
145
- if not file_obj: raise gr.Error("Please upload an Excel file.")
146
-
147
- system = HybridExcelQuerySystem(openai_api_key=api_key)
148
- logs = system.load_excel_file(file_obj.name)
149
- sheet_names = system.sheet_names
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
- return (logs, system, gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True), gr.update(visible=True))
152
-
153
- def user_interaction(question: str, history: List, system_state: HybridExcelQuerySystem, selected_sheet: str):
154
- if not system_state: raise gr.Error("Please upload and process a file first.")
155
- result = system_state.query(question, selected_sheet)
156
- answer = result.get("answer", "No response.")
157
- tool_used = result.get("tool_used", "Unknown")
158
- full_response = f"{answer}\n\n*Tool Used: {tool_used}*"
159
- return full_response
160
-
161
- with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Excel Analyzer") as demo:
162
- system_state = gr.State()
163
- gr.Markdown("# 🤖 Hybrid Excel Analyzer")
164
- gr.Markdown("This app automatically chooses the best AI tool to answer your questions about an Excel file.")
 
 
 
 
 
 
 
 
165
  with gr.Row():
166
  with gr.Column(scale=1):
167
  gr.Markdown("### 1. Setup")
168
- openai_api_key = gr.Textbox(label="OpenAI API Key", type="password", value=os.getenv("OPENAI_API_KEY", ""))
169
- excel_upload = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
170
- process_button = gr.Button("Process File", variant="primary")
171
- status_text = gr.Textbox(label="Processing Status", interactive=False, lines=8)
 
 
 
 
 
 
172
  with gr.Column(scale=2):
173
  gr.Markdown("### 2. Ask a Question")
174
- with gr.Group(visible=False) as query_ui:
175
- sheet_selector = gr.Dropdown(label="Select a Sheet")
176
- chat_interface = gr.ChatInterface(fn=user_interaction, additional_inputs=[system_state, sheet_selector], title="Chat with your Excel Data")
177
- process_button.click(fn=process_excel, inputs=[openai_api_key, excel_upload], outputs=[status_text, system_state, sheet_selector, query_ui])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  if __name__ == "__main__":
180
- demo.launch()
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from langchain_openai import OpenAI
4
+ from langchain_core.documents import Document
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ import re
9
  import os
10
+ from typing import Dict, List, Any
11
+ import warnings
12
  import gradio as gr
13
+ from dotenv import load_dotenv
 
14
 
15
+ # Ignore warnings for a cleaner interface
16
+ warnings.filterwarnings('ignore')
17
+ # Load environment variables from .env file
18
+ load_dotenv()
 
 
 
19
 
20
+ class ExcelAIQuerySystem:
21
+ """
22
+ A system to query Excel files using natural language, powered by OpenAI and LangChain.
23
+ """
24
  def __init__(self, openai_api_key: str):
25
  os.environ["OPENAI_API_KEY"] = openai_api_key
26
+ self.llm = OpenAI(temperature=0)
27
+ self.embeddings = OpenAIEmbeddings()
28
+ self.excel_data = {}
29
+ self.sheet_descriptions = {}
30
+ self.vectorstore = None
 
31
  self.logs = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def load_excel_file(self, file_path: str) -> str:
34
+ """Loads and processes an Excel file, generating descriptions and a vector store."""
35
  self.logs.clear()
36
  try:
37
+ excel_file = pd.ExcelFile(file_path)
38
+ sheet_names = excel_file.sheet_names
39
+ self.logs.append(f"✅ Found {len(sheet_names)} sheets: {', '.join(sheet_names)}")
40
 
41
+ for sheet_name in sheet_names:
42
+ try:
43
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
44
+ df = self._clean_dataframe(df)
45
+ self.excel_data[sheet_name] = df
46
+
47
+ description = self._generate_sheet_description(sheet_name, df)
48
+ self.sheet_descriptions[sheet_name] = description
49
+ self.logs.append(f" - Loaded and described sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
50
+ except Exception as e:
51
+ self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
52
+ continue
 
 
 
53
 
54
+ self._create_vectorstore()
55
+ self.logs.append("✅ Vector store created successfully.")
56
  return "\n".join(self.logs)
57
  except Exception as e:
58
+ raise Exception(f"Error loading Excel file: {str(e)}")
59
 
60
+ def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
61
+ """Cleans a DataFrame by removing empty rows/columns and converting data types."""
62
+ df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
 
 
63
  for col in df.columns:
64
+ if df[col].dtype == 'object':
65
+ try:
66
+ df[col] = pd.to_datetime(df[col], errors='ignore')
67
+ except:
68
+ pass
69
+ try:
70
+ df[col] = pd.to_numeric(df[col], errors='ignore')
71
+ except:
72
+ pass
73
  return df
74
 
75
+ def _generate_sheet_description(self, sheet_name: str, df: pd.DataFrame) -> str:
76
+ """Generates a text description of a DataFrame using an LLM."""
77
+ sample_data = df.head(3).to_string()
78
+ prompt = f"""
79
+ Analyze this Excel sheet and provide a concise one-paragraph summary.
80
+ Sheet Name: {sheet_name}
81
+ Columns: {list(df.columns)}
82
+ Sample Data:
83
+ {sample_data}
84
+
85
+ Focus on the main purpose of the data, key metrics, and the time period covered.
86
+ """
 
 
 
 
87
  try:
88
+ return self.llm.invoke(prompt)
89
+ except Exception:
90
+ return f"Sheet: {sheet_name}, Columns: {', '.join(list(df.columns))}"
91
+
92
+ def _create_vectorstore(self):
93
+ """Creates a FAISS vector store from sheet descriptions for similarity search."""
94
+ documents = [
95
+ Document(page_content=desc, metadata={"sheet_name": name})
96
+ for name, desc in self.sheet_descriptions.items()
97
+ ]
98
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
99
+ splits = text_splitter.split_documents(documents)
100
+ self.vectorstore = FAISS.from_documents(splits, self.embeddings)
101
+
102
+ def identify_relevant_sheets(self, query: str) -> List[str]:
103
+ """Identifies the most relevant sheets for a given query using the vector store."""
104
+ if not self.vectorstore:
105
+ return list(self.excel_data.keys())
106
+ try:
107
+ docs = self.vectorstore.similarity_search(query, k=3)
108
+ sheet_names = [doc.metadata['sheet_name'] for doc in docs if 'sheet_name' in doc.metadata]
109
+ return list(dict.fromkeys(sheet_names))[:5]
110
+ except Exception:
111
+ return list(self.excel_data.keys())
112
 
113
+ def query_data(self, query: str) -> Dict[str, Any]:
114
+ """Processes a user query against the loaded Excel data."""
115
+ results = {'query': query, 'relevant_sheets': [], 'sheet_results': {}, 'summary': '', 'insights': []}
116
  try:
117
+ relevant_sheets = self.identify_relevant_sheets(query)
118
+ results['relevant_sheets'] = relevant_sheets
119
+
120
+ for sheet_name in relevant_sheets:
121
+ if sheet_name not in self.excel_data:
122
+ continue
123
+ df = self.excel_data[sheet_name]
124
+ analysis_prompt = f"""
125
+ Analyze the data from sheet '{sheet_name}' to answer the query: "{query}"
126
+ Columns: {list(df.columns)}
127
+ Sample Data:
128
+ {df.head(5).to_string()}
129
+
130
+ Provide a direct answer, including key numbers, trends, or patterns.
131
+ """
132
+ response = self.llm.invoke(analysis_prompt)
133
+ results['sheet_results'][sheet_name] = {'response': response}
134
+
135
+ results['summary'] = self._generate_summary(query, results['sheet_results'])
136
+ results['insights'] = self._extract_insights(results['sheet_results'])
137
+ return results
138
  except Exception as e:
139
+ results['summary'] = f"Error processing query: {str(e)}"
140
+ return results
141
 
142
+ def _generate_summary(self, query: str, sheet_results: Dict) -> str:
143
+ """Generates a final, consolidated summary from individual sheet analyses."""
144
+ if not sheet_results:
145
+ return "No relevant data found to answer the query."
146
+
147
+ combined_responses = "\n\n".join(
148
+ f"--- Analysis from Sheet '{name}' ---\n{res['response']}"
149
+ for name, res in sheet_results.items()
150
+ )
151
+ prompt = f"""
152
+ Based on the following analyses, provide a final, consolidated answer to the query.
153
+ Original Query: {query}
154
+
155
+ {combined_responses}
156
+
157
+ Synthesize these findings into a clear and direct summary.
158
+ """
159
+ return self.llm.invoke(prompt)
160
+
161
+ def _extract_insights(self, sheet_results: Dict) -> List[str]:
162
+ """Extracts simple, actionable insights from the analysis results."""
163
+ insights = set()
164
+ for sheet_name, result in sheet_results.items():
165
+ response = result.get('response', '').lower()
166
+ if re.search(r'\b\d+\.?\d*\b', response):
167
+ insights.add(f"Numerical data found in '{sheet_name}'")
168
+ trend_keywords = ['increase', 'decrease', 'growth', 'decline', 'trend', 'pattern']
169
+ if any(keyword in response for keyword in trend_keywords):
170
+ insights.add(f"Trend analysis available in '{sheet_name}'")
171
+ return list(insights)
172
+
173
+ # --- Gradio Interface ---
174
+
175
+ def process_file(api_key, file_obj):
176
+ """Gradio function to load the file and prepare the system."""
177
+ if not api_key:
178
+ raise gr.Error("OpenAI API Key is required.")
179
+ if file_obj is None:
180
+ raise gr.Error("Please upload an Excel file.")
181
+ try:
182
+ excel_system = ExcelAIQuerySystem(api_key)
183
+ loading_logs = excel_system.load_excel_file(file_obj.name)
184
+
185
+ return (
186
+ loading_logs,
187
+ excel_system,
188
+ gr.update(visible=True),
189
+ gr.update(visible=True),
190
+ gr.update(visible=True)
191
+ )
192
+ except Exception as e:
193
+ raise gr.Error(f"Failed to process file: {e}")
194
+
195
+ def generate_response(query, system_state):
196
+ """Gradio function to handle user queries and display results."""
197
+ if not query:
198
+ raise gr.Error("Please enter a query.")
199
+ if system_state is None:
200
+ raise gr.Error("File not loaded. Please upload and load a file first.")
201
 
202
+ try:
203
+ result = system_state.query_data(query)
204
+ summary = result.get('summary', 'No summary available.')
205
+ sheets = ", ".join(result.get('relevant_sheets', []))
206
+ insights = ", ".join(result.get('insights', []))
207
+
208
+ details = f"**🔍 Relevant Sheets Identified:**\n{sheets}\n\n"
209
+ if insights:
210
+ details += f"**💡 Key Insights:**\n{insights}"
211
+
212
+ return summary, details
213
+ except Exception as e:
214
+ raise gr.Error(f"Error during query: {e}")
215
+
216
+ # --- UI Layout ---
217
+
218
+ with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
219
+ system_state = gr.State(None)
220
+
221
+ gr.Markdown("# 📊 Excel AI Query System")
222
+ gr.Markdown("Upload an Excel file, and ask questions about your data in plain English.")
223
+
224
  with gr.Row():
225
  with gr.Column(scale=1):
226
  gr.Markdown("### 1. Setup")
227
+ api_key_input = gr.Textbox(
228
+ label="OpenAI API Key",
229
+ type="password",
230
+ placeholder="Enter your OpenAI API key...",
231
+ value=os.getenv("OPENAI_API_KEY", "")
232
+ )
233
+ file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
234
+ load_button = gr.Button("Load File", variant="primary")
235
+ status_output = gr.Textbox(label="Loading Status", interactive=False, lines=5)
236
+
237
  with gr.Column(scale=2):
238
  gr.Markdown("### 2. Ask a Question")
239
+ query_input = gr.Textbox(
240
+ label="Your Question",
241
+ placeholder="e.g., 'What were the total sales in Q3?' or 'Show me the performance trend for Product X.'",
242
+ visible=False
243
+ )
244
+ ask_button = gr.Button("Get Answer", variant="primary", visible=False)
245
+
246
+ results_accordion = gr.Accordion("Results", open=False, visible=False)
247
+ with results_accordion:
248
+ summary_output = gr.Markdown(label="Summary")
249
+ details_output = gr.Markdown(label="Details")
250
+
251
+ # --- Event Handlers ---
252
+
253
+ load_button.click(
254
+ fn=process_file,
255
+ inputs=[api_key_input, file_input],
256
+ outputs=[status_output, system_state, query_input, ask_button, results_accordion]
257
+ )
258
+
259
+ ask_button.click(
260
+ fn=generate_response,
261
+ inputs=[query_input, system_state],
262
+ outputs=[summary_output, details_output]
263
+ ).then(
264
+ lambda: gr.update(open=True),
265
+ outputs=results_accordion
266
+ )
267
+
268
 
269
  if __name__ == "__main__":
270
+ demo.launch(share=True)
271
+
272
+ # # --- To this ---
273
+ # if __name__ == "__main__":
274
+ # # Render provides the PORT environment variable
275
+ # port = int(os.environ.get('PORT', 10000))
276
+ # # Launch on 0.0.0.0 to make it accessible outside the container
277
+ # demo.launch(server_name="0.0.0.0", server_port=port)