Adityabhaskar commited on
Commit
9b5e972
·
verified ·
1 Parent(s): 6ef497e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -156
app.py CHANGED
@@ -1,38 +1,39 @@
1
  import pandas as pd
2
- import numpy as np
3
- from langchain_openai import OpenAI
4
- from langchain_core.documents import Document
5
- from langchain_community.vectorstores import FAISS
6
- from langchain_openai import OpenAIEmbeddings
7
- from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- import re
9
  import os
10
- from typing import Dict, List, Any
11
  import warnings
12
  import gradio as gr
13
  from dotenv import load_dotenv
14
 
 
 
 
 
15
  # Ignore warnings for a cleaner interface
16
  warnings.filterwarnings('ignore')
17
  # Load environment variables from .env file
18
  load_dotenv()
19
 
20
- class ExcelAIQuerySystem:
21
  """
22
- A system to query Excel files using natural language, powered by OpenAI and LangChain.
 
 
23
  """
24
  def __init__(self, openai_api_key: str):
 
25
  os.environ["OPENAI_API_KEY"] = openai_api_key
 
26
  self.llm = OpenAI(temperature=0)
27
- self.embeddings = OpenAIEmbeddings()
28
- self.excel_data = {}
29
- self.sheet_descriptions = {}
30
- self.vectorstore = None
31
  self.logs = []
32
 
33
- def load_excel_file(self, file_path: str) -> str:
34
- """Loads and processes an Excel file, generating descriptions and a vector store."""
 
 
 
35
  self.logs.clear()
 
36
  try:
37
  excel_file = pd.ExcelFile(file_path)
38
  sheet_names = excel_file.sheet_names
@@ -43,132 +44,57 @@ class ExcelAIQuerySystem:
43
  df = pd.read_excel(file_path, sheet_name=sheet_name)
44
  df = self._clean_dataframe(df)
45
  self.excel_data[sheet_name] = df
46
-
47
- description = self._generate_sheet_description(sheet_name, df)
48
- self.sheet_descriptions[sheet_name] = description
49
- self.logs.append(f" - Loaded and described sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
50
  except Exception as e:
51
  self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
52
  continue
53
 
54
- self._create_vectorstore()
55
- self.logs.append("✅ Vector store created successfully.")
56
- return "\n".join(self.logs)
57
  except Exception as e:
58
  raise Exception(f"Error loading Excel file: {str(e)}")
59
 
60
  def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
61
- """Cleans a DataFrame by removing empty rows/columns and converting data types."""
62
  df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
 
63
  for col in df.columns:
64
  if df[col].dtype == 'object':
65
  try:
66
- df[col] = pd.to_datetime(df[col], errors='ignore')
67
  except:
68
  pass
69
  try:
70
- df[col] = pd.to_numeric(df[col], errors='ignore')
71
  except:
72
  pass
73
  return df
74
 
75
- def _generate_sheet_description(self, sheet_name: str, df: pd.DataFrame) -> str:
76
- """Generates a text description of a DataFrame using an LLM."""
77
- sample_data = df.head(3).to_string()
78
- prompt = f"""
79
- Analyze this Excel sheet and provide a concise one-paragraph summary.
80
- Sheet Name: {sheet_name}
81
- Columns: {list(df.columns)}
82
- Sample Data:
83
- {sample_data}
84
-
85
- Focus on the main purpose of the data, key metrics, and the time period covered.
86
  """
 
 
 
 
 
 
 
87
  try:
88
- return self.llm.invoke(prompt)
89
- except Exception:
90
- return f"Sheet: {sheet_name}, Columns: {', '.join(list(df.columns))}"
91
-
92
- def _create_vectorstore(self):
93
- """Creates a FAISS vector store from sheet descriptions for similarity search."""
94
- documents = [
95
- Document(page_content=desc, metadata={"sheet_name": name})
96
- for name, desc in self.sheet_descriptions.items()
97
- ]
98
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
99
- splits = text_splitter.split_documents(documents)
100
- self.vectorstore = FAISS.from_documents(splits, self.embeddings)
101
-
102
- def identify_relevant_sheets(self, query: str) -> List[str]:
103
- """Identifies the most relevant sheets for a given query using the vector store."""
104
- if not self.vectorstore:
105
- return list(self.excel_data.keys())
106
- try:
107
- docs = self.vectorstore.similarity_search(query, k=3)
108
- sheet_names = [doc.metadata['sheet_name'] for doc in docs if 'sheet_name' in doc.metadata]
109
- return list(dict.fromkeys(sheet_names))[:5]
110
- except Exception:
111
- return list(self.excel_data.keys())
112
-
113
- def query_data(self, query: str) -> Dict[str, Any]:
114
- """Processes a user query against the loaded Excel data."""
115
- results = {'query': query, 'relevant_sheets': [], 'sheet_results': {}, 'summary': '', 'insights': []}
116
- try:
117
- relevant_sheets = self.identify_relevant_sheets(query)
118
- results['relevant_sheets'] = relevant_sheets
119
-
120
- for sheet_name in relevant_sheets:
121
- if sheet_name not in self.excel_data:
122
- continue
123
- df = self.excel_data[sheet_name]
124
- analysis_prompt = f"""
125
- Analyze the data from sheet '{sheet_name}' to answer the query: "{query}"
126
- Columns: {list(df.columns)}
127
- Sample Data:
128
- {df.head(5).to_string()}
129
-
130
- Provide a direct answer, including key numbers, trends, or patterns.
131
- """
132
- response = self.llm.invoke(analysis_prompt)
133
- results['sheet_results'][sheet_name] = {'response': response}
134
 
135
- results['summary'] = self._generate_summary(query, results['sheet_results'])
136
- results['insights'] = self._extract_insights(results['sheet_results'])
137
- return results
138
  except Exception as e:
139
- results['summary'] = f"Error processing query: {str(e)}"
140
- return results
141
-
142
- def _generate_summary(self, query: str, sheet_results: Dict) -> str:
143
- """Generates a final, consolidated summary from individual sheet analyses."""
144
- if not sheet_results:
145
- return "No relevant data found to answer the query."
146
-
147
- combined_responses = "\n\n".join(
148
- f"--- Analysis from Sheet '{name}' ---\n{res['response']}"
149
- for name, res in sheet_results.items()
150
- )
151
- prompt = f"""
152
- Based on the following analyses, provide a final, consolidated answer to the query.
153
- Original Query: {query}
154
-
155
- {combined_responses}
156
-
157
- Synthesize these findings into a clear and direct summary.
158
- """
159
- return self.llm.invoke(prompt)
160
-
161
- def _extract_insights(self, sheet_results: Dict) -> List[str]:
162
- """Extracts simple, actionable insights from the analysis results."""
163
- insights = set()
164
- for sheet_name, result in sheet_results.items():
165
- response = result.get('response', '').lower()
166
- if re.search(r'\b\d+\.?\d*\b', response):
167
- insights.add(f"Numerical data found in '{sheet_name}'")
168
- trend_keywords = ['increase', 'decrease', 'growth', 'decline', 'trend', 'pattern']
169
- if any(keyword in response for keyword in trend_keywords):
170
- insights.add(f"Trend analysis available in '{sheet_name}'")
171
- return list(insights)
172
 
173
  # --- Gradio Interface ---
174
 
@@ -179,47 +105,50 @@ def process_file(api_key, file_obj):
179
  if file_obj is None:
180
  raise gr.Error("Please upload an Excel file.")
181
  try:
182
- excel_system = ExcelAIQuerySystem(api_key)
183
- loading_logs = excel_system.load_excel_file(file_obj.name)
 
184
 
 
185
  return (
186
  loading_logs,
187
- excel_system,
 
 
 
188
  gr.update(visible=True),
189
  gr.update(visible=True),
190
- gr.update(visible=True)
 
191
  )
192
  except Exception as e:
193
  raise gr.Error(f"Failed to process file: {e}")
194
 
195
- def generate_response(query, system_state):
196
  """Gradio function to handle user queries and display results."""
197
  if not query:
198
- raise gr.Error("Please enter a query.")
 
 
199
  if system_state is None:
200
  raise gr.Error("File not loaded. Please upload and load a file first.")
201
 
202
  try:
203
- result = system_state.query_data(query)
204
- summary = result.get('summary', 'No summary available.')
205
- sheets = ", ".join(result.get('relevant_sheets', []))
206
- insights = ", ".join(result.get('insights', []))
207
 
208
- details = f"**🔍 Relevant Sheets Identified:**\n{sheets}\n\n"
209
- if insights:
210
- details += f"**💡 Key Insights:**\n{insights}"
211
-
212
- return summary, details
213
  except Exception as e:
214
  raise gr.Error(f"Error during query: {e}")
215
 
216
  # --- UI Layout ---
217
 
218
- with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
219
  system_state = gr.State(None)
220
 
221
- gr.Markdown("# 📊 Excel AI Query System")
222
- gr.Markdown("Upload an Excel file, and ask questions about your data in plain English.")
223
 
224
  with gr.Row():
225
  with gr.Column(scale=1):
@@ -232,46 +161,43 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
232
  )
233
  file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
234
  load_button = gr.Button("Load File", variant="primary")
235
- status_output = gr.Textbox(label="Loading Status", interactive=False, lines=5)
236
 
237
  with gr.Column(scale=2):
238
  gr.Markdown("### 2. Ask a Question")
 
 
 
 
 
 
239
  query_input = gr.Textbox(
240
  label="Your Question",
241
- placeholder="e.g., 'What were the total sales in Q3?' or 'Show me the performance trend for Product X.'",
242
- visible=False
 
243
  )
244
  ask_button = gr.Button("Get Answer", variant="primary", visible=False)
245
 
 
246
  results_accordion = gr.Accordion("Results", open=False, visible=False)
247
  with results_accordion:
248
- summary_output = gr.Markdown(label="Summary")
249
- details_output = gr.Markdown(label="Details")
250
 
251
  # --- Event Handlers ---
252
 
253
  load_button.click(
254
  fn=process_file,
255
  inputs=[api_key_input, file_input],
256
- outputs=[status_output, system_state, query_input, ask_button, results_accordion]
257
  )
258
 
259
  ask_button.click(
260
  fn=generate_response,
261
- inputs=[query_input, system_state],
262
- outputs=[summary_output, details_output]
263
- ).then(
264
- lambda: gr.update(open=True),
265
- outputs=results_accordion
266
  )
267
 
268
 
269
  if __name__ == "__main__":
270
- demo.launch(share=True)
271
-
272
- # # --- To this ---
273
- # if __name__ == "__main__":
274
- # # Render provides the PORT environment variable
275
- # port = int(os.environ.get('PORT', 10000))
276
- # # Launch on 0.0.0.0 to make it accessible outside the container
277
- # demo.launch(server_name="0.0.0.0", server_port=port)
 
1
  import pandas as pd
 
 
 
 
 
 
 
2
  import os
 
3
  import warnings
4
  import gradio as gr
5
  from dotenv import load_dotenv
6
 
7
+ # New imports for the Pandas Agent
8
+ from langchain_openai import OpenAI
9
+ from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
10
+
11
  # Ignore warnings for a cleaner interface
12
  warnings.filterwarnings('ignore')
13
  # Load environment variables from .env file
14
  load_dotenv()
15
 
16
+ class ExcelPandasAgent:
17
  """
18
+ An agent-based system to query Excel files using natural language,
19
+ powered by an OpenAI LLM and a Pandas DataFrame Agent.
20
+ This version can perform mathematical calculations, comparisons, and data analysis.
21
  """
22
  def __init__(self, openai_api_key: str):
23
+ """Initializes the system with the OpenAI API key."""
24
  os.environ["OPENAI_API_KEY"] = openai_api_key
25
+ # Using a temperature of 0 for deterministic, factual answers.
26
  self.llm = OpenAI(temperature=0)
27
+ self.excel_data: dict[str, pd.DataFrame] = {}
 
 
 
28
  self.logs = []
29
 
30
+ def load_excel_file(self, file_path: str) -> tuple[str, list]:
31
+ """
32
+ Loads and processes an Excel file into multiple pandas DataFrames,
33
+ one for each sheet.
34
+ """
35
  self.logs.clear()
36
+ self.excel_data.clear()
37
  try:
38
  excel_file = pd.ExcelFile(file_path)
39
  sheet_names = excel_file.sheet_names
 
44
  df = pd.read_excel(file_path, sheet_name=sheet_name)
45
  df = self._clean_dataframe(df)
46
  self.excel_data[sheet_name] = df
47
+ self.logs.append(f" - Indexed sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
 
 
 
48
  except Exception as e:
49
  self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
50
  continue
51
 
52
+ self.logs.append("✅ All sheets processed and indexed.")
53
+ return "\n".join(self.logs), sheet_names
 
54
  except Exception as e:
55
  raise Exception(f"Error loading Excel file: {str(e)}")
56
 
57
  def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
58
+ """Cleans a DataFrame by removing empty rows/columns."""
59
  df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
60
+ # Attempt to convert object columns to numeric or datetime where possible
61
  for col in df.columns:
62
  if df[col].dtype == 'object':
63
  try:
64
+ df[col] = pd.to_numeric(df[col], errors='ignore')
65
  except:
66
  pass
67
  try:
68
+ df[col] = pd.to_datetime(df[col], errors='ignore')
69
  except:
70
  pass
71
  return df
72
 
73
+ def query_sheet(self, query: str, sheet_name: str) -> str:
 
 
 
 
 
 
 
 
 
 
74
  """
75
+ Processes a user query against a specific sheet using the Pandas Agent.
76
+ """
77
+ if sheet_name not in self.excel_data:
78
+ return f"Error: Sheet '{sheet_name}' not found. Please select a valid sheet."
79
+
80
+ df = self.excel_data[sheet_name]
81
+
82
  try:
83
+ # Create a new pandas agent for each query.
84
+ # verbose=True will print the agent's thought process to the console.
85
+ pandas_agent = create_pandas_dataframe_agent(
86
+ self.llm,
87
+ df,
88
+ verbose=True,
89
+ agent_executor_kwargs={"handle_parsing_errors": True} # Helps with robustness
90
+ )
91
+ # Invoke the agent with the user's query.
92
+ response = pandas_agent.invoke(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # The final answer is in the 'output' key of the response dictionary.
95
+ return response.get('output', 'Sorry, I could not generate an answer.')
 
96
  except Exception as e:
97
+ return f"An error occurred while querying the agent: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  # --- Gradio Interface ---
100
 
 
105
  if file_obj is None:
106
  raise gr.Error("Please upload an Excel file.")
107
  try:
108
+ # Instantiate the agent system
109
+ agent_system = ExcelPandasAgent(api_key)
110
+ loading_logs, sheet_names = agent_system.load_excel_file(file_obj.name)
111
 
112
+ # Return updates to the UI components
113
  return (
114
  loading_logs,
115
+ agent_system,
116
+ # Populate and show the sheet selector dropdown
117
+ gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
118
+ # Show the query box and button
119
  gr.update(visible=True),
120
  gr.update(visible=True),
121
+ # Hide the results from any previous run
122
+ gr.update(visible=False, open=False)
123
  )
124
  except Exception as e:
125
  raise gr.Error(f"Failed to process file: {e}")
126
 
127
+ def generate_response(query, sheet_name, system_state):
128
  """Gradio function to handle user queries and display results."""
129
  if not query:
130
+ raise gr.Error("Please enter a question.")
131
+ if not sheet_name:
132
+ raise gr.Error("Please select a sheet to query from the dropdown.")
133
  if system_state is None:
134
  raise gr.Error("File not loaded. Please upload and load a file first.")
135
 
136
  try:
137
+ # Call the agent's query method
138
+ answer = system_state.query_sheet(query, sheet_name)
 
 
139
 
140
+ # Return the answer and make the results accordion visible and open
141
+ return answer, gr.update(visible=True, open=True)
 
 
 
142
  except Exception as e:
143
  raise gr.Error(f"Error during query: {e}")
144
 
145
  # --- UI Layout ---
146
 
147
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="Excel AI Agent") as demo:
148
  system_state = gr.State(None)
149
 
150
+ gr.Markdown("# 🤖 Excel AI Agent (Pandas Edition)")
151
+ gr.Markdown("This version uses a **Pandas Agent** to answer questions by executing code, allowing for mathematical calculations and data analysis.")
152
 
153
  with gr.Row():
154
  with gr.Column(scale=1):
 
161
  )
162
  file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
163
  load_button = gr.Button("Load File", variant="primary")
164
+ status_output = gr.Textbox(label="Indexing Status", interactive=False, lines=8)
165
 
166
  with gr.Column(scale=2):
167
  gr.Markdown("### 2. Ask a Question")
168
+ # This dropdown is now connected to the backend
169
+ sheet_selector = gr.Dropdown(
170
+ label="Select a sheet to query",
171
+ interactive=True,
172
+ visible=False
173
+ )
174
  query_input = gr.Textbox(
175
  label="Your Question",
176
+ placeholder="e.g., 'What is the sum of the sales column?' or 'Which product had the highest profit in March?'",
177
+ visible=False,
178
+ lines=3
179
  )
180
  ask_button = gr.Button("Get Answer", variant="primary", visible=False)
181
 
182
+ # Simplified results area
183
  results_accordion = gr.Accordion("Results", open=False, visible=False)
184
  with results_accordion:
185
+ answer_output = gr.Markdown(label="Answer")
 
186
 
187
  # --- Event Handlers ---
188
 
189
  load_button.click(
190
  fn=process_file,
191
  inputs=[api_key_input, file_input],
192
+ outputs=[status_output, system_state, sheet_selector, query_input, ask_button, results_accordion]
193
  )
194
 
195
  ask_button.click(
196
  fn=generate_response,
197
+ inputs=[query_input, sheet_selector, system_state],
198
+ outputs=[answer_output, results_accordion]
 
 
 
199
  )
200
 
201
 
202
  if __name__ == "__main__":
203
+ demo.launch(share=True)