Adityabhaskar commited on
Commit
3e327a8
Β·
verified Β·
1 Parent(s): baf3f32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -85
app.py CHANGED
@@ -1,40 +1,38 @@
1
  import pandas as pd
2
  import numpy as np
 
 
 
 
 
 
3
  import os
 
4
  import warnings
5
  import gradio as gr
6
  from dotenv import load_dotenv
7
 
8
- # New imports for the Pandas Agent
9
- from langchain_openai import OpenAI
10
- from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
11
-
12
  # Ignore warnings for a cleaner interface
13
  warnings.filterwarnings('ignore')
14
  # Load environment variables from .env file
15
  load_dotenv()
16
 
17
- class ExcelPandasAgent:
18
  """
19
- An agent-based system to query Excel files using natural language,
20
- powered by an OpenAI LLM and a Pandas DataFrame Agent.
21
- This version can perform mathematical calculations, comparisons, and data analysis.
22
  """
23
  def __init__(self, openai_api_key: str):
24
- """Initializes the system with the OpenAI API key."""
25
  os.environ["OPENAI_API_KEY"] = openai_api_key
26
- # Using a temperature of 0 for deterministic, factual answers.
27
  self.llm = OpenAI(temperature=0)
28
- self.excel_data: dict[str, pd.DataFrame] = {}
 
 
 
29
  self.logs = []
30
 
31
- def load_excel_file(self, file_path: str) -> tuple[str, list]:
32
- """
33
- Loads and processes an Excel file into multiple pandas DataFrames,
34
- one for each sheet.
35
- """
36
  self.logs.clear()
37
- self.excel_data.clear()
38
  try:
39
  excel_file = pd.ExcelFile(file_path)
40
  sheet_names = excel_file.sheet_names
@@ -43,65 +41,143 @@ class ExcelPandasAgent:
43
  for sheet_name in sheet_names:
44
  try:
45
  df = pd.read_excel(file_path, sheet_name=sheet_name)
46
- # The cleaning function is called here for each sheet
47
  df = self._clean_dataframe(df)
48
  self.excel_data[sheet_name] = df
49
- self.logs.append(f" - Indexed and cleaned sheet '{sheet_name}' ({df.shape[0]} rows Γ— {df.shape[1]} columns)")
 
 
 
50
  except Exception as e:
51
  self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
52
  continue
53
 
54
- self.logs.append("βœ… All sheets processed and indexed.")
55
- return "\n".join(self.logs), sheet_names
 
56
  except Exception as e:
57
  raise Exception(f"Error loading Excel file: {str(e)}")
58
 
59
  def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
60
- """
61
- Cleans a DataFrame by removing empty rows/columns and robustly converting types.
62
- """
63
  df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
64
  for col in df.columns:
65
- # Apply to object columns that might contain mixed numeric/text data
66
  if df[col].dtype == 'object':
67
- # This is the key change. It attempts to convert the column
68
- # to numbers. Any value that fails (like 'Apr') becomes NaN (Not a Number).
69
- df[col] = pd.to_numeric(df[col], errors='coerce')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- # Now, find all numeric columns (including those just converted)
72
- # and fill any resulting NaN values with 0. This prepares them for calculations.
73
- for col in df.select_dtypes(include=np.number).columns:
74
- df[col] = df[col].fillna(0)
 
 
 
 
 
75
 
76
- return df
 
 
 
 
 
 
 
 
 
77
 
78
- def query_sheet(self, query: str, sheet_name: str) -> str:
79
  """
80
- Processes a user query against a specific sheet using the Pandas Agent.
 
 
81
  """
82
- if sheet_name not in self.excel_data:
83
- return f"Error: Sheet '{sheet_name}' not found. Please select a valid sheet."
84
-
85
- df = self.excel_data[sheet_name]
86
-
87
  try:
88
- # Create a new pandas agent for each query.
89
- pandas_agent = create_pandas_dataframe_agent(
90
- self.llm,
91
- df,
92
- verbose=True,
93
- max_iterations=50,
94
- max_execution_time=300,
95
- agent_executor_kwargs={"handle_parsing_errors": True},
96
- allow_dangerous_code=True
97
- )
98
- # Invoke the agent with the user's query.
99
- response = pandas_agent.invoke(query)
100
 
101
- # The final answer is in the 'output' key of the response dictionary.
102
- return response.get('output', 'Sorry, I could not generate an answer.')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  except Exception as e:
104
- return f"An error occurred while querying the agent: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # --- Gradio Interface ---
107
 
@@ -112,47 +188,53 @@ def process_file(api_key, file_obj):
112
  if file_obj is None:
113
  raise gr.Error("Please upload an Excel file.")
114
  try:
115
- # Instantiate the agent system
116
- agent_system = ExcelPandasAgent(api_key)
117
- loading_logs, sheet_names = agent_system.load_excel_file(file_obj.name)
 
 
 
118
 
119
- # Return updates to the UI components
120
  return (
121
  loading_logs,
122
- agent_system,
123
- gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
124
- gr.update(visible=True),
125
- gr.update(visible=True),
126
- gr.update(visible=False, open=False)
127
  )
128
  except Exception as e:
129
  raise gr.Error(f"Failed to process file: {e}")
130
 
131
- def generate_response(query, sheet_name, system_state):
132
  """Gradio function to handle user queries and display results."""
133
  if not query:
134
- raise gr.Error("Please enter a question.")
135
- if not sheet_name:
136
- raise gr.Error("Please select a sheet to query from the dropdown.")
137
  if system_state is None:
138
  raise gr.Error("File not loaded. Please upload and load a file first.")
139
 
140
  try:
141
- # Call the agent's query method
142
- answer = system_state.query_sheet(query, sheet_name)
 
 
 
143
 
144
- # Return the answer and make the results accordion visible and open
145
- return answer, gr.update(visible=True, open=True)
 
 
 
146
  except Exception as e:
147
  raise gr.Error(f"Error during query: {e}")
148
 
149
  # --- UI Layout ---
150
 
151
- with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="Excel AI Agent") as demo:
152
  system_state = gr.State(None)
153
 
154
- gr.Markdown("# πŸ€– Excel AI Agent (Pandas Edition)")
155
- gr.Markdown("This version uses a **Pandas Agent** to answer questions by executing code, allowing for mathematical calculations and data analysis.")
156
 
157
  with gr.Row():
158
  with gr.Column(scale=1):
@@ -165,26 +247,26 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="Excel AI Agent")
165
  )
166
  file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
167
  load_button = gr.Button("Load File", variant="primary")
168
- status_output = gr.Textbox(label="Indexing Status", interactive=False, lines=8)
169
 
170
  with gr.Column(scale=2):
171
  gr.Markdown("### 2. Ask a Question")
172
  sheet_selector = gr.Dropdown(
173
- label="Select a sheet to query",
174
- interactive=True,
175
  visible=False
176
  )
177
  query_input = gr.Textbox(
178
  label="Your Question",
179
- placeholder="e.g., 'What is the sum of the sales column?' or 'Which product had the highest profit in March?'",
180
- visible=False,
181
- lines=3
182
  )
183
  ask_button = gr.Button("Get Answer", variant="primary", visible=False)
184
 
185
  results_accordion = gr.Accordion("Results", open=False, visible=False)
186
  with results_accordion:
187
- answer_output = gr.Markdown(label="Answer")
 
188
 
189
  # --- Event Handlers ---
190
 
@@ -196,10 +278,12 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="Excel AI Agent")
196
 
197
  ask_button.click(
198
  fn=generate_response,
199
- inputs=[query_input, sheet_selector, system_state],
200
- outputs=[answer_output, results_accordion]
 
 
 
201
  )
202
 
203
-
204
  if __name__ == "__main__":
205
  demo.launch(share=True)
 
1
  import pandas as pd
2
  import numpy as np
3
+ from langchain_openai import OpenAI
4
+ from langchain_core.documents import Document
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ import re
9
  import os
10
+ from typing import Dict, List, Any
11
  import warnings
12
  import gradio as gr
13
  from dotenv import load_dotenv
14
 
 
 
 
 
15
  # Ignore warnings for a cleaner interface
16
  warnings.filterwarnings('ignore')
17
  # Load environment variables from .env file
18
  load_dotenv()
19
 
20
+ class ExcelAIQuerySystem:
21
  """
22
+ A system to query Excel files using natural language, powered by OpenAI and LangChain.
 
 
23
  """
24
  def __init__(self, openai_api_key: str):
 
25
  os.environ["OPENAI_API_KEY"] = openai_api_key
 
26
  self.llm = OpenAI(temperature=0)
27
+ self.embeddings = OpenAIEmbeddings()
28
+ self.excel_data = {}
29
+ self.sheet_descriptions = {}
30
+ self.vectorstore = None
31
  self.logs = []
32
 
33
+ def load_excel_file(self, file_path: str) -> str:
34
+ """Loads and processes an Excel file, generating descriptions and a vector store."""
 
 
 
35
  self.logs.clear()
 
36
  try:
37
  excel_file = pd.ExcelFile(file_path)
38
  sheet_names = excel_file.sheet_names
 
41
  for sheet_name in sheet_names:
42
  try:
43
  df = pd.read_excel(file_path, sheet_name=sheet_name)
 
44
  df = self._clean_dataframe(df)
45
  self.excel_data[sheet_name] = df
46
+
47
+ description = self._generate_sheet_description(sheet_name, df)
48
+ self.sheet_descriptions[sheet_name] = description
49
+ self.logs.append(f" - Loaded and described sheet '{sheet_name}' ({df.shape[0]} rows Γ— {df.shape[1]} columns)")
50
  except Exception as e:
51
  self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
52
  continue
53
 
54
+ self._create_vectorstore()
55
+ self.logs.append("βœ… Vector store created successfully.")
56
+ return "\n".join(self.logs)
57
  except Exception as e:
58
  raise Exception(f"Error loading Excel file: {str(e)}")
59
 
60
  def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
61
+ """Cleans a DataFrame by removing empty rows/columns and converting data types."""
 
 
62
  df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
63
  for col in df.columns:
 
64
  if df[col].dtype == 'object':
65
+ try:
66
+ df[col] = pd.to_datetime(df[col], errors='ignore')
67
+ except:
68
+ pass
69
+ try:
70
+ df[col] = pd.to_numeric(df[col], errors='ignore')
71
+ except:
72
+ pass
73
+ return df
74
+
75
+ def _generate_sheet_description(self, sheet_name: str, df: pd.DataFrame) -> str:
76
+ """Generates a text description of a DataFrame using an LLM."""
77
+ sample_data = df.head(3).to_string()
78
+ prompt = f"""
79
+ Analyze this Excel sheet and provide a concise one-paragraph summary.
80
+ Sheet Name: {sheet_name}
81
+ Columns: {list(df.columns)}
82
+ Sample Data:
83
+ {sample_data}
84
+
85
+ Focus on the main purpose of the data, key metrics, and the time period covered.
86
+ """
87
+ try:
88
+ return self.llm.invoke(prompt)
89
+ except Exception:
90
+ return f"Sheet: {sheet_name}, Columns: {', '.join(list(df.columns))}"
91
 
92
+ def _create_vectorstore(self):
93
+ """Creates a FAISS vector store from sheet descriptions for similarity search."""
94
+ documents = [
95
+ Document(page_content=desc, metadata={"sheet_name": name})
96
+ for name, desc in self.sheet_descriptions.items()
97
+ ]
98
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
99
+ splits = text_splitter.split_documents(documents)
100
+ self.vectorstore = FAISS.from_documents(splits, self.embeddings)
101
 
102
+ def identify_relevant_sheets(self, query: str) -> List[str]:
103
+ """Identifies the most relevant sheets for a given query using the vector store."""
104
+ if not self.vectorstore:
105
+ return list(self.excel_data.keys())
106
+ try:
107
+ docs = self.vectorstore.similarity_search(query, k=3)
108
+ sheet_names = [doc.metadata['sheet_name'] for doc in docs if 'sheet_name' in doc.metadata]
109
+ return list(dict.fromkeys(sheet_names))[:5]
110
+ except Exception:
111
+ return list(self.excel_data.keys())
112
 
113
+ def query_data(self, query: str, selected_sheet: str = None) -> Dict[str, Any]:
114
  """
115
+ Processes a user query against the loaded Excel data.
116
+ If a sheet is selected, it queries that sheet directly.
117
+ Otherwise, it identifies the most relevant sheets.
118
  """
119
+ results = {'query': query, 'relevant_sheets': [], 'sheet_results': {}, 'summary': '', 'insights': []}
 
 
 
 
120
  try:
121
+ # If a specific sheet is selected (and it's not the default auto-select), use it.
122
+ if selected_sheet and selected_sheet != "Auto-Select based on Query":
123
+ relevant_sheets = [selected_sheet]
124
+ else:
125
+ relevant_sheets = self.identify_relevant_sheets(query)
 
 
 
 
 
 
 
126
 
127
+ results['relevant_sheets'] = relevant_sheets
128
+
129
+ for sheet_name in relevant_sheets:
130
+ if sheet_name not in self.excel_data:
131
+ continue
132
+ df = self.excel_data[sheet_name]
133
+ analysis_prompt = f"""
134
+ Analyze the data from sheet '{sheet_name}' to answer the query: "{query}"
135
+ Columns: {list(df.columns)}
136
+ Sample Data:
137
+ {df.head(5).to_string()}
138
+
139
+ Provide a direct answer, including key numbers, trends, or patterns.
140
+ """
141
+ response = self.llm.invoke(analysis_prompt)
142
+ results['sheet_results'][sheet_name] = {'response': response}
143
+
144
+ results['summary'] = self._generate_summary(query, results['sheet_results'])
145
+ results['insights'] = self._extract_insights(results['sheet_results'])
146
+ return results
147
  except Exception as e:
148
+ results['summary'] = f"Error processing query: {str(e)}"
149
+ return results
150
+
151
+ def _generate_summary(self, query: str, sheet_results: Dict) -> str:
152
+ """Generates a final, consolidated summary from individual sheet analyses."""
153
+ if not sheet_results:
154
+ return "No relevant data found to answer the query."
155
+
156
+ combined_responses = "\n\n".join(
157
+ f"--- Analysis from Sheet '{name}' ---\n{res['response']}"
158
+ for name, res in sheet_results.items()
159
+ )
160
+ prompt = f"""
161
+ Based on the following analyses, provide a final, consolidated answer to the query.
162
+ Original Query: {query}
163
+
164
+ {combined_responses}
165
+
166
+ Synthesize these findings into a clear and direct summary.
167
+ """
168
+ return self.llm.invoke(prompt)
169
+
170
+ def _extract_insights(self, sheet_results: Dict) -> List[str]:
171
+ """Extracts simple, actionable insights from the analysis results."""
172
+ insights = set()
173
+ for sheet_name, result in sheet_results.items():
174
+ response = result.get('response', '').lower()
175
+ if re.search(r'\b\d+\.?\d*\b', response):
176
+ insights.add(f"Numerical data found in '{sheet_name}'")
177
+ trend_keywords = ['increase', 'decrease', 'growth', 'decline', 'trend', 'pattern']
178
+ if any(keyword in response for keyword in trend_keywords):
179
+ insights.add(f"Trend analysis available in '{sheet_name}'")
180
+ return list(insights)
181
 
182
  # --- Gradio Interface ---
183
 
 
188
  if file_obj is None:
189
  raise gr.Error("Please upload an Excel file.")
190
  try:
191
+ excel_system = ExcelAIQuerySystem(api_key)
192
+ loading_logs = excel_system.load_excel_file(file_obj.name)
193
+
194
+ # Get sheet names for the dropdown
195
+ sheet_names = list(excel_system.excel_data.keys())
196
+ dropdown_choices = ["Auto-Select based on Query"] + sheet_names
197
 
 
198
  return (
199
  loading_logs,
200
+ excel_system,
201
+ gr.update(choices=dropdown_choices, value=dropdown_choices[0], visible=True), # Update dropdown
202
+ gr.update(visible=True), # Query input
203
+ gr.update(visible=True), # Ask button
204
+ gr.update(visible=True) # Results accordion
205
  )
206
  except Exception as e:
207
  raise gr.Error(f"Failed to process file: {e}")
208
 
209
+ def generate_response(query, sheet_selection, system_state):
210
  """Gradio function to handle user queries and display results."""
211
  if not query:
212
+ raise gr.Error("Please enter a query.")
 
 
213
  if system_state is None:
214
  raise gr.Error("File not loaded. Please upload and load a file first.")
215
 
216
  try:
217
+ # Pass the selected sheet to the query function
218
+ result = system_state.query_data(query, selected_sheet=sheet_selection)
219
+ summary = result.get('summary', 'No summary available.')
220
+ sheets = ", ".join(result.get('relevant_sheets', []))
221
+ insights = ", ".join(result.get('insights', []))
222
 
223
+ details = f"**πŸ” Relevant Sheets Identified:**\n{sheets}\n\n"
224
+ if insights:
225
+ details += f"**πŸ’‘ Key Insights:**\n{insights}"
226
+
227
+ return summary, details
228
  except Exception as e:
229
  raise gr.Error(f"Error during query: {e}")
230
 
231
  # --- UI Layout ---
232
 
233
+ with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
234
  system_state = gr.State(None)
235
 
236
+ gr.Markdown("# πŸ“Š Excel AI Query System")
237
+ gr.Markdown("Upload an Excel file, and ask questions about your data in plain English.")
238
 
239
  with gr.Row():
240
  with gr.Column(scale=1):
 
247
  )
248
  file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
249
  load_button = gr.Button("Load File", variant="primary")
250
+ status_output = gr.Textbox(label="Loading Status", interactive=False, lines=5)
251
 
252
  with gr.Column(scale=2):
253
  gr.Markdown("### 2. Ask a Question")
254
  sheet_selector = gr.Dropdown(
255
+ label="πŸ“„ Select a Sheet to Query",
256
+ info="Choose a specific sheet, or let the AI decide automatically.",
257
  visible=False
258
  )
259
  query_input = gr.Textbox(
260
  label="Your Question",
261
+ placeholder="e.g., 'What were the total sales in Q3?' or 'Show me the performance trend for Product X.'",
262
+ visible=False
 
263
  )
264
  ask_button = gr.Button("Get Answer", variant="primary", visible=False)
265
 
266
  results_accordion = gr.Accordion("Results", open=False, visible=False)
267
  with results_accordion:
268
+ summary_output = gr.Markdown(label="Summary")
269
+ details_output = gr.Markdown(label="Details")
270
 
271
  # --- Event Handlers ---
272
 
 
278
 
279
  ask_button.click(
280
  fn=generate_response,
281
+ inputs=[query_input, sheet_selector, system_state], # Add sheet_selector as an input
282
+ outputs=[summary_output, details_output]
283
+ ).then(
284
+ lambda: gr.update(open=True),
285
+ outputs=results_accordion
286
  )
287
 
 
288
  if __name__ == "__main__":
289
  demo.launch(share=True)