Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1,38 +1,39 @@
|
|
1 |
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
from langchain_openai import OpenAI
|
4 |
-
from langchain_core.documents import Document
|
5 |
-
from langchain_community.vectorstores import FAISS
|
6 |
-
from langchain_openai import OpenAIEmbeddings
|
7 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
8 |
-
import re
|
9 |
import os
|
10 |
-
from typing import Dict, List, Any
|
11 |
import warnings
|
12 |
import gradio as gr
|
13 |
from dotenv import load_dotenv
|
14 |
|
|
|
|
|
|
|
|
|
15 |
# Ignore warnings for a cleaner interface
|
16 |
warnings.filterwarnings('ignore')
|
17 |
# Load environment variables from .env file
|
18 |
load_dotenv()
|
19 |
|
20 |
-
class
|
21 |
"""
|
22 |
-
|
|
|
|
|
23 |
"""
|
24 |
def __init__(self, openai_api_key: str):
|
|
|
25 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
|
|
26 |
self.llm = OpenAI(temperature=0)
|
27 |
-
self.
|
28 |
-
self.excel_data = {}
|
29 |
-
self.sheet_descriptions = {}
|
30 |
-
self.vectorstore = None
|
31 |
self.logs = []
|
32 |
|
33 |
-
def load_excel_file(self, file_path: str) -> str:
|
34 |
-
"""
|
|
|
|
|
|
|
35 |
self.logs.clear()
|
|
|
36 |
try:
|
37 |
excel_file = pd.ExcelFile(file_path)
|
38 |
sheet_names = excel_file.sheet_names
|
@@ -43,132 +44,57 @@ class ExcelAIQuerySystem:
|
|
43 |
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
44 |
df = self._clean_dataframe(df)
|
45 |
self.excel_data[sheet_name] = df
|
46 |
-
|
47 |
-
description = self._generate_sheet_description(sheet_name, df)
|
48 |
-
self.sheet_descriptions[sheet_name] = description
|
49 |
-
self.logs.append(f" - Loaded and described sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
|
50 |
except Exception as e:
|
51 |
self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
|
52 |
continue
|
53 |
|
54 |
-
self.
|
55 |
-
self.logs
|
56 |
-
return "\n".join(self.logs)
|
57 |
except Exception as e:
|
58 |
raise Exception(f"Error loading Excel file: {str(e)}")
|
59 |
|
60 |
def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
61 |
-
"""Cleans a DataFrame by removing empty rows/columns
|
62 |
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
|
|
|
63 |
for col in df.columns:
|
64 |
if df[col].dtype == 'object':
|
65 |
try:
|
66 |
-
df[col] = pd.
|
67 |
except:
|
68 |
pass
|
69 |
try:
|
70 |
-
df[col] = pd.
|
71 |
except:
|
72 |
pass
|
73 |
return df
|
74 |
|
75 |
-
def
|
76 |
-
"""Generates a text description of a DataFrame using an LLM."""
|
77 |
-
sample_data = df.head(3).to_string()
|
78 |
-
prompt = f"""
|
79 |
-
Analyze this Excel sheet and provide a concise one-paragraph summary.
|
80 |
-
Sheet Name: {sheet_name}
|
81 |
-
Columns: {list(df.columns)}
|
82 |
-
Sample Data:
|
83 |
-
{sample_data}
|
84 |
-
|
85 |
-
Focus on the main purpose of the data, key metrics, and the time period covered.
|
86 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
try:
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
99 |
-
splits = text_splitter.split_documents(documents)
|
100 |
-
self.vectorstore = FAISS.from_documents(splits, self.embeddings)
|
101 |
-
|
102 |
-
def identify_relevant_sheets(self, query: str) -> List[str]:
|
103 |
-
"""Identifies the most relevant sheets for a given query using the vector store."""
|
104 |
-
if not self.vectorstore:
|
105 |
-
return list(self.excel_data.keys())
|
106 |
-
try:
|
107 |
-
docs = self.vectorstore.similarity_search(query, k=3)
|
108 |
-
sheet_names = [doc.metadata['sheet_name'] for doc in docs if 'sheet_name' in doc.metadata]
|
109 |
-
return list(dict.fromkeys(sheet_names))[:5]
|
110 |
-
except Exception:
|
111 |
-
return list(self.excel_data.keys())
|
112 |
-
|
113 |
-
def query_data(self, query: str) -> Dict[str, Any]:
|
114 |
-
"""Processes a user query against the loaded Excel data."""
|
115 |
-
results = {'query': query, 'relevant_sheets': [], 'sheet_results': {}, 'summary': '', 'insights': []}
|
116 |
-
try:
|
117 |
-
relevant_sheets = self.identify_relevant_sheets(query)
|
118 |
-
results['relevant_sheets'] = relevant_sheets
|
119 |
-
|
120 |
-
for sheet_name in relevant_sheets:
|
121 |
-
if sheet_name not in self.excel_data:
|
122 |
-
continue
|
123 |
-
df = self.excel_data[sheet_name]
|
124 |
-
analysis_prompt = f"""
|
125 |
-
Analyze the data from sheet '{sheet_name}' to answer the query: "{query}"
|
126 |
-
Columns: {list(df.columns)}
|
127 |
-
Sample Data:
|
128 |
-
{df.head(5).to_string()}
|
129 |
-
|
130 |
-
Provide a direct answer, including key numbers, trends, or patterns.
|
131 |
-
"""
|
132 |
-
response = self.llm.invoke(analysis_prompt)
|
133 |
-
results['sheet_results'][sheet_name] = {'response': response}
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
return results
|
138 |
except Exception as e:
|
139 |
-
|
140 |
-
return results
|
141 |
-
|
142 |
-
def _generate_summary(self, query: str, sheet_results: Dict) -> str:
|
143 |
-
"""Generates a final, consolidated summary from individual sheet analyses."""
|
144 |
-
if not sheet_results:
|
145 |
-
return "No relevant data found to answer the query."
|
146 |
-
|
147 |
-
combined_responses = "\n\n".join(
|
148 |
-
f"--- Analysis from Sheet '{name}' ---\n{res['response']}"
|
149 |
-
for name, res in sheet_results.items()
|
150 |
-
)
|
151 |
-
prompt = f"""
|
152 |
-
Based on the following analyses, provide a final, consolidated answer to the query.
|
153 |
-
Original Query: {query}
|
154 |
-
|
155 |
-
{combined_responses}
|
156 |
-
|
157 |
-
Synthesize these findings into a clear and direct summary.
|
158 |
-
"""
|
159 |
-
return self.llm.invoke(prompt)
|
160 |
-
|
161 |
-
def _extract_insights(self, sheet_results: Dict) -> List[str]:
|
162 |
-
"""Extracts simple, actionable insights from the analysis results."""
|
163 |
-
insights = set()
|
164 |
-
for sheet_name, result in sheet_results.items():
|
165 |
-
response = result.get('response', '').lower()
|
166 |
-
if re.search(r'\b\d+\.?\d*\b', response):
|
167 |
-
insights.add(f"Numerical data found in '{sheet_name}'")
|
168 |
-
trend_keywords = ['increase', 'decrease', 'growth', 'decline', 'trend', 'pattern']
|
169 |
-
if any(keyword in response for keyword in trend_keywords):
|
170 |
-
insights.add(f"Trend analysis available in '{sheet_name}'")
|
171 |
-
return list(insights)
|
172 |
|
173 |
# --- Gradio Interface ---
|
174 |
|
@@ -179,47 +105,50 @@ def process_file(api_key, file_obj):
|
|
179 |
if file_obj is None:
|
180 |
raise gr.Error("Please upload an Excel file.")
|
181 |
try:
|
182 |
-
|
183 |
-
|
|
|
184 |
|
|
|
185 |
return (
|
186 |
loading_logs,
|
187 |
-
|
|
|
|
|
|
|
188 |
gr.update(visible=True),
|
189 |
gr.update(visible=True),
|
190 |
-
|
|
|
191 |
)
|
192 |
except Exception as e:
|
193 |
raise gr.Error(f"Failed to process file: {e}")
|
194 |
|
195 |
-
def generate_response(query, system_state):
|
196 |
"""Gradio function to handle user queries and display results."""
|
197 |
if not query:
|
198 |
-
raise gr.Error("Please enter a
|
|
|
|
|
199 |
if system_state is None:
|
200 |
raise gr.Error("File not loaded. Please upload and load a file first.")
|
201 |
|
202 |
try:
|
203 |
-
|
204 |
-
|
205 |
-
sheets = ", ".join(result.get('relevant_sheets', []))
|
206 |
-
insights = ", ".join(result.get('insights', []))
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
details += f"**💡 Key Insights:**\n{insights}"
|
211 |
-
|
212 |
-
return summary, details
|
213 |
except Exception as e:
|
214 |
raise gr.Error(f"Error during query: {e}")
|
215 |
|
216 |
# --- UI Layout ---
|
217 |
|
218 |
-
with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI
|
219 |
system_state = gr.State(None)
|
220 |
|
221 |
-
gr.Markdown("#
|
222 |
-
gr.Markdown("
|
223 |
|
224 |
with gr.Row():
|
225 |
with gr.Column(scale=1):
|
@@ -232,46 +161,43 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
|
|
232 |
)
|
233 |
file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
|
234 |
load_button = gr.Button("Load File", variant="primary")
|
235 |
-
status_output = gr.Textbox(label="
|
236 |
|
237 |
with gr.Column(scale=2):
|
238 |
gr.Markdown("### 2. Ask a Question")
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
query_input = gr.Textbox(
|
240 |
label="Your Question",
|
241 |
-
placeholder="e.g., 'What
|
242 |
-
visible=False
|
|
|
243 |
)
|
244 |
ask_button = gr.Button("Get Answer", variant="primary", visible=False)
|
245 |
|
|
|
246 |
results_accordion = gr.Accordion("Results", open=False, visible=False)
|
247 |
with results_accordion:
|
248 |
-
|
249 |
-
details_output = gr.Markdown(label="Details")
|
250 |
|
251 |
# --- Event Handlers ---
|
252 |
|
253 |
load_button.click(
|
254 |
fn=process_file,
|
255 |
inputs=[api_key_input, file_input],
|
256 |
-
outputs=[status_output, system_state, query_input, ask_button, results_accordion]
|
257 |
)
|
258 |
|
259 |
ask_button.click(
|
260 |
fn=generate_response,
|
261 |
-
inputs=[query_input, system_state],
|
262 |
-
outputs=[
|
263 |
-
).then(
|
264 |
-
lambda: gr.update(open=True),
|
265 |
-
outputs=results_accordion
|
266 |
)
|
267 |
|
268 |
|
269 |
if __name__ == "__main__":
|
270 |
-
demo.launch(share=True)
|
271 |
-
|
272 |
-
# # --- To this ---
|
273 |
-
# if __name__ == "__main__":
|
274 |
-
# # Render provides the PORT environment variable
|
275 |
-
# port = int(os.environ.get('PORT', 10000))
|
276 |
-
# # Launch on 0.0.0.0 to make it accessible outside the container
|
277 |
-
# demo.launch(server_name="0.0.0.0", server_port=port)
|
|
|
1 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import os
|
|
|
3 |
import warnings
|
4 |
import gradio as gr
|
5 |
from dotenv import load_dotenv
|
6 |
|
7 |
+
# New imports for the Pandas Agent
|
8 |
+
from langchain_openai import OpenAI
|
9 |
+
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
|
10 |
+
|
11 |
# Ignore warnings for a cleaner interface
|
12 |
warnings.filterwarnings('ignore')
|
13 |
# Load environment variables from .env file
|
14 |
load_dotenv()
|
15 |
|
16 |
+
class ExcelPandasAgent:
|
17 |
"""
|
18 |
+
An agent-based system to query Excel files using natural language,
|
19 |
+
powered by an OpenAI LLM and a Pandas DataFrame Agent.
|
20 |
+
This version can perform mathematical calculations, comparisons, and data analysis.
|
21 |
"""
|
22 |
def __init__(self, openai_api_key: str):
|
23 |
+
"""Initializes the system with the OpenAI API key."""
|
24 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
25 |
+
# Using a temperature of 0 for deterministic, factual answers.
|
26 |
self.llm = OpenAI(temperature=0)
|
27 |
+
self.excel_data: dict[str, pd.DataFrame] = {}
|
|
|
|
|
|
|
28 |
self.logs = []
|
29 |
|
30 |
+
def load_excel_file(self, file_path: str) -> tuple[str, list]:
|
31 |
+
"""
|
32 |
+
Loads and processes an Excel file into multiple pandas DataFrames,
|
33 |
+
one for each sheet.
|
34 |
+
"""
|
35 |
self.logs.clear()
|
36 |
+
self.excel_data.clear()
|
37 |
try:
|
38 |
excel_file = pd.ExcelFile(file_path)
|
39 |
sheet_names = excel_file.sheet_names
|
|
|
44 |
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
45 |
df = self._clean_dataframe(df)
|
46 |
self.excel_data[sheet_name] = df
|
47 |
+
self.logs.append(f" - Indexed sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
|
|
|
|
|
|
|
48 |
except Exception as e:
|
49 |
self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
|
50 |
continue
|
51 |
|
52 |
+
self.logs.append("✅ All sheets processed and indexed.")
|
53 |
+
return "\n".join(self.logs), sheet_names
|
|
|
54 |
except Exception as e:
|
55 |
raise Exception(f"Error loading Excel file: {str(e)}")
|
56 |
|
57 |
def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
58 |
+
"""Cleans a DataFrame by removing empty rows/columns."""
|
59 |
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
|
60 |
+
# Attempt to convert object columns to numeric or datetime where possible
|
61 |
for col in df.columns:
|
62 |
if df[col].dtype == 'object':
|
63 |
try:
|
64 |
+
df[col] = pd.to_numeric(df[col], errors='ignore')
|
65 |
except:
|
66 |
pass
|
67 |
try:
|
68 |
+
df[col] = pd.to_datetime(df[col], errors='ignore')
|
69 |
except:
|
70 |
pass
|
71 |
return df
|
72 |
|
73 |
+
def query_sheet(self, query: str, sheet_name: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
"""
|
75 |
+
Processes a user query against a specific sheet using the Pandas Agent.
|
76 |
+
"""
|
77 |
+
if sheet_name not in self.excel_data:
|
78 |
+
return f"Error: Sheet '{sheet_name}' not found. Please select a valid sheet."
|
79 |
+
|
80 |
+
df = self.excel_data[sheet_name]
|
81 |
+
|
82 |
try:
|
83 |
+
# Create a new pandas agent for each query.
|
84 |
+
# verbose=True will print the agent's thought process to the console.
|
85 |
+
pandas_agent = create_pandas_dataframe_agent(
|
86 |
+
self.llm,
|
87 |
+
df,
|
88 |
+
verbose=True,
|
89 |
+
agent_executor_kwargs={"handle_parsing_errors": True} # Helps with robustness
|
90 |
+
)
|
91 |
+
# Invoke the agent with the user's query.
|
92 |
+
response = pandas_agent.invoke(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
+
# The final answer is in the 'output' key of the response dictionary.
|
95 |
+
return response.get('output', 'Sorry, I could not generate an answer.')
|
|
|
96 |
except Exception as e:
|
97 |
+
return f"An error occurred while querying the agent: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
# --- Gradio Interface ---
|
100 |
|
|
|
105 |
if file_obj is None:
|
106 |
raise gr.Error("Please upload an Excel file.")
|
107 |
try:
|
108 |
+
# Instantiate the agent system
|
109 |
+
agent_system = ExcelPandasAgent(api_key)
|
110 |
+
loading_logs, sheet_names = agent_system.load_excel_file(file_obj.name)
|
111 |
|
112 |
+
# Return updates to the UI components
|
113 |
return (
|
114 |
loading_logs,
|
115 |
+
agent_system,
|
116 |
+
# Populate and show the sheet selector dropdown
|
117 |
+
gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
|
118 |
+
# Show the query box and button
|
119 |
gr.update(visible=True),
|
120 |
gr.update(visible=True),
|
121 |
+
# Hide the results from any previous run
|
122 |
+
gr.update(visible=False, open=False)
|
123 |
)
|
124 |
except Exception as e:
|
125 |
raise gr.Error(f"Failed to process file: {e}")
|
126 |
|
127 |
+
def generate_response(query, sheet_name, system_state):
|
128 |
"""Gradio function to handle user queries and display results."""
|
129 |
if not query:
|
130 |
+
raise gr.Error("Please enter a question.")
|
131 |
+
if not sheet_name:
|
132 |
+
raise gr.Error("Please select a sheet to query from the dropdown.")
|
133 |
if system_state is None:
|
134 |
raise gr.Error("File not loaded. Please upload and load a file first.")
|
135 |
|
136 |
try:
|
137 |
+
# Call the agent's query method
|
138 |
+
answer = system_state.query_sheet(query, sheet_name)
|
|
|
|
|
139 |
|
140 |
+
# Return the answer and make the results accordion visible and open
|
141 |
+
return answer, gr.update(visible=True, open=True)
|
|
|
|
|
|
|
142 |
except Exception as e:
|
143 |
raise gr.Error(f"Error during query: {e}")
|
144 |
|
145 |
# --- UI Layout ---
|
146 |
|
147 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="Excel AI Agent") as demo:
|
148 |
system_state = gr.State(None)
|
149 |
|
150 |
+
gr.Markdown("# 🤖 Excel AI Agent (Pandas Edition)")
|
151 |
+
gr.Markdown("This version uses a **Pandas Agent** to answer questions by executing code, allowing for mathematical calculations and data analysis.")
|
152 |
|
153 |
with gr.Row():
|
154 |
with gr.Column(scale=1):
|
|
|
161 |
)
|
162 |
file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
|
163 |
load_button = gr.Button("Load File", variant="primary")
|
164 |
+
status_output = gr.Textbox(label="Indexing Status", interactive=False, lines=8)
|
165 |
|
166 |
with gr.Column(scale=2):
|
167 |
gr.Markdown("### 2. Ask a Question")
|
168 |
+
# This dropdown is now connected to the backend
|
169 |
+
sheet_selector = gr.Dropdown(
|
170 |
+
label="Select a sheet to query",
|
171 |
+
interactive=True,
|
172 |
+
visible=False
|
173 |
+
)
|
174 |
query_input = gr.Textbox(
|
175 |
label="Your Question",
|
176 |
+
placeholder="e.g., 'What is the sum of the sales column?' or 'Which product had the highest profit in March?'",
|
177 |
+
visible=False,
|
178 |
+
lines=3
|
179 |
)
|
180 |
ask_button = gr.Button("Get Answer", variant="primary", visible=False)
|
181 |
|
182 |
+
# Simplified results area
|
183 |
results_accordion = gr.Accordion("Results", open=False, visible=False)
|
184 |
with results_accordion:
|
185 |
+
answer_output = gr.Markdown(label="Answer")
|
|
|
186 |
|
187 |
# --- Event Handlers ---
|
188 |
|
189 |
load_button.click(
|
190 |
fn=process_file,
|
191 |
inputs=[api_key_input, file_input],
|
192 |
+
outputs=[status_output, system_state, sheet_selector, query_input, ask_button, results_accordion]
|
193 |
)
|
194 |
|
195 |
ask_button.click(
|
196 |
fn=generate_response,
|
197 |
+
inputs=[query_input, sheet_selector, system_state],
|
198 |
+
outputs=[answer_output, results_accordion]
|
|
|
|
|
|
|
199 |
)
|
200 |
|
201 |
|
202 |
if __name__ == "__main__":
|
203 |
+
demo.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|