Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1,289 +1,139 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
from langchain_openai import OpenAI
|
4 |
-
from langchain_core.documents import Document
|
5 |
-
from langchain_community.vectorstores import FAISS
|
6 |
-
from langchain_openai import OpenAIEmbeddings
|
7 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
8 |
-
import re
|
9 |
-
import os
|
10 |
-
from typing import Dict, List, Any
|
11 |
-
import warnings
|
12 |
import gradio as gr
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
#
|
16 |
-
|
17 |
-
# Load environment variables from .env file
|
18 |
-
load_dotenv()
|
19 |
|
20 |
-
|
21 |
"""
|
22 |
-
|
|
|
23 |
"""
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
"
|
35 |
-
|
36 |
-
try:
|
37 |
-
excel_file = pd.ExcelFile(file_path)
|
38 |
-
sheet_names = excel_file.sheet_names
|
39 |
-
self.logs.append(f"✅ Found {len(sheet_names)} sheets: {', '.join(sheet_names)}")
|
40 |
-
|
41 |
-
for sheet_name in sheet_names:
|
42 |
-
try:
|
43 |
-
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
44 |
-
df = self._clean_dataframe(df)
|
45 |
-
self.excel_data[sheet_name] = df
|
46 |
-
|
47 |
-
description = self._generate_sheet_description(sheet_name, df)
|
48 |
-
self.sheet_descriptions[sheet_name] = description
|
49 |
-
self.logs.append(f" - Loaded and described sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
|
50 |
-
except Exception as e:
|
51 |
-
self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
|
52 |
-
continue
|
53 |
-
|
54 |
-
self._create_vectorstore()
|
55 |
-
self.logs.append("✅ Vector store created successfully.")
|
56 |
-
return "\n".join(self.logs)
|
57 |
-
except Exception as e:
|
58 |
-
raise Exception(f"Error loading Excel file: {str(e)}")
|
59 |
|
60 |
-
def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
61 |
-
"""Cleans a DataFrame by removing empty rows/columns and converting data types."""
|
62 |
-
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
|
63 |
-
for col in df.columns:
|
64 |
-
if df[col].dtype == 'object':
|
65 |
-
try:
|
66 |
-
df[col] = pd.to_datetime(df[col], errors='ignore')
|
67 |
-
except:
|
68 |
-
pass
|
69 |
-
try:
|
70 |
-
df[col] = pd.to_numeric(df[col], errors='ignore')
|
71 |
-
except:
|
72 |
-
pass
|
73 |
-
return df
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
prompt = f"""
|
79 |
-
Analyze this Excel sheet and provide a concise one-paragraph summary.
|
80 |
-
Sheet Name: {sheet_name}
|
81 |
-
Columns: {list(df.columns)}
|
82 |
-
Sample Data:
|
83 |
-
{sample_data}
|
84 |
-
|
85 |
-
Focus on the main purpose of the data, key metrics, and the time period covered.
|
86 |
-
"""
|
87 |
-
try:
|
88 |
-
return self.llm.invoke(prompt)
|
89 |
-
except Exception:
|
90 |
-
return f"Sheet: {sheet_name}, Columns: {', '.join(list(df.columns))}"
|
91 |
-
|
92 |
-
def _create_vectorstore(self):
|
93 |
-
"""Creates a FAISS vector store from sheet descriptions for similarity search."""
|
94 |
-
documents = [
|
95 |
-
Document(page_content=desc, metadata={"sheet_name": name})
|
96 |
-
for name, desc in self.sheet_descriptions.items()
|
97 |
-
]
|
98 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
99 |
-
splits = text_splitter.split_documents(documents)
|
100 |
-
self.vectorstore = FAISS.from_documents(splits, self.embeddings)
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
sheet_names = [doc.metadata['sheet_name'] for doc in docs if 'sheet_name' in doc.metadata]
|
109 |
-
return list(dict.fromkeys(sheet_names))[:5]
|
110 |
-
except Exception:
|
111 |
-
return list(self.excel_data.keys())
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
"""
|
119 |
-
results = {'query': query, 'relevant_sheets': [], 'sheet_results': {}, 'summary': '', 'insights': []}
|
120 |
-
try:
|
121 |
-
# If a specific sheet is selected (and it's not the default auto-select), use it.
|
122 |
-
if selected_sheet and selected_sheet != "Auto-Select based on Query":
|
123 |
-
relevant_sheets = [selected_sheet]
|
124 |
-
else:
|
125 |
-
relevant_sheets = self.identify_relevant_sheets(query)
|
126 |
-
|
127 |
-
results['relevant_sheets'] = relevant_sheets
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
Analyze the data from sheet '{sheet_name}' to answer the query: "{query}"
|
135 |
-
Columns: {list(df.columns)}
|
136 |
-
Sample Data:
|
137 |
-
{df.head(5).to_string()}
|
138 |
-
|
139 |
-
Provide a direct answer, including key numbers, trends, or patterns.
|
140 |
-
"""
|
141 |
-
response = self.llm.invoke(analysis_prompt)
|
142 |
-
results['sheet_results'][sheet_name] = {'response': response}
|
143 |
-
|
144 |
-
results['summary'] = self._generate_summary(query, results['sheet_results'])
|
145 |
-
results['insights'] = self._extract_insights(results['sheet_results'])
|
146 |
-
return results
|
147 |
-
except Exception as e:
|
148 |
-
results['summary'] = f"Error processing query: {str(e)}"
|
149 |
-
return results
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
if not sheet_results:
|
154 |
-
return "No relevant data found to answer the query."
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
)
|
160 |
-
prompt = f"""
|
161 |
-
Based on the following analyses, provide a final, consolidated answer to the query.
|
162 |
-
Original Query: {query}
|
163 |
|
164 |
-
|
|
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
def _extract_insights(self, sheet_results: Dict) -> List[str]:
|
171 |
-
"""Extracts simple, actionable insights from the analysis results."""
|
172 |
-
insights = set()
|
173 |
-
for sheet_name, result in sheet_results.items():
|
174 |
-
response = result.get('response', '').lower()
|
175 |
-
if re.search(r'\b\d+\.?\d*\b', response):
|
176 |
-
insights.add(f"Numerical data found in '{sheet_name}'")
|
177 |
-
trend_keywords = ['increase', 'decrease', 'growth', 'decline', 'trend', 'pattern']
|
178 |
-
if any(keyword in response for keyword in trend_keywords):
|
179 |
-
insights.add(f"Trend analysis available in '{sheet_name}'")
|
180 |
-
return list(insights)
|
181 |
-
|
182 |
-
# --- Gradio Interface ---
|
183 |
-
|
184 |
-
def process_file(api_key, file_obj):
|
185 |
-
"""Gradio function to load the file and prepare the system."""
|
186 |
-
if not api_key:
|
187 |
-
raise gr.Error("OpenAI API Key is required.")
|
188 |
-
if file_obj is None:
|
189 |
-
raise gr.Error("Please upload an Excel file.")
|
190 |
-
try:
|
191 |
-
excel_system = ExcelAIQuerySystem(api_key)
|
192 |
-
loading_logs = excel_system.load_excel_file(file_obj.name)
|
193 |
|
194 |
-
#
|
195 |
-
|
196 |
-
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
excel_system,
|
201 |
-
gr.update(choices=dropdown_choices, value=dropdown_choices[0], visible=True), # Update dropdown
|
202 |
-
gr.update(visible=True), # Query input
|
203 |
-
gr.update(visible=True), # Ask button
|
204 |
-
gr.update(visible=True) # Results accordion
|
205 |
-
)
|
206 |
-
except Exception as e:
|
207 |
-
raise gr.Error(f"Failed to process file: {e}")
|
208 |
-
|
209 |
-
def generate_response(query, sheet_selection, system_state):
|
210 |
-
"""Gradio function to handle user queries and display results."""
|
211 |
-
if not query:
|
212 |
-
raise gr.Error("Please enter a query.")
|
213 |
-
if system_state is None:
|
214 |
-
raise gr.Error("File not loaded. Please upload and load a file first.")
|
215 |
-
|
216 |
-
try:
|
217 |
-
# Pass the selected sheet to the query function
|
218 |
-
result = system_state.query_data(query, selected_sheet=sheet_selection)
|
219 |
-
summary = result.get('summary', 'No summary available.')
|
220 |
-
sheets = ", ".join(result.get('relevant_sheets', []))
|
221 |
-
insights = ", ".join(result.get('insights', []))
|
222 |
|
223 |
-
|
224 |
-
if insights:
|
225 |
-
details += f"**💡 Key Insights:**\n{insights}"
|
226 |
-
|
227 |
-
return summary, details
|
228 |
-
except Exception as e:
|
229 |
-
raise gr.Error(f"Error during query: {e}")
|
230 |
-
|
231 |
-
# --- UI Layout ---
|
232 |
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
with gr.Row():
|
240 |
with gr.Column(scale=1):
|
241 |
-
gr.
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
placeholder="Enter your OpenAI API key...",
|
246 |
-
value=os.getenv("OPENAI_API_KEY", "")
|
247 |
-
)
|
248 |
-
file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
|
249 |
-
load_button = gr.Button("Load File", variant="primary")
|
250 |
-
status_output = gr.Textbox(label="Loading Status", interactive=False, lines=5)
|
251 |
|
252 |
with gr.Column(scale=2):
|
253 |
-
gr.
|
254 |
-
|
255 |
-
|
256 |
-
info="Choose a specific sheet, or let the AI decide automatically.",
|
257 |
-
visible=False
|
258 |
-
)
|
259 |
-
query_input = gr.Textbox(
|
260 |
-
label="Your Question",
|
261 |
-
placeholder="e.g., 'What were the total sales in Q3?' or 'Show me the performance trend for Product X.'",
|
262 |
-
visible=False
|
263 |
-
)
|
264 |
-
ask_button = gr.Button("Get Answer", variant="primary", visible=False)
|
265 |
-
|
266 |
-
results_accordion = gr.Accordion("Results", open=False, visible=False)
|
267 |
-
with results_accordion:
|
268 |
-
summary_output = gr.Markdown(label="Summary")
|
269 |
-
details_output = gr.Markdown(label="Details")
|
270 |
-
|
271 |
-
# --- Event Handlers ---
|
272 |
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
|
|
277 |
)
|
278 |
|
279 |
-
|
280 |
-
fn=
|
281 |
-
inputs=[
|
282 |
-
outputs=
|
283 |
-
).then(
|
284 |
-
lambda: gr.update(open=True),
|
285 |
-
outputs=results_accordion
|
286 |
)
|
287 |
|
288 |
if __name__ == "__main__":
|
289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
import tempfile
|
5 |
+
import nest_asyncio
|
6 |
+
from llama_index.llms.openai import OpenAI
|
7 |
+
from llama_index.core import VectorStoreIndex
|
8 |
+
from llama_parse import LlamaParse
|
9 |
+
from llama_index.core.node_parser import MarkdownElementNodeParser
|
10 |
|
11 |
+
# Apply nest_asyncio to handle async operations in Gradio
|
12 |
+
nest_asyncio.apply()
|
|
|
|
|
13 |
|
14 |
+
def get_sheet_names(file):
|
15 |
"""
|
16 |
+
Reads an uploaded Excel file and returns its sheet names.
|
17 |
+
This function is triggered when a file is uploaded.
|
18 |
"""
|
19 |
+
if file is None:
|
20 |
+
# No file, so return an empty, non-interactive dropdown
|
21 |
+
return gr.Dropdown(choices=[], interactive=False)
|
22 |
+
try:
|
23 |
+
# Use pandas to quickly get sheet names without loading all data
|
24 |
+
xls = pd.ExcelFile(file.name)
|
25 |
+
sheet_names = xls.sheet_names
|
26 |
+
# Return an updated, interactive dropdown with the sheet names
|
27 |
+
return gr.Dropdown(choices=sheet_names, value=sheet_names[0], interactive=True)
|
28 |
+
except Exception as e:
|
29 |
+
gr.Warning(f"Could not read Excel file: {e}")
|
30 |
+
return gr.Dropdown(choices=[], interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
def process_and_query(openai_api_key, llama_parse_api_key, uploaded_file, selected_sheet, query_text):
|
34 |
+
"""
|
35 |
+
Processes a selected sheet from an uploaded Excel file and answers a user's query.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
Args:
|
38 |
+
openai_api_key (str): The user's OpenAI API key.
|
39 |
+
llama_parse_api_key (str): The user's LlamaParse API key.
|
40 |
+
uploaded_file (gradio.File): The uploaded Excel file object.
|
41 |
+
selected_sheet (str): The name of the sheet to process.
|
42 |
+
query_text (str): The question to ask about the document.
|
|
|
|
|
|
|
|
|
43 |
|
44 |
+
Returns:
|
45 |
+
str: The answer to the query or an error message.
|
46 |
+
"""
|
47 |
+
if not all([openai_api_key, llama_parse_api_key, uploaded_file, selected_sheet, query_text]):
|
48 |
+
return "Error: Please provide all inputs - both API keys, a file, a selected sheet, and a query."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
+
# Create a temporary file to store the selected sheet's data
|
51 |
+
temp_file = None
|
52 |
+
try:
|
53 |
+
os.environ["OPENAI_API_KEY"] = openai_api_key
|
54 |
+
llm = OpenAI(model="gpt-4o-mini", api_key=openai_api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
# Read the selected sheet using pandas
|
57 |
+
df = pd.read_excel(uploaded_file.name, sheet_name=selected_sheet)
|
|
|
|
|
58 |
|
59 |
+
# Save the sheet's data to a temporary CSV file for LlamaParse
|
60 |
+
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix=".csv", encoding='utf-8') as temp_file:
|
61 |
+
df.to_csv(temp_file.name, index=False)
|
62 |
+
temp_file_path = temp_file.name
|
63 |
+
|
64 |
+
# Initialize LlamaParse
|
65 |
+
parser = LlamaParse(
|
66 |
+
api_key=llama_parse_api_key,
|
67 |
+
result_type="markdown",
|
68 |
+
verbose=True
|
69 |
)
|
|
|
|
|
|
|
70 |
|
71 |
+
# Load data from the temporary file containing only the selected sheet
|
72 |
+
documents = parser.load_data(temp_file_path)
|
73 |
|
74 |
+
# Initialize the parser and process the documents
|
75 |
+
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=4)
|
76 |
+
nodes = node_parser.get_nodes_from_documents(documents)
|
77 |
+
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
# Create the index and query engine
|
80 |
+
recursive_index = VectorStoreIndex(nodes=base_nodes + objects, llm=llm)
|
81 |
+
query_engine = recursive_index.as_query_engine(similarity_top_k=5, llm=llm)
|
82 |
|
83 |
+
# Execute the query
|
84 |
+
response = query_engine.query(query_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
+
return str(response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
except Exception as e:
|
89 |
+
return f"An error occurred: {e}"
|
90 |
+
finally:
|
91 |
+
# Clean up the temporary file
|
92 |
+
if temp_file and os.path.exists(temp_file.name):
|
93 |
+
os.unlink(temp_file.name)
|
94 |
|
95 |
+
# --- Gradio Interface ---
|
96 |
+
with gr.Blocks(
|
97 |
+
title="Excel Sheet Q&A",
|
98 |
+
theme=gr.themes.Soft(),
|
99 |
+
css=".gradio-container {background: linear-gradient(to right, #C9D6FF, #E2E2E2)}"
|
100 |
+
) as iface:
|
101 |
+
gr.Markdown(
|
102 |
+
"""
|
103 |
+
# 📊 Excel Sheet Q&A
|
104 |
+
|
105 |
+
1. Enter your API keys.
|
106 |
+
2. Upload an Excel file.
|
107 |
+
3. Choose a specific sheet from the dropdown.
|
108 |
+
4. Ask a question about the data in that sheet.
|
109 |
+
"""
|
110 |
+
)
|
111 |
+
|
112 |
with gr.Row():
|
113 |
with gr.Column(scale=1):
|
114 |
+
openai_key_input = gr.Textbox(label="OpenAI API Key", type="password")
|
115 |
+
llamaparse_key_input = gr.Textbox(label="LlamaParse API Key", type="password")
|
116 |
+
file_uploader = gr.File(label="Upload Excel Databook", file_types=[".xlsx", ".xls"])
|
117 |
+
sheet_selector = gr.Dropdown(label="Choose a Sheet", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
with gr.Column(scale=2):
|
120 |
+
query_input = gr.Textbox(label="Your Question", placeholder="e.g., What were the total revenues in 2022?", lines=8)
|
121 |
+
submit_button = gr.Button("Ask Question", variant="primary")
|
122 |
+
output_display = gr.Markdown(label="Answer")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
+
# Define the interactive workflow
|
125 |
+
file_uploader.upload(
|
126 |
+
fn=get_sheet_names,
|
127 |
+
inputs=[file_uploader],
|
128 |
+
outputs=[sheet_selector]
|
129 |
)
|
130 |
|
131 |
+
submit_button.click(
|
132 |
+
fn=process_and_query,
|
133 |
+
inputs=[openai_key_input, llamaparse_key_input, file_uploader, sheet_selector, query_input],
|
134 |
+
outputs=output_display
|
|
|
|
|
|
|
135 |
)
|
136 |
|
137 |
if __name__ == "__main__":
|
138 |
+
iface.launch(share=True)
|
139 |
+
|