Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1,178 +1,177 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
3 |
-
from langchain_core.documents import Document
|
4 |
-
from langchain_community.vectorstores import FAISS
|
5 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
-
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
|
7 |
import os
|
8 |
-
from typing import Dict, Any
|
9 |
-
import warnings
|
10 |
import gradio as gr
|
11 |
-
|
|
|
12 |
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
class
|
17 |
"""
|
18 |
-
|
19 |
-
This method is good for lookups but not for mathematical aggregations.
|
20 |
"""
|
21 |
def __init__(self, openai_api_key: str):
|
22 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
26 |
self.logs = []
|
27 |
self.sheet_names = []
|
28 |
|
29 |
def load_excel_file(self, file_path: str) -> str:
|
|
|
30 |
self.logs.clear()
|
31 |
try:
|
32 |
-
|
33 |
-
self.sheet_names =
|
34 |
self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
|
35 |
|
36 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
37 |
-
|
38 |
for sheet_name in self.sheet_names:
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
# Create a FAISS vector store for the chunks
|
52 |
-
self.sheet_data_stores[sheet_name] = FAISS.from_documents(chunks, self.embeddings)
|
53 |
-
self.logs.append(f" - Indexed sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
|
54 |
-
except Exception as e:
|
55 |
-
self.logs.append(f"⚠️ Error processing sheet '{sheet_name}': {str(e)}")
|
56 |
-
continue
|
57 |
|
58 |
-
self.logs.append("✅ All sheets
|
59 |
return "\n".join(self.logs)
|
60 |
except Exception as e:
|
61 |
-
raise Exception(f"Error loading Excel file: {
|
62 |
|
63 |
-
def
|
64 |
-
df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
|
65 |
df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
|
66 |
-
|
|
|
|
|
67 |
for col in df.columns:
|
68 |
df[col] = df[col].astype(str)
|
69 |
return df
|
70 |
|
71 |
-
def
|
72 |
-
"""
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
75 |
"""
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
|
|
|
|
82 |
try:
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
95 |
)
|
96 |
-
|
97 |
-
|
98 |
-
return results
|
99 |
except Exception as e:
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
)
|
124 |
-
except Exception as e:
|
125 |
-
raise gr.Error(f"Failed to process file: {e}")
|
126 |
-
|
127 |
-
def generate_response(query, selected_sheet, system_state):
|
128 |
-
if not query: raise gr.Error("Please enter a query.")
|
129 |
-
if system_state is None: raise gr.Error("File not loaded. Please upload and load a file first.")
|
130 |
-
if not selected_sheet: raise gr.Error("Please select a sheet to query.")
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
gr.Markdown("# 📊 Excel AI Query System (Chunk & Search Edition)")
|
143 |
-
gr.Markdown("This version finds specific information in your Excel file. It is not designed for math or whole-dataset calculations.")
|
144 |
with gr.Row():
|
145 |
with gr.Column(scale=1):
|
146 |
gr.Markdown("### 1. Setup")
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
151 |
with gr.Column(scale=2):
|
152 |
gr.Markdown("### 2. Ask a Question")
|
153 |
-
|
154 |
-
label="Select a
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
fn=process_file,
|
166 |
-
inputs=[api_key_input, file_input],
|
167 |
-
outputs=[status_output, system_state, sheet_selector, query_input, ask_button, results_accordion]
|
168 |
-
)
|
169 |
-
ask_button.click(
|
170 |
-
fn=generate_response,
|
171 |
-
inputs=[query_input, sheet_selector, system_state],
|
172 |
-
outputs=[summary_output, details_output]
|
173 |
-
).then(
|
174 |
-
lambda: gr.update(open=True),
|
175 |
-
outputs=results_accordion
|
176 |
)
|
177 |
|
178 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
|
|
2 |
import gradio as gr
|
3 |
+
import pandas as pd
|
4 |
+
from typing import List, Dict, Any
|
5 |
|
6 |
+
# --- LlamaIndex & LangChain Imports ---
|
7 |
+
from llama_index.core import VectorStoreIndex, Document, Settings
|
8 |
+
from llama_index.llms.openai import OpenAI as LlamaOpenAI
|
9 |
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
10 |
+
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
|
11 |
+
from langchain_openai import ChatOpenAI
|
12 |
+
from langchain.agents.agent_types import AgentType
|
13 |
+
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
|
14 |
|
15 |
+
class HybridExcelQuerySystem:
|
16 |
"""
|
17 |
+
Implements a hybrid system that uses a RAG tool for lookups and a Pandas Agent for calculations.
|
|
|
18 |
"""
|
19 |
def __init__(self, openai_api_key: str):
|
20 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
21 |
+
# For LlamaIndex (RAG)
|
22 |
+
Settings.llm = LlamaOpenAI(model="gpt-4o")
|
23 |
+
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
|
24 |
+
# For LangChain Agent (Calculations)
|
25 |
+
self.agent_llm = ChatOpenAI(temperature=0, model="gpt-4o")
|
26 |
+
|
27 |
+
self.dataframes: Dict[str, pd.DataFrame] = {}
|
28 |
+
self.vector_stores: Dict[str, VectorStoreIndex] = {}
|
29 |
self.logs = []
|
30 |
self.sheet_names = []
|
31 |
|
32 |
def load_excel_file(self, file_path: str) -> str:
|
33 |
+
"""Loads data from an Excel file and prepares it for both RAG and Agent tools."""
|
34 |
self.logs.clear()
|
35 |
try:
|
36 |
+
xls = pd.ExcelFile(file_path)
|
37 |
+
self.sheet_names = xls.sheet_names
|
38 |
self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
|
39 |
|
|
|
|
|
40 |
for sheet_name in self.sheet_names:
|
41 |
+
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
42 |
+
|
43 |
+
# --- Prepare for Agent ---
|
44 |
+
self.dataframes[sheet_name] = self._clean_dataframe_for_agent(df.copy())
|
45 |
+
|
46 |
+
# --- Prepare for RAG ---
|
47 |
+
rag_df = self._clean_dataframe_for_rag(df.copy())
|
48 |
+
markdown_text = rag_df.to_markdown(index=False)
|
49 |
+
doc = Document(text=markdown_text, metadata={"source": sheet_name})
|
50 |
+
self.vector_stores[sheet_name] = VectorStoreIndex.from_documents([doc])
|
51 |
+
|
52 |
+
self.logs.append(f" - Prepared sheet '{sheet_name}' for both Lookup and Calculation.")
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
self.logs.append("✅ All sheets are ready.")
|
55 |
return "\n".join(self.logs)
|
56 |
except Exception as e:
|
57 |
+
raise Exception(f"Error loading Excel file: {e}")
|
58 |
|
59 |
+
def _clean_dataframe_for_agent(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
|
60 |
df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
|
61 |
+
return df
|
62 |
+
|
63 |
+
def _clean_dataframe_for_rag(self, df: pd.DataFrame) -> pd.DataFrame:
|
64 |
for col in df.columns:
|
65 |
df[col] = df[col].astype(str)
|
66 |
return df
|
67 |
|
68 |
+
def _classify_query(self, query: str) -> str:
|
69 |
+
"""Uses an LLM to classify the query as 'lookup' or 'calculation'."""
|
70 |
+
prompt = f"""
|
71 |
+
Classify the user's query about an Excel sheet as either "lookup" or "calculation".
|
72 |
+
- "lookup": Use for questions asking for specific data, text, or summaries that can likely be found in a few rows. Examples: 'What are the details for order X?', 'Summarize the notes for July'.
|
73 |
+
- "calculation": Use for questions that require mathematical operations (sum, average, count), sorting, filtering, or finding trends across the entire dataset. Examples: 'What is the total revenue?', 'Find the top 3 months by profit', 'How many entries are there?'.
|
74 |
+
|
75 |
+
User Query: "{query}"
|
76 |
+
Classification:
|
77 |
"""
|
78 |
+
response = self.agent_llm.invoke(prompt)
|
79 |
+
classification = response.content.strip().lower()
|
80 |
+
# Default to lookup for safety if classification is unclear
|
81 |
+
return "calculation" if "calculation" in classification else "lookup"
|
82 |
+
|
83 |
+
def query(self, query: str, selected_sheet: str) -> Dict[str, Any]:
|
84 |
+
"""The main query function that routes to the appropriate tool."""
|
85 |
+
if not selected_sheet:
|
86 |
+
return {"answer": "Error: Please select a sheet first.", "tool_used": "None"}
|
87 |
+
|
88 |
+
classification = self._classify_query(query)
|
89 |
+
|
90 |
+
if classification == "calculation":
|
91 |
+
return self._execute_agent_query(query, selected_sheet)
|
92 |
+
else: # Default to RAG for lookups
|
93 |
+
return self._execute_rag_query(query, selected_sheet)
|
94 |
|
95 |
+
def _execute_rag_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
|
96 |
+
"""Handles lookup queries using the RAG tool."""
|
97 |
try:
|
98 |
+
query_engine = self.vector_stores[sheet_name].as_query_engine()
|
99 |
+
response = query_engine.query(query)
|
100 |
+
return {"answer": str(response), "tool_used": "Lookup (RAG Search)"}
|
101 |
+
except Exception as e:
|
102 |
+
return {"answer": f"Error during lookup: {e}", "tool_used": "Lookup (RAG Search)"}
|
103 |
+
|
104 |
+
def _execute_agent_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
|
105 |
+
"""Handles calculation queries using the Pandas Agent."""
|
106 |
+
try:
|
107 |
+
df = self.dataframes[sheet_name]
|
108 |
+
agent = create_pandas_dataframe_agent(
|
109 |
+
self.agent_llm,
|
110 |
+
df,
|
111 |
+
agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
112 |
+
verbose=True,
|
113 |
+
allow_dangerous_code=True,
|
114 |
+
max_iterations=15
|
115 |
)
|
116 |
+
response = agent.invoke(query)
|
117 |
+
return {"answer": response['output'], "tool_used": "Calculation (Pandas Agent)"}
|
|
|
118 |
except Exception as e:
|
119 |
+
return {"answer": f"Error during calculation: {e}", "tool_used": "Calculation (Pandas Agent)"}
|
120 |
+
|
121 |
+
# --- Gradio UI ---
|
122 |
+
def process_excel(api_key: str, file_obj: gr.File):
|
123 |
+
if not api_key: raise gr.Error("Please provide your OpenAI API key.")
|
124 |
+
if not file_obj: raise gr.Error("Please upload an Excel file.")
|
125 |
+
|
126 |
+
system = HybridExcelQuerySystem(api_key=api_key)
|
127 |
+
logs = system.load_excel_file(file_obj.name)
|
128 |
+
sheet_names = system.sheet_names
|
129 |
+
|
130 |
+
return (
|
131 |
+
logs, system,
|
132 |
+
gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
|
133 |
+
gr.update(visible=True)
|
134 |
+
)
|
135 |
+
|
136 |
+
def user_interaction(question: str, history: List, system_state: HybridExcelQuerySystem, selected_sheet: str):
|
137 |
+
if not system_state: raise gr.Error("Please upload and process a file first.")
|
138 |
+
|
139 |
+
result = system_state.query(question, selected_sheet)
|
140 |
+
answer = result.get("answer", "No response.")
|
141 |
+
tool_used = result.get("tool_used", "Unknown")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
+
# Append the tool used to the answer for transparency
|
144 |
+
full_response = f"{answer}\n\n*Tool Used: {tool_used}*"
|
145 |
+
return full_response
|
146 |
+
|
147 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Excel Analyzer") as demo:
|
148 |
+
system_state = gr.State()
|
149 |
+
|
150 |
+
gr.Markdown("# 🤖 Hybrid Excel Analyzer")
|
151 |
+
gr.Markdown("This app automatically chooses the best AI tool—a search tool for lookups or a code-writing agent for calculations—to answer your questions about an Excel file.")
|
152 |
+
|
|
|
|
|
153 |
with gr.Row():
|
154 |
with gr.Column(scale=1):
|
155 |
gr.Markdown("### 1. Setup")
|
156 |
+
openai_api_key = gr.Textbox(label="OpenAI API Key", type="password", value=os.getenv("OPENAI_API_KEY", ""))
|
157 |
+
excel_upload = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
|
158 |
+
process_button = gr.Button("Process File", variant="primary")
|
159 |
+
status_text = gr.Textbox(label="Processing Status", interactive=False, lines=8)
|
160 |
+
|
161 |
with gr.Column(scale=2):
|
162 |
gr.Markdown("### 2. Ask a Question")
|
163 |
+
with gr.Group(visible=False) as query_ui:
|
164 |
+
sheet_selector = gr.Dropdown(label="Select a Sheet")
|
165 |
+
chat_interface = gr.ChatInterface(
|
166 |
+
fn=user_interaction,
|
167 |
+
additional_inputs=[system_state, sheet_selector],
|
168 |
+
title="Chat with your Excel Data"
|
169 |
+
)
|
170 |
+
|
171 |
+
process_button.click(
|
172 |
+
fn=process_excel,
|
173 |
+
inputs=[openai_api_key, excel_upload],
|
174 |
+
outputs=[status_text, system_state, sheet_selector, query_ui]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
)
|
176 |
|
177 |
if __name__ == "__main__":
|