Adityabhaskar commited on
Commit
0c623ad
·
verified ·
1 Parent(s): ec25353

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -138
app.py CHANGED
@@ -1,178 +1,177 @@
1
- import pandas as pd
2
- from langchain_openai import OpenAIEmbeddings, ChatOpenAI
3
- from langchain_core.documents import Document
4
- from langchain_community.vectorstores import FAISS
5
- from langchain_text_splitters import RecursiveCharacterTextSplitter
6
- from langchain.chains.qa_with_sources import load_qa_with_sources_chain
7
  import os
8
- from typing import Dict, Any
9
- import warnings
10
  import gradio as gr
11
- from dotenv import load_dotenv
 
12
 
13
- warnings.filterwarnings('ignore')
14
- load_dotenv()
 
 
 
 
 
 
15
 
16
- class ExcelAIQuerySystem:
17
  """
18
- A system to query Excel files using a reliable "Chunk and Search" (RAG) method.
19
- This method is good for lookups but not for mathematical aggregations.
20
  """
21
  def __init__(self, openai_api_key: str):
22
  os.environ["OPENAI_API_KEY"] = openai_api_key
23
- self.llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
24
- self.embeddings = OpenAIEmbeddings()
25
- self.sheet_data_stores: Dict[str, FAISS] = {} # Store a vector store for each sheet
 
 
 
 
 
26
  self.logs = []
27
  self.sheet_names = []
28
 
29
  def load_excel_file(self, file_path: str) -> str:
 
30
  self.logs.clear()
31
  try:
32
- excel_file = pd.ExcelFile(file_path)
33
- self.sheet_names = excel_file.sheet_names
34
  self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
35
 
36
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
37
-
38
  for sheet_name in self.sheet_names:
39
- try:
40
- df = pd.read_excel(file_path, sheet_name=sheet_name)
41
- df = self._clean_dataframe(df)
42
-
43
- # Convert dataframe to a single text document
44
- # Using markdown format for better structure
45
- markdown_text = df.to_markdown(index=False)
46
-
47
- # Create documents and split them into chunks
48
- doc = Document(page_content=markdown_text, metadata={"source": sheet_name})
49
- chunks = text_splitter.split_documents([doc])
50
-
51
- # Create a FAISS vector store for the chunks
52
- self.sheet_data_stores[sheet_name] = FAISS.from_documents(chunks, self.embeddings)
53
- self.logs.append(f" - Indexed sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
54
- except Exception as e:
55
- self.logs.append(f"⚠️ Error processing sheet '{sheet_name}': {str(e)}")
56
- continue
57
 
58
- self.logs.append("✅ All sheets processed and indexed.")
59
  return "\n".join(self.logs)
60
  except Exception as e:
61
- raise Exception(f"Error loading Excel file: {str(e)}")
62
 
63
- def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
64
- df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
65
  df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
66
- # Convert all data to string to ensure consistency for text processing
 
 
67
  for col in df.columns:
68
  df[col] = df[col].astype(str)
69
  return df
70
 
71
- def query_data(self, query: str, target_sheet: str) -> Dict[str, Any]:
72
- """
73
- --- NEW LOGIC ---
74
- Searches for relevant data chunks and uses an LLM to answer based on them.
 
 
 
 
 
75
  """
76
- results = {'query': query, 'summary': ''}
77
-
78
- if not target_sheet or target_sheet not in self.sheet_data_stores:
79
- results['summary'] = "Error: Please select a valid sheet to query."
80
- return results
 
 
 
 
 
 
 
 
 
 
 
81
 
 
 
82
  try:
83
- vector_store = self.sheet_data_stores[target_sheet]
84
-
85
- # Find the most relevant data chunks for the query
86
- relevant_docs = vector_store.similarity_search(query, k=5)
87
-
88
- # Create a Question-Answering chain
89
- qa_chain = load_qa_with_sources_chain(self.llm, chain_type="stuff")
90
-
91
- # Run the chain with the relevant docs
92
- response = qa_chain.invoke(
93
- {"input_documents": relevant_docs, "question": query},
94
- return_only_outputs=True
 
 
 
 
 
95
  )
96
-
97
- results['summary'] = response.get('output_text', "Could not find an answer in the data.")
98
- return results
99
  except Exception as e:
100
- results['summary'] = f"An error occurred while querying the data: {str(e)}"
101
- return results
102
-
103
- # --- Gradio Interface ---
104
- # Simplified to work with the new RAG logic
105
-
106
- def process_file(api_key, file_obj):
107
- if not api_key: raise gr.Error("OpenAI API Key is required.")
108
- if file_obj is None: raise gr.Error("Please upload an Excel file.")
109
- try:
110
- excel_system = ExcelAIQuerySystem(api_key)
111
- loading_logs = excel_system.load_excel_file(file_obj.name)
112
-
113
- # Now a sheet must be selected, so we don't include "Auto-Select"
114
- sheet_names = excel_system.sheet_names
115
-
116
- return (
117
- loading_logs,
118
- excel_system,
119
- gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
120
- gr.update(visible=True),
121
- gr.update(visible=True),
122
- gr.update(visible=True)
123
- )
124
- except Exception as e:
125
- raise gr.Error(f"Failed to process file: {e}")
126
-
127
- def generate_response(query, selected_sheet, system_state):
128
- if not query: raise gr.Error("Please enter a query.")
129
- if system_state is None: raise gr.Error("File not loaded. Please upload and load a file first.")
130
- if not selected_sheet: raise gr.Error("Please select a sheet to query.")
131
 
132
- try:
133
- result = system_state.query_data(query, target_sheet=selected_sheet)
134
- summary = result.get('summary', 'No summary available.')
135
- details = f"**🔍 Searched in Sheet:**\n{selected_sheet}"
136
- return summary, details
137
- except Exception as e:
138
- raise gr.Error(f"Error during query: {e}")
139
-
140
- with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
141
- system_state = gr.State(None)
142
- gr.Markdown("# 📊 Excel AI Query System (Chunk & Search Edition)")
143
- gr.Markdown("This version finds specific information in your Excel file. It is not designed for math or whole-dataset calculations.")
144
  with gr.Row():
145
  with gr.Column(scale=1):
146
  gr.Markdown("### 1. Setup")
147
- api_key_input = gr.Textbox(label="OpenAI API Key", type="password", placeholder="Enter your OpenAI API key...", value=os.getenv("OPENAI_API_KEY", ""))
148
- file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
149
- load_button = gr.Button("Load File", variant="primary")
150
- status_output = gr.Textbox(label="Indexing Status", interactive=False, lines=10)
 
151
  with gr.Column(scale=2):
152
  gr.Markdown("### 2. Ask a Question")
153
- sheet_selector = gr.Dropdown(
154
- label="Select a sheet to query",
155
- info="You must select a sheet.",
156
- visible=False,
157
- interactive=True
158
- )
159
- query_input = gr.Textbox(label="Your Question", placeholder="e.g., 'What are the details for order #12345?'", visible=False)
160
- ask_button = gr.Button("Get Answer", variant="primary", visible=False)
161
- with gr.Accordion("Results", open=False, visible=False) as results_accordion:
162
- summary_output = gr.Markdown(label="Answer")
163
- details_output = gr.Markdown(label="Details")
164
- load_button.click(
165
- fn=process_file,
166
- inputs=[api_key_input, file_input],
167
- outputs=[status_output, system_state, sheet_selector, query_input, ask_button, results_accordion]
168
- )
169
- ask_button.click(
170
- fn=generate_response,
171
- inputs=[query_input, sheet_selector, system_state],
172
- outputs=[summary_output, details_output]
173
- ).then(
174
- lambda: gr.update(open=True),
175
- outputs=results_accordion
176
  )
177
 
178
  if __name__ == "__main__":
 
 
 
 
 
 
 
1
  import os
 
 
2
  import gradio as gr
3
+ import pandas as pd
4
+ from typing import List, Dict, Any
5
 
6
+ # --- LlamaIndex & LangChain Imports ---
7
+ from llama_index.core import VectorStoreIndex, Document, Settings
8
+ from llama_index.llms.openai import OpenAI as LlamaOpenAI
9
+ from llama_index.embeddings.openai import OpenAIEmbedding
10
+ from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
11
+ from langchain_openai import ChatOpenAI
12
+ from langchain.agents.agent_types import AgentType
13
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
14
 
15
+ class HybridExcelQuerySystem:
16
  """
17
+ Implements a hybrid system that uses a RAG tool for lookups and a Pandas Agent for calculations.
 
18
  """
19
  def __init__(self, openai_api_key: str):
20
  os.environ["OPENAI_API_KEY"] = openai_api_key
21
+ # For LlamaIndex (RAG)
22
+ Settings.llm = LlamaOpenAI(model="gpt-4o")
23
+ Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
24
+ # For LangChain Agent (Calculations)
25
+ self.agent_llm = ChatOpenAI(temperature=0, model="gpt-4o")
26
+
27
+ self.dataframes: Dict[str, pd.DataFrame] = {}
28
+ self.vector_stores: Dict[str, VectorStoreIndex] = {}
29
  self.logs = []
30
  self.sheet_names = []
31
 
32
  def load_excel_file(self, file_path: str) -> str:
33
+ """Loads data from an Excel file and prepares it for both RAG and Agent tools."""
34
  self.logs.clear()
35
  try:
36
+ xls = pd.ExcelFile(file_path)
37
+ self.sheet_names = xls.sheet_names
38
  self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
39
 
 
 
40
  for sheet_name in self.sheet_names:
41
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
42
+
43
+ # --- Prepare for Agent ---
44
+ self.dataframes[sheet_name] = self._clean_dataframe_for_agent(df.copy())
45
+
46
+ # --- Prepare for RAG ---
47
+ rag_df = self._clean_dataframe_for_rag(df.copy())
48
+ markdown_text = rag_df.to_markdown(index=False)
49
+ doc = Document(text=markdown_text, metadata={"source": sheet_name})
50
+ self.vector_stores[sheet_name] = VectorStoreIndex.from_documents([doc])
51
+
52
+ self.logs.append(f" - Prepared sheet '{sheet_name}' for both Lookup and Calculation.")
 
 
 
 
 
 
53
 
54
+ self.logs.append("✅ All sheets are ready.")
55
  return "\n".join(self.logs)
56
  except Exception as e:
57
+ raise Exception(f"Error loading Excel file: {e}")
58
 
59
+ def _clean_dataframe_for_agent(self, df: pd.DataFrame) -> pd.DataFrame:
 
60
  df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
61
+ return df
62
+
63
+ def _clean_dataframe_for_rag(self, df: pd.DataFrame) -> pd.DataFrame:
64
  for col in df.columns:
65
  df[col] = df[col].astype(str)
66
  return df
67
 
68
+ def _classify_query(self, query: str) -> str:
69
+ """Uses an LLM to classify the query as 'lookup' or 'calculation'."""
70
+ prompt = f"""
71
+ Classify the user's query about an Excel sheet as either "lookup" or "calculation".
72
+ - "lookup": Use for questions asking for specific data, text, or summaries that can likely be found in a few rows. Examples: 'What are the details for order X?', 'Summarize the notes for July'.
73
+ - "calculation": Use for questions that require mathematical operations (sum, average, count), sorting, filtering, or finding trends across the entire dataset. Examples: 'What is the total revenue?', 'Find the top 3 months by profit', 'How many entries are there?'.
74
+
75
+ User Query: "{query}"
76
+ Classification:
77
  """
78
+ response = self.agent_llm.invoke(prompt)
79
+ classification = response.content.strip().lower()
80
+ # Default to lookup for safety if classification is unclear
81
+ return "calculation" if "calculation" in classification else "lookup"
82
+
83
+ def query(self, query: str, selected_sheet: str) -> Dict[str, Any]:
84
+ """The main query function that routes to the appropriate tool."""
85
+ if not selected_sheet:
86
+ return {"answer": "Error: Please select a sheet first.", "tool_used": "None"}
87
+
88
+ classification = self._classify_query(query)
89
+
90
+ if classification == "calculation":
91
+ return self._execute_agent_query(query, selected_sheet)
92
+ else: # Default to RAG for lookups
93
+ return self._execute_rag_query(query, selected_sheet)
94
 
95
+ def _execute_rag_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
96
+ """Handles lookup queries using the RAG tool."""
97
  try:
98
+ query_engine = self.vector_stores[sheet_name].as_query_engine()
99
+ response = query_engine.query(query)
100
+ return {"answer": str(response), "tool_used": "Lookup (RAG Search)"}
101
+ except Exception as e:
102
+ return {"answer": f"Error during lookup: {e}", "tool_used": "Lookup (RAG Search)"}
103
+
104
+ def _execute_agent_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
105
+ """Handles calculation queries using the Pandas Agent."""
106
+ try:
107
+ df = self.dataframes[sheet_name]
108
+ agent = create_pandas_dataframe_agent(
109
+ self.agent_llm,
110
+ df,
111
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
112
+ verbose=True,
113
+ allow_dangerous_code=True,
114
+ max_iterations=15
115
  )
116
+ response = agent.invoke(query)
117
+ return {"answer": response['output'], "tool_used": "Calculation (Pandas Agent)"}
 
118
  except Exception as e:
119
+ return {"answer": f"Error during calculation: {e}", "tool_used": "Calculation (Pandas Agent)"}
120
+
121
+ # --- Gradio UI ---
122
+ def process_excel(api_key: str, file_obj: gr.File):
123
+ if not api_key: raise gr.Error("Please provide your OpenAI API key.")
124
+ if not file_obj: raise gr.Error("Please upload an Excel file.")
125
+
126
+ system = HybridExcelQuerySystem(api_key=api_key)
127
+ logs = system.load_excel_file(file_obj.name)
128
+ sheet_names = system.sheet_names
129
+
130
+ return (
131
+ logs, system,
132
+ gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
133
+ gr.update(visible=True)
134
+ )
135
+
136
+ def user_interaction(question: str, history: List, system_state: HybridExcelQuerySystem, selected_sheet: str):
137
+ if not system_state: raise gr.Error("Please upload and process a file first.")
138
+
139
+ result = system_state.query(question, selected_sheet)
140
+ answer = result.get("answer", "No response.")
141
+ tool_used = result.get("tool_used", "Unknown")
 
 
 
 
 
 
 
 
142
 
143
+ # Append the tool used to the answer for transparency
144
+ full_response = f"{answer}\n\n*Tool Used: {tool_used}*"
145
+ return full_response
146
+
147
+ with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Excel Analyzer") as demo:
148
+ system_state = gr.State()
149
+
150
+ gr.Markdown("# 🤖 Hybrid Excel Analyzer")
151
+ gr.Markdown("This app automatically chooses the best AI tool—a search tool for lookups or a code-writing agent for calculations—to answer your questions about an Excel file.")
152
+
 
 
153
  with gr.Row():
154
  with gr.Column(scale=1):
155
  gr.Markdown("### 1. Setup")
156
+ openai_api_key = gr.Textbox(label="OpenAI API Key", type="password", value=os.getenv("OPENAI_API_KEY", ""))
157
+ excel_upload = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
158
+ process_button = gr.Button("Process File", variant="primary")
159
+ status_text = gr.Textbox(label="Processing Status", interactive=False, lines=8)
160
+
161
  with gr.Column(scale=2):
162
  gr.Markdown("### 2. Ask a Question")
163
+ with gr.Group(visible=False) as query_ui:
164
+ sheet_selector = gr.Dropdown(label="Select a Sheet")
165
+ chat_interface = gr.ChatInterface(
166
+ fn=user_interaction,
167
+ additional_inputs=[system_state, sheet_selector],
168
+ title="Chat with your Excel Data"
169
+ )
170
+
171
+ process_button.click(
172
+ fn=process_excel,
173
+ inputs=[openai_api_key, excel_upload],
174
+ outputs=[status_text, system_state, sheet_selector, query_ui]
 
 
 
 
 
 
 
 
 
 
 
175
  )
176
 
177
  if __name__ == "__main__":