Adityabhaskar commited on
Commit
f837f58
·
verified ·
1 Parent(s): 0d5b676

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -102
app.py CHANGED
@@ -1,15 +1,11 @@
1
  import pandas as pd
2
- import numpy as np
3
- from langchain_openai import OpenAIEmbeddings, ChatOpenAI # MODIFIED
4
  from langchain_core.documents import Document
5
  from langchain_community.vectorstores import FAISS
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
- from langchain.agents.agent_types import AgentType
8
- from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
9
- import re
10
  import os
11
- import io
12
- from typing import Dict, List, Any
13
  import warnings
14
  import gradio as gr
15
  from dotenv import load_dotenv
@@ -18,37 +14,48 @@ warnings.filterwarnings('ignore')
18
  load_dotenv()
19
 
20
  class ExcelAIQuerySystem:
 
 
 
 
21
  def __init__(self, openai_api_key: str):
22
  os.environ["OPENAI_API_KEY"] = openai_api_key
23
- # --- USE A MORE CAPABLE CHAT MODEL ---
24
  self.llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
25
  self.embeddings = OpenAIEmbeddings()
26
- self.excel_data = {}
27
- self.sheet_descriptions = {}
28
- self.vectorstore = None
29
  self.logs = []
 
30
 
31
  def load_excel_file(self, file_path: str) -> str:
32
  self.logs.clear()
33
  try:
34
  excel_file = pd.ExcelFile(file_path)
35
- sheet_names = excel_file.sheet_names
36
- self.logs.append(f"✅ Found {len(sheet_names)} sheets: {', '.join(sheet_names)}")
37
 
38
- for sheet_name in sheet_names:
 
 
39
  try:
40
  df = pd.read_excel(file_path, sheet_name=sheet_name)
41
  df = self._clean_dataframe(df)
42
- self.excel_data[sheet_name] = df
43
- description = self._generate_sheet_description(sheet_name, df)
44
- self.sheet_descriptions[sheet_name] = description
45
- self.logs.append(f" - Loaded and described sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
 
 
 
 
 
 
 
 
46
  except Exception as e:
47
- self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
48
  continue
49
 
50
- self._create_vectorstore()
51
- self.logs.append("✅ Vector store created successfully.")
52
  return "\n".join(self.logs)
53
  except Exception as e:
54
  raise Exception(f"Error loading Excel file: {str(e)}")
@@ -56,102 +63,60 @@ class ExcelAIQuerySystem:
56
  def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
57
  df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
58
  df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
 
59
  for col in df.columns:
60
- if df[col].dtype == 'object':
61
- try: df[col] = pd.to_datetime(df[col], errors='ignore')
62
- except: pass
63
- try: df[col] = pd.to_numeric(df[col], errors='ignore')
64
- except: pass
65
  return df
66
 
67
- def _generate_sheet_description(self, sheet_name: str, df: pd.DataFrame) -> str:
68
- buffer = io.StringIO()
69
- df.info(buf=buffer)
70
- prompt = f"""
71
- Analyze the metadata of this Excel sheet to provide a concise, one-paragraph summary.
72
- Sheet Name: {sheet_name}
73
- Dataframe Info: {buffer.getvalue()}
74
- First 3 Rows: {df.head(3).to_string()}
75
- Summary Stats: {df.describe().to_string()}
76
- Based on all the metadata, summarize the sheet's main purpose and the types of data it contains.
77
  """
78
- try:
79
- return self.llm.invoke(prompt).content
80
- except Exception:
81
- return f"Sheet: {sheet_name}, Columns: {', '.join(list(df.columns))}"
82
-
83
- def _create_vectorstore(self):
84
- documents = [Document(page_content=desc, metadata={"sheet_name": name}) for name, desc in self.sheet_descriptions.items()]
85
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
86
- splits = text_splitter.split_documents(documents)
87
- self.vectorstore = FAISS.from_documents(splits, self.embeddings)
88
-
89
- def identify_relevant_sheets(self, query: str) -> List[str]:
90
- if not self.vectorstore: return list(self.excel_data.keys())
91
- try:
92
- docs = self.vectorstore.similarity_search(query, k=5)
93
- sheet_names = [doc.metadata['sheet_name'] for doc in docs if 'sheet_name' in doc.metadata]
94
- return list(dict.fromkeys(sheet_names))
95
- except Exception:
96
- return list(self.excel_data.keys())
97
-
98
- def query_data(self, query: str, target_sheet: str = "Auto-Select") -> Dict[str, Any]:
99
- results = {'query': query, 'relevant_sheets': [], 'sheet_results': {}, 'summary': ''}
100
 
101
- try:
102
- if target_sheet and target_sheet != "Auto-Select":
103
- relevant_sheets = [target_sheet]
104
- if target_sheet not in self.excel_data:
105
- results['summary'] = f"Error: The selected sheet '{target_sheet}' was not found or could not be loaded."
106
- return results
107
- else:
108
- relevant_sheets = self.identify_relevant_sheets(query)
109
-
110
- results['relevant_sheets'] = relevant_sheets
111
 
112
- for sheet_name in relevant_sheets:
113
- if sheet_name not in self.excel_data: continue
114
-
115
- df = self.excel_data[sheet_name]
116
- # --- INCREASE THE ITERATION LIMIT ---
117
- pandas_agent = create_pandas_dataframe_agent(
118
- self.llm,
119
- df,
120
- agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
121
- verbose=True,
122
- allow_dangerous_code=True,
123
- max_iterations=50
124
- )
125
- response = pandas_agent.invoke(query)
126
- results['sheet_results'][sheet_name] = {'response': response['output']}
127
 
128
- results['summary'] = self._generate_summary(query, results['sheet_results'])
 
 
 
 
 
 
129
  return results
130
  except Exception as e:
131
  results['summary'] = f"An error occurred while querying the data: {str(e)}"
132
  return results
133
 
134
- def _generate_summary(self, query: str, sheet_results: Dict) -> str:
135
- if not sheet_results: return "No relevant data found to answer the query."
136
- if len(sheet_results) == 1: return list(sheet_results.values())[0]['response']
137
 
138
- combined_responses = "\n\n".join([f"--- Analysis from Sheet '{name}' ---\n{res['response']}" for name, res in sheet_results.items()])
139
- prompt = f"The following are answers to the query '{query}' from different data sheets. Synthesize them into a single, cohesive final answer.\n\n{combined_responses}\n\nProvide a final, consolidated answer."
140
- return self.llm.invoke(prompt).content
141
-
142
- # --- Gradio Interface (No changes needed) ---
143
  def process_file(api_key, file_obj):
144
  if not api_key: raise gr.Error("OpenAI API Key is required.")
145
  if file_obj is None: raise gr.Error("Please upload an Excel file.")
146
  try:
147
  excel_system = ExcelAIQuerySystem(api_key)
148
  loading_logs = excel_system.load_excel_file(file_obj.name)
149
- sheet_names = ["Auto-Select"] + list(excel_system.excel_data.keys())
 
 
150
 
151
  return (
152
  loading_logs,
153
  excel_system,
154
- gr.update(choices=sheet_names, value="Auto-Select", visible=True),
155
  gr.update(visible=True),
156
  gr.update(visible=True),
157
  gr.update(visible=True)
@@ -162,39 +127,39 @@ def process_file(api_key, file_obj):
162
  def generate_response(query, selected_sheet, system_state):
163
  if not query: raise gr.Error("Please enter a query.")
164
  if system_state is None: raise gr.Error("File not loaded. Please upload and load a file first.")
 
165
 
166
  try:
167
  result = system_state.query_data(query, target_sheet=selected_sheet)
168
  summary = result.get('summary', 'No summary available.')
169
- sheets = ", ".join(result.get('relevant_sheets', []))
170
- details = f"**🔍 Sheets Queried:**\n{sheets}"
171
  return summary, details
172
  except Exception as e:
173
  raise gr.Error(f"Error during query: {e}")
174
 
175
  with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
176
  system_state = gr.State(None)
177
- gr.Markdown("# 📊 Excel AI Query System")
178
- gr.Markdown("Upload an Excel file, choose a specific sheet or let the AI decide, and ask questions about your data.")
179
  with gr.Row():
180
  with gr.Column(scale=1):
181
  gr.Markdown("### 1. Setup")
182
  api_key_input = gr.Textbox(label="OpenAI API Key", type="password", placeholder="Enter your OpenAI API key...", value=os.getenv("OPENAI_API_KEY", ""))
183
  file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
184
  load_button = gr.Button("Load File", variant="primary")
185
- status_output = gr.Textbox(label="Loading Status", interactive=False, lines=10)
186
  with gr.Column(scale=2):
187
  gr.Markdown("### 2. Ask a Question")
188
  sheet_selector = gr.Dropdown(
189
  label="Select a sheet to query",
190
- info="Choose 'Auto-Select' to let the AI find the best sheet.",
191
  visible=False,
192
  interactive=True
193
  )
194
- query_input = gr.Textbox(label="Your Question", placeholder="e.g., 'What is the average revenue?'", visible=False)
195
  ask_button = gr.Button("Get Answer", variant="primary", visible=False)
196
  with gr.Accordion("Results", open=False, visible=False) as results_accordion:
197
- summary_output = gr.Markdown(label="Summary")
198
  details_output = gr.Markdown(label="Details")
199
  load_button.click(
200
  fn=process_file,
 
1
  import pandas as pd
2
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 
3
  from langchain_core.documents import Document
4
  from langchain_community.vectorstores import FAISS
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 
 
7
  import os
8
+ from typing import Dict, Any
 
9
  import warnings
10
  import gradio as gr
11
  from dotenv import load_dotenv
 
14
  load_dotenv()
15
 
16
  class ExcelAIQuerySystem:
17
+ """
18
+ A system to query Excel files using a reliable "Chunk and Search" (RAG) method.
19
+ This method is good for lookups but not for mathematical aggregations.
20
+ """
21
  def __init__(self, openai_api_key: str):
22
  os.environ["OPENAI_API_KEY"] = openai_api_key
 
23
  self.llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
24
  self.embeddings = OpenAIEmbeddings()
25
+ self.sheet_data_stores: Dict[str, FAISS] = {} # Store a vector store for each sheet
 
 
26
  self.logs = []
27
+ self.sheet_names = []
28
 
29
  def load_excel_file(self, file_path: str) -> str:
30
  self.logs.clear()
31
  try:
32
  excel_file = pd.ExcelFile(file_path)
33
+ self.sheet_names = excel_file.sheet_names
34
+ self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
35
 
36
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
37
+
38
+ for sheet_name in self.sheet_names:
39
  try:
40
  df = pd.read_excel(file_path, sheet_name=sheet_name)
41
  df = self._clean_dataframe(df)
42
+
43
+ # Convert dataframe to a single text document
44
+ # Using markdown format for better structure
45
+ markdown_text = df.to_markdown(index=False)
46
+
47
+ # Create documents and split them into chunks
48
+ doc = Document(page_content=markdown_text, metadata={"sheet_name": sheet_name})
49
+ chunks = text_splitter.split_documents([doc])
50
+
51
+ # Create a FAISS vector store for the chunks
52
+ self.sheet_data_stores[sheet_name] = FAISS.from_documents(chunks, self.embeddings)
53
+ self.logs.append(f" - Indexed sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
54
  except Exception as e:
55
+ self.logs.append(f"⚠️ Error processing sheet '{sheet_name}': {str(e)}")
56
  continue
57
 
58
+ self.logs.append("✅ All sheets processed and indexed.")
 
59
  return "\n".join(self.logs)
60
  except Exception as e:
61
  raise Exception(f"Error loading Excel file: {str(e)}")
 
63
  def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
64
  df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
65
  df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
66
+ # Convert all data to string to ensure consistency for text processing
67
  for col in df.columns:
68
+ df[col] = df[col].astype(str)
 
 
 
 
69
  return df
70
 
71
+ def query_data(self, query: str, target_sheet: str) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
72
  """
73
+ --- NEW LOGIC ---
74
+ Searches for relevant data chunks and uses an LLM to answer based on them.
75
+ """
76
+ results = {'query': query, 'summary': ''}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ if not target_sheet or target_sheet not in self.sheet_data_stores:
79
+ results['summary'] = "Error: Please select a valid sheet to query."
80
+ return results
 
 
 
 
 
 
 
81
 
82
+ try:
83
+ vector_store = self.sheet_data_stores[target_sheet]
84
+
85
+ # Find the most relevant data chunks for the query
86
+ relevant_docs = vector_store.similarity_search(query, k=5)
87
+
88
+ # Create a Question-Answering chain
89
+ qa_chain = load_qa_with_sources_chain(self.llm, chain_type="stuff")
 
 
 
 
 
 
 
90
 
91
+ # Run the chain with the relevant docs
92
+ response = qa_chain.invoke(
93
+ {"input_documents": relevant_docs, "question": query},
94
+ return_only_outputs=True
95
+ )
96
+
97
+ results['summary'] = response.get('output_text', "Could not find an answer in the data.")
98
  return results
99
  except Exception as e:
100
  results['summary'] = f"An error occurred while querying the data: {str(e)}"
101
  return results
102
 
103
+ # --- Gradio Interface ---
104
+ # Simplified to work with the new RAG logic
 
105
 
 
 
 
 
 
106
  def process_file(api_key, file_obj):
107
  if not api_key: raise gr.Error("OpenAI API Key is required.")
108
  if file_obj is None: raise gr.Error("Please upload an Excel file.")
109
  try:
110
  excel_system = ExcelAIQuerySystem(api_key)
111
  loading_logs = excel_system.load_excel_file(file_obj.name)
112
+
113
+ # Now a sheet must be selected, so we don't include "Auto-Select"
114
+ sheet_names = excel_system.sheet_names
115
 
116
  return (
117
  loading_logs,
118
  excel_system,
119
+ gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
120
  gr.update(visible=True),
121
  gr.update(visible=True),
122
  gr.update(visible=True)
 
127
  def generate_response(query, selected_sheet, system_state):
128
  if not query: raise gr.Error("Please enter a query.")
129
  if system_state is None: raise gr.Error("File not loaded. Please upload and load a file first.")
130
+ if not selected_sheet: raise gr.Error("Please select a sheet to query.")
131
 
132
  try:
133
  result = system_state.query_data(query, target_sheet=selected_sheet)
134
  summary = result.get('summary', 'No summary available.')
135
+ details = f"**🔍 Searched in Sheet:**\n{selected_sheet}"
 
136
  return summary, details
137
  except Exception as e:
138
  raise gr.Error(f"Error during query: {e}")
139
 
140
  with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
141
  system_state = gr.State(None)
142
+ gr.Markdown("# 📊 Excel AI Query System (Chunk & Search Edition)")
143
+ gr.Markdown("This version finds specific information in your Excel file. It is not designed for math or whole-dataset calculations.")
144
  with gr.Row():
145
  with gr.Column(scale=1):
146
  gr.Markdown("### 1. Setup")
147
  api_key_input = gr.Textbox(label="OpenAI API Key", type="password", placeholder="Enter your OpenAI API key...", value=os.getenv("OPENAI_API_KEY", ""))
148
  file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
149
  load_button = gr.Button("Load File", variant="primary")
150
+ status_output = gr.Textbox(label="Indexing Status", interactive=False, lines=10)
151
  with gr.Column(scale=2):
152
  gr.Markdown("### 2. Ask a Question")
153
  sheet_selector = gr.Dropdown(
154
  label="Select a sheet to query",
155
+ info="You must select a sheet.",
156
  visible=False,
157
  interactive=True
158
  )
159
+ query_input = gr.Textbox(label="Your Question", placeholder="e.g., 'What are the details for order #12345?'", visible=False)
160
  ask_button = gr.Button("Get Answer", variant="primary", visible=False)
161
  with gr.Accordion("Results", open=False, visible=False) as results_accordion:
162
+ summary_output = gr.Markdown(label="Answer")
163
  details_output = gr.Markdown(label="Details")
164
  load_button.click(
165
  fn=process_file,