Adityabhaskar commited on
Commit
411ac2d
·
verified ·
1 Parent(s): 1c90d05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -109
app.py CHANGED
@@ -5,8 +5,11 @@ from langchain_core.documents import Document
5
  from langchain_community.vectorstores import FAISS
6
  from langchain_openai import OpenAIEmbeddings
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 
8
  import re
9
  import os
 
10
  from typing import Dict, List, Any
11
  import warnings
12
  import gradio as gr
@@ -19,11 +22,12 @@ load_dotenv()
19
 
20
  class ExcelAIQuerySystem:
21
  """
22
- A system to query Excel files using natural language, powered by OpenAI and LangChain.
 
23
  """
24
  def __init__(self, openai_api_key: str):
25
  os.environ["OPENAI_API_KEY"] = openai_api_key
26
- self.llm = OpenAI(temperature=0)
27
  self.embeddings = OpenAIEmbeddings()
28
  self.excel_data = {}
29
  self.sheet_descriptions = {}
@@ -43,7 +47,6 @@ class ExcelAIQuerySystem:
43
  df = pd.read_excel(file_path, sheet_name=sheet_name)
44
  df = self._clean_dataframe(df)
45
  self.excel_data[sheet_name] = df
46
-
47
  description = self._generate_sheet_description(sheet_name, df)
48
  self.sheet_descriptions[sheet_name] = description
49
  self.logs.append(f" - Loaded and described sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
@@ -58,31 +61,28 @@ class ExcelAIQuerySystem:
58
  raise Exception(f"Error loading Excel file: {str(e)}")
59
 
60
  def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
61
- """Cleans a DataFrame by removing empty rows/columns and converting data types."""
62
  df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
 
63
  for col in df.columns:
64
  if df[col].dtype == 'object':
65
- try:
66
- df[col] = pd.to_datetime(df[col], errors='ignore')
67
- except:
68
- pass
69
- try:
70
- df[col] = pd.to_numeric(df[col], errors='ignore')
71
- except:
72
- pass
73
  return df
74
 
75
  def _generate_sheet_description(self, sheet_name: str, df: pd.DataFrame) -> str:
76
- """Generates a text description of a DataFrame using an LLM."""
77
- sample_data = df.head(3).to_string()
 
78
  prompt = f"""
79
- Analyze this Excel sheet and provide a concise one-paragraph summary.
80
  Sheet Name: {sheet_name}
81
- Columns: {list(df.columns)}
82
- Sample Data:
83
- {sample_data}
84
-
85
- Focus on the main purpose of the data, key metrics, and the time period covered.
86
  """
87
  try:
88
  return self.llm.invoke(prompt)
@@ -91,100 +91,75 @@ class ExcelAIQuerySystem:
91
 
92
  def _create_vectorstore(self):
93
  """Creates a FAISS vector store from sheet descriptions for similarity search."""
94
- documents = [
95
- Document(page_content=desc, metadata={"sheet_name": name})
96
- for name, desc in self.sheet_descriptions.items()
97
- ]
98
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
99
  splits = text_splitter.split_documents(documents)
100
  self.vectorstore = FAISS.from_documents(splits, self.embeddings)
101
 
102
  def identify_relevant_sheets(self, query: str) -> List[str]:
103
  """Identifies the most relevant sheets for a given query using the vector store."""
104
- if not self.vectorstore:
105
- return list(self.excel_data.keys())
106
  try:
107
- docs = self.vectorstore.similarity_search(query, k=3)
108
  sheet_names = [doc.metadata['sheet_name'] for doc in docs if 'sheet_name' in doc.metadata]
109
- return list(dict.fromkeys(sheet_names))[:5]
110
  except Exception:
111
  return list(self.excel_data.keys())
112
 
113
- def query_data(self, query: str) -> Dict[str, Any]:
114
- """Processes a user query against the loaded Excel data."""
115
- results = {'query': query, 'relevant_sheets': [], 'sheet_results': {}, 'summary': '', 'insights': []}
 
116
  try:
117
- relevant_sheets = self.identify_relevant_sheets(query)
 
 
 
 
 
 
 
 
118
  results['relevant_sheets'] = relevant_sheets
119
 
120
  for sheet_name in relevant_sheets:
121
- if sheet_name not in self.excel_data:
122
- continue
123
- df = self.excel_data[sheet_name]
124
- analysis_prompt = f"""
125
- Analyze the data from sheet '{sheet_name}' to answer the query: "{query}"
126
- Columns: {list(df.columns)}
127
- Sample Data:
128
- {df.head(5).to_string()}
129
 
130
- Provide a direct answer, including key numbers, trends, or patterns.
131
- """
132
- response = self.llm.invoke(analysis_prompt)
133
- results['sheet_results'][sheet_name] = {'response': response}
134
 
135
  results['summary'] = self._generate_summary(query, results['sheet_results'])
136
- results['insights'] = self._extract_insights(results['sheet_results'])
137
  return results
138
  except Exception as e:
139
- results['summary'] = f"Error processing query: {str(e)}"
140
  return results
141
 
142
  def _generate_summary(self, query: str, sheet_results: Dict) -> str:
143
  """Generates a final, consolidated summary from individual sheet analyses."""
144
- if not sheet_results:
145
- return "No relevant data found to answer the query."
146
-
147
- combined_responses = "\n\n".join(
148
- f"--- Analysis from Sheet '{name}' ---\n{res['response']}"
149
- for name, res in sheet_results.items()
150
- )
151
- prompt = f"""
152
- Based on the following analyses, provide a final, consolidated answer to the query.
153
- Original Query: {query}
154
-
155
- {combined_responses}
156
-
157
- Synthesize these findings into a clear and direct summary.
158
- """
159
- return self.llm.invoke(prompt)
160
 
161
- def _extract_insights(self, sheet_results: Dict) -> List[str]:
162
- """Extracts simple, actionable insights from the analysis results."""
163
- insights = set()
164
- for sheet_name, result in sheet_results.items():
165
- response = result.get('response', '').lower()
166
- if re.search(r'\b\d+\.?\d*\b', response):
167
- insights.add(f"Numerical data found in '{sheet_name}'")
168
- trend_keywords = ['increase', 'decrease', 'growth', 'decline', 'trend', 'pattern']
169
- if any(keyword in response for keyword in trend_keywords):
170
- insights.add(f"Trend analysis available in '{sheet_name}'")
171
- return list(insights)
172
 
173
  # --- Gradio Interface ---
174
 
175
  def process_file(api_key, file_obj):
176
- """Gradio function to load the file and prepare the system."""
177
- if not api_key:
178
- raise gr.Error("OpenAI API Key is required.")
179
- if file_obj is None:
180
- raise gr.Error("Please upload an Excel file.")
181
  try:
182
  excel_system = ExcelAIQuerySystem(api_key)
183
  loading_logs = excel_system.load_excel_file(file_obj.name)
 
184
 
185
  return (
186
  loading_logs,
187
  excel_system,
 
188
  gr.update(visible=True),
189
  gr.update(visible=True),
190
  gr.update(visible=True)
@@ -192,55 +167,44 @@ def process_file(api_key, file_obj):
192
  except Exception as e:
193
  raise gr.Error(f"Failed to process file: {e}")
194
 
195
- def generate_response(query, system_state):
196
- """Gradio function to handle user queries and display results."""
197
- if not query:
198
- raise gr.Error("Please enter a query.")
199
- if system_state is None:
200
- raise gr.Error("File not loaded. Please upload and load a file first.")
201
 
202
  try:
203
- result = system_state.query_data(query)
204
  summary = result.get('summary', 'No summary available.')
205
  sheets = ", ".join(result.get('relevant_sheets', []))
206
- insights = ", ".join(result.get('insights', []))
207
-
208
- details = f"**🔍 Relevant Sheets Identified:**\n{sheets}\n\n"
209
- if insights:
210
- details += f"**💡 Key Insights:**\n{insights}"
211
-
212
  return summary, details
213
  except Exception as e:
214
  raise gr.Error(f"Error during query: {e}")
215
 
216
- # --- UI Layout ---
217
-
218
  with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
219
  system_state = gr.State(None)
220
 
221
  gr.Markdown("# 📊 Excel AI Query System")
222
- gr.Markdown("Upload an Excel file, and ask questions about your data in plain English.")
223
 
224
  with gr.Row():
225
  with gr.Column(scale=1):
226
  gr.Markdown("### 1. Setup")
227
- api_key_input = gr.Textbox(
228
- label="OpenAI API Key",
229
- type="password",
230
- placeholder="Enter your OpenAI API key...",
231
- value=os.getenv("OPENAI_API_KEY", "")
232
- )
233
  file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
234
  load_button = gr.Button("Load File", variant="primary")
235
- status_output = gr.Textbox(label="Loading Status", interactive=False, lines=5)
236
 
237
  with gr.Column(scale=2):
238
  gr.Markdown("### 2. Ask a Question")
239
- query_input = gr.Textbox(
240
- label="Your Question",
241
- placeholder="e.g., 'What were the total sales in Q3?' or 'Show me the performance trend for Product X.'",
242
- visible=False
 
 
243
  )
 
244
  ask_button = gr.Button("Get Answer", variant="primary", visible=False)
245
 
246
  results_accordion = gr.Accordion("Results", open=False, visible=False)
@@ -248,21 +212,20 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
248
  summary_output = gr.Markdown(label="Summary")
249
  details_output = gr.Markdown(label="Details")
250
 
251
- # --- Event Handlers ---
252
-
253
  load_button.click(
254
  fn=process_file,
255
  inputs=[api_key_input, file_input],
256
- outputs=[status_output, system_state, query_input, ask_button, results_accordion]
257
  )
258
 
259
  ask_button.click(
260
  fn=generate_response,
261
- inputs=[query_input, system_state],
262
  outputs=[summary_output, details_output]
263
  ).then(
264
  lambda: gr.update(open=True),
265
  outputs=results_accordion
266
  )
 
267
  if __name__ == "__main__":
268
  demo.launch()
 
5
  from langchain_community.vectorstores import FAISS
6
  from langchain_openai import OpenAIEmbeddings
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from langchain.agents.agent_types import AgentType
9
+ from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
10
  import re
11
  import os
12
+ import io
13
  from typing import Dict, List, Any
14
  import warnings
15
  import gradio as gr
 
22
 
23
  class ExcelAIQuerySystem:
24
  """
25
+ An improved system to query Excel files using a Pandas Agent for higher accuracy,
26
+ with an option to target a specific sheet.
27
  """
28
  def __init__(self, openai_api_key: str):
29
  os.environ["OPENAI_API_KEY"] = openai_api_key
30
+ self.llm = OpenAI(temperature=0)
31
  self.embeddings = OpenAIEmbeddings()
32
  self.excel_data = {}
33
  self.sheet_descriptions = {}
 
47
  df = pd.read_excel(file_path, sheet_name=sheet_name)
48
  df = self._clean_dataframe(df)
49
  self.excel_data[sheet_name] = df
 
50
  description = self._generate_sheet_description(sheet_name, df)
51
  self.sheet_descriptions[sheet_name] = description
52
  self.logs.append(f" - Loaded and described sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
 
61
  raise Exception(f"Error loading Excel file: {str(e)}")
62
 
63
  def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
64
+ """Cleans a DataFrame by removing empty rows/columns, standardizing headers, and converting types."""
65
  df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
66
+ df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
67
  for col in df.columns:
68
  if df[col].dtype == 'object':
69
+ try: df[col] = pd.to_datetime(df[col], errors='ignore')
70
+ except: pass
71
+ try: df[col] = pd.to_numeric(df[col], errors='ignore')
72
+ except: pass
 
 
 
 
73
  return df
74
 
75
  def _generate_sheet_description(self, sheet_name: str, df: pd.DataFrame) -> str:
76
+ """Generates a richer, more detailed description of a DataFrame for better retrieval."""
77
+ buffer = io.StringIO()
78
+ df.info(buf=buffer)
79
  prompt = f"""
80
+ Analyze the metadata of this Excel sheet to provide a concise, one-paragraph summary.
81
  Sheet Name: {sheet_name}
82
+ Dataframe Info: {buffer.getvalue()}
83
+ First 3 Rows: {df.head(3).to_string()}
84
+ Summary Stats: {df.describe().to_string()}
85
+ Based on all the metadata, summarize the sheet's main purpose and the types of data it contains.
 
86
  """
87
  try:
88
  return self.llm.invoke(prompt)
 
91
 
92
  def _create_vectorstore(self):
93
  """Creates a FAISS vector store from sheet descriptions for similarity search."""
94
+ documents = [Document(page_content=desc, metadata={"sheet_name": name}) for name, desc in self.sheet_descriptions.items()]
 
 
 
95
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
96
  splits = text_splitter.split_documents(documents)
97
  self.vectorstore = FAISS.from_documents(splits, self.embeddings)
98
 
99
  def identify_relevant_sheets(self, query: str) -> List[str]:
100
  """Identifies the most relevant sheets for a given query using the vector store."""
101
+ if not self.vectorstore: return list(self.excel_data.keys())
 
102
  try:
103
+ docs = self.vectorstore.similarity_search(query, k=5)
104
  sheet_names = [doc.metadata['sheet_name'] for doc in docs if 'sheet_name' in doc.metadata]
105
+ return list(dict.fromkeys(sheet_names))
106
  except Exception:
107
  return list(self.excel_data.keys())
108
 
109
+ def query_data(self, query: str, target_sheet: str = "Auto-Select") -> Dict[str, Any]:
110
+ """--- MODIFIED: Processes a query, either against a specific sheet or by auto-selecting the most relevant ones. ---"""
111
+ results = {'query': query, 'relevant_sheets': [], 'sheet_results': {}, 'summary': ''}
112
+
113
  try:
114
+ # Determine which sheets to query
115
+ if target_sheet and target_sheet != "Auto-Select":
116
+ relevant_sheets = [target_sheet]
117
+ if target_sheet not in self.excel_data:
118
+ results['summary'] = f"Error: The selected sheet '{target_sheet}' was not found or could not be loaded."
119
+ return results
120
+ else:
121
+ relevant_sheets = self.identify_relevant_sheets(query)
122
+
123
  results['relevant_sheets'] = relevant_sheets
124
 
125
  for sheet_name in relevant_sheets:
126
+ if sheet_name not in self.excel_data: continue
 
 
 
 
 
 
 
127
 
128
+ df = self.excel_data[sheet_name]
129
+ pandas_agent = create_pandas_dataframe_agent(self.llm, df, agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
130
+ response = pandas_agent.invoke(query)
131
+ results['sheet_results'][sheet_name] = {'response': response['output']}
132
 
133
  results['summary'] = self._generate_summary(query, results['sheet_results'])
 
134
  return results
135
  except Exception as e:
136
+ results['summary'] = f"An error occurred while querying the data: {str(e)}"
137
  return results
138
 
139
  def _generate_summary(self, query: str, sheet_results: Dict) -> str:
140
  """Generates a final, consolidated summary from individual sheet analyses."""
141
+ if not sheet_results: return "No relevant data found to answer the query."
142
+ if len(sheet_results) == 1: return list(sheet_results.values())[0]['response']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ combined_responses = "\n\n".join([f"--- Analysis from Sheet '{name}' ---\n{res['response']}" for name, res in sheet_results.items()])
145
+ prompt = f"The following are answers to the query '{query}' from different data sheets. Synthesize them into a single, cohesive final answer.\n\n{combined_responses}\n\nProvide a final, consolidated answer."
146
+ return self.llm.invoke(prompt)
 
 
 
 
 
 
 
 
147
 
148
  # --- Gradio Interface ---
149
 
150
  def process_file(api_key, file_obj):
151
+ """--- MODIFIED: Also returns the list of sheet names to populate the dropdown. ---"""
152
+ if not api_key: raise gr.Error("OpenAI API Key is required.")
153
+ if file_obj is None: raise gr.Error("Please upload an Excel file.")
 
 
154
  try:
155
  excel_system = ExcelAIQuerySystem(api_key)
156
  loading_logs = excel_system.load_excel_file(file_obj.name)
157
+ sheet_names = ["Auto-Select"] + list(excel_system.excel_data.keys())
158
 
159
  return (
160
  loading_logs,
161
  excel_system,
162
+ gr.update(choices=sheet_names, value="Auto-Select", visible=True), # Update dropdown
163
  gr.update(visible=True),
164
  gr.update(visible=True),
165
  gr.update(visible=True)
 
167
  except Exception as e:
168
  raise gr.Error(f"Failed to process file: {e}")
169
 
170
+ def generate_response(query, selected_sheet, system_state):
171
+ """--- MODIFIED: Passes the selected sheet to the query function. ---"""
172
+ if not query: raise gr.Error("Please enter a query.")
173
+ if system_state is None: raise gr.Error("File not loaded. Please upload and load a file first.")
 
 
174
 
175
  try:
176
+ result = system_state.query_data(query, target_sheet=selected_sheet)
177
  summary = result.get('summary', 'No summary available.')
178
  sheets = ", ".join(result.get('relevant_sheets', []))
179
+ details = f"**🔍 Sheets Queried:**\n{sheets}"
 
 
 
 
 
180
  return summary, details
181
  except Exception as e:
182
  raise gr.Error(f"Error during query: {e}")
183
 
 
 
184
  with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
185
  system_state = gr.State(None)
186
 
187
  gr.Markdown("# 📊 Excel AI Query System")
188
+ gr.Markdown("Upload an Excel file, choose a specific sheet or let the AI decide, and ask questions about your data.")
189
 
190
  with gr.Row():
191
  with gr.Column(scale=1):
192
  gr.Markdown("### 1. Setup")
193
+ api_key_input = gr.Textbox(label="OpenAI API Key", type="password", placeholder="Enter your OpenAI API key...", value=os.getenv("OPENAI_API_KEY", ""))
 
 
 
 
 
194
  file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
195
  load_button = gr.Button("Load File", variant="primary")
196
+ status_output = gr.Textbox(label="Loading Status", interactive=False, lines=10)
197
 
198
  with gr.Column(scale=2):
199
  gr.Markdown("### 2. Ask a Question")
200
+ # --- NEW: Dropdown for sheet selection ---
201
+ sheet_selector = gr.Dropdown(
202
+ label="Select a sheet to query",
203
+ info="Choose 'Auto-Select' to let the AI find the best sheet.",
204
+ visible=False,
205
+ interactive=True
206
  )
207
+ query_input = gr.Textbox(label="Your Question", placeholder="e.g., 'What is the average revenue?'", visible=False)
208
  ask_button = gr.Button("Get Answer", variant="primary", visible=False)
209
 
210
  results_accordion = gr.Accordion("Results", open=False, visible=False)
 
212
  summary_output = gr.Markdown(label="Summary")
213
  details_output = gr.Markdown(label="Details")
214
 
 
 
215
  load_button.click(
216
  fn=process_file,
217
  inputs=[api_key_input, file_input],
218
+ outputs=[status_output, system_state, sheet_selector, query_input, ask_button, results_accordion]
219
  )
220
 
221
  ask_button.click(
222
  fn=generate_response,
223
+ inputs=[query_input, sheet_selector, system_state], # Add sheet_selector to inputs
224
  outputs=[summary_output, details_output]
225
  ).then(
226
  lambda: gr.update(open=True),
227
  outputs=results_accordion
228
  )
229
+
230
  if __name__ == "__main__":
231
  demo.launch()