Adityabhaskar commited on
Commit
d51ad5a
·
verified ·
1 Parent(s): 13d63c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -259
app.py CHANGED
@@ -1,289 +1,139 @@
1
- import pandas as pd
2
- import numpy as np
3
- from langchain_openai import OpenAI
4
- from langchain_core.documents import Document
5
- from langchain_community.vectorstores import FAISS
6
- from langchain_openai import OpenAIEmbeddings
7
- from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- import re
9
- import os
10
- from typing import Dict, List, Any
11
- import warnings
12
  import gradio as gr
13
- from dotenv import load_dotenv
 
 
 
 
 
 
 
14
 
15
- # Ignore warnings for a cleaner interface
16
- warnings.filterwarnings('ignore')
17
- # Load environment variables from .env file
18
- load_dotenv()
19
 
20
- class ExcelAIQuerySystem:
21
  """
22
- A system to query Excel files using natural language, powered by OpenAI and LangChain.
 
23
  """
24
- def __init__(self, openai_api_key: str):
25
- os.environ["OPENAI_API_KEY"] = openai_api_key
26
- self.llm = OpenAI(temperature=0)
27
- self.embeddings = OpenAIEmbeddings()
28
- self.excel_data = {}
29
- self.sheet_descriptions = {}
30
- self.vectorstore = None
31
- self.logs = []
32
-
33
- def load_excel_file(self, file_path: str) -> str:
34
- """Loads and processes an Excel file, generating descriptions and a vector store."""
35
- self.logs.clear()
36
- try:
37
- excel_file = pd.ExcelFile(file_path)
38
- sheet_names = excel_file.sheet_names
39
- self.logs.append(f"✅ Found {len(sheet_names)} sheets: {', '.join(sheet_names)}")
40
-
41
- for sheet_name in sheet_names:
42
- try:
43
- df = pd.read_excel(file_path, sheet_name=sheet_name)
44
- df = self._clean_dataframe(df)
45
- self.excel_data[sheet_name] = df
46
-
47
- description = self._generate_sheet_description(sheet_name, df)
48
- self.sheet_descriptions[sheet_name] = description
49
- self.logs.append(f" - Loaded and described sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
50
- except Exception as e:
51
- self.logs.append(f"⚠️ Error loading sheet '{sheet_name}': {str(e)}")
52
- continue
53
-
54
- self._create_vectorstore()
55
- self.logs.append("✅ Vector store created successfully.")
56
- return "\n".join(self.logs)
57
- except Exception as e:
58
- raise Exception(f"Error loading Excel file: {str(e)}")
59
 
60
- def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
61
- """Cleans a DataFrame by removing empty rows/columns and converting data types."""
62
- df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
63
- for col in df.columns:
64
- if df[col].dtype == 'object':
65
- try:
66
- df[col] = pd.to_datetime(df[col], errors='ignore')
67
- except:
68
- pass
69
- try:
70
- df[col] = pd.to_numeric(df[col], errors='ignore')
71
- except:
72
- pass
73
- return df
74
 
75
- def _generate_sheet_description(self, sheet_name: str, df: pd.DataFrame) -> str:
76
- """Generates a text description of a DataFrame using an LLM."""
77
- sample_data = df.head(3).to_string()
78
- prompt = f"""
79
- Analyze this Excel sheet and provide a concise one-paragraph summary.
80
- Sheet Name: {sheet_name}
81
- Columns: {list(df.columns)}
82
- Sample Data:
83
- {sample_data}
84
-
85
- Focus on the main purpose of the data, key metrics, and the time period covered.
86
- """
87
- try:
88
- return self.llm.invoke(prompt)
89
- except Exception:
90
- return f"Sheet: {sheet_name}, Columns: {', '.join(list(df.columns))}"
91
-
92
- def _create_vectorstore(self):
93
- """Creates a FAISS vector store from sheet descriptions for similarity search."""
94
- documents = [
95
- Document(page_content=desc, metadata={"sheet_name": name})
96
- for name, desc in self.sheet_descriptions.items()
97
- ]
98
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
99
- splits = text_splitter.split_documents(documents)
100
- self.vectorstore = FAISS.from_documents(splits, self.embeddings)
101
 
102
- def identify_relevant_sheets(self, query: str) -> List[str]:
103
- """Identifies the most relevant sheets for a given query using the vector store."""
104
- if not self.vectorstore:
105
- return list(self.excel_data.keys())
106
- try:
107
- docs = self.vectorstore.similarity_search(query, k=3)
108
- sheet_names = [doc.metadata['sheet_name'] for doc in docs if 'sheet_name' in doc.metadata]
109
- return list(dict.fromkeys(sheet_names))[:5]
110
- except Exception:
111
- return list(self.excel_data.keys())
112
 
113
- def query_data(self, query: str, selected_sheet: str = None) -> Dict[str, Any]:
114
- """
115
- Processes a user query against the loaded Excel data.
116
- If a sheet is selected, it queries that sheet directly.
117
- Otherwise, it identifies the most relevant sheets.
118
- """
119
- results = {'query': query, 'relevant_sheets': [], 'sheet_results': {}, 'summary': '', 'insights': []}
120
- try:
121
- # If a specific sheet is selected (and it's not the default auto-select), use it.
122
- if selected_sheet and selected_sheet != "Auto-Select based on Query":
123
- relevant_sheets = [selected_sheet]
124
- else:
125
- relevant_sheets = self.identify_relevant_sheets(query)
126
-
127
- results['relevant_sheets'] = relevant_sheets
128
 
129
- for sheet_name in relevant_sheets:
130
- if sheet_name not in self.excel_data:
131
- continue
132
- df = self.excel_data[sheet_name]
133
- analysis_prompt = f"""
134
- Analyze the data from sheet '{sheet_name}' to answer the query: "{query}"
135
- Columns: {list(df.columns)}
136
- Sample Data:
137
- {df.head(5).to_string()}
138
-
139
- Provide a direct answer, including key numbers, trends, or patterns.
140
- """
141
- response = self.llm.invoke(analysis_prompt)
142
- results['sheet_results'][sheet_name] = {'response': response}
143
-
144
- results['summary'] = self._generate_summary(query, results['sheet_results'])
145
- results['insights'] = self._extract_insights(results['sheet_results'])
146
- return results
147
- except Exception as e:
148
- results['summary'] = f"Error processing query: {str(e)}"
149
- return results
150
 
151
- def _generate_summary(self, query: str, sheet_results: Dict) -> str:
152
- """Generates a final, consolidated summary from individual sheet analyses."""
153
- if not sheet_results:
154
- return "No relevant data found to answer the query."
155
 
156
- combined_responses = "\n\n".join(
157
- f"--- Analysis from Sheet '{name}' ---\n{res['response']}"
158
- for name, res in sheet_results.items()
 
 
 
 
 
 
 
159
  )
160
- prompt = f"""
161
- Based on the following analyses, provide a final, consolidated answer to the query.
162
- Original Query: {query}
163
 
164
- {combined_responses}
 
165
 
166
- Synthesize these findings into a clear and direct summary.
167
- """
168
- return self.llm.invoke(prompt)
169
-
170
- def _extract_insights(self, sheet_results: Dict) -> List[str]:
171
- """Extracts simple, actionable insights from the analysis results."""
172
- insights = set()
173
- for sheet_name, result in sheet_results.items():
174
- response = result.get('response', '').lower()
175
- if re.search(r'\b\d+\.?\d*\b', response):
176
- insights.add(f"Numerical data found in '{sheet_name}'")
177
- trend_keywords = ['increase', 'decrease', 'growth', 'decline', 'trend', 'pattern']
178
- if any(keyword in response for keyword in trend_keywords):
179
- insights.add(f"Trend analysis available in '{sheet_name}'")
180
- return list(insights)
181
-
182
- # --- Gradio Interface ---
183
-
184
- def process_file(api_key, file_obj):
185
- """Gradio function to load the file and prepare the system."""
186
- if not api_key:
187
- raise gr.Error("OpenAI API Key is required.")
188
- if file_obj is None:
189
- raise gr.Error("Please upload an Excel file.")
190
- try:
191
- excel_system = ExcelAIQuerySystem(api_key)
192
- loading_logs = excel_system.load_excel_file(file_obj.name)
193
 
194
- # Get sheet names for the dropdown
195
- sheet_names = list(excel_system.excel_data.keys())
196
- dropdown_choices = ["Auto-Select based on Query"] + sheet_names
197
 
198
- return (
199
- loading_logs,
200
- excel_system,
201
- gr.update(choices=dropdown_choices, value=dropdown_choices[0], visible=True), # Update dropdown
202
- gr.update(visible=True), # Query input
203
- gr.update(visible=True), # Ask button
204
- gr.update(visible=True) # Results accordion
205
- )
206
- except Exception as e:
207
- raise gr.Error(f"Failed to process file: {e}")
208
-
209
- def generate_response(query, sheet_selection, system_state):
210
- """Gradio function to handle user queries and display results."""
211
- if not query:
212
- raise gr.Error("Please enter a query.")
213
- if system_state is None:
214
- raise gr.Error("File not loaded. Please upload and load a file first.")
215
-
216
- try:
217
- # Pass the selected sheet to the query function
218
- result = system_state.query_data(query, selected_sheet=sheet_selection)
219
- summary = result.get('summary', 'No summary available.')
220
- sheets = ", ".join(result.get('relevant_sheets', []))
221
- insights = ", ".join(result.get('insights', []))
222
 
223
- details = f"**🔍 Relevant Sheets Identified:**\n{sheets}\n\n"
224
- if insights:
225
- details += f"**💡 Key Insights:**\n{insights}"
226
-
227
- return summary, details
228
- except Exception as e:
229
- raise gr.Error(f"Error during query: {e}")
230
-
231
- # --- UI Layout ---
232
 
233
- with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
234
- system_state = gr.State(None)
235
-
236
- gr.Markdown("# 📊 Excel AI Query System")
237
- gr.Markdown("Upload an Excel file, and ask questions about your data in plain English.")
 
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  with gr.Row():
240
  with gr.Column(scale=1):
241
- gr.Markdown("### 1. Setup")
242
- api_key_input = gr.Textbox(
243
- label="OpenAI API Key",
244
- type="password",
245
- placeholder="Enter your OpenAI API key...",
246
- value=os.getenv("OPENAI_API_KEY", "")
247
- )
248
- file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
249
- load_button = gr.Button("Load File", variant="primary")
250
- status_output = gr.Textbox(label="Loading Status", interactive=False, lines=5)
251
 
252
  with gr.Column(scale=2):
253
- gr.Markdown("### 2. Ask a Question")
254
- sheet_selector = gr.Dropdown(
255
- label="📄 Select a Sheet to Query",
256
- info="Choose a specific sheet, or let the AI decide automatically.",
257
- visible=False
258
- )
259
- query_input = gr.Textbox(
260
- label="Your Question",
261
- placeholder="e.g., 'What were the total sales in Q3?' or 'Show me the performance trend for Product X.'",
262
- visible=False
263
- )
264
- ask_button = gr.Button("Get Answer", variant="primary", visible=False)
265
-
266
- results_accordion = gr.Accordion("Results", open=False, visible=False)
267
- with results_accordion:
268
- summary_output = gr.Markdown(label="Summary")
269
- details_output = gr.Markdown(label="Details")
270
-
271
- # --- Event Handlers ---
272
 
273
- load_button.click(
274
- fn=process_file,
275
- inputs=[api_key_input, file_input],
276
- outputs=[status_output, system_state, sheet_selector, query_input, ask_button, results_accordion]
 
277
  )
278
 
279
- ask_button.click(
280
- fn=generate_response,
281
- inputs=[query_input, sheet_selector, system_state], # Add sheet_selector as an input
282
- outputs=[summary_output, details_output]
283
- ).then(
284
- lambda: gr.update(open=True),
285
- outputs=results_accordion
286
  )
287
 
288
  if __name__ == "__main__":
289
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import os
3
+ import pandas as pd
4
+ import tempfile
5
+ import nest_asyncio
6
+ from llama_index.llms.openai import OpenAI
7
+ from llama_index.core import VectorStoreIndex
8
+ from llama_parse import LlamaParse
9
+ from llama_index.core.node_parser import MarkdownElementNodeParser
10
 
11
+ # Apply nest_asyncio to handle async operations in Gradio
12
+ nest_asyncio.apply()
 
 
13
 
14
+ def get_sheet_names(file):
15
  """
16
+ Reads an uploaded Excel file and returns its sheet names.
17
+ This function is triggered when a file is uploaded.
18
  """
19
+ if file is None:
20
+ # No file, so return an empty, non-interactive dropdown
21
+ return gr.Dropdown(choices=[], interactive=False)
22
+ try:
23
+ # Use pandas to quickly get sheet names without loading all data
24
+ xls = pd.ExcelFile(file.name)
25
+ sheet_names = xls.sheet_names
26
+ # Return an updated, interactive dropdown with the sheet names
27
+ return gr.Dropdown(choices=sheet_names, value=sheet_names[0], interactive=True)
28
+ except Exception as e:
29
+ gr.Warning(f"Could not read Excel file: {e}")
30
+ return gr.Dropdown(choices=[], interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ def process_and_query(openai_api_key, llama_parse_api_key, uploaded_file, selected_sheet, query_text):
34
+ """
35
+ Processes a selected sheet from an uploaded Excel file and answers a user's query.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ Args:
38
+ openai_api_key (str): The user's OpenAI API key.
39
+ llama_parse_api_key (str): The user's LlamaParse API key.
40
+ uploaded_file (gradio.File): The uploaded Excel file object.
41
+ selected_sheet (str): The name of the sheet to process.
42
+ query_text (str): The question to ask about the document.
 
 
 
 
43
 
44
+ Returns:
45
+ str: The answer to the query or an error message.
46
+ """
47
+ if not all([openai_api_key, llama_parse_api_key, uploaded_file, selected_sheet, query_text]):
48
+ return "Error: Please provide all inputs - both API keys, a file, a selected sheet, and a query."
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Create a temporary file to store the selected sheet's data
51
+ temp_file = None
52
+ try:
53
+ os.environ["OPENAI_API_KEY"] = openai_api_key
54
+ llm = OpenAI(model="gpt-4o-mini", api_key=openai_api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ # Read the selected sheet using pandas
57
+ df = pd.read_excel(uploaded_file.name, sheet_name=selected_sheet)
 
 
58
 
59
+ # Save the sheet's data to a temporary CSV file for LlamaParse
60
+ with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix=".csv", encoding='utf-8') as temp_file:
61
+ df.to_csv(temp_file.name, index=False)
62
+ temp_file_path = temp_file.name
63
+
64
+ # Initialize LlamaParse
65
+ parser = LlamaParse(
66
+ api_key=llama_parse_api_key,
67
+ result_type="markdown",
68
+ verbose=True
69
  )
 
 
 
70
 
71
+ # Load data from the temporary file containing only the selected sheet
72
+ documents = parser.load_data(temp_file_path)
73
 
74
+ # Initialize the parser and process the documents
75
+ node_parser = MarkdownElementNodeParser(llm=llm, num_workers=4)
76
+ nodes = node_parser.get_nodes_from_documents(documents)
77
+ base_nodes, objects = node_parser.get_nodes_and_objects(nodes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ # Create the index and query engine
80
+ recursive_index = VectorStoreIndex(nodes=base_nodes + objects, llm=llm)
81
+ query_engine = recursive_index.as_query_engine(similarity_top_k=5, llm=llm)
82
 
83
+ # Execute the query
84
+ response = query_engine.query(query_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ return str(response)
 
 
 
 
 
 
 
 
87
 
88
+ except Exception as e:
89
+ return f"An error occurred: {e}"
90
+ finally:
91
+ # Clean up the temporary file
92
+ if temp_file and os.path.exists(temp_file.name):
93
+ os.unlink(temp_file.name)
94
 
95
+ # --- Gradio Interface ---
96
+ with gr.Blocks(
97
+ title="Excel Sheet Q&A",
98
+ theme=gr.themes.Soft(),
99
+ css=".gradio-container {background: linear-gradient(to right, #C9D6FF, #E2E2E2)}"
100
+ ) as iface:
101
+ gr.Markdown(
102
+ """
103
+ # 📊 Excel Sheet Q&A
104
+
105
+ 1. Enter your API keys.
106
+ 2. Upload an Excel file.
107
+ 3. Choose a specific sheet from the dropdown.
108
+ 4. Ask a question about the data in that sheet.
109
+ """
110
+ )
111
+
112
  with gr.Row():
113
  with gr.Column(scale=1):
114
+ openai_key_input = gr.Textbox(label="OpenAI API Key", type="password")
115
+ llamaparse_key_input = gr.Textbox(label="LlamaParse API Key", type="password")
116
+ file_uploader = gr.File(label="Upload Excel Databook", file_types=[".xlsx", ".xls"])
117
+ sheet_selector = gr.Dropdown(label="Choose a Sheet", interactive=False)
 
 
 
 
 
 
118
 
119
  with gr.Column(scale=2):
120
+ query_input = gr.Textbox(label="Your Question", placeholder="e.g., What were the total revenues in 2022?", lines=8)
121
+ submit_button = gr.Button("Ask Question", variant="primary")
122
+ output_display = gr.Markdown(label="Answer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
+ # Define the interactive workflow
125
+ file_uploader.upload(
126
+ fn=get_sheet_names,
127
+ inputs=[file_uploader],
128
+ outputs=[sheet_selector]
129
  )
130
 
131
+ submit_button.click(
132
+ fn=process_and_query,
133
+ inputs=[openai_key_input, llamaparse_key_input, file_uploader, sheet_selector, query_input],
134
+ outputs=output_display
 
 
 
135
  )
136
 
137
  if __name__ == "__main__":
138
+ iface.launch(share=True)
139
+