Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -24,39 +24,44 @@ class HybridExcelQuerySystem:
|
|
24 |
self.sheet_names = []
|
25 |
|
26 |
def _pivot_numerical_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
27 |
-
"""
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
#
|
32 |
-
|
33 |
-
|
34 |
-
final_headers = []
|
35 |
-
for col in new_headers:
|
36 |
-
if col in seen:
|
37 |
-
seen[col] += 1
|
38 |
-
final_headers.append(f"{col}_{seen[col]}")
|
39 |
-
else:
|
40 |
-
seen[col] = 0
|
41 |
-
final_headers.append(col)
|
42 |
-
df.columns = final_headers
|
43 |
|
44 |
-
|
|
|
|
|
45 |
|
46 |
-
|
47 |
-
|
|
|
48 |
|
49 |
-
#
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
if col != 'Month':
|
52 |
-
|
53 |
-
|
|
|
54 |
|
55 |
-
|
56 |
-
df = df.rename(columns={'Period': 'Month'})
|
57 |
-
months = ['Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar']
|
58 |
-
df = df[df['Month'].isin(months)]
|
59 |
-
return df
|
60 |
|
61 |
def load_excel_file(self, file_path: str) -> str:
|
62 |
self.logs.clear()
|
@@ -66,7 +71,7 @@ class HybridExcelQuerySystem:
|
|
66 |
self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
|
67 |
|
68 |
for sheet_name in self.sheet_names:
|
69 |
-
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
70 |
|
71 |
if sheet_name == "Numerical Data":
|
72 |
agent_df = self._pivot_numerical_data(df.copy())
|
@@ -87,7 +92,7 @@ class HybridExcelQuerySystem:
|
|
87 |
raise Exception(f"Error loading Excel file: {e}")
|
88 |
|
89 |
def _clean_dataframe_for_agent(self, df: pd.DataFrame) -> pd.DataFrame:
|
90 |
-
df.columns = df.columns
|
91 |
return df
|
92 |
|
93 |
def _clean_dataframe_for_rag(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -96,14 +101,7 @@ class HybridExcelQuerySystem:
|
|
96 |
return df
|
97 |
|
98 |
def _classify_query(self, query: str) -> str:
|
99 |
-
prompt = f"""
|
100 |
-
Classify the user's query about an Excel sheet as either "lookup" or "calculation".
|
101 |
-
- "lookup": Use for questions asking for specific data, text, or summaries.
|
102 |
-
- "calculation": Use for questions that require math, sorting, or filtering across the dataset.
|
103 |
-
|
104 |
-
User Query: "{query}"
|
105 |
-
Classification:
|
106 |
-
"""
|
107 |
response = self.agent_llm.invoke(prompt)
|
108 |
classification = response.content.strip().lower()
|
109 |
return "calculation" if "calculation" in classification else "lookup"
|
@@ -128,15 +126,7 @@ class HybridExcelQuerySystem:
|
|
128 |
def _execute_agent_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
|
129 |
try:
|
130 |
df = self.dataframes[sheet_name]
|
131 |
-
agent = create_pandas_dataframe_agent(
|
132 |
-
self.agent_llm,
|
133 |
-
df,
|
134 |
-
agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
135 |
-
verbose=True,
|
136 |
-
allow_dangerous_code=True,
|
137 |
-
max_iterations=15,
|
138 |
-
handle_parsing_errors=True
|
139 |
-
)
|
140 |
response = agent.invoke(query)
|
141 |
return {"answer": response['output'], "tool_used": "Calculation (Pandas Agent)"}
|
142 |
except Exception as e:
|
@@ -151,11 +141,7 @@ def process_excel(api_key: str, file_obj: gr.File):
|
|
151 |
logs = system.load_excel_file(file_obj.name)
|
152 |
sheet_names = system.sheet_names
|
153 |
|
154 |
-
return (
|
155 |
-
logs, system,
|
156 |
-
gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
|
157 |
-
gr.update(visible=True)
|
158 |
-
)
|
159 |
|
160 |
def user_interaction(question: str, history: List, system_state: HybridExcelQuerySystem, selected_sheet: str):
|
161 |
if not system_state: raise gr.Error("Please upload and process a file first.")
|
@@ -180,16 +166,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Excel Analyzer") as demo:
|
|
180 |
gr.Markdown("### 2. Ask a Question")
|
181 |
with gr.Group(visible=False) as query_ui:
|
182 |
sheet_selector = gr.Dropdown(label="Select a Sheet")
|
183 |
-
chat_interface = gr.ChatInterface(
|
184 |
-
|
185 |
-
additional_inputs=[system_state, sheet_selector],
|
186 |
-
title="Chat with your Excel Data"
|
187 |
-
)
|
188 |
-
process_button.click(
|
189 |
-
fn=process_excel,
|
190 |
-
inputs=[openai_api_key, excel_upload],
|
191 |
-
outputs=[status_text, system_state, sheet_selector, query_ui]
|
192 |
-
)
|
193 |
|
194 |
if __name__ == "__main__":
|
195 |
demo.launch()
|
|
|
24 |
self.sheet_names = []
|
25 |
|
26 |
def _pivot_numerical_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
27 |
+
"""
|
28 |
+
--- NEW, MORE ROBUST VERSION ---
|
29 |
+
Intelligently reconstructs the 'Numerical Data' sheet into a clean, tidy format.
|
30 |
+
"""
|
31 |
+
# Find the row with month names (it's the first non-empty row)
|
32 |
+
header_row_index = df.dropna(how='all').index[0]
|
33 |
+
month_headers = df.iloc[header_row_index].dropna().tolist()
|
34 |
|
35 |
+
# Find the first column with financial metrics
|
36 |
+
metric_col_name = df.columns[0]
|
37 |
+
df_metrics = df[[metric_col_name]].dropna()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
# Find the starting row and column for the actual data
|
40 |
+
start_row = df_metrics.index[0]
|
41 |
+
start_col = df.columns.get_loc(month_headers[0])
|
42 |
|
43 |
+
# Extract the core data, metrics, and headers
|
44 |
+
data = df.iloc[start_row:, start_col:start_col+len(month_headers)]
|
45 |
+
metrics = df.iloc[start_row:, 0]
|
46 |
|
47 |
+
# Create a new, clean DataFrame
|
48 |
+
clean_df = pd.DataFrame(data.values, index=metrics, columns=month_headers)
|
49 |
+
|
50 |
+
# Transpose so months are rows
|
51 |
+
clean_df = clean_df.T
|
52 |
+
clean_df = clean_df.reset_index().rename(columns={'index': 'Month'})
|
53 |
+
|
54 |
+
# Clean column names
|
55 |
+
clean_df.columns = clean_df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_%]', '_', regex=True).str.replace('__', '_')
|
56 |
+
|
57 |
+
# Convert all columns except 'Month' to numeric
|
58 |
+
for col in clean_df.columns:
|
59 |
if col != 'Month':
|
60 |
+
if clean_df[col].dtype == 'object':
|
61 |
+
clean_df[col] = clean_df[col].astype(str).str.replace('%', '', regex=False)
|
62 |
+
clean_df[col] = pd.to_numeric(clean_df[col], errors='coerce')
|
63 |
|
64 |
+
return clean_df
|
|
|
|
|
|
|
|
|
65 |
|
66 |
def load_excel_file(self, file_path: str) -> str:
|
67 |
self.logs.clear()
|
|
|
71 |
self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
|
72 |
|
73 |
for sheet_name in self.sheet_names:
|
74 |
+
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None) # Read without a header
|
75 |
|
76 |
if sheet_name == "Numerical Data":
|
77 |
agent_df = self._pivot_numerical_data(df.copy())
|
|
|
92 |
raise Exception(f"Error loading Excel file: {e}")
|
93 |
|
94 |
def _clean_dataframe_for_agent(self, df: pd.DataFrame) -> pd.DataFrame:
|
95 |
+
df.columns = [f"Col_{i}" for i in range(len(df.columns))]
|
96 |
return df
|
97 |
|
98 |
def _clean_dataframe_for_rag(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
|
101 |
return df
|
102 |
|
103 |
def _classify_query(self, query: str) -> str:
|
104 |
+
prompt = f"""Classify the user's query about an Excel sheet as either "lookup" or "calculation". "lookup": for questions asking for specific data or summaries. "calculation": for questions requiring math, sorting, or filtering. Query: "{query}" Classification:"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
response = self.agent_llm.invoke(prompt)
|
106 |
classification = response.content.strip().lower()
|
107 |
return "calculation" if "calculation" in classification else "lookup"
|
|
|
126 |
def _execute_agent_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
|
127 |
try:
|
128 |
df = self.dataframes[sheet_name]
|
129 |
+
agent = create_pandas_dataframe_agent(self.agent_llm, df, agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True, allow_dangerous_code=True, max_iterations=15, handle_parsing_errors=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
response = agent.invoke(query)
|
131 |
return {"answer": response['output'], "tool_used": "Calculation (Pandas Agent)"}
|
132 |
except Exception as e:
|
|
|
141 |
logs = system.load_excel_file(file_obj.name)
|
142 |
sheet_names = system.sheet_names
|
143 |
|
144 |
+
return (logs, system, gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True), gr.update(visible=True))
|
|
|
|
|
|
|
|
|
145 |
|
146 |
def user_interaction(question: str, history: List, system_state: HybridExcelQuerySystem, selected_sheet: str):
|
147 |
if not system_state: raise gr.Error("Please upload and process a file first.")
|
|
|
166 |
gr.Markdown("### 2. Ask a Question")
|
167 |
with gr.Group(visible=False) as query_ui:
|
168 |
sheet_selector = gr.Dropdown(label="Select a Sheet")
|
169 |
+
chat_interface = gr.ChatInterface(fn=user_interaction, additional_inputs=[system_state, sheet_selector], title="Chat with your Excel Data")
|
170 |
+
process_button.click(fn=process_excel, inputs=[openai_api_key, excel_upload], outputs=[status_text, system_state, sheet_selector, query_ui])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
if __name__ == "__main__":
|
173 |
demo.launch()
|