Adityabhaskar commited on
Commit
c4c4e2a
·
verified ·
1 Parent(s): 17b5798

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -61
app.py CHANGED
@@ -24,39 +24,44 @@ class HybridExcelQuerySystem:
24
  self.sheet_names = []
25
 
26
  def _pivot_numerical_data(self, df: pd.DataFrame) -> pd.DataFrame:
27
- """Pivots the special 'Numerical Data' sheet into a clean, usable format."""
28
- header_row_index = 0
29
- df = df.T
 
 
 
 
30
 
31
- # Get potential new headers and de-duplicate them
32
- new_headers = df.iloc[header_row_index].fillna('Unnamed')
33
- seen = {}
34
- final_headers = []
35
- for col in new_headers:
36
- if col in seen:
37
- seen[col] += 1
38
- final_headers.append(f"{col}_{seen[col]}")
39
- else:
40
- seen[col] = 0
41
- final_headers.append(col)
42
- df.columns = final_headers
43
 
44
- df = df.iloc[1:]
 
 
45
 
46
- df = df.reset_index().rename(columns={'index': 'Month'})
47
- df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_%]', '_', regex=True).str.replace('__', '_')
 
48
 
49
- # Identify and convert numeric columns
50
- for col in df.columns:
 
 
 
 
 
 
 
 
 
 
51
  if col != 'Month':
52
- df[col] = df[col].astype(str).str.replace('%', '', regex=False)
53
- df[col] = pd.to_numeric(df[col], errors='coerce')
 
54
 
55
- df = df.dropna(axis=1, how='all')
56
- df = df.rename(columns={'Period': 'Month'})
57
- months = ['Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar']
58
- df = df[df['Month'].isin(months)]
59
- return df
60
 
61
  def load_excel_file(self, file_path: str) -> str:
62
  self.logs.clear()
@@ -66,7 +71,7 @@ class HybridExcelQuerySystem:
66
  self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
67
 
68
  for sheet_name in self.sheet_names:
69
- df = pd.read_excel(file_path, sheet_name=sheet_name)
70
 
71
  if sheet_name == "Numerical Data":
72
  agent_df = self._pivot_numerical_data(df.copy())
@@ -87,7 +92,7 @@ class HybridExcelQuerySystem:
87
  raise Exception(f"Error loading Excel file: {e}")
88
 
89
  def _clean_dataframe_for_agent(self, df: pd.DataFrame) -> pd.DataFrame:
90
- df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
91
  return df
92
 
93
  def _clean_dataframe_for_rag(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -96,14 +101,7 @@ class HybridExcelQuerySystem:
96
  return df
97
 
98
  def _classify_query(self, query: str) -> str:
99
- prompt = f"""
100
- Classify the user's query about an Excel sheet as either "lookup" or "calculation".
101
- - "lookup": Use for questions asking for specific data, text, or summaries.
102
- - "calculation": Use for questions that require math, sorting, or filtering across the dataset.
103
-
104
- User Query: "{query}"
105
- Classification:
106
- """
107
  response = self.agent_llm.invoke(prompt)
108
  classification = response.content.strip().lower()
109
  return "calculation" if "calculation" in classification else "lookup"
@@ -128,15 +126,7 @@ class HybridExcelQuerySystem:
128
  def _execute_agent_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
129
  try:
130
  df = self.dataframes[sheet_name]
131
- agent = create_pandas_dataframe_agent(
132
- self.agent_llm,
133
- df,
134
- agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
135
- verbose=True,
136
- allow_dangerous_code=True,
137
- max_iterations=15,
138
- handle_parsing_errors=True
139
- )
140
  response = agent.invoke(query)
141
  return {"answer": response['output'], "tool_used": "Calculation (Pandas Agent)"}
142
  except Exception as e:
@@ -151,11 +141,7 @@ def process_excel(api_key: str, file_obj: gr.File):
151
  logs = system.load_excel_file(file_obj.name)
152
  sheet_names = system.sheet_names
153
 
154
- return (
155
- logs, system,
156
- gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
157
- gr.update(visible=True)
158
- )
159
 
160
  def user_interaction(question: str, history: List, system_state: HybridExcelQuerySystem, selected_sheet: str):
161
  if not system_state: raise gr.Error("Please upload and process a file first.")
@@ -180,16 +166,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Excel Analyzer") as demo:
180
  gr.Markdown("### 2. Ask a Question")
181
  with gr.Group(visible=False) as query_ui:
182
  sheet_selector = gr.Dropdown(label="Select a Sheet")
183
- chat_interface = gr.ChatInterface(
184
- fn=user_interaction,
185
- additional_inputs=[system_state, sheet_selector],
186
- title="Chat with your Excel Data"
187
- )
188
- process_button.click(
189
- fn=process_excel,
190
- inputs=[openai_api_key, excel_upload],
191
- outputs=[status_text, system_state, sheet_selector, query_ui]
192
- )
193
 
194
  if __name__ == "__main__":
195
  demo.launch()
 
24
  self.sheet_names = []
25
 
26
  def _pivot_numerical_data(self, df: pd.DataFrame) -> pd.DataFrame:
27
+ """
28
+ --- NEW, MORE ROBUST VERSION ---
29
+ Intelligently reconstructs the 'Numerical Data' sheet into a clean, tidy format.
30
+ """
31
+ # Find the row with month names (it's the first non-empty row)
32
+ header_row_index = df.dropna(how='all').index[0]
33
+ month_headers = df.iloc[header_row_index].dropna().tolist()
34
 
35
+ # Find the first column with financial metrics
36
+ metric_col_name = df.columns[0]
37
+ df_metrics = df[[metric_col_name]].dropna()
 
 
 
 
 
 
 
 
 
38
 
39
+ # Find the starting row and column for the actual data
40
+ start_row = df_metrics.index[0]
41
+ start_col = df.columns.get_loc(month_headers[0])
42
 
43
+ # Extract the core data, metrics, and headers
44
+ data = df.iloc[start_row:, start_col:start_col+len(month_headers)]
45
+ metrics = df.iloc[start_row:, 0]
46
 
47
+ # Create a new, clean DataFrame
48
+ clean_df = pd.DataFrame(data.values, index=metrics, columns=month_headers)
49
+
50
+ # Transpose so months are rows
51
+ clean_df = clean_df.T
52
+ clean_df = clean_df.reset_index().rename(columns={'index': 'Month'})
53
+
54
+ # Clean column names
55
+ clean_df.columns = clean_df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_%]', '_', regex=True).str.replace('__', '_')
56
+
57
+ # Convert all columns except 'Month' to numeric
58
+ for col in clean_df.columns:
59
  if col != 'Month':
60
+ if clean_df[col].dtype == 'object':
61
+ clean_df[col] = clean_df[col].astype(str).str.replace('%', '', regex=False)
62
+ clean_df[col] = pd.to_numeric(clean_df[col], errors='coerce')
63
 
64
+ return clean_df
 
 
 
 
65
 
66
  def load_excel_file(self, file_path: str) -> str:
67
  self.logs.clear()
 
71
  self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
72
 
73
  for sheet_name in self.sheet_names:
74
+ df = pd.read_excel(file_path, sheet_name=sheet_name, header=None) # Read without a header
75
 
76
  if sheet_name == "Numerical Data":
77
  agent_df = self._pivot_numerical_data(df.copy())
 
92
  raise Exception(f"Error loading Excel file: {e}")
93
 
94
  def _clean_dataframe_for_agent(self, df: pd.DataFrame) -> pd.DataFrame:
95
+ df.columns = [f"Col_{i}" for i in range(len(df.columns))]
96
  return df
97
 
98
  def _clean_dataframe_for_rag(self, df: pd.DataFrame) -> pd.DataFrame:
 
101
  return df
102
 
103
  def _classify_query(self, query: str) -> str:
104
+ prompt = f"""Classify the user's query about an Excel sheet as either "lookup" or "calculation". "lookup": for questions asking for specific data or summaries. "calculation": for questions requiring math, sorting, or filtering. Query: "{query}" Classification:"""
 
 
 
 
 
 
 
105
  response = self.agent_llm.invoke(prompt)
106
  classification = response.content.strip().lower()
107
  return "calculation" if "calculation" in classification else "lookup"
 
126
  def _execute_agent_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
127
  try:
128
  df = self.dataframes[sheet_name]
129
+ agent = create_pandas_dataframe_agent(self.agent_llm, df, agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True, allow_dangerous_code=True, max_iterations=15, handle_parsing_errors=True)
 
 
 
 
 
 
 
 
130
  response = agent.invoke(query)
131
  return {"answer": response['output'], "tool_used": "Calculation (Pandas Agent)"}
132
  except Exception as e:
 
141
  logs = system.load_excel_file(file_obj.name)
142
  sheet_names = system.sheet_names
143
 
144
+ return (logs, system, gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True), gr.update(visible=True))
 
 
 
 
145
 
146
  def user_interaction(question: str, history: List, system_state: HybridExcelQuerySystem, selected_sheet: str):
147
  if not system_state: raise gr.Error("Please upload and process a file first.")
 
166
  gr.Markdown("### 2. Ask a Question")
167
  with gr.Group(visible=False) as query_ui:
168
  sheet_selector = gr.Dropdown(label="Select a Sheet")
169
+ chat_interface = gr.ChatInterface(fn=user_interaction, additional_inputs=[system_state, sheet_selector], title="Chat with your Excel Data")
170
+ process_button.click(fn=process_excel, inputs=[openai_api_key, excel_upload], outputs=[status_text, system_state, sheet_selector, query_ui])
 
 
 
 
 
 
 
 
171
 
172
  if __name__ == "__main__":
173
  demo.launch()