Spaces:

Adityabhaskar
/

stealth

Paused

App Files Files Community

Adityabhaskar commited on Sep 3

Commit

c4c4e2a

verified ·

1 Parent(s): 17b5798

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -61

app.py CHANGED Viewed

@@ -24,39 +24,44 @@ class HybridExcelQuerySystem:
         self.sheet_names = []
     def _pivot_numerical_data(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Pivots the special 'Numerical Data' sheet into a clean, usable format."""
-        header_row_index = 0
-        df = df.T
-        # Get potential new headers and de-duplicate them
-        new_headers = df.iloc[header_row_index].fillna('Unnamed')
-        seen = {}
-        final_headers = []
-        for col in new_headers:
-            if col in seen:
-                seen[col] += 1
-                final_headers.append(f"{col}_{seen[col]}")
-            else:
-                seen[col] = 0
-                final_headers.append(col)
-        df.columns = final_headers
-        df = df.iloc[1:]
-        df = df.reset_index().rename(columns={'index': 'Month'})
-        df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_%]', '_', regex=True).str.replace('__', '_')
-        # Identify and convert numeric columns
-        for col in df.columns:
             if col != 'Month':
-                df[col] = df[col].astype(str).str.replace('%', '', regex=False)
-                df[col] = pd.to_numeric(df[col], errors='coerce')
-        df = df.dropna(axis=1, how='all')
-        df = df.rename(columns={'Period': 'Month'})
-        months = ['Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar']
-        df = df[df['Month'].isin(months)]
-        return df
     def load_excel_file(self, file_path: str) -> str:
         self.logs.clear()
@@ -66,7 +71,7 @@ class HybridExcelQuerySystem:
             self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
             for sheet_name in self.sheet_names:
-                df = pd.read_excel(file_path, sheet_name=sheet_name)
                 if sheet_name == "Numerical Data":
                     agent_df = self._pivot_numerical_data(df.copy())
@@ -87,7 +92,7 @@ class HybridExcelQuerySystem:
             raise Exception(f"Error loading Excel file: {e}")
     def _clean_dataframe_for_agent(self, df: pd.DataFrame) -> pd.DataFrame:
-        df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
         return df
     def _clean_dataframe_for_rag(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -96,14 +101,7 @@ class HybridExcelQuerySystem:
         return df
     def _classify_query(self, query: str) -> str:
-        prompt = f"""
-        Classify the user's query about an Excel sheet as either "lookup" or "calculation".
-        - "lookup": Use for questions asking for specific data, text, or summaries.
-        - "calculation": Use for questions that require math, sorting, or filtering across the dataset.
-        User Query: "{query}"
-        Classification:
-        """
         response = self.agent_llm.invoke(prompt)
         classification = response.content.strip().lower()
         return "calculation" if "calculation" in classification else "lookup"
@@ -128,15 +126,7 @@ class HybridExcelQuerySystem:
     def _execute_agent_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
         try:
             df = self.dataframes[sheet_name]
-            agent = create_pandas_dataframe_agent(
-                self.agent_llm,
-                df,
-                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
-                verbose=True,
-                allow_dangerous_code=True,
-                max_iterations=15,
-                handle_parsing_errors=True
-            )
             response = agent.invoke(query)
             return {"answer": response['output'], "tool_used": "Calculation (Pandas Agent)"}
         except Exception as e:
@@ -151,11 +141,7 @@ def process_excel(api_key: str, file_obj: gr.File):
     logs = system.load_excel_file(file_obj.name)
     sheet_names = system.sheet_names
-    return (
-        logs, system,
-        gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
-        gr.update(visible=True)
-    )
 def user_interaction(question: str, history: List, system_state: HybridExcelQuerySystem, selected_sheet: str):
     if not system_state: raise gr.Error("Please upload and process a file first.")
@@ -180,16 +166,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Excel Analyzer") as demo:
             gr.Markdown("### 2. Ask a Question")
             with gr.Group(visible=False) as query_ui:
                 sheet_selector = gr.Dropdown(label="Select a Sheet")
-                chat_interface = gr.ChatInterface(
-                    fn=user_interaction,
-                    additional_inputs=[system_state, sheet_selector],
-                    title="Chat with your Excel Data"
-                )
-    process_button.click(
-        fn=process_excel,
-        inputs=[openai_api_key, excel_upload],
-        outputs=[status_text, system_state, sheet_selector, query_ui]
-    )
 if __name__ == "__main__":
     demo.launch()

         self.sheet_names = []
     def _pivot_numerical_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        --- NEW, MORE ROBUST VERSION ---
+        Intelligently reconstructs the 'Numerical Data' sheet into a clean, tidy format.
+        """
+        # Find the row with month names (it's the first non-empty row)
+        header_row_index = df.dropna(how='all').index[0]
+        month_headers = df.iloc[header_row_index].dropna().tolist()
+        # Find the first column with financial metrics
+        metric_col_name = df.columns[0]
+        df_metrics = df[[metric_col_name]].dropna()
+        # Find the starting row and column for the actual data
+        start_row = df_metrics.index[0]
+        start_col = df.columns.get_loc(month_headers[0])
+        # Extract the core data, metrics, and headers
+        data = df.iloc[start_row:, start_col:start_col+len(month_headers)]
+        metrics = df.iloc[start_row:, 0]
+        # Create a new, clean DataFrame
+        clean_df = pd.DataFrame(data.values, index=metrics, columns=month_headers)
+        # Transpose so months are rows
+        clean_df = clean_df.T
+        clean_df = clean_df.reset_index().rename(columns={'index': 'Month'})
+        # Clean column names
+        clean_df.columns = clean_df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_%]', '_', regex=True).str.replace('__', '_')
+        # Convert all columns except 'Month' to numeric
+        for col in clean_df.columns:
             if col != 'Month':
+                if clean_df[col].dtype == 'object':
+                    clean_df[col] = clean_df[col].astype(str).str.replace('%', '', regex=False)
+                clean_df[col] = pd.to_numeric(clean_df[col], errors='coerce')
+        return clean_df
     def load_excel_file(self, file_path: str) -> str:
         self.logs.clear()
             self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
             for sheet_name in self.sheet_names:
+                df = pd.read_excel(file_path, sheet_name=sheet_name, header=None) # Read without a header
                 if sheet_name == "Numerical Data":
                     agent_df = self._pivot_numerical_data(df.copy())
             raise Exception(f"Error loading Excel file: {e}")
     def _clean_dataframe_for_agent(self, df: pd.DataFrame) -> pd.DataFrame:
+        df.columns = [f"Col_{i}" for i in range(len(df.columns))]
         return df
     def _clean_dataframe_for_rag(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
     def _classify_query(self, query: str) -> str:
+        prompt = f"""Classify the user's query about an Excel sheet as either "lookup" or "calculation". "lookup": for questions asking for specific data or summaries. "calculation": for questions requiring math, sorting, or filtering. Query: "{query}" Classification:"""
         response = self.agent_llm.invoke(prompt)
         classification = response.content.strip().lower()
         return "calculation" if "calculation" in classification else "lookup"
     def _execute_agent_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
         try:
             df = self.dataframes[sheet_name]
+            agent = create_pandas_dataframe_agent(self.agent_llm, df, agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True, allow_dangerous_code=True, max_iterations=15, handle_parsing_errors=True)
             response = agent.invoke(query)
             return {"answer": response['output'], "tool_used": "Calculation (Pandas Agent)"}
         except Exception as e:
     logs = system.load_excel_file(file_obj.name)
     sheet_names = system.sheet_names
+    return (logs, system, gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True), gr.update(visible=True))
 def user_interaction(question: str, history: List, system_state: HybridExcelQuerySystem, selected_sheet: str):
     if not system_state: raise gr.Error("Please upload and process a file first.")
             gr.Markdown("### 2. Ask a Question")
             with gr.Group(visible=False) as query_ui:
                 sheet_selector = gr.Dropdown(label="Select a Sheet")
+                chat_interface = gr.ChatInterface(fn=user_interaction, additional_inputs=[system_state, sheet_selector], title="Chat with your Excel Data")
+    process_button.click(fn=process_excel, inputs=[openai_api_key, excel_upload], outputs=[status_text, system_state, sheet_selector, query_ui])
 if __name__ == "__main__":
     demo.launch()