Spaces:

Adityabhaskar
/

stealth

Paused

App Files Files Community

Adityabhaskar commited on 20 days ago

Commit

0c623ad

verified ·

1 Parent(s): ec25353

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -138

app.py CHANGED Viewed

@@ -1,178 +1,177 @@
-import pandas as pd
-from langchain_openai import OpenAIEmbeddings, ChatOpenAI
-from langchain_core.documents import Document
-from langchain_community.vectorstores import FAISS
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 import os
-from typing import Dict, Any
-import warnings
 import gradio as gr
-from dotenv import load_dotenv
-warnings.filterwarnings('ignore')
-load_dotenv()
-class ExcelAIQuerySystem:
     """
-    A system to query Excel files using a reliable "Chunk and Search" (RAG) method.
-    This method is good for lookups but not for mathematical aggregations.
     """
     def __init__(self, openai_api_key: str):
         os.environ["OPENAI_API_KEY"] = openai_api_key
-        self.llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
-        self.embeddings = OpenAIEmbeddings()
-        self.sheet_data_stores: Dict[str, FAISS] = {} # Store a vector store for each sheet
         self.logs = []
         self.sheet_names = []
     def load_excel_file(self, file_path: str) -> str:
         self.logs.clear()
         try:
-            excel_file = pd.ExcelFile(file_path)
-            self.sheet_names = excel_file.sheet_names
             self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
-            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
             for sheet_name in self.sheet_names:
-                try:
-                    df = pd.read_excel(file_path, sheet_name=sheet_name)
-                    df = self._clean_dataframe(df)
-                    # Convert dataframe to a single text document
-                    # Using markdown format for better structure
-                    markdown_text = df.to_markdown(index=False)
-                    # Create documents and split them into chunks
-                    doc = Document(page_content=markdown_text, metadata={"source": sheet_name})
-                    chunks = text_splitter.split_documents([doc])
-                    # Create a FAISS vector store for the chunks
-                    self.sheet_data_stores[sheet_name] = FAISS.from_documents(chunks, self.embeddings)
-                    self.logs.append(f"  - Indexed sheet '{sheet_name}' ({df.shape[0]} rows × {df.shape[1]} columns)")
-                except Exception as e:
-                    self.logs.append(f"⚠️ Error processing sheet '{sheet_name}': {str(e)}")
-                    continue
-            self.logs.append("✅ All sheets processed and indexed.")
             return "\n".join(self.logs)
         except Exception as e:
-            raise Exception(f"Error loading Excel file: {str(e)}")
-    def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
-        df = df.dropna(how='all').dropna(axis=1, how='all').reset_index(drop=True)
         df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
-        # Convert all data to string to ensure consistency for text processing
         for col in df.columns:
             df[col] = df[col].astype(str)
         return df
-    def query_data(self, query: str, target_sheet: str) -> Dict[str, Any]:
-        """
-        --- NEW LOGIC ---
-        Searches for relevant data chunks and uses an LLM to answer based on them.
         """
-        results = {'query': query, 'summary': ''}
-        if not target_sheet or target_sheet not in self.sheet_data_stores:
-            results['summary'] = "Error: Please select a valid sheet to query."
-            return results
         try:
-            vector_store = self.sheet_data_stores[target_sheet]
-            # Find the most relevant data chunks for the query
-            relevant_docs = vector_store.similarity_search(query, k=5)
-            # Create a Question-Answering chain
-            qa_chain = load_qa_with_sources_chain(self.llm, chain_type="stuff")
-            # Run the chain with the relevant docs
-            response = qa_chain.invoke(
-                {"input_documents": relevant_docs, "question": query},
-                return_only_outputs=True
             )
-            results['summary'] = response.get('output_text', "Could not find an answer in the data.")
-            return results
         except Exception as e:
-            results['summary'] = f"An error occurred while querying the data: {str(e)}"
-            return results
-# --- Gradio Interface ---
-# Simplified to work with the new RAG logic
-def process_file(api_key, file_obj):
-    if not api_key: raise gr.Error("OpenAI API Key is required.")
-    if file_obj is None: raise gr.Error("Please upload an Excel file.")
-    try:
-        excel_system = ExcelAIQuerySystem(api_key)
-        loading_logs = excel_system.load_excel_file(file_obj.name)
-        # Now a sheet must be selected, so we don't include "Auto-Select"
-        sheet_names = excel_system.sheet_names
-        return (
-            loading_logs,
-            excel_system,
-            gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
-            gr.update(visible=True),
-            gr.update(visible=True),
-            gr.update(visible=True)
-        )
-    except Exception as e:
-        raise gr.Error(f"Failed to process file: {e}")
-def generate_response(query, selected_sheet, system_state):
-    if not query: raise gr.Error("Please enter a query.")
-    if system_state is None: raise gr.Error("File not loaded. Please upload and load a file first.")
-    if not selected_sheet: raise gr.Error("Please select a sheet to query.")
-    try:
-        result = system_state.query_data(query, target_sheet=selected_sheet)
-        summary = result.get('summary', 'No summary available.')
-        details = f"**🔍 Searched in Sheet:**\n{selected_sheet}"
-        return summary, details
-    except Exception as e:
-        raise gr.Error(f"Error during query: {e}")
-with gr.Blocks(theme=gr.themes.Soft(), title="Excel AI Query System") as demo:
-    system_state = gr.State(None)
-    gr.Markdown("# 📊 Excel AI Query System (Chunk & Search Edition)")
-    gr.Markdown("This version finds specific information in your Excel file. It is not designed for math or whole-dataset calculations.")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 1. Setup")
-            api_key_input = gr.Textbox(label="OpenAI API Key", type="password", placeholder="Enter your OpenAI API key...", value=os.getenv("OPENAI_API_KEY", ""))
-            file_input = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
-            load_button = gr.Button("Load File", variant="primary")
-            status_output = gr.Textbox(label="Indexing Status", interactive=False, lines=10)
         with gr.Column(scale=2):
             gr.Markdown("### 2. Ask a Question")
-            sheet_selector = gr.Dropdown(
-                label="Select a sheet to query",
-                info="You must select a sheet.",
-                visible=False,
-                interactive=True
-            )
-            query_input = gr.Textbox(label="Your Question", placeholder="e.g., 'What are the details for order #12345?'", visible=False)
-            ask_button = gr.Button("Get Answer", variant="primary", visible=False)
-            with gr.Accordion("Results", open=False, visible=False) as results_accordion:
-                summary_output = gr.Markdown(label="Answer")
-                details_output = gr.Markdown(label="Details")
-    load_button.click(
-        fn=process_file,
-        inputs=[api_key_input, file_input],
-        outputs=[status_output, system_state, sheet_selector, query_input, ask_button, results_accordion]
-    )
-    ask_button.click(
-        fn=generate_response,
-        inputs=[query_input, sheet_selector, system_state],
-        outputs=[summary_output, details_output]
-    ).then(
-        lambda: gr.update(open=True),
-        outputs=results_accordion
     )
 if __name__ == "__main__":

 import os
 import gradio as gr
+import pandas as pd
+from typing import List, Dict, Any
+# --- LlamaIndex & LangChain Imports ---
+from llama_index.core import VectorStoreIndex, Document, Settings
+from llama_index.llms.openai import OpenAI as LlamaOpenAI
+from llama_index.embeddings.openai import OpenAIEmbedding
+from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
+from langchain_openai import ChatOpenAI
+from langchain.agents.agent_types import AgentType
+from langchain.chains.qa_with_sources import load_qa_with_sources_chain
+class HybridExcelQuerySystem:
     """
+    Implements a hybrid system that uses a RAG tool for lookups and a Pandas Agent for calculations.
     """
     def __init__(self, openai_api_key: str):
         os.environ["OPENAI_API_KEY"] = openai_api_key
+        # For LlamaIndex (RAG)
+        Settings.llm = LlamaOpenAI(model="gpt-4o")
+        Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
+        # For LangChain Agent (Calculations)
+        self.agent_llm = ChatOpenAI(temperature=0, model="gpt-4o")
+        self.dataframes: Dict[str, pd.DataFrame] = {}
+        self.vector_stores: Dict[str, VectorStoreIndex] = {}
         self.logs = []
         self.sheet_names = []
     def load_excel_file(self, file_path: str) -> str:
+        """Loads data from an Excel file and prepares it for both RAG and Agent tools."""
         self.logs.clear()
         try:
+            xls = pd.ExcelFile(file_path)
+            self.sheet_names = xls.sheet_names
             self.logs.append(f"✅ Found {len(self.sheet_names)} sheets: {', '.join(self.sheet_names)}")
             for sheet_name in self.sheet_names:
+                df = pd.read_excel(file_path, sheet_name=sheet_name)
+                # --- Prepare for Agent ---
+                self.dataframes[sheet_name] = self._clean_dataframe_for_agent(df.copy())
+                # --- Prepare for RAG ---
+                rag_df = self._clean_dataframe_for_rag(df.copy())
+                markdown_text = rag_df.to_markdown(index=False)
+                doc = Document(text=markdown_text, metadata={"source": sheet_name})
+                self.vector_stores[sheet_name] = VectorStoreIndex.from_documents([doc])
+                self.logs.append(f"  - Prepared sheet '{sheet_name}' for both Lookup and Calculation.")
+            self.logs.append("✅ All sheets are ready.")
             return "\n".join(self.logs)
         except Exception as e:
+            raise Exception(f"Error loading Excel file: {e}")
+    def _clean_dataframe_for_agent(self, df: pd.DataFrame) -> pd.DataFrame:
         df.columns = df.columns.str.strip().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
+        return df
+    def _clean_dataframe_for_rag(self, df: pd.DataFrame) -> pd.DataFrame:
         for col in df.columns:
             df[col] = df[col].astype(str)
         return df
+    def _classify_query(self, query: str) -> str:
+        """Uses an LLM to classify the query as 'lookup' or 'calculation'."""
+        prompt = f"""
+        Classify the user's query about an Excel sheet as either "lookup" or "calculation".
+        - "lookup": Use for questions asking for specific data, text, or summaries that can likely be found in a few rows. Examples: 'What are the details for order X?', 'Summarize the notes for July'.
+        - "calculation": Use for questions that require mathematical operations (sum, average, count), sorting, filtering, or finding trends across the entire dataset. Examples: 'What is the total revenue?', 'Find the top 3 months by profit', 'How many entries are there?'.
+        User Query: "{query}"
+        Classification:
         """
+        response = self.agent_llm.invoke(prompt)
+        classification = response.content.strip().lower()
+        # Default to lookup for safety if classification is unclear
+        return "calculation" if "calculation" in classification else "lookup"
+    def query(self, query: str, selected_sheet: str) -> Dict[str, Any]:
+        """The main query function that routes to the appropriate tool."""
+        if not selected_sheet:
+            return {"answer": "Error: Please select a sheet first.", "tool_used": "None"}
+        classification = self._classify_query(query)
+        if classification == "calculation":
+            return self._execute_agent_query(query, selected_sheet)
+        else: # Default to RAG for lookups
+            return self._execute_rag_query(query, selected_sheet)
+    def _execute_rag_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
+        """Handles lookup queries using the RAG tool."""
         try:
+            query_engine = self.vector_stores[sheet_name].as_query_engine()
+            response = query_engine.query(query)
+            return {"answer": str(response), "tool_used": "Lookup (RAG Search)"}
+        except Exception as e:
+            return {"answer": f"Error during lookup: {e}", "tool_used": "Lookup (RAG Search)"}
+    def _execute_agent_query(self, query: str, sheet_name: str) -> Dict[str, Any]:
+        """Handles calculation queries using the Pandas Agent."""
+        try:
+            df = self.dataframes[sheet_name]
+            agent = create_pandas_dataframe_agent(
+                self.agent_llm,
+                df,
+                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
+                verbose=True,
+                allow_dangerous_code=True,
+                max_iterations=15
             )
+            response = agent.invoke(query)
+            return {"answer": response['output'], "tool_used": "Calculation (Pandas Agent)"}
         except Exception as e:
+            return {"answer": f"Error during calculation: {e}", "tool_used": "Calculation (Pandas Agent)"}
+# --- Gradio UI ---
+def process_excel(api_key: str, file_obj: gr.File):
+    if not api_key: raise gr.Error("Please provide your OpenAI API key.")
+    if not file_obj: raise gr.Error("Please upload an Excel file.")
+    system = HybridExcelQuerySystem(api_key=api_key)
+    logs = system.load_excel_file(file_obj.name)
+    sheet_names = system.sheet_names
+    return (
+        logs, system,
+        gr.update(choices=sheet_names, value=sheet_names[0] if sheet_names else None, visible=True),
+        gr.update(visible=True)
+    )
+def user_interaction(question: str, history: List, system_state: HybridExcelQuerySystem, selected_sheet: str):
+    if not system_state: raise gr.Error("Please upload and process a file first.")
+    result = system_state.query(question, selected_sheet)
+    answer = result.get("answer", "No response.")
+    tool_used = result.get("tool_used", "Unknown")
+    # Append the tool used to the answer for transparency
+    full_response = f"{answer}\n\n*Tool Used: {tool_used}*"
+    return full_response
+with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Excel Analyzer") as demo:
+    system_state = gr.State()
+    gr.Markdown("# 🤖 Hybrid Excel Analyzer")
+    gr.Markdown("This app automatically chooses the best AI tool—a search tool for lookups or a code-writing agent for calculations—to answer your questions about an Excel file.")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 1. Setup")
+            openai_api_key = gr.Textbox(label="OpenAI API Key", type="password", value=os.getenv("OPENAI_API_KEY", ""))
+            excel_upload = gr.File(label="Upload Excel File", file_types=[".xlsx", ".xls"])
+            process_button = gr.Button("Process File", variant="primary")
+            status_text = gr.Textbox(label="Processing Status", interactive=False, lines=8)
         with gr.Column(scale=2):
             gr.Markdown("### 2. Ask a Question")
+            with gr.Group(visible=False) as query_ui:
+                sheet_selector = gr.Dropdown(label="Select a Sheet")
+                chat_interface = gr.ChatInterface(
+                    fn=user_interaction,
+                    additional_inputs=[system_state, sheet_selector],
+                    title="Chat with your Excel Data"
+                )
+    process_button.click(
+        fn=process_excel,
+        inputs=[openai_api_key, excel_upload],
+        outputs=[status_text, system_state, sheet_selector, query_ui]
     )
 if __name__ == "__main__":