Spaces:

AIProductSpace
/

ai-data-question-answer-bot

Runtime error

App Files Files Community

beyzacodeway commited on 15 days ago

Commit

dbd3785

verified ·

1 Parent(s): 2f255d3

Upload 6 files

Browse files

Files changed (6) hide show

src/question-answer/agents.py +221 -0
src/question-answer/graph.py +169 -0
src/question-answer/prompts.py +150 -0
src/question-answer/states.py +13 -0
src/question-answer/tools.py +264 -0
src/question-answer/utils.py +177 -0

src/question-answer/agents.py ADDED Viewed

	@@ -0,0 +1,221 @@

+from prompts import coder_prompt, fixer_prompt, analysis_prompt
+from states import State
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import SystemMessage, AIMessage
+from tools import run_code
+from utils import create_markdown_report, save_markdown_report
+def coder_agent(state: State) -> State:
+    """
+    Creates the cleaning code in Python to be executed in a sandbox.
+    """
+    print("-----------------------------------------------------------------------------------")
+    print("Creating the analysis code...")
+    print("-----------------------------------------------------------------------------------")
+    # Create the LLM
+    llm = ChatOpenAI(model="gpt-4.1", temperature=0)
+    # Get the dataset path
+    dataset_path = state['dataset_path']
+    # Get the messages
+    messages = state['messages']
+    # Extract the most recent user question
+    question = None
+    for message in reversed(messages):
+        if hasattr(message, 'additional_kwargs') and message.additional_kwargs.get('agent') == 'human':
+            question = message.content
+            break
+    if not question:
+        question = "Analyze the dataset"
+    # Create the system prompt with conversation context
+    system_prompt = coder_prompt(dataset_path, question)
+    # Build conversation context for the LLM
+    conversation_messages = []
+    # Add system prompt
+    conversation_messages.append(SystemMessage(content=system_prompt, additional_kwargs={"agent": "system", "node_type": "generation"}))
+    # Add recent conversation history (last 10 messages to keep context manageable)
+    recent_messages = messages[-10:] if len(messages) > 10 else messages
+    for msg in recent_messages:
+        if hasattr(msg, 'additional_kwargs') and msg.additional_kwargs.get('agent') in ['human', 'analysis_agent']:
+            conversation_messages.append(msg)
+    # Invoke the LLM with conversation context
+    code = llm.invoke(conversation_messages).content
+    messages.append(AIMessage(content=code, additional_kwargs={"agent": "coder_agent", "node_type": "generation"}))
+    return {
+        "messages": messages,
+        "dataset_path": dataset_path
+    }
+def runner_agent(state: State) -> State:
+    """
+    Runs the analysis code in a sandbox.
+    """
+    print("-----------------------------------------------------------------------------------")
+    print("Running the analysis code...")
+    print("-----------------------------------------------------------------------------------")
+    code = state['messages'][-1].content
+    dataset_path = state['dataset_path']
+    # save the code to a file in the output folder
+    import os
+    os.makedirs("output", exist_ok=True)
+    with open("output/analysis_code.py", "w") as f:
+        f.write(code)
+    result = run_code(code, dataset_path)
+    # Get the messages and add the result
+    messages = state['messages']
+    messages.append(AIMessage(content=f"Code execution result: {result['execution']}",
+                             additional_kwargs={"agent": "runner_agent", "node_type": "generation"}))
+    # Track generated charts as message objects
+    charts = result.get('charts', [])
+    chart_messages = [AIMessage(content=chart, additional_kwargs={"agent": "runner_agent", "node_type": "chart"}) for chart in charts]
+    return {
+        "messages": messages,
+        "codes": state.get('codes', []) + [code],
+        "charts": chart_messages
+    }
+def fixer_agent(state: State) -> State:
+    """
+    Fixes the analysis code.
+    """
+    print("-----------------------------------------------------------------------------------")
+    print("Fixing the analysis code...")
+    print("-----------------------------------------------------------------------------------")
+    # Extract the last human message (question)
+    question = None
+    for message in reversed(state['messages']):
+        if hasattr(message, 'additional_kwargs') and message.additional_kwargs.get('agent') == 'human':
+            question = message.content
+            break
+    # Extract the last coder_agent message (code)
+    code = None
+    for message in reversed(state['messages']):
+        if hasattr(message, 'additional_kwargs') and message.additional_kwargs.get('agent') == 'coder_agent':
+            code = message.content
+            break
+    # Extract the last runner_agent message (error)
+    error = None
+    for message in reversed(state['messages']):
+        if hasattr(message, 'additional_kwargs') and message.additional_kwargs.get('agent') == 'runner_agent':
+            error = message.content
+            break
+    # Get the dataset path
+    dataset_path = state['dataset_path']
+    # Create the system prompt
+    system_prompt = fixer_prompt(code, error, question, dataset_path)
+    # Get the messages
+    messages = state['messages']
+    # Add the system prompt to the messages
+    messages.append(SystemMessage(content=system_prompt, additional_kwargs={"agent": "system", "node_type": "fixing"}))
+    # Create the LLM and invoke it to fix the code
+    llm = ChatOpenAI(model="gpt-4.1", temperature=0)
+    fixed_code = llm.invoke(system_prompt).content
+    messages.append(AIMessage(content=fixed_code, additional_kwargs={"agent": "fixer_agent", "node_type": "fixing"}))
+    return {
+        "messages": messages,
+        "codes": state.get('codes', []) + [fixed_code]
+    }
+def analysis_agent(state: State) -> State:
+    """
+    Analyzes the question, the result of the execution, and the charts to answer the question.
+    """
+    print("-----------------------------------------------------------------------------------")
+    print("Analyzing the question, the result of the execution, and the charts to answer the question...")
+    print("-----------------------------------------------------------------------------------")
+    # Get the messages
+    messages = state['messages']
+    # last human message
+    question = None
+    for message in reversed(messages):
+        if hasattr(message, 'additional_kwargs') and message.additional_kwargs.get('agent') == 'human':
+            question = message.content
+            break
+    # Get the dataset path
+    dataset_path = state['dataset_path']
+    # Get the execution result
+    execution_result = None
+    for message in reversed(messages):
+        if hasattr(message, 'additional_kwargs') and message.additional_kwargs.get('agent') == 'runner_agent':
+            execution_result = message.content
+            break
+    # Get the charts from state and ensure they are strings
+    charts = state.get('charts', [])
+    # Convert any message objects to strings and filter out duplicates
+    chart_paths = []
+    seen_charts = set()
+    for chart in charts:
+        if hasattr(chart, 'content'):
+            chart_path = chart.content
+        else:
+            chart_path = str(chart)
+        # Only add unique chart paths
+        if chart_path not in seen_charts:
+            chart_paths.append(chart_path)
+            seen_charts.add(chart_path)
+    # Create the system prompt
+    system_prompt = analysis_prompt(question, dataset_path, execution_result, chart_paths)
+    # Build conversation context for the LLM
+    conversation_messages = []
+    # Add system prompt
+    conversation_messages.append(SystemMessage(content=system_prompt, additional_kwargs={"agent": "system", "node_type": "analysis"}))
+    # Add recent conversation history (last 10 messages to keep context manageable)
+    recent_messages = messages[-10:] if len(messages) > 10 else messages
+    for msg in recent_messages:
+        if hasattr(msg, 'additional_kwargs') and msg.additional_kwargs.get('agent') in ['human', 'analysis_agent']:
+            conversation_messages.append(msg)
+    # Create the LLM and invoke it to analyze the question, the result of the execution, and the charts to answer the question.
+    llm = ChatOpenAI(model="gpt-4.1", temperature=0)
+    analysis = llm.invoke(conversation_messages).content
+    messages.append(AIMessage(content=analysis, additional_kwargs={"agent": "analysis_agent", "node_type": "analysis"}))
+    # Create a markdown report
+    report_content = create_markdown_report(question, analysis, chart_paths, execution_result)
+    # Save the report to a file
+    report_filename = save_markdown_report(report_content)
+    # Report filename is returned in the state, no need to add to messages
+    return {
+        "messages": messages,
+        "analysis": analysis,
+        "report": report_filename
+    }

src/question-answer/graph.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from __future__ import annotations
+import os
+from typing import TypedDict
+from langgraph.graph import START, END
+from langgraph.graph import StateGraph
+from langchain_core.messages import HumanMessage
+from states import State
+from agents import coder_agent, runner_agent, fixer_agent, analysis_agent
+class Context(TypedDict):
+    """Context parameters for the agent.
+    Set these when creating assistants OR when invoking the graph.
+    See: https://langchain-ai.github.io/langgraph/cloud/how-tos/configuration_cloud/
+    """
+    my_configurable_param: str
+if __name__ == "__main__":
+    state = {
+        "messages": [],
+        "dataset_path": "/Users/beyzaerdogan/Desktop/ai-analyst/cereal.csv"
+    }
+def build_graph():
+    graph_builder = StateGraph(State)
+    graph_builder.add_node("coder_agent", coder_agent)
+    graph_builder.add_node("runner_agent", runner_agent)
+    graph_builder.add_node("fixer_agent", fixer_agent)
+    graph_builder.add_node("analysis_agent", analysis_agent)
+    graph_builder.add_edge(START, "coder_agent")
+    graph_builder.add_edge("coder_agent", "runner_agent")
+    def should_fix_code(state: State) -> str:
+        """Determine if we need to fix the code based on execution result."""
+        messages = state.get("messages", [])
+        if not messages:
+            return "success"
+        # Count fix attempts to prevent infinite loops
+        fix_attempts = 0
+        for message in messages:
+            if hasattr(message, 'additional_kwargs') and message.additional_kwargs.get('agent') == 'fixer_agent':
+                fix_attempts += 1
+        # Limit to 3 fix attempts to prevent quota exceeded errors
+        if fix_attempts >= 3:
+            return "success"
+        # Get the last runner_agent message to check for errors
+        for message in reversed(messages):
+            if hasattr(message, 'additional_kwargs') and message.additional_kwargs.get('agent') == 'runner_agent':
+                content = message.content
+                # Check if the execution failed
+                if "Execution failed" in content or "error" in content.lower() or "failed" in content.lower():
+                    return "error"
+                break
+        return "success"
+    graph_builder.add_conditional_edges(
+        "runner_agent",
+        should_fix_code,
+        {
+            "error": "fixer_agent",
+            "success": "analysis_agent"
+        }
+    )
+    graph_builder.add_edge("fixer_agent", "runner_agent")
+    graph_builder.add_edge("analysis_agent", END)
+    return graph_builder.compile()
+def chat_interface():
+    """Interactive chat interface for data analysis."""
+    graph = build_graph()
+    # Initialize state
+    state = {
+        "messages": [],
+        "dataset_path": "/Users/beyzaerdogan/Desktop/ai-analyst/cereal.csv",
+        "charts": [],
+        "report": "",
+        "codes": []
+    }
+    print("🤖 AI Data Analyst Chat")
+    print("=" * 50)
+    print("Ask me anything about your dataset! Type 'quit' or 'exit' to end the conversation.")
+    print("=" * 50)
+    while True:
+        try:
+            # Get user input
+            user_input = input("\n👤 You: ").strip()
+            # Check for exit commands
+            if user_input.lower() in ['quit', 'exit', 'bye', 'goodbye']:
+                print("\n🤖 AI: Goodbye! Thanks for chatting with me.")
+                break
+            if not user_input:
+                print("🤖 AI: Please enter a question or message.")
+                continue
+            # Add user message to state
+            user_message = HumanMessage(
+                content=user_input,
+                additional_kwargs={"agent": "human", "node_type": "question"}
+            )
+            state["messages"].append(user_message)
+            print("\n🤖 AI: Let me analyze that for you...")
+            # Run the graph with current state
+            state = graph.invoke(state)
+            # Extract and display the analysis response
+            analysis_response = None
+            for message in reversed(state["messages"]):
+                if (hasattr(message, 'additional_kwargs') and
+                    message.additional_kwargs.get('agent') == 'analysis_agent' and
+                    message.additional_kwargs.get('node_type') == 'analysis'):
+                    analysis_response = message.content
+                    break
+            if analysis_response:
+                print(f"\n🤖 AI: {analysis_response}")
+            else:
+                print("\n🤖 AI: I've processed your request. Check the output folder for results.")
+            # Show any generated reports
+            for message in state["messages"]:
+                if (hasattr(message, 'additional_kwargs') and
+                    message.additional_kwargs.get('agent') == 'analysis_agent' and
+                    message.additional_kwargs.get('node_type') == 'report'):
+                    print(f"📊 Report: {message.content}")
+        except KeyboardInterrupt:
+            print("\n\n🤖 AI: Goodbye! Thanks for chatting with me.")
+            break
+        except Exception as e:
+            print(f"\n🤖 AI: Sorry, I encountered an error: {str(e)}")
+            print("Please try again with a different question.")
+def main():
+    """Main function for single question mode (backward compatibility)."""
+    graph = build_graph()
+    state = {
+        "messages": [
+            HumanMessage(content="What are the top 5 cereals by their protein amount?",
+                        additional_kwargs={"agent": "human", "node_type": "question"})
+        ],
+        "dataset_path": "/Users/beyzaerdogan/Desktop/ai-analyst/cereal.csv",
+        "charts": [],
+        "report": ""
+    }
+    state = graph.invoke(state)
+    print(state)
+if __name__ == "__main__":
+    # Run chat interface by default
+    chat_interface()

src/question-answer/prompts.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import json
+try:
+    from .utils import get_dataset_info
+except ImportError:
+    from utils import get_dataset_info
+from dotenv import load_dotenv
+import json
+import os
+import glob
+load_dotenv()
+def coder_prompt(dataset_path: str, question: str) -> str:
+    """
+    System prompt for the data analyst.
+    """
+    dataset_info = get_dataset_info(dataset_path)
+    return f"""
+    You are a senior data analyst.
+    You have access to a pandas dataframe `df` that will be available in the sandbox environment.
+    Here is the dataset information:
+    {dataset_info}
+    USER QUESTION: {question}
+    Write Python code to answer this specific question both visually and statistically.
+    The code will be executed in a secure sandbox environment where the dataset is available as a CSV file.
+    IMPORTANT GUIDELINES:
+    1. First load the dataset: df = pd.read_csv('tmp/dataset.csv')
+    2. Only use built-in Python libraries, pandas, matplotlib, and seaborn
+    3. Write clear, well-commented code
+    4. Handle potential errors gracefully
+    5. Return meaningful results that directly answer the question
+    6. ALWAYS create visualizations when they would help answer the question
+    7. Use proper statistical tests and analysis to answer the question
+    VISUALIZATION REQUIREMENTS:
+    - ALWAYS use matplotlib.pyplot for plotting
+    - ALWAYS save plots as files using plt.savefig() before plt.show()
+    - Set proper figure size: plt.figure(figsize=(10, 6))
+    - Add titles and labels: plt.title(), plt.xlabel(), plt.ylabel()
+    - Use plt.tight_layout() for better spacing
+    - Use pastel colors for the plots
+    - Save each plot with a unique filename: plt.savefig('chart_1.png', dpi=300, bbox_inches='tight')
+    - Call plt.show() after saving
+    - Example: plt.savefig('chart_1.png', dpi=300, bbox_inches='tight'); plt.show()
+    RETURN FORMAT:
+    - Return ONLY the Python code without any markdown formatting
+    - Do NOT include ```python or ``` markers
+    - Do NOT include any explanatory text or comments
+    - Start directly with import statements
+    """
+def fixer_prompt(code: str, error: str, question: str, dataset_path: str) -> str:
+    """
+    System prompt for the analysis code fixing agent.
+    """
+    dataset_info = get_dataset_info(dataset_path)
+    return f"""
+    You are a senior data analyst.
+    You have access to a pandas dataframe `df` that will be available in the sandbox environment.
+    Here is the dataset information:
+    {dataset_info}
+    USER QUESTION: {question}
+    Here is the code that failed:
+    {code}
+    Here is the error message:
+    {error}
+    Your task is to fix the code to resolve the error.
+    VISUALIZATION REQUIREMENTS (if the code includes plots):
+    - ALWAYS save plots as files using plt.savefig() before plt.show()
+    - Save each plot with a unique filename: plt.savefig('chart_1.png', dpi=300, bbox_inches='tight')
+    - Call plt.show() after saving
+    - Example: plt.savefig('chart_1.png', dpi=300, bbox_inches='tight'); plt.show()
+    RETURN FORMAT:
+    - Return ONLY the fixed Python code without any markdown formatting
+    - Do NOT include ```python or ``` markers
+    - Do NOT include any explanatory text or comments
+    - Start directly with import statements
+    """
+def analysis_prompt(question: str, dataset_path: str, execution_result: str, charts: list = None) -> str:
+    """
+    System prompt for the analysis agent.
+    """
+    dataset_info = get_dataset_info(dataset_path)
+    # Use provided charts or find them in output directory
+    if charts is None:
+        charts = []
+        # Get the absolute path to the output directory
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        output_dir = os.path.join(current_dir, "output")
+        if os.path.exists(output_dir):
+            # Find all image files in the output directory
+            chart_patterns = ['*.png', '*.jpg', '*.jpeg', '*.svg', '*.pdf']
+            for pattern in chart_patterns:
+                chart_files = glob.glob(os.path.join(output_dir, pattern))
+                charts.extend(chart_files)
+            # Sort charts by creation time (newest first)
+            charts.sort(key=os.path.getctime, reverse=True)
+    print(f"Found {len(charts)} charts: {charts}")
+    # Create chart information for the prompt
+    chart_info = ""
+    if charts:
+        chart_info = f"\n\nCHARTS GENERATED:\n"
+        for i, chart in enumerate(charts, 1):
+            chart_info += f"Chart {i}: {chart}\n"
+        chart_info += "\nThese charts contain visual representations of the data analysis. Please refer to them when providing your analysis."
+    else:
+        chart_info = "\n\nNo charts were generated for this analysis."
+    return f"""
+    You are a senior data analyst.
+    Here is the dataset information:
+    {dataset_info}
+    Here is the question that was asked:
+    {question}
+    Here is the result of the execution:
+    {execution_result}
+    {chart_info}
+    Your task is to analyze the question, the result of the execution, and the charts to answer the question comprehensively.
+    RETURN FORMAT:
+    - Return ONLY the analysis in a single string
+    - Do NOT include any other text or comments
+    - Start directly with the analysis
+    - If charts were generated, reference them in your analysis
+    """

src/question-answer/states.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Type definitions for the LangGraph project."""
+from typing import TypedDict, Annotated
+from langchain_core.messages import AnyMessage
+from langgraph.graph.message import add_messages
+class State(TypedDict):
+    messages: Annotated[list[AnyMessage], add_messages]
+    dataset_path: str
+    codes: Annotated[list[str], add_messages]
+    charts: Annotated[list[str], add_messages]
+    report: str

src/question-answer/tools.py ADDED Viewed

	@@ -0,0 +1,264 @@

+from dotenv import load_dotenv
+import base64
+import os
+from daytona import Daytona
+load_dotenv()
+def clean_code(code: str) -> str:
+    """Clean the code - remove any file paths or non-Python content"""
+    lines = code.split('\n')
+    python_start = -1
+    # Look for Python code starting patterns
+    for i, line in enumerate(lines):
+        stripped = line.strip()
+        # Skip markdown code blocks
+        if stripped.startswith('```python') or stripped.startswith('```'):
+            continue
+        # Look for actual Python code
+        if (stripped.startswith('import ') or
+            stripped.startswith('from ') or
+            stripped.startswith('# ') or
+            stripped.startswith('"""') or
+            stripped.startswith("'''") or
+            (stripped and not '/' in stripped and not stripped.endswith('"'))):
+            python_start = i
+            break
+    if python_start > 0:
+        cleaned_code = '\n'.join(lines[python_start:])
+        print(f"Cleaned code by removing {python_start} lines from the beginning")
+    else:
+        cleaned_code = code
+    # Remove any remaining markdown markers
+    cleaned_code = cleaned_code.replace('```python', '').replace('```', '')
+    return cleaned_code
+def print_logs(result) -> None:
+    """Print the logs of the execution"""
+    if hasattr(result, 'stdout') and result.stdout:
+        print("STDOUT:")
+        print(result.stdout)
+    if hasattr(result, 'stderr') and result.stderr:
+        print("STDERR:")
+        print(result.stderr)
+def cleanup_sandboxes():
+    """Clean up all sandboxes to free resources"""
+    try:
+        daytona = Daytona()
+        # Try to get existing sandbox and delete it
+        try:
+            existing_sandbox = daytona.get_current_sandbox()
+            existing_sandbox.delete()
+            print(f"Cleaned up existing sandbox: {existing_sandbox.id}")
+        except:
+            print("No existing sandbox to clean up")
+    except Exception as e:
+        print(f"Warning: Could not clean up sandboxes: {e}")
+def run_code(code: str, dataset_path: str) -> dict:
+    """Run code in a Daytona sandbox"""
+    cleaned_code = clean_code(code)
+    # initialize daytona client
+    daytona = Daytona()
+    # try to get existing sandbox
+    try:
+        sandbox = daytona.get_current_sandbox()
+        print("Using existing sandbox")
+    except:
+        try:
+            sandbox = daytona.create()
+            print("Created new sandbox")
+        except Exception as e:
+            if "CPU quota exceeded" in str(e) or "disk quota exceeded" in str(e).lower():
+                print("Quota exceeded, cleaning up and trying again...")
+                cleanup_sandboxes()
+                try:
+                    sandbox = daytona.create()
+                    print("Created new sandbox after cleanup")
+                except Exception as e2:
+                    print(f"Still failed after cleanup: {e2}")
+                    raise e2
+            else:
+                raise e
+    # Upload the dataset to the sandbox using file system operations
+    try:
+        # Upload the original dataset to the sandbox
+        sandbox_datapath = f"tmp/{os.path.basename(dataset_path)}"
+        sandbox.fs.upload_file(dataset_path, sandbox_datapath)
+        print(f"Uploaded {dataset_path} to {sandbox_datapath}")
+        # Replace the original file path in the code with the sandbox path
+        cleaned_code = cleaned_code.replace(dataset_path, sandbox_datapath)
+        print(f"Updated code to use sandbox path: {sandbox_datapath}")
+    except Exception as e:
+        print(f"WARNING: Could not upload {dataset_path} to sandbox: {e}")
+        # If the file doesn't exist locally, continue without uploading
+        if "does not exist" in str(e) or "not found" in str(e).lower():
+            print(f"File {dataset_path} does not exist locally, continuing without upload")
+            sandbox_datapath = dataset_path  # Use original path as fallback
+        else:
+            raise e
+    #########################################################
+    ################# Running the code ######################
+    #########################################################
+    try:
+        # Install only essential dependencies to speed up execution
+        print("Installing essential dependencies...")
+        install_deps_code = """
+import subprocess
+import sys
+# Install only essential packages
+packages = ['matplotlib', 'pandas', 'numpy']
+for package in packages:
+    try:
+        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '--quiet'])
+        print(f"Installed {package}")
+    except Exception as e:
+        print(f"Failed to install {package}: {e}")
+        """
+        try:
+            deps_result = sandbox.process.code_run(install_deps_code)
+            print("Dependencies installation completed")
+        except Exception as e:
+            print(f"Warning: Could not install dependencies: {e}")
+        # Run the code in the sandbox
+        print("-----------------------------------------------------------------------------------")
+        print('Running the analysis code in the sandbox....')
+        result = sandbox.process.code_run(cleaned_code)
+        print('Code execution finished!')
+        # Check for execution errors
+        if result.exit_code != 0:
+            print(f"EXECUTION ERROR: {result.result}")
+            return {
+                "success": False,
+                "execution": f"Execution failed with error: {result.result}"
+            }
+    except Exception as e:
+        print(f"Error running code: {e}")
+        return {
+            "success": False,
+            "execution": str(e)
+        }
+    print("-----------------------------------------------------------------------------------")
+    print("Checking for files in the sandbox...")
+    print("-----------------------------------------------------------------------------------")
+    #########################################################
+    ############# Post-execution file checking ##############
+    #########################################################
+    # Check what files were created after code execution
+    try:
+        post_debug_code = """
+import os
+import glob
+print("\\n=== FILES AFTER CODE EXECUTION ===")
+print(f"Current working directory: {os.getcwd()}")
+print("Files in current directory:")
+for f in os.listdir('.'):
+    print(f"  {f}")
+        """
+        post_debug_result = sandbox.process.code_run(post_debug_code)
+        print("Post-execution debug info:", post_debug_result.result)
+    except Exception as e:
+        print(f"Could not check post-execution files: {e}")
+    #########################################################
+    ############# Checking for charts in the sandbox ########
+    #########################################################
+    # Ensure output directory exists
+    os.makedirs("output", exist_ok=True)
+    # Check for plots - look for saved plot files in the sandbox
+    charts_count = 0
+    # Look for common plot file patterns in both current directory and /tmp/
+    search_directories = ['.', '/tmp']
+    plot_patterns = ['*.png', '*.jpg', '*.jpeg', '*.svg', '*.pdf']
+    for search_dir in search_directories:
+        print(f"Searching for charts in {search_dir}...")
+        for pattern in plot_patterns:
+            try:
+                # List files matching the pattern in the specific directory
+                list_cmd = f"import glob; import os; files = glob.glob('{search_dir}/{pattern}'); print('\\n'.join(files))"
+                plot_files_result = sandbox.process.code_run(list_cmd)
+                if plot_files_result.result.strip():
+                    plot_files = plot_files_result.result.strip().split('\n')
+                    for i, plot_file in enumerate(plot_files):
+                        try:
+                            # Create filename for the chart
+                            chart_filename = f"chart_{charts_count + 1}.{plot_file.strip().split('.')[-1]}"
+                            # Download the plot file from sandbox
+                            sandbox.fs.download_file(plot_file.strip(), f"output/{chart_filename}")
+                            print(f"Downloaded chart: output/{chart_filename}")
+                            charts_count += 1
+                        except Exception as e:
+                            print(f"Error processing chart {plot_file}: {e}")
+            except Exception as e:
+                print(f"Error searching for {pattern} files in {search_dir}: {e}")
+    if charts_count == 0:
+        print("WARNING: No charts were downloaded.")
+        # Let's also check what files actually exist in the sandbox
+        try:
+            debug_cmd = """
+import os
+import glob
+print("\\n=== DEBUGGING CHART DETECTION ===")
+print("Current working directory:", os.getcwd())
+print("Files in current directory:")
+for f in os.listdir('.'):
+    print(f"  {f}")
+print("\\nAll image files found:")
+for pattern in ['*.png', '*.jpg', '*.jpeg', '*.svg', '*.pdf']:
+    files = glob.glob(pattern)
+    if files:
+        print(f"  {pattern}: {files}")
+    else:
+        print(f"  {pattern}: No files found")
+            """
+            debug_result = sandbox.process.code_run(debug_cmd)
+            print("Chart detection debug:", debug_result.result)
+        except Exception as e:
+            print(f"Could not run chart detection debug: {e}")
+    else:
+        print(f"Successfully downloaded {charts_count} charts")
+    # Clean up the sandbox to free disk space
+    try:
+        sandbox.delete()
+        print("Sandbox cleaned up to free disk space")
+    except Exception as e:
+        print(f"Warning: Could not clean up sandbox: {e}")
+    return {
+        "success": True,
+        "execution": result.result,
+        "charts": [f"output/chart_{i+1}.png" for i in range(charts_count)]
+    }

src/question-answer/utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import pandas as pd
+import numpy as np
+from pathlib import Path
+import json
+import os
+from datetime import datetime
+import re
+def format_execution_results(execution_result: str) -> str:
+    """
+    Format execution results to be more appealing and concise.
+    """
+    if not execution_result:
+        return "No execution results available."
+    # Remove technical warnings and error messages
+    lines = execution_result.split('\n')
+    cleaned_lines = []
+    skip_next_lines = 0
+    for i, line in enumerate(lines):
+        # Skip lines if we're in a warning block
+        if skip_next_lines > 0:
+            skip_next_lines -= 1
+            continue
+        # Skip warning lines and their content
+        if any(warning in line.lower() for warning in [
+            'futurewarning', 'userwarning', 'deprecationwarning',
+            'target_code:', 'warning:', 'error:', 'passing `palette`'
+        ]):
+            # Skip the warning line and the next few lines that are part of the warning
+            skip_next_lines = 2
+            continue
+        # Skip empty lines at the beginning
+        if not cleaned_lines and not line.strip():
+            continue
+        # Skip lines that are just function calls or technical details
+        if line.strip() in ['sns.barplot(', 'sns.barplot(']:
+            continue
+        cleaned_lines.append(line)
+    # Join and clean up
+    result = '\n'.join(cleaned_lines).strip()
+    # Remove the "Code execution result:" prefix if present
+    if result.startswith("Code execution result: "):
+        result = result[23:]
+    # If the result is too long, truncate it
+    if len(result) > 800:
+        result = result[:800] + "\n... (truncated for readability)"
+    return f"```\n{result}\n```"
+def get_dataset_info(dataset_path: str) -> str:
+        """
+        Get comprehensive information about the dataset.
+        """
+        try:
+            # Load the dataset
+            file_path = Path(dataset_path)
+            if not file_path.exists():
+                return f"Error: Dataset file '{dataset_path}' not found."
+            # Read dataset based on file extension
+            if file_path.suffix.lower() == '.csv':
+                df = pd.read_csv(dataset_path)
+            elif file_path.suffix.lower() in ['.xlsx', '.xls']:
+                df = pd.read_excel(dataset_path)
+            elif file_path.suffix.lower() == '.json':
+                df = pd.read_json(dataset_path)
+            elif file_path.suffix.lower() == '.parquet':
+                df = pd.read_parquet(dataset_path)
+            else:
+                return f"Error: Unsupported file format '{file_path.suffix}'"
+            # Gather comprehensive dataset information
+            info = {
+                'file_name': file_path.name,
+                'file_size': f"{file_path.stat().st_size / (1024*1024):.2f} MB",
+                'shape': f"{df.shape[0]} rows x {df.shape[1]} columns",
+                'columns': list(df.columns),
+                'data_types': df.dtypes.to_dict(),
+                'missing_values': df.isnull().sum().to_dict(),
+                'missing_percentage': (df.isnull().sum() / len(df) * 100).round(2).to_dict(),
+                'duplicate_rows': df.duplicated().sum(),
+                'memory_usage': f"{df.memory_usage(deep=True).sum() / (1024*1024):.2f} MB",
+                'numeric_columns': list(df.select_dtypes(include=[np.number]).columns),
+                'categorical_columns': list(df.select_dtypes(include=['object', 'category']).columns),
+                'datetime_columns': list(df.select_dtypes(include=['datetime64']).columns)
+            }
+            # Add statistical summary for numeric columns
+            if info['numeric_columns']:
+                numeric_stats = df[info['numeric_columns']].describe().to_dict()
+                info['numeric_statistics'] = numeric_stats
+            # Add unique value counts for categorical columns (sample)
+            categorical_info = {}
+            for col in info['categorical_columns'][:5]:  # Limit to first 5 categorical columns
+                unique_count = df[col].nunique()
+                categorical_info[col] = {
+                    'unique_values': unique_count,
+                    'sample_values': list(df[col].dropna().unique()[:10])  # First 10 unique values
+                }
+            info['categorical_info'] = categorical_info
+            # Identify potential data quality issues
+            issues = []
+            # Check for columns with high missing values
+            high_missing = [col for col, pct in info['missing_percentage'].items() if pct > 50]
+            if high_missing:
+                issues.append(f"High missing values (>50%): {high_missing}")
+            # Check for potential outliers in numeric columns
+            for col in info['numeric_columns']:
+                q1 = df[col].quantile(0.25)
+                q3 = df[col].quantile(0.75)
+                iqr = q3 - q1
+                outliers = df[(df[col] < (q1 - 1.5 * iqr)) | (df[col] > (q3 + 1.5 * iqr))][col].count()
+                if outliers > 0:
+                    issues.append(f"Potential outliers in '{col}': {outliers} values")
+            # Check for inconsistent data types
+            for col in info['categorical_columns']:
+                if df[col].dtype == 'object':
+                    # Check if column contains mixed numeric and string values
+                    sample_values = df[col].dropna().astype(str).head(100)
+                    numeric_count = sum(1 for val in sample_values if val.replace('.', '').replace('-', '').isdigit())
+                    if 0 < numeric_count < len(sample_values):
+                        issues.append(f"Mixed data types in '{col}': contains both numeric and text values")
+            info['potential_issues'] = issues
+            return json.dumps(info, indent=2, default=str)
+        except Exception as e:
+            return f"Error analyzing dataset: {str(e)}"
+def create_markdown_report(question: str, analysis: str, charts: list, execution_result: str) -> str:
+    """
+    Create a simple markdown report with analysis and key findings.
+    """
+    report = f"""## Analysis
+{analysis}
+## Key Findings
+{format_execution_results(execution_result)}
+"""
+    return report
+def save_markdown_report(report_content: str) -> str:
+    """
+    Save the markdown report to a file and return the filename.
+    """
+    # Ensure output directory exists
+    output_dir = "output"
+    os.makedirs(output_dir, exist_ok=True)
+    # Generate filename with timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"analysis_report_{timestamp}.md"
+    filepath = os.path.join(output_dir, filename)
+    # Write the report
+    with open(filepath, 'w', encoding='utf-8') as f:
+        f.write(report_content)
+    print(f"Report saved: {filepath}")
+    return filepath