Spaces:

Arxived
/

pandasai

Sleeping

File size: 6,762 Bytes

3dc0491

import streamlit as st
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from pandasai import Agent
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
import os
import logging

# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Fetch API keys from environment variables
api_key = os.getenv("OPENAI_API_KEY")
pandasai_api_key = os.getenv("PANDASAI_API_KEY")

# Check for missing keys and raise specific errors
missing_keys = []
if not api_key:
    missing_keys.append("OPENAI_API_KEY")
if not pandasai_api_key:
    missing_keys.append("PANDASAI_API_KEY")

if missing_keys:
    missing_keys_str = ", ".join(missing_keys)
    raise EnvironmentError(
        f"The following API key(s) are missing: {missing_keys_str}. Please set them in the environment."
    )

# Title of the app
st.title("Data Analyzer")

# Function to load datasets into session
def load_dataset_into_session():
    input_option = st.radio(
        "Select Dataset Input:",
        ["Use Repo Directory Dataset", "Use Hugging Face Dataset", "Upload CSV File"],
    )

    # Option 1: Load dataset from the repo directory
    if input_option == "Use Repo Directory Dataset":
        file_path = "./source/test.csv"
        if st.button("Load Repo Dataset"):
            try:
                st.session_state.df = pd.read_csv(file_path)
                st.success(f"File loaded successfully from '{file_path}'!")
                st.dataframe(st.session_state.df.head(10))
            except Exception as e:
                st.error(f"Error loading dataset from the repo directory: {e}")
                logger.error(f"Error loading dataset from repo directory: {e}")

    # Option 2: Load dataset from Hugging Face
    elif input_option == "Use Hugging Face Dataset":
        dataset_name = st.text_input(
            "Enter Hugging Face Dataset Name:", value="HUPD/hupd"
        )
        if st.button("Load Hugging Face Dataset"):
            try:
                dataset = load_dataset(dataset_name, split="train", trust_remote_code=True)
                # Convert Hugging Face dataset to Pandas DataFrame
                if hasattr(dataset, "to_pandas"):
                    st.session_state.df = dataset.to_pandas()
                else:
                    st.session_state.df = pd.DataFrame(dataset)
                st.success(f"Hugging Face Dataset '{dataset_name}' loaded successfully!")
                st.dataframe(st.session_state.df.head(10))
            except Exception as e:
                st.error(f"Error loading Hugging Face dataset: {e}")
                logger.error(f"Error loading Hugging Face dataset: {e}")

    # Option 3: Upload CSV File
    elif input_option == "Upload CSV File":
        uploaded_file = st.file_uploader("Upload a CSV File:", type=["csv"])
        if uploaded_file:
            try:
                st.session_state.df = pd.read_csv(uploaded_file)
                st.success("File uploaded successfully!")
                st.dataframe(st.session_state.df.head(10))
            except Exception as e:
                st.error(f"Error reading uploaded file: {e}")
                logger.error(f"Error reading uploaded file: {e}")

# Ensure session state for the DataFrame
if "df" not in st.session_state:
    st.session_state.df = None

# Load dataset into session
load_dataset_into_session()

# Check if a dataset is loaded
if st.session_state.df is not None:
    df = st.session_state.df
    try:
        # Initialize PandasAI Agent
        agent = Agent(df)

        # Convert DataFrame to documents for RAG
        documents = [
            Document(
                page_content=", ".join(
                    [f"{col}: {row[col]}" for col in df.columns if pd.notnull(row[col])]
                ),
                metadata={"index": index},
            )
            for index, row in df.iterrows()
        ]

        # Set up RAG
        embeddings = OpenAIEmbeddings()
        vectorstore = FAISS.from_documents(documents, embeddings)
        retriever = vectorstore.as_retriever()
        qa_chain = RetrievalQA.from_chain_type(
            llm=ChatOpenAI(),
            chain_type="stuff",
            retriever=retriever,
        )

        # Create tabs
        tab1, tab2, tab3 = st.tabs(
            ["PandasAI Analysis", "RAG Q&A", "Data Visualization"]
        )

        # Tab 1: PandasAI Analysis
        with tab1:
            st.header("PandasAI Analysis")
            pandas_question = st.text_input("Ask a question about the data (PandasAI):")
            if pandas_question:
                try:
                    result = agent.chat(pandas_question)
                    st.write("PandasAI Answer:", result)
                except Exception as e:
                    st.error(f"Error during PandasAI Analysis: {e}")

        # Tab 2: RAG Q&A
        with tab2:
            st.header("RAG Q&A")
            rag_question = st.text_input("Ask a question about the data (RAG):")
            if rag_question:
                try:
                    result = qa_chain.run(rag_question)
                    st.write("RAG Answer:", result)
                except Exception as e:
                    st.error(f"Error during RAG Q&A: {e}")

        # Tab 3: Data Visualization
        with tab3:
            st.header("Data Visualization")
            viz_question = st.text_input(
                "What kind of graph would you like to create? (e.g., 'Show a scatter plot of salary vs experience')"
            )
            if viz_question:
                try:
                    result = agent.chat(viz_question)

                    # Extract Python code for visualization
                    import re

                    code_pattern = r"```python\n(.*?)\n```"
                    code_match = re.search(code_pattern, result, re.DOTALL)

                    if code_match:
                        viz_code = code_match.group(1)
                        # Replace matplotlib (plt) code with Plotly (px)
                        viz_code = viz_code.replace("plt.", "px.")
                        exec(viz_code)  # Execute the visualization code
                        st.plotly_chart(fig)
                    else:
                        st.warning("Could not generate a graph. Try a different query.")
                except Exception as e:
                    st.error(f"Error during Data Visualization: {e}")
    except Exception as e:
        st.error(f"An error occurred during processing: {e}")
else:
    st.info("Please load a dataset to start analysis.")