import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from ydata_profiling import ProfileReport import json import os from langchain.llms import HuggingFaceHub from langchain.chains import LLMChain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain.tools.python.tool import PythonAstREPLTool from langchain.agents import AgentExecutor, create_react_agent from langchain_experimental.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent from langchain.agents.agent_types import AgentType # Set page configuration st.set_page_config(page_title="Interactive Data Profiler & Chat", layout="wide", page_icon="📊") # Create session states for DataFrame and chat history if they don't exist if 'df' not in st.session_state: st.session_state.df = None if 'chat_history' not in st.session_state: st.session_state.chat_history = [] if 'suggestions' not in st.session_state: st.session_state.suggestions = [] # Initialize Hugging Face API def get_llm(): # Using a small but capable open-source model llm = HuggingFaceHub( repo_id="google/flan-t5-large", model_kwargs={"temperature": 0.1, "max_length": 512}, huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN", "") ) return llm # Function to generate report def generate_profile_report(df): with st.spinner("Generating profile report..."): profile = ProfileReport(df, title="Profiling Report", explorative=True, minimal=True) # Minimal for faster processing return profile # Function to generate query suggestions def generate_suggestions(df): # Get basic info about the dataframe num_rows = df.shape[0] num_cols = df.shape[1] column_names = df.columns.tolist() data_types = df.dtypes.astype(str).tolist() # Sample suggestions based on dataframe structure suggestions = [ f"How many rows are in this dataset?", f"What are all the column names?", f"Show me the first 5 rows", f"What is the average of {column_names[0] if len(column_names) > 0 else 'column'}" ] # Add column-specific suggestions for col, dtype in zip(column_names[:min(3, len(column_names))], data_types[:min(3, len(data_types))]): if 'int' in dtype or 'float' in dtype: suggestions.append(f"What is the mean value of {col}?") suggestions.append(f"What is the maximum value of {col}?") elif 'object' in dtype or 'str' in dtype: suggestions.append(f"What are the unique values in {col}?") suggestions.append(f"How many missing values in {col}?") return suggestions # Function to execute pandas operations safely def execute_pandas_query(df, query): try: # Create pandas agent agent = create_pandas_dataframe_agent( llm=get_llm(), df=df, agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True ) # Execute query result = agent.run(query) return result except Exception as e: # Fallback to basic operations if agent fails if "rows" in query.lower() and "how many" in query.lower(): return f"The dataset has {df.shape[0]} rows." elif "columns" in query.lower() and "how many" in query.lower(): return f"The dataset has {df.shape[1]} columns." elif "column names" in query.lower(): return f"The column names are: {', '.join(df.columns.tolist())}" elif "first" in query.lower() and "rows" in query.lower(): num = 5 # Default for word in query.split(): if word.isdigit(): num = int(word) break return df.head(num).to_string() elif "describe" in query.lower(): return df.describe().to_string() else: return f"I couldn't process that query. Error: {str(e)}" # Main app header st.title("🔍 Interactive Data Profiler & Chat") st.markdown(""" Upload your CSV file to get detailed profiling and ask questions about your data! This app combines interactive data profiling with a chat interface for data exploration. """) # File uploader uploaded_file = st.file_uploader("Upload a CSV file", type="csv") # Process uploaded file if uploaded_file is not None: try: # Read CSV into DataFrame df = pd.read_csv(uploaded_file) st.session_state.df = df st.success(f"✅ File uploaded successfully! Found {df.shape[0]} rows and {df.shape[1]} columns.") # Generate suggestions when a new file is uploaded if len(st.session_state.suggestions) == 0: st.session_state.suggestions = generate_suggestions(df) # Create tabs for different functionalities tab1, tab2 = st.tabs(["📊 Data Profiling", "💬 Data Chat"]) # Tab 1: Data Profiling with tab1: st.header("Data Profiling") # Basic info col1, col2, col3 = st.columns(3) with col1: st.metric("Rows", df.shape[0]) with col2: st.metric("Columns", df.shape[1]) with col3: st.metric("Missing Values", df.isna().sum().sum()) # Show raw data sample with st.expander("Preview Data"): st.dataframe(df.head(10)) # Generate the profile report profile = generate_profile_report(df) # Convert report to HTML and display report_html = profile.to_html() st.components.v1.html(report_html, height=1000, scrolling=True) # Provide download button st.write("### Download the Profiling Report") report_bytes = report_html.encode('utf-8') st.download_button( label="Download Report (HTML)", data=report_bytes, file_name="profiling_report.html", mime="text/html" ) # Tab 2: Interactive Chat with tab2: st.header("Chat with Your Data") st.info("Ask questions about your data and get instant answers!") # Chat input and suggested questions user_question = st.text_input("Your question:", key="question_input") # Show suggestion chips st.write("Suggested questions:") cols = st.columns(2) for i, suggestion in enumerate(st.session_state.suggestions): col_idx = i % 2 with cols[col_idx]: if st.button(suggestion, key=f"suggestion_{i}"): user_question = suggestion st.session_state.question_input = suggestion st.experimental_rerun() # Process question if user_question: st.session_state.chat_history.append({"role": "user", "content": user_question}) # Get answer with st.spinner("Thinking..."): answer = execute_pandas_query(df, user_question) # Add answer to chat history st.session_state.chat_history.append({"role": "assistant", "content": answer}) # Display chat history st.write("### Conversation History") for message in st.session_state.chat_history: if message["role"] == "user": st.markdown(f"**You:** {message['content']}") else: st.markdown(f"**Assistant:** {message['content']}") st.markdown("---") # Clear chat button if st.button("Clear Chat History"): st.session_state.chat_history = [] st.experimental_rerun() except Exception as e: st.error(f"An error occurred: {str(e)}") else: st.info("👆 Please upload a CSV file to begin.") # Placeholder visuals st.markdown("### What you can do with this app:") col1, col2 = st.columns(2) with col1: st.markdown("**📊 Data Profiling**") st.markdown("- Automatic data quality assessment") st.markdown("- Column statistics and distributions") st.markdown("- Correlation analysis") st.markdown("- Missing values analysis") with col2: st.markdown("**💬 Interactive Data Chat**") st.markdown("- Ask natural language questions") st.markdown("- Get instant insights") st.markdown("- Suggested questions for quick exploration") st.markdown("- No coding required!")