louiecerv's picture
Sync with remote
815ea8d
import streamlit as st
import os
import google.generativeai as genai
from huggingface_hub import hf_hub_download
import base64
MODEL_ID = "gemini-2.0-flash-exp" # Keep the model ID as is
try:
api_key = os.getenv("GEMINI_API_KEY")
model_id = MODEL_ID
genai.configure(api_key=api_key)
except Exception as e:
st.error(f"Error: {e}")
st.stop
model = genai.GenerativeModel(MODEL_ID)
chat = model.start_chat()
def download_pdf():
"""
Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename.
"""
try:
hf_token = os.getenv("HF_TOKEN")
repo_id = "louiecerv/vqa_machine_learning_dataset" # Corrected dataset repo path
filename = "Unsupervised_Learning_Algorithms.pdf"
filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset")
return filepath
except Exception as e:
st.error(f"Failed to download PDF from Hugging Face Hub: {e}")
st.stop() # Stop if the download fails
# Initialize conversation history in Streamlit session state
if "conversation_history" not in st.session_state:
st.session_state.conversation_history = []
if "uploaded_file_part" not in st.session_state: # Store the file *part*
st.session_state.uploaded_file_part = None
if "uploaded_pdf_path" not in st.session_state:
st.session_state.uploaded_pdf_path = download_pdf()
def multimodal_prompt(pdf_path, text_prompt):
"""
Sends a multimodal prompt to Gemini, handling file uploads efficiently.
Args:
pdf_path: The path to the PDF file.
text_prompt: The text prompt for the model.
Returns:
The model's response as a string, or an error message.
"""
try:
if st.session_state.uploaded_file_part is None: # First time, upload
pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf")
st.session_state.uploaded_file_part = pdf_part
prompt = [text_prompt, pdf_part] # First turn includes the actual file
else: # Subsequent turns, reference the file
prompt = [text_prompt, st.session_state.uploaded_file_part] # Subsequent turns include the file reference
response = chat.send_message(prompt)
# Update conversation history
st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True})
st.session_state.conversation_history.append({"role": "assistant", "content": response.text})
return response.text
except Exception as e:
return f"An error occurred: {e}"
def display_download_button(file_path, file_name):
try:
with open(file_path, "rb") as f:
file_bytes = f.read()
b64 = base64.b64encode(file_bytes).decode()
href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download the source document (PDF)</a>'
st.markdown(href, unsafe_allow_html=True)
except FileNotFoundError:
st.error("File not found for download.")
except Exception as e:
st.error(f"Error during download: {e}")
# Define the ML Models
models = ["K-Means Clustering", "Hierarchical Clustering",
"DBSCAN", "Gaussian Mixture Models", "Principal Component Analysis (PCA)",
"t-Distributed Stochastic Neighbor Embedding", "Autoencoders", "Self-Organizing Maps (SOM)", "Association Rule Learning"]
# --- Sidebar ---
st.sidebar.title("🤖 Visual Q and A")
selected_model = st.sidebar.selectbox("Select the ML Model", models)
# --- Main Page ---
st.title("📚 VQA on the Unsupervised Machine Learning Algorithms")
about = """
**How to use this App**
This app leverages Gemini 2.0 to provide insights on the provided document.
Select a question from the dropdown menu or enter your own question to get
Gemini's generated response based on the provided document.
"""
with st.expander("How to use this App"):
st.markdown(about)
# --- Q and A Tab ---
st.header("Questions and Answers")
# Generate 5 questions based on the selected model
if selected_model == "K-Means Clustering":
questions = [
"What is the fundamental objective of the K-Means clustering algorithm, and how does it achieve this objective?",
"Explain the concept of 'inertia' in the context of K-Means clustering and its role in the algorithm's operation.",
"Describe the four key steps involved in the K-Means clustering process, providing details about each step.",
"What are the main advantages and disadvantages of using the K-Means clustering algorithm?",
"How does the selection of the 'k' value (number of clusters) influence the results of K-Means clustering? What are some common methods for determining the optimal 'k'?",
"Discuss the issue of sensitivity to initialization in K-Means clustering. How can this sensitivity affect the clustering results, and what strategies can be employed to mitigate this issue?",
"Explain why K-Means clustering might struggle with datasets containing clusters of varying shapes and densities. Are there any modifications or alternative algorithms that can address this limitation?",
"How can outliers impact the performance of K-Means clustering? Discuss techniques for identifying and handling outliers in the context of this algorithm.",
"Describe several real-world applications where K-Means clustering can be effectively utilized, providing specific examples.",
"Compare and contrast K-Means clustering with other unsupervised learning algorithms, such as hierarchical clustering or DBSCAN, highlighting their relative strengths and weaknesses."
]
if selected_model == "Hierarchical Clustering":
questions = [
"What is the primary objective of hierarchical clustering, and how does it differ from other clustering techniques like k-means?",
"Explain the difference between the agglomerative and divisive approaches to hierarchical clustering, and provide a real-world example where each approach might be preferred.",
"Describe the concept of 'linkage criteria' in hierarchical clustering. Discuss the three common types of linkage (single, complete, and average) and how they influence cluster formation.",
"How can a dendrogram be used to interpret the results of hierarchical clustering? What information can you glean from its structure and branch lengths?",
"Discuss the advantages and disadvantages of hierarchical clustering compared to other unsupervised learning methods. When might you choose hierarchical clustering over k-means or DBSCAN?",
"How does the choice of distance metric affect the results of hierarchical clustering? Explain the impact of using different distance metrics like Euclidean, Manhattan, and cosine distance.",
"Hierarchical clustering can be sensitive to noise and outliers. How can you identify and address these issues when applying this technique?",
"Explain how hierarchical clustering can be used for exploratory data analysis. Provide an example of how you might use it to gain insights into a new dataset.",
"Discuss the computational complexity of hierarchical clustering. How does it scale with the number of data points, and what are some strategies for handling large datasets?",
"Can hierarchical clustering be used with categorical data? If so, how would you adapt the distance metric and linkage criteria to handle such data?"
]
if selected_model == "DBSCAN":
questions = [
"What are the core differences between DBSCAN and traditional clustering algorithms like K-Means, and how do these differences impact the types of data structures they can effectively cluster?",
"Explain the concept of density-based clustering and how DBSCAN utilizes this concept to identify clusters.",
"How does DBSCAN handle outliers, and why is this approach beneficial in certain datasets compared to other clustering techniques?",
"What are the two key parameters in DBSCAN, and how do they influence the clustering outcome?",
"Describe the process of identifying core points, border points, and noise points in DBSCAN.",
"Discuss the advantages and disadvantages of using DBSCAN, particularly its ability to handle arbitrarily shaped clusters and its sensitivity to parameter settings.",
"In what scenarios would DBSCAN be a more suitable choice than K-Means or hierarchical clustering?",
"How does DBSCAN's ability to identify noise contribute to its effectiveness in anomaly detection tasks?",
"What are some real-world applications of DBSCAN, and how does its density-based approach address the specific challenges of these applications?",
"How does DBSCAN compare to other density-based clustering algorithms, and what factors might lead you to choose DBSCAN over alternative methods?"
]
if selected_model == "Gaussian Mixture Models":
questions = [
"Explain the underlying assumption of Gaussian Mixture Models (GMMs) and how it differs from the assumptions made by K-Means clustering.",
"Describe the role of Gaussian distributions in GMMs and how they contribute to the model's flexibility in capturing cluster shapes.",
"How does the Expectation-Maximization (EM) algorithm facilitate the estimation of parameters in GMMs?",
"What are the advantages of using GMMs over K-Means for clustering data with varying shapes and densities?",
"Explain the concept of 'soft clustering' in GMMs and how it provides a more nuanced understanding of cluster assignments compared to 'hard clustering' methods.",
"How can GMMs be used for density estimation, and what are the benefits of this probabilistic approach?",
"Discuss the challenges associated with initializing GMMs and the potential impact on the final clustering results.",
"In what situations might GMMs be a preferred choice over other clustering algorithms, considering their strengths and weaknesses?",
"How does the concept of 'responsibility' in the E-step of the EM algorithm help in assigning data points to Gaussian components?",
"Provide examples of real-world applications where GMMs have been successfully employed for clustering or density estimation tasks."
]
if selected_model == "Principal Component Analysis (PCA)":
questions = [
"How does PCA achieve dimensionality reduction, and what are the key mathematical concepts involved in this process?",
"Explain the role of eigenvectors and eigenvalues in PCA, and how they contribute to identifying principal components.",
"What are the benefits of using PCA for dimensionality reduction, particularly in the context of large datasets?",
"How does PCA help in addressing the curse of dimensionality, and why is this important in machine learning?",
"Describe the steps involved in performing PCA, including data standardization and the selection of principal components.",
"Discuss the limitations of PCA, such as its linearity assumption and potential issues with interpretability.",
"In what situations might PCA not be suitable for dimensionality reduction, and what alternative techniques could be considered?",
"How can PCA be used to improve the performance of other machine learning algorithms, and what types of algorithms benefit most from this preprocessing step?",
"What are some real-world applications of PCA, and how does its ability to reduce dimensionality contribute to solving these problems?",
"How does PCA compare to other dimensionality reduction techniques, and what factors would influence your choice between PCA and alternative methods?"
]
if selected_model == "Self-Organizing Maps (SOM)":
questions = [
"Explain the concept of a Self-Organizing Map (SOM) and its role in unsupervised learning.",
"Describe the structure of a SOM, including its layers and the connections between neurons.",
"How does the competitive learning process work in a SOM, and how is the Best Matching Unit (BMU) determined?",
"Explain the process of weight adaptation in a SOM and how it leads to the formation of a topological map.",
"What are the key parameters involved in training a SOM, and how do they affect the resulting map?",
"Discuss the advantages and disadvantages of using SOMs for dimensionality reduction and visualization.",
"How does a SOM preserve the topological properties of the input data, and why is this important?",
"What are some common applications of SOMs in fields like data analysis, image processing, and pattern recognition?",
"Compare and contrast SOMs with other unsupervised learning techniques such as K-Means clustering and Principal Component Analysis (PCA).",
"How can SOMs be used for clustering and classification tasks, and what are the limitations of this approach?"
]
if selected_model == "t-Distributed Stochastic Neighbor Embedding":
questions = [
"What is the primary objective of t-SNE, and how does it differ from the goals of principal component analysis (PCA)?",
"Explain the concept of 'perplexity' in t-SNE and its role in balancing local and global structure preservation.",
"How does t-SNE use probability distributions to represent relationships between data points in high-dimensional and low-dimensional spaces?",
"Describe the optimization process in t-SNE and the challenges associated with minimizing the Kullback-Leibler divergence.",
"What are the advantages of t-SNE over linear dimensionality reduction techniques like PCA, particularly for visualizing complex datasets?",
"Discuss the limitations of t-SNE, including its computational cost and sensitivity to parameter settings.",
"How does the 'crowding problem' affect t-SNE visualizations, and what strategies can be used to mitigate this issue?",
"In what situations would t-SNE be the preferred choice for dimensionality reduction and visualization compared to other techniques?",
"Provide examples of real-world applications where t-SNE has been successfully used to gain insights from high-dimensional data.",
"How can t-SNE be combined with other machine learning techniques, such as clustering or classification, to improve data analysis and visualization?"
]
if selected_model == "Autoencoders":
questions = [
"What is the fundamental purpose of an autoencoder, and how does it differ from other unsupervised learning techniques like clustering or dimensionality reduction?",
"Describe the two main components of an autoencoder and their respective roles in the learning process.",
"Explain the concept of a latent space representation in the context of autoencoders. How does this representation contribute to dimensionality reduction and feature extraction?",
"How does the training process of an autoencoder work, and what is the significance of minimizing reconstruction error?",
"What are the advantages of using autoencoders for non-linear dimensionality reduction compared to linear techniques like PCA?",
"Discuss how autoencoders can be applied to tasks such as denoising and anomaly detection.",
"What are some potential challenges or drawbacks of using autoencoders, such as overfitting or the need for large datasets?",
"How can techniques like regularization help to mitigate the risk of overfitting in autoencoders?",
"Explain how the flexibility of autoencoders allows them to be adapted to various architectures and applications.",
"Can you provide examples of real-world applications where autoencoders have been successfully used for dimensionality reduction, feature extraction, or other unsupervised learning tasks?"
]
if selected_model == "Association Rule Learning":
questions = [
"What is the primary goal of Association Rule Learning, and how does it differ from other unsupervised learning techniques like clustering or dimensionality reduction?",
"Explain the concept of 'support' and 'confidence' in Association Rule Learning, and how these metrics are used to evaluate the strength of an association rule.",
"Describe the Apriori algorithm, focusing on its key steps for generating frequent itemsets and association rules.",
"How does the Apriori algorithm address the challenge of computational complexity when dealing with a large number of possible itemsets?",
"What are the advantages and disadvantages of using Association Rule Learning, particularly in terms of interpretability and computational cost?",
"In what real-world scenarios is Association Rule Learning most applicable, and what types of insights can be gained from its application?",
"How does the choice of support and confidence thresholds impact the number and quality of discovered rules, and what factors should be considered when setting these thresholds?",
"What are some potential challenges or limitations of Association Rule Learning, such as dealing with rare items or handling continuous variables?",
"How can Association Rule Learning be used in conjunction with other data mining or machine learning techniques to enhance its effectiveness?",
"Discuss the ethical considerations surrounding the application of Association Rule Learning, particularly in areas like customer privacy and targeted advertising."
]
# Create a selection box
selected_question = st.selectbox("Choose a question", questions)
# Display a checkbox
if st.checkbox('Check this box to ask a question not listed above'):
# If the checkbox is checked, display a text box
selected_question = st.text_input('Enter a question')
if st.button("Ask AI"):
with st.spinner("AI is thinking..."):
if st.session_state.uploaded_pdf_path is None:
st.session_state.uploaded_pdf_path = download_pdf()
filepath = st.session_state.uploaded_pdf_path
text_prompt = f"Use the provided document focus on rhe topic: {selected_model} to answer the following question: {selected_question}. Use your own knowledge as well as sources from the web and the provided document. Always cite your sourcss."
response = multimodal_prompt(filepath, text_prompt) # Use the downloaded filepath
st.markdown(f"**Question:** {selected_question}")
st.markdown(f"**Response:** {response}")
if st.session_state.uploaded_pdf_path:
display_download_button(st.session_state.uploaded_pdf_path, "Unsupervised_Learning_Algorithms.pdf")
st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)")
st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨")