|
import streamlit as st |
|
import os |
|
import google.generativeai as genai |
|
from huggingface_hub import hf_hub_download |
|
import base64 |
|
|
|
MODEL_ID = "gemini-2.0-flash-exp" |
|
try: |
|
api_key = os.getenv("GEMINI_API_KEY") |
|
model_id = MODEL_ID |
|
genai.configure(api_key=api_key) |
|
except Exception as e: |
|
st.error(f"Error: {e}") |
|
st.stop |
|
|
|
model = genai.GenerativeModel(MODEL_ID) |
|
chat = model.start_chat() |
|
|
|
def download_pdf(): |
|
""" |
|
Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename. |
|
""" |
|
try: |
|
hf_token = os.getenv("HF_TOKEN") |
|
repo_id = "louiecerv/vqa_machine_learning_dataset" |
|
filename = "Unsupervised_Learning_Algorithms.pdf" |
|
filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset") |
|
return filepath |
|
except Exception as e: |
|
st.error(f"Failed to download PDF from Hugging Face Hub: {e}") |
|
st.stop() |
|
|
|
|
|
if "conversation_history" not in st.session_state: |
|
st.session_state.conversation_history = [] |
|
if "uploaded_file_part" not in st.session_state: |
|
st.session_state.uploaded_file_part = None |
|
if "uploaded_pdf_path" not in st.session_state: |
|
st.session_state.uploaded_pdf_path = download_pdf() |
|
|
|
def multimodal_prompt(pdf_path, text_prompt): |
|
""" |
|
Sends a multimodal prompt to Gemini, handling file uploads efficiently. |
|
Args: |
|
pdf_path: The path to the PDF file. |
|
text_prompt: The text prompt for the model. |
|
Returns: |
|
The model's response as a string, or an error message. |
|
""" |
|
try: |
|
if st.session_state.uploaded_file_part is None: |
|
pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf") |
|
st.session_state.uploaded_file_part = pdf_part |
|
prompt = [text_prompt, pdf_part] |
|
else: |
|
|
|
prompt = [text_prompt, st.session_state.uploaded_file_part] |
|
|
|
response = chat.send_message(prompt) |
|
|
|
|
|
st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True}) |
|
st.session_state.conversation_history.append({"role": "assistant", "content": response.text}) |
|
return response.text |
|
|
|
except Exception as e: |
|
return f"An error occurred: {e}" |
|
|
|
def display_download_button(file_path, file_name): |
|
try: |
|
with open(file_path, "rb") as f: |
|
file_bytes = f.read() |
|
b64 = base64.b64encode(file_bytes).decode() |
|
href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download the source document (PDF)</a>' |
|
st.markdown(href, unsafe_allow_html=True) |
|
except FileNotFoundError: |
|
st.error("File not found for download.") |
|
except Exception as e: |
|
st.error(f"Error during download: {e}") |
|
|
|
|
|
models = ["K-Means Clustering", "Hierarchical Clustering", |
|
"DBSCAN", "Gaussian Mixture Models", "Principal Component Analysis (PCA)", |
|
"t-Distributed Stochastic Neighbor Embedding", "Autoencoders", "Self-Organizing Maps (SOM)", "Association Rule Learning"] |
|
|
|
|
|
st.sidebar.title("🤖 Visual Q and A") |
|
selected_model = st.sidebar.selectbox("Select the ML Model", models) |
|
|
|
|
|
st.title("📚 VQA on the Unsupervised Machine Learning Algorithms") |
|
about = """ |
|
|
|
**How to use this App** |
|
This app leverages Gemini 2.0 to provide insights on the provided document. |
|
Select a question from the dropdown menu or enter your own question to get |
|
Gemini's generated response based on the provided document. |
|
""" |
|
|
|
with st.expander("How to use this App"): |
|
st.markdown(about) |
|
|
|
|
|
st.header("Questions and Answers") |
|
|
|
|
|
if selected_model == "K-Means Clustering": |
|
questions = [ |
|
"What is the fundamental objective of the K-Means clustering algorithm, and how does it achieve this objective?", |
|
"Explain the concept of 'inertia' in the context of K-Means clustering and its role in the algorithm's operation.", |
|
"Describe the four key steps involved in the K-Means clustering process, providing details about each step.", |
|
"What are the main advantages and disadvantages of using the K-Means clustering algorithm?", |
|
"How does the selection of the 'k' value (number of clusters) influence the results of K-Means clustering? What are some common methods for determining the optimal 'k'?", |
|
"Discuss the issue of sensitivity to initialization in K-Means clustering. How can this sensitivity affect the clustering results, and what strategies can be employed to mitigate this issue?", |
|
"Explain why K-Means clustering might struggle with datasets containing clusters of varying shapes and densities. Are there any modifications or alternative algorithms that can address this limitation?", |
|
"How can outliers impact the performance of K-Means clustering? Discuss techniques for identifying and handling outliers in the context of this algorithm.", |
|
"Describe several real-world applications where K-Means clustering can be effectively utilized, providing specific examples.", |
|
"Compare and contrast K-Means clustering with other unsupervised learning algorithms, such as hierarchical clustering or DBSCAN, highlighting their relative strengths and weaknesses." |
|
] |
|
if selected_model == "Hierarchical Clustering": |
|
questions = [ |
|
"What is the primary objective of hierarchical clustering, and how does it differ from other clustering techniques like k-means?", |
|
"Explain the difference between the agglomerative and divisive approaches to hierarchical clustering, and provide a real-world example where each approach might be preferred.", |
|
"Describe the concept of 'linkage criteria' in hierarchical clustering. Discuss the three common types of linkage (single, complete, and average) and how they influence cluster formation.", |
|
"How can a dendrogram be used to interpret the results of hierarchical clustering? What information can you glean from its structure and branch lengths?", |
|
"Discuss the advantages and disadvantages of hierarchical clustering compared to other unsupervised learning methods. When might you choose hierarchical clustering over k-means or DBSCAN?", |
|
"How does the choice of distance metric affect the results of hierarchical clustering? Explain the impact of using different distance metrics like Euclidean, Manhattan, and cosine distance.", |
|
"Hierarchical clustering can be sensitive to noise and outliers. How can you identify and address these issues when applying this technique?", |
|
"Explain how hierarchical clustering can be used for exploratory data analysis. Provide an example of how you might use it to gain insights into a new dataset.", |
|
"Discuss the computational complexity of hierarchical clustering. How does it scale with the number of data points, and what are some strategies for handling large datasets?", |
|
"Can hierarchical clustering be used with categorical data? If so, how would you adapt the distance metric and linkage criteria to handle such data?" |
|
] |
|
if selected_model == "DBSCAN": |
|
questions = [ |
|
"What are the core differences between DBSCAN and traditional clustering algorithms like K-Means, and how do these differences impact the types of data structures they can effectively cluster?", |
|
"Explain the concept of density-based clustering and how DBSCAN utilizes this concept to identify clusters.", |
|
"How does DBSCAN handle outliers, and why is this approach beneficial in certain datasets compared to other clustering techniques?", |
|
"What are the two key parameters in DBSCAN, and how do they influence the clustering outcome?", |
|
"Describe the process of identifying core points, border points, and noise points in DBSCAN.", |
|
"Discuss the advantages and disadvantages of using DBSCAN, particularly its ability to handle arbitrarily shaped clusters and its sensitivity to parameter settings.", |
|
"In what scenarios would DBSCAN be a more suitable choice than K-Means or hierarchical clustering?", |
|
"How does DBSCAN's ability to identify noise contribute to its effectiveness in anomaly detection tasks?", |
|
"What are some real-world applications of DBSCAN, and how does its density-based approach address the specific challenges of these applications?", |
|
"How does DBSCAN compare to other density-based clustering algorithms, and what factors might lead you to choose DBSCAN over alternative methods?" |
|
] |
|
if selected_model == "Gaussian Mixture Models": |
|
questions = [ |
|
"Explain the underlying assumption of Gaussian Mixture Models (GMMs) and how it differs from the assumptions made by K-Means clustering.", |
|
"Describe the role of Gaussian distributions in GMMs and how they contribute to the model's flexibility in capturing cluster shapes.", |
|
"How does the Expectation-Maximization (EM) algorithm facilitate the estimation of parameters in GMMs?", |
|
"What are the advantages of using GMMs over K-Means for clustering data with varying shapes and densities?", |
|
"Explain the concept of 'soft clustering' in GMMs and how it provides a more nuanced understanding of cluster assignments compared to 'hard clustering' methods.", |
|
"How can GMMs be used for density estimation, and what are the benefits of this probabilistic approach?", |
|
"Discuss the challenges associated with initializing GMMs and the potential impact on the final clustering results.", |
|
"In what situations might GMMs be a preferred choice over other clustering algorithms, considering their strengths and weaknesses?", |
|
"How does the concept of 'responsibility' in the E-step of the EM algorithm help in assigning data points to Gaussian components?", |
|
"Provide examples of real-world applications where GMMs have been successfully employed for clustering or density estimation tasks." |
|
] |
|
if selected_model == "Principal Component Analysis (PCA)": |
|
questions = [ |
|
"How does PCA achieve dimensionality reduction, and what are the key mathematical concepts involved in this process?", |
|
"Explain the role of eigenvectors and eigenvalues in PCA, and how they contribute to identifying principal components.", |
|
"What are the benefits of using PCA for dimensionality reduction, particularly in the context of large datasets?", |
|
"How does PCA help in addressing the curse of dimensionality, and why is this important in machine learning?", |
|
"Describe the steps involved in performing PCA, including data standardization and the selection of principal components.", |
|
"Discuss the limitations of PCA, such as its linearity assumption and potential issues with interpretability.", |
|
"In what situations might PCA not be suitable for dimensionality reduction, and what alternative techniques could be considered?", |
|
"How can PCA be used to improve the performance of other machine learning algorithms, and what types of algorithms benefit most from this preprocessing step?", |
|
"What are some real-world applications of PCA, and how does its ability to reduce dimensionality contribute to solving these problems?", |
|
"How does PCA compare to other dimensionality reduction techniques, and what factors would influence your choice between PCA and alternative methods?" |
|
] |
|
if selected_model == "Self-Organizing Maps (SOM)": |
|
questions = [ |
|
"Explain the concept of a Self-Organizing Map (SOM) and its role in unsupervised learning.", |
|
"Describe the structure of a SOM, including its layers and the connections between neurons.", |
|
"How does the competitive learning process work in a SOM, and how is the Best Matching Unit (BMU) determined?", |
|
"Explain the process of weight adaptation in a SOM and how it leads to the formation of a topological map.", |
|
"What are the key parameters involved in training a SOM, and how do they affect the resulting map?", |
|
"Discuss the advantages and disadvantages of using SOMs for dimensionality reduction and visualization.", |
|
"How does a SOM preserve the topological properties of the input data, and why is this important?", |
|
"What are some common applications of SOMs in fields like data analysis, image processing, and pattern recognition?", |
|
"Compare and contrast SOMs with other unsupervised learning techniques such as K-Means clustering and Principal Component Analysis (PCA).", |
|
"How can SOMs be used for clustering and classification tasks, and what are the limitations of this approach?" |
|
] |
|
if selected_model == "t-Distributed Stochastic Neighbor Embedding": |
|
questions = [ |
|
"What is the primary objective of t-SNE, and how does it differ from the goals of principal component analysis (PCA)?", |
|
"Explain the concept of 'perplexity' in t-SNE and its role in balancing local and global structure preservation.", |
|
"How does t-SNE use probability distributions to represent relationships between data points in high-dimensional and low-dimensional spaces?", |
|
"Describe the optimization process in t-SNE and the challenges associated with minimizing the Kullback-Leibler divergence.", |
|
"What are the advantages of t-SNE over linear dimensionality reduction techniques like PCA, particularly for visualizing complex datasets?", |
|
"Discuss the limitations of t-SNE, including its computational cost and sensitivity to parameter settings.", |
|
"How does the 'crowding problem' affect t-SNE visualizations, and what strategies can be used to mitigate this issue?", |
|
"In what situations would t-SNE be the preferred choice for dimensionality reduction and visualization compared to other techniques?", |
|
"Provide examples of real-world applications where t-SNE has been successfully used to gain insights from high-dimensional data.", |
|
"How can t-SNE be combined with other machine learning techniques, such as clustering or classification, to improve data analysis and visualization?" |
|
] |
|
if selected_model == "Autoencoders": |
|
questions = [ |
|
"What is the fundamental purpose of an autoencoder, and how does it differ from other unsupervised learning techniques like clustering or dimensionality reduction?", |
|
"Describe the two main components of an autoencoder and their respective roles in the learning process.", |
|
"Explain the concept of a latent space representation in the context of autoencoders. How does this representation contribute to dimensionality reduction and feature extraction?", |
|
"How does the training process of an autoencoder work, and what is the significance of minimizing reconstruction error?", |
|
"What are the advantages of using autoencoders for non-linear dimensionality reduction compared to linear techniques like PCA?", |
|
"Discuss how autoencoders can be applied to tasks such as denoising and anomaly detection.", |
|
"What are some potential challenges or drawbacks of using autoencoders, such as overfitting or the need for large datasets?", |
|
"How can techniques like regularization help to mitigate the risk of overfitting in autoencoders?", |
|
"Explain how the flexibility of autoencoders allows them to be adapted to various architectures and applications.", |
|
"Can you provide examples of real-world applications where autoencoders have been successfully used for dimensionality reduction, feature extraction, or other unsupervised learning tasks?" |
|
] |
|
if selected_model == "Association Rule Learning": |
|
questions = [ |
|
"What is the primary goal of Association Rule Learning, and how does it differ from other unsupervised learning techniques like clustering or dimensionality reduction?", |
|
"Explain the concept of 'support' and 'confidence' in Association Rule Learning, and how these metrics are used to evaluate the strength of an association rule.", |
|
"Describe the Apriori algorithm, focusing on its key steps for generating frequent itemsets and association rules.", |
|
"How does the Apriori algorithm address the challenge of computational complexity when dealing with a large number of possible itemsets?", |
|
"What are the advantages and disadvantages of using Association Rule Learning, particularly in terms of interpretability and computational cost?", |
|
"In what real-world scenarios is Association Rule Learning most applicable, and what types of insights can be gained from its application?", |
|
"How does the choice of support and confidence thresholds impact the number and quality of discovered rules, and what factors should be considered when setting these thresholds?", |
|
"What are some potential challenges or limitations of Association Rule Learning, such as dealing with rare items or handling continuous variables?", |
|
"How can Association Rule Learning be used in conjunction with other data mining or machine learning techniques to enhance its effectiveness?", |
|
"Discuss the ethical considerations surrounding the application of Association Rule Learning, particularly in areas like customer privacy and targeted advertising." |
|
] |
|
|
|
|
|
selected_question = st.selectbox("Choose a question", questions) |
|
|
|
|
|
if st.checkbox('Check this box to ask a question not listed above'): |
|
|
|
selected_question = st.text_input('Enter a question') |
|
|
|
if st.button("Ask AI"): |
|
with st.spinner("AI is thinking..."): |
|
if st.session_state.uploaded_pdf_path is None: |
|
st.session_state.uploaded_pdf_path = download_pdf() |
|
|
|
filepath = st.session_state.uploaded_pdf_path |
|
text_prompt = f"Use the provided document focus on rhe topic: {selected_model} to answer the following question: {selected_question}. Use your own knowledge as well as sources from the web and the provided document. Always cite your sourcss." |
|
response = multimodal_prompt(filepath, text_prompt) |
|
st.markdown(f"**Question:** {selected_question}") |
|
st.markdown(f"**Response:** {response}") |
|
|
|
if st.session_state.uploaded_pdf_path: |
|
display_download_button(st.session_state.uploaded_pdf_path, "Unsupervised_Learning_Algorithms.pdf") |
|
|
|
st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)") |
|
st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨") |