sanaa-11's picture
Upload app.py
f51fb04 verified
import streamlit as st
import wikipedia
import wikipediaapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
# Initialize Wikipedia API
wiki_wiki = wikipediaapi.Wikipedia(
language='en',
user_agent="LSI1/1.0 ([email protected])"
)
# Function to fetch related Wikipedia articles
def fetch_related_articles(query, max_articles=20):
search_results = wikipedia.search(query, results=max_articles)
articles = {}
for title in search_results:
page = wiki_wiki.page(title)
if page.exists():
articles[title] = page.text
return articles
# Function to rank articles using SVD
def rank_articles(query, vectorizer, svd, lsi_matrix, titles):
query_tfidf = vectorizer.transform([query])
query_lsi = svd.transform(query_tfidf)
query_lsi = normalize(query_lsi)
similarities = cosine_similarity(query_lsi, lsi_matrix).flatten()
ranked_indices = similarities.argsort()[::-1] # Sort by similarity (desc)
return [(titles[idx], similarities[idx]) for idx in ranked_indices]
# Streamlit UI
st.title("Wikipedia Search with SVD")
st.write("Enter a search query to fetch and rank Wikipedia articles.")
# Input for query
search_query = st.text_input("Search Wikipedia:")
# Detect query change and clear session state if needed
if "previous_query" not in st.session_state:
st.session_state.previous_query = None
if search_query and search_query != st.session_state.previous_query:
# Reset session state variables
st.session_state.previous_query = search_query
st.session_state.ranked_results = None
st.session_state.end_index = 10
if search_query:
# Fetch articles dynamically
with st.spinner("Fetching articles..."):
articles = fetch_related_articles(search_query)
if not articles:
st.warning("No articles found! Try a different query.")
else:
# Prepare data for TF-IDF
titles = list(articles.keys())
contents = list(articles.values())
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(contents)
# SVD Dimensionality Reduction
n_components = 100 # Number of dimensions
svd = TruncatedSVD(n_components=n_components)
lsi_matrix = svd.fit_transform(tfidf_matrix)
lsi_matrix = normalize(lsi_matrix) # Normalize for cosine similarity
# Handle session state for pagination
if st.session_state.ranked_results is None:
st.session_state.ranked_results = rank_articles(search_query, vectorizer, svd, lsi_matrix, titles)
# Display ranked results
end_index = st.session_state.end_index
ranked_results = st.session_state.ranked_results[:end_index]
st.subheader("Search Results:")
for title, similarity in ranked_results:
st.markdown(f"### [{title}](https://en.wikipedia.org/wiki/{title.replace(' ', '_')})")
st.write(f"**Similarity Score:** {similarity:.2f}")
st.write("---")
# Pagination controls
if end_index < len(st.session_state.ranked_results):
if st.button("Load More"):
st.session_state.end_index += 10
else:
st.info("No more articles to load.")