Spaces:
Sleeping
Sleeping
import streamlit as st | |
import wikipedia | |
import wikipediaapi | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.decomposition import TruncatedSVD | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.preprocessing import normalize | |
# Initialize Wikipedia API | |
wiki_wiki = wikipediaapi.Wikipedia( | |
language='en', | |
user_agent="LSI1/1.0 ([email protected])" | |
) | |
# Function to fetch related Wikipedia articles | |
def fetch_related_articles(query, max_articles=20): | |
search_results = wikipedia.search(query, results=max_articles) | |
articles = {} | |
for title in search_results: | |
page = wiki_wiki.page(title) | |
if page.exists(): | |
articles[title] = page.text | |
return articles | |
# Function to rank articles using SVD | |
def rank_articles(query, vectorizer, svd, lsi_matrix, titles): | |
query_tfidf = vectorizer.transform([query]) | |
query_lsi = svd.transform(query_tfidf) | |
query_lsi = normalize(query_lsi) | |
similarities = cosine_similarity(query_lsi, lsi_matrix).flatten() | |
ranked_indices = similarities.argsort()[::-1] # Sort by similarity (desc) | |
return [(titles[idx], similarities[idx]) for idx in ranked_indices] | |
# Streamlit UI | |
st.title("Wikipedia Search with SVD") | |
st.write("Enter a search query to fetch and rank Wikipedia articles.") | |
# Input for query | |
search_query = st.text_input("Search Wikipedia:") | |
# Detect query change and clear session state if needed | |
if "previous_query" not in st.session_state: | |
st.session_state.previous_query = None | |
if search_query and search_query != st.session_state.previous_query: | |
# Reset session state variables | |
st.session_state.previous_query = search_query | |
st.session_state.ranked_results = None | |
st.session_state.end_index = 10 | |
if search_query: | |
# Fetch articles dynamically | |
with st.spinner("Fetching articles..."): | |
articles = fetch_related_articles(search_query) | |
if not articles: | |
st.warning("No articles found! Try a different query.") | |
else: | |
# Prepare data for TF-IDF | |
titles = list(articles.keys()) | |
contents = list(articles.values()) | |
# TF-IDF Vectorization | |
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english") | |
tfidf_matrix = vectorizer.fit_transform(contents) | |
# SVD Dimensionality Reduction | |
n_components = 100 # Number of dimensions | |
svd = TruncatedSVD(n_components=n_components) | |
lsi_matrix = svd.fit_transform(tfidf_matrix) | |
lsi_matrix = normalize(lsi_matrix) # Normalize for cosine similarity | |
# Handle session state for pagination | |
if st.session_state.ranked_results is None: | |
st.session_state.ranked_results = rank_articles(search_query, vectorizer, svd, lsi_matrix, titles) | |
# Display ranked results | |
end_index = st.session_state.end_index | |
ranked_results = st.session_state.ranked_results[:end_index] | |
st.subheader("Search Results:") | |
for title, similarity in ranked_results: | |
st.markdown(f"### [{title}](https://en.wikipedia.org/wiki/{title.replace(' ', '_')})") | |
st.write(f"**Similarity Score:** {similarity:.2f}") | |
st.write("---") | |
# Pagination controls | |
if end_index < len(st.session_state.ranked_results): | |
if st.button("Load More"): | |
st.session_state.end_index += 10 | |
else: | |
st.info("No more articles to load.") | |