Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import wikipedia
|
3 |
+
import wikipediaapi
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from sklearn.decomposition import TruncatedSVD
|
6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
+
from sklearn.preprocessing import normalize
|
8 |
+
|
9 |
+
# Initialize Wikipedia API
|
10 |
+
wiki_wiki = wikipediaapi.Wikipedia(
|
11 |
+
language='en',
|
12 |
+
user_agent="LSI1/1.0 ([email protected])"
|
13 |
+
)
|
14 |
+
|
15 |
+
# Function to fetch related Wikipedia articles
|
16 |
+
def fetch_related_articles(query, max_articles=20):
|
17 |
+
search_results = wikipedia.search(query, results=max_articles)
|
18 |
+
articles = {}
|
19 |
+
for title in search_results:
|
20 |
+
page = wiki_wiki.page(title)
|
21 |
+
if page.exists():
|
22 |
+
articles[title] = page.text
|
23 |
+
return articles
|
24 |
+
|
25 |
+
# Function to rank articles using SVD
|
26 |
+
def rank_articles(query, vectorizer, svd, lsi_matrix, titles):
|
27 |
+
query_tfidf = vectorizer.transform([query])
|
28 |
+
query_lsi = svd.transform(query_tfidf)
|
29 |
+
query_lsi = normalize(query_lsi)
|
30 |
+
similarities = cosine_similarity(query_lsi, lsi_matrix).flatten()
|
31 |
+
ranked_indices = similarities.argsort()[::-1] # Sort by similarity (desc)
|
32 |
+
return [(titles[idx], similarities[idx]) for idx in ranked_indices]
|
33 |
+
|
34 |
+
# Streamlit UI
|
35 |
+
st.title("Wikipedia Search with SVD")
|
36 |
+
st.write("Enter a search query to fetch and rank Wikipedia articles.")
|
37 |
+
|
38 |
+
# Input for query
|
39 |
+
search_query = st.text_input("Search Wikipedia:")
|
40 |
+
|
41 |
+
# Detect query change and clear session state if needed
|
42 |
+
if "previous_query" not in st.session_state:
|
43 |
+
st.session_state.previous_query = None
|
44 |
+
|
45 |
+
if search_query and search_query != st.session_state.previous_query:
|
46 |
+
# Reset session state variables
|
47 |
+
st.session_state.previous_query = search_query
|
48 |
+
st.session_state.ranked_results = None
|
49 |
+
st.session_state.end_index = 10
|
50 |
+
|
51 |
+
if search_query:
|
52 |
+
# Fetch articles dynamically
|
53 |
+
with st.spinner("Fetching articles..."):
|
54 |
+
articles = fetch_related_articles(search_query)
|
55 |
+
|
56 |
+
if not articles:
|
57 |
+
st.warning("No articles found! Try a different query.")
|
58 |
+
else:
|
59 |
+
# Prepare data for TF-IDF
|
60 |
+
titles = list(articles.keys())
|
61 |
+
contents = list(articles.values())
|
62 |
+
|
63 |
+
# TF-IDF Vectorization
|
64 |
+
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
|
65 |
+
tfidf_matrix = vectorizer.fit_transform(contents)
|
66 |
+
|
67 |
+
# SVD Dimensionality Reduction
|
68 |
+
n_components = 100 # Number of dimensions
|
69 |
+
svd = TruncatedSVD(n_components=n_components)
|
70 |
+
lsi_matrix = svd.fit_transform(tfidf_matrix)
|
71 |
+
lsi_matrix = normalize(lsi_matrix) # Normalize for cosine similarity
|
72 |
+
|
73 |
+
# Handle session state for pagination
|
74 |
+
if st.session_state.ranked_results is None:
|
75 |
+
st.session_state.ranked_results = rank_articles(search_query, vectorizer, svd, lsi_matrix, titles)
|
76 |
+
|
77 |
+
# Display ranked results
|
78 |
+
end_index = st.session_state.end_index
|
79 |
+
ranked_results = st.session_state.ranked_results[:end_index]
|
80 |
+
|
81 |
+
st.subheader("Search Results:")
|
82 |
+
for title, similarity in ranked_results:
|
83 |
+
st.markdown(f"### [{title}](https://en.wikipedia.org/wiki/{title.replace(' ', '_')})")
|
84 |
+
st.write(f"**Similarity Score:** {similarity:.2f}")
|
85 |
+
st.write("---")
|
86 |
+
|
87 |
+
# Pagination controls
|
88 |
+
if end_index < len(st.session_state.ranked_results):
|
89 |
+
if st.button("Load More"):
|
90 |
+
st.session_state.end_index += 10
|
91 |
+
else:
|
92 |
+
st.info("No more articles to load.")
|