sanaa-11 commited on
Commit
f51fb04
·
verified ·
1 Parent(s): 7bad413

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -0
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import wikipedia
3
+ import wikipediaapi
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.decomposition import TruncatedSVD
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from sklearn.preprocessing import normalize
8
+
9
+ # Initialize Wikipedia API
10
+ wiki_wiki = wikipediaapi.Wikipedia(
11
+ language='en',
12
+ user_agent="LSI1/1.0 ([email protected])"
13
+ )
14
+
15
+ # Function to fetch related Wikipedia articles
16
+ def fetch_related_articles(query, max_articles=20):
17
+ search_results = wikipedia.search(query, results=max_articles)
18
+ articles = {}
19
+ for title in search_results:
20
+ page = wiki_wiki.page(title)
21
+ if page.exists():
22
+ articles[title] = page.text
23
+ return articles
24
+
25
+ # Function to rank articles using SVD
26
+ def rank_articles(query, vectorizer, svd, lsi_matrix, titles):
27
+ query_tfidf = vectorizer.transform([query])
28
+ query_lsi = svd.transform(query_tfidf)
29
+ query_lsi = normalize(query_lsi)
30
+ similarities = cosine_similarity(query_lsi, lsi_matrix).flatten()
31
+ ranked_indices = similarities.argsort()[::-1] # Sort by similarity (desc)
32
+ return [(titles[idx], similarities[idx]) for idx in ranked_indices]
33
+
34
+ # Streamlit UI
35
+ st.title("Wikipedia Search with SVD")
36
+ st.write("Enter a search query to fetch and rank Wikipedia articles.")
37
+
38
+ # Input for query
39
+ search_query = st.text_input("Search Wikipedia:")
40
+
41
+ # Detect query change and clear session state if needed
42
+ if "previous_query" not in st.session_state:
43
+ st.session_state.previous_query = None
44
+
45
+ if search_query and search_query != st.session_state.previous_query:
46
+ # Reset session state variables
47
+ st.session_state.previous_query = search_query
48
+ st.session_state.ranked_results = None
49
+ st.session_state.end_index = 10
50
+
51
+ if search_query:
52
+ # Fetch articles dynamically
53
+ with st.spinner("Fetching articles..."):
54
+ articles = fetch_related_articles(search_query)
55
+
56
+ if not articles:
57
+ st.warning("No articles found! Try a different query.")
58
+ else:
59
+ # Prepare data for TF-IDF
60
+ titles = list(articles.keys())
61
+ contents = list(articles.values())
62
+
63
+ # TF-IDF Vectorization
64
+ vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
65
+ tfidf_matrix = vectorizer.fit_transform(contents)
66
+
67
+ # SVD Dimensionality Reduction
68
+ n_components = 100 # Number of dimensions
69
+ svd = TruncatedSVD(n_components=n_components)
70
+ lsi_matrix = svd.fit_transform(tfidf_matrix)
71
+ lsi_matrix = normalize(lsi_matrix) # Normalize for cosine similarity
72
+
73
+ # Handle session state for pagination
74
+ if st.session_state.ranked_results is None:
75
+ st.session_state.ranked_results = rank_articles(search_query, vectorizer, svd, lsi_matrix, titles)
76
+
77
+ # Display ranked results
78
+ end_index = st.session_state.end_index
79
+ ranked_results = st.session_state.ranked_results[:end_index]
80
+
81
+ st.subheader("Search Results:")
82
+ for title, similarity in ranked_results:
83
+ st.markdown(f"### [{title}](https://en.wikipedia.org/wiki/{title.replace(' ', '_')})")
84
+ st.write(f"**Similarity Score:** {similarity:.2f}")
85
+ st.write("---")
86
+
87
+ # Pagination controls
88
+ if end_index < len(st.session_state.ranked_results):
89
+ if st.button("Load More"):
90
+ st.session_state.end_index += 10
91
+ else:
92
+ st.info("No more articles to load.")