Spaces:
Sleeping
Sleeping
import os | |
from sentence_transformers import SentenceTransformer, util | |
from datasets import load_dataset | |
from transformers import pipeline | |
import streamlit as st | |
# Cache dataset loading | |
def load_data(dataset_id="sentence-transformers/natural-questions", split="train"): | |
return load_dataset(dataset_id, split=split) | |
# Cache model loading | |
def load_model(): | |
return SentenceTransformer('allenai-specter') | |
# Cache corpus embedding generation | |
def generate_embeddings(_model, _dataset_file, sample_size=32): | |
# Prepare paper texts by combining query and answer fields | |
paper_texts = [ | |
record['query'] + '[SEP]' + record['answer'] for record in _dataset_file.select(range(sample_size)) | |
] | |
# Compute embeddings for all paper texts | |
return paper_texts, _model.encode(paper_texts, convert_to_tensor=True, show_progress_bar=True) | |
# Cache summarization pipeline | |
def load_summarizer(): | |
return pipeline("summarization") | |
# Streamlit app | |
st.title("Semantic Search with Summarization") | |
# Load resources | |
dataset_file = load_data() | |
model = load_model() | |
paper_texts, corpus_embeddings = generate_embeddings(model, dataset_file) | |
summarizer = load_summarizer() | |
# Function to search and summarize | |
def search_papers_and_summarize(query, max_summary_length=45): | |
# Encode the query | |
query_embedding = model.encode(query, convert_to_tensor=True) | |
# Perform semantic search | |
search_hits = util.semantic_search(query_embedding, corpus_embeddings) | |
search_hits = search_hits[0] # Get the hits for the first query | |
# Collect answers from top hits | |
answers = [] | |
for hit in search_hits[:5]: # Limit to top 5 results | |
related_text = dataset_file[int(hit['corpus_id'])] | |
answers.append(related_text['answer']) | |
# Combine answers into a single text for summarization | |
combined_text = " ".join(answers) | |
# Summarize the combined text | |
summary = summarizer(combined_text, max_length=max_summary_length, clean_up_tokenization_spaces=True) | |
return summary[0]['summary_text'] | |
# Streamlit input | |
query = st.text_input("Enter your query:", "") | |
if query: | |
st.write("Searching for relevant answers...") | |
summary = search_papers_and_summarize(query) | |
st.subheader("Summary") | |
st.write(summary) | |