Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import re | |
# --------------------- | |
# Disclaimer | |
# --------------------- | |
DISCLAIMER = """ | |
**Disclaimer:** | |
This application is provided for **research and educational purposes only**. | |
All summaries are generated using an automated language model and may contain inaccuracies or omissions. | |
It is not intended to replace professional judgment, peer-reviewed references, or expert consultation. | |
The authors and developers assume no legal liability for any misuse, misinterpretation, or unintended consequences | |
arising from the use of this tool. Please use responsibly and cross-check results with credible sources. | |
""" | |
# --------------------- | |
# Model Setup | |
# --------------------- | |
MODEL_NAME = "allenai/scibert_scivocab_cased" # Example model for tokenization/embedding | |
SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window | |
# Load summarization model and tokenizer | |
summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL) | |
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL) | |
# --------------------- | |
# Utility Functions | |
# --------------------- | |
def extract_text_from_pdf(pdf_file): | |
# This function extracts text from a PDF file. Requires PyPDF2 or similar library. | |
# For Hugging Face Spaces, PyPDF2 often works. | |
try: | |
import PyPDF2 | |
reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
except Exception as e: | |
return f"Error reading PDF: {e}" | |
def clean_text(text): | |
# Basic cleaning function | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def summarize_text(text): | |
# Summarize the given text | |
inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384) | |
with torch.no_grad(): | |
summary_ids = summarizer_model.generate( | |
inputs["input_ids"], | |
attention_mask=inputs["attention_mask"], | |
num_beams=4, | |
length_penalty=2.0, | |
max_length=512, | |
early_stopping=True | |
) | |
summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
return summary | |
def analyze_text(text): | |
# In a more elaborate system, you might: | |
# 1. Extract main findings using IE or NER. | |
# 2. Identify methods mentioned. | |
# 3. Extract references (regex patterns for citations). | |
# Here we just do a simple summarization. | |
text_clean = clean_text(text) | |
if len(text_clean) < 50: | |
return "Please provide a longer text snippet or PDF." | |
summary = summarize_text(text_clean) | |
# Dummy logic for key methods and references (in a real app, use NLP-based extraction) | |
methods = "Key methods extraction is not yet implemented." | |
references = "Reference extraction is not yet implemented." | |
return summary, methods, references | |
def process_input(pdf_file, text_snippet): | |
# If PDF is provided, extract text from PDF | |
input_text = "" | |
if pdf_file is not None: | |
input_text = extract_text_from_pdf(pdf_file) | |
# If a text snippet is provided, append it. | |
if text_snippet is not None and text_snippet.strip(): | |
input_text = input_text + " " + text_snippet.strip() | |
if not input_text.strip(): | |
return "No input provided.", "", "" | |
summary, methods, references = analyze_text(input_text) | |
return summary, methods, references | |
# --------------------- | |
# Gradio Interface | |
# --------------------- | |
with gr.Blocks() as demo: | |
gr.Markdown("# NeuroLit Explorer") | |
gr.Markdown(DISCLAIMER) | |
gr.Markdown("**Instructions:** Upload a PDF or paste a text snippet from a neuroscience article, then click 'Summarize'.") | |
with gr.Row(): | |
pdf_input = gr.File(label="Upload PDF") | |
text_input = gr.Textbox(label="Or Paste Article Text") | |
summarize_button = gr.Button("Summarize") | |
summary_output = gr.Textbox(label="Summary") | |
methods_output = gr.Textbox(label="Key Methods") | |
references_output = gr.Textbox(label="Relevant References") | |
summarize_button.click(fn=process_input, inputs=[pdf_input, text_input], outputs=[summary_output, methods_output, references_output]) | |
demo.launch() | |