Spaces:
Sleeping
Sleeping
File size: 4,374 Bytes
2912f11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import re
# ---------------------
# Disclaimer
# ---------------------
DISCLAIMER = """
**Disclaimer:**
This application is provided for **research and educational purposes only**.
All summaries are generated using an automated language model and may contain inaccuracies or omissions.
It is not intended to replace professional judgment, peer-reviewed references, or expert consultation.
The authors and developers assume no legal liability for any misuse, misinterpretation, or unintended consequences
arising from the use of this tool. Please use responsibly and cross-check results with credible sources.
"""
# ---------------------
# Model Setup
# ---------------------
MODEL_NAME = "allenai/scibert_scivocab_cased" # Example model for tokenization/embedding
SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window
# Load summarization model and tokenizer
summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)
# ---------------------
# Utility Functions
# ---------------------
def extract_text_from_pdf(pdf_file):
# This function extracts text from a PDF file. Requires PyPDF2 or similar library.
# For Hugging Face Spaces, PyPDF2 often works.
try:
import PyPDF2
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
return f"Error reading PDF: {e}"
def clean_text(text):
# Basic cleaning function
text = re.sub(r'\s+', ' ', text).strip()
return text
def summarize_text(text):
# Summarize the given text
inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
with torch.no_grad():
summary_ids = summarizer_model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
num_beams=4,
length_penalty=2.0,
max_length=512,
early_stopping=True
)
summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def analyze_text(text):
# In a more elaborate system, you might:
# 1. Extract main findings using IE or NER.
# 2. Identify methods mentioned.
# 3. Extract references (regex patterns for citations).
# Here we just do a simple summarization.
text_clean = clean_text(text)
if len(text_clean) < 50:
return "Please provide a longer text snippet or PDF."
summary = summarize_text(text_clean)
# Dummy logic for key methods and references (in a real app, use NLP-based extraction)
methods = "Key methods extraction is not yet implemented."
references = "Reference extraction is not yet implemented."
return summary, methods, references
def process_input(pdf_file, text_snippet):
# If PDF is provided, extract text from PDF
input_text = ""
if pdf_file is not None:
input_text = extract_text_from_pdf(pdf_file)
# If a text snippet is provided, append it.
if text_snippet is not None and text_snippet.strip():
input_text = input_text + " " + text_snippet.strip()
if not input_text.strip():
return "No input provided.", "", ""
summary, methods, references = analyze_text(input_text)
return summary, methods, references
# ---------------------
# Gradio Interface
# ---------------------
with gr.Blocks() as demo:
gr.Markdown("# NeuroLit Explorer")
gr.Markdown(DISCLAIMER)
gr.Markdown("**Instructions:** Upload a PDF or paste a text snippet from a neuroscience article, then click 'Summarize'.")
with gr.Row():
pdf_input = gr.File(label="Upload PDF")
text_input = gr.Textbox(label="Or Paste Article Text")
summarize_button = gr.Button("Summarize")
summary_output = gr.Textbox(label="Summary")
methods_output = gr.Textbox(label="Key Methods")
references_output = gr.Textbox(label="Relevant References")
summarize_button.click(fn=process_input, inputs=[pdf_input, text_input], outputs=[summary_output, methods_output, references_output])
demo.launch()
|