import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import re # --------------------- # Disclaimer # --------------------- DISCLAIMER = """ **Disclaimer:** This application is provided for **research and educational purposes only**. All summaries are generated using an automated language model and may contain inaccuracies or omissions. It is not intended to replace professional judgment, peer-reviewed references, or expert consultation. The authors and developers assume no legal liability for any misuse, misinterpretation, or unintended consequences arising from the use of this tool. Please use responsibly and cross-check results with credible sources. """ # --------------------- # Model Setup # --------------------- MODEL_NAME = "allenai/scibert_scivocab_cased" # Example model for tokenization/embedding SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window # Load summarization model and tokenizer summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL) summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL) # --------------------- # Utility Functions # --------------------- def extract_text_from_pdf(pdf_file): # This function extracts text from a PDF file. Requires PyPDF2 or similar library. # For Hugging Face Spaces, PyPDF2 often works. try: import PyPDF2 reader = PyPDF2.PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text except Exception as e: return f"Error reading PDF: {e}" def clean_text(text): # Basic cleaning function text = re.sub(r'\s+', ' ', text).strip() return text def summarize_text(text): # Summarize the given text inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384) with torch.no_grad(): summary_ids = summarizer_model.generate( inputs["input_ids"], attention_mask=inputs["attention_mask"], num_beams=4, length_penalty=2.0, max_length=512, early_stopping=True ) summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary def analyze_text(text): # In a more elaborate system, you might: # 1. Extract main findings using IE or NER. # 2. Identify methods mentioned. # 3. Extract references (regex patterns for citations). # Here we just do a simple summarization. text_clean = clean_text(text) if len(text_clean) < 50: return "Please provide a longer text snippet or PDF." summary = summarize_text(text_clean) # Dummy logic for key methods and references (in a real app, use NLP-based extraction) methods = "Key methods extraction is not yet implemented." references = "Reference extraction is not yet implemented." return summary, methods, references def process_input(pdf_file, text_snippet): # If PDF is provided, extract text from PDF input_text = "" if pdf_file is not None: input_text = extract_text_from_pdf(pdf_file) # If a text snippet is provided, append it. if text_snippet is not None and text_snippet.strip(): input_text = input_text + " " + text_snippet.strip() if not input_text.strip(): return "No input provided.", "", "" summary, methods, references = analyze_text(input_text) return summary, methods, references # --------------------- # Gradio Interface # --------------------- with gr.Blocks() as demo: gr.Markdown("# NeuroLit Explorer") gr.Markdown(DISCLAIMER) gr.Markdown("**Instructions:** Upload a PDF or paste a text snippet from a neuroscience article, then click 'Summarize'.") with gr.Row(): pdf_input = gr.File(label="Upload PDF") text_input = gr.Textbox(label="Or Paste Article Text") summarize_button = gr.Button("Summarize") summary_output = gr.Textbox(label="Summary") methods_output = gr.Textbox(label="Key Methods") references_output = gr.Textbox(label="Relevant References") summarize_button.click(fn=process_input, inputs=[pdf_input, text_input], outputs=[summary_output, methods_output, references_output]) demo.launch()