File size: 4,374 Bytes
2912f11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import re

# ---------------------
# Disclaimer
# ---------------------
DISCLAIMER = """
**Disclaimer:**  
This application is provided for **research and educational purposes only**. 
All summaries are generated using an automated language model and may contain inaccuracies or omissions. 
It is not intended to replace professional judgment, peer-reviewed references, or expert consultation. 
The authors and developers assume no legal liability for any misuse, misinterpretation, or unintended consequences 
arising from the use of this tool. Please use responsibly and cross-check results with credible sources.
"""

# ---------------------
# Model Setup
# ---------------------
MODEL_NAME = "allenai/scibert_scivocab_cased"  # Example model for tokenization/embedding
SUMMARIZATION_MODEL = "allenai/led-base-16384"  # Example summarization model with a large context window

# Load summarization model and tokenizer
summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)

# ---------------------
# Utility Functions
# ---------------------
def extract_text_from_pdf(pdf_file):
    # This function extracts text from a PDF file. Requires PyPDF2 or similar library.
    # For Hugging Face Spaces, PyPDF2 often works.
    try:
        import PyPDF2
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text
    except Exception as e:
        return f"Error reading PDF: {e}"

def clean_text(text):
    # Basic cleaning function
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def summarize_text(text):
    # Summarize the given text
    inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
    with torch.no_grad():
        summary_ids = summarizer_model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            num_beams=4,
            length_penalty=2.0,
            max_length=512,
            early_stopping=True
        )
    summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def analyze_text(text):
    # In a more elaborate system, you might:
    # 1. Extract main findings using IE or NER.
    # 2. Identify methods mentioned.
    # 3. Extract references (regex patterns for citations).
    # Here we just do a simple summarization.
    text_clean = clean_text(text)
    if len(text_clean) < 50:
        return "Please provide a longer text snippet or PDF."
    
    summary = summarize_text(text_clean)
    
    # Dummy logic for key methods and references (in a real app, use NLP-based extraction)
    methods = "Key methods extraction is not yet implemented."  
    references = "Reference extraction is not yet implemented."
    return summary, methods, references

def process_input(pdf_file, text_snippet):
    # If PDF is provided, extract text from PDF
    input_text = ""
    if pdf_file is not None:
        input_text = extract_text_from_pdf(pdf_file)
    
    # If a text snippet is provided, append it.
    if text_snippet is not None and text_snippet.strip():
        input_text = input_text + " " + text_snippet.strip()
    
    if not input_text.strip():
        return "No input provided.", "", ""
    
    summary, methods, references = analyze_text(input_text)
    return summary, methods, references

# ---------------------
# Gradio Interface
# ---------------------
with gr.Blocks() as demo:
    gr.Markdown("# NeuroLit Explorer")
    gr.Markdown(DISCLAIMER)
    gr.Markdown("**Instructions:** Upload a PDF or paste a text snippet from a neuroscience article, then click 'Summarize'.")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF")
        text_input = gr.Textbox(label="Or Paste Article Text")
    summarize_button = gr.Button("Summarize")
    
    summary_output = gr.Textbox(label="Summary")
    methods_output = gr.Textbox(label="Key Methods")
    references_output = gr.Textbox(label="Relevant References")
    
    summarize_button.click(fn=process_input, inputs=[pdf_input, text_input], outputs=[summary_output, methods_output, references_output])

demo.launch()