File size: 2,998 Bytes
df6d309
cd78587
a0a4c21
a9ed5cd
ed723ec
f76e3c0
190a44b
f76e3c0
 
de71dbe
f9b6f1e
df6d309
eb9dd95
 
f9b6f1e
 
da9af65
eb9dd95
 
e24ae62
0f1930b
9be9f60
f9b6f1e
 
b46238f
a8825e5
df6d309
f9b6f1e
 
df6d309
f9b6f1e
09d71b7
 
 
6f020bb
a2d5f2f
 
f9b6f1e
1465c8f
a2d5f2f
f9b6f1e
df6d309
de71dbe
f9b6f1e
99a4a27
df6d309
0f1930b
de71dbe
 
fbcc2a4
da9af65
a0a4c21
de71dbe
 
f76e3c0
de71dbe
 
a9ed5cd
de71dbe
 
 
df6d309
 
 
276fdaf
f9b6f1e
276fdaf
df6d309
 
 
 
 
 
de71dbe
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import re
import pdfminer
from pdfminer.high_level import extract_pages
from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer

import tensorflow as tf
import streamlit as st

def preprocess_text(element):
    """Preprocesses text elements from the PDF."""
    if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
        text = element.get_text().strip()
        # Remove non-textual elements
        text = re.sub(r'[^\w\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        return text
    else:
        return ""

def answer_question(text, question, max_length=512):
    """Answers a question using the provided text and a pre-trained model."""
    qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
    qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
    tokenizer = AutoTokenizer.from_pretrained(qa_model_name)

    # Add special tokens and tokenize
    inputs = tokenizer(question, text, return_tensors="tf", padding=True, truncation=True, max_length=max_length)

    # Model prediction
    outputs = qa_model(inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Find the indices of the start and end positions
    answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
    answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0]  

    # Extract the answer text from the original text
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer if answer else "No answer found."

# Streamlit app
st.set_page_config(page_title="PDF Summarizer and Q&A")
st.header("PDF Summarizer and Q&A")

# User options
st.subheader("Settings")
min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100)
summarization_model = "facebook/bart-large-cnn"

# File upload and processing
uploaded_file = st.file_uploader("Choose a PDF file")
if uploaded_file is not None:
    with st.spinner("Processing..."):
        text = ""
        for page_layout in extract_pages(uploaded_file):
            for element in page_layout:
                text += preprocess_text(element) + "\n"
        if text:
            question = st.text_input("Ask a question about the PDF:")
            summarize_button = st.button("Generate Summary")
            if summarize_button:
                with st.spinner("Summarizing..."):
                    summary_response = pipeline("summarization", model=summarization_model)(text, min_length=min_summary_length)
                    st.subheader("Summary")
                    st.write(summary_response[0]["summary_text"])
            if question:
                with st.spinner("Answering..."):
                    answer = answer_question(text, question)
                    st.subheader("Answer")
                    st.write(answer)
        else:
            st.error("No text found in the PDF.")