File size: 3,945 Bytes
df6d309
cd78587
a0a4c21
a9ed5cd
ed723ec
f76e3c0
190a44b
f76e3c0
 
68de083
de71dbe
df6d309
 
 
 
 
 
 
 
 
eb9dd95
 
 
 
 
da9af65
 
 
df6d309
8d21acd
da9af65
eb9dd95
 
e24ae62
0f1930b
68de083
9be9f60
df6d309
 
 
 
 
 
 
 
 
1465c8f
 
df6d309
b46238f
a8825e5
df6d309
1465c8f
9be9f60
 
 
1465c8f
 
 
 
df6d309
09d71b7
df6d309
09d71b7
 
6f020bb
09d71b7
 
 
1465c8f
a2d5f2f
 
 
1465c8f
a2d5f2f
 
df6d309
 
de71dbe
68de083
de71dbe
0f1930b
99a4a27
df6d309
0f1930b
de71dbe
 
fbcc2a4
da9af65
a0a4c21
de71dbe
 
f76e3c0
de71dbe
 
a9ed5cd
de71dbe
 
 
df6d309
 
 
276fdaf
99a4a27
 
 
276fdaf
df6d309
 
 
 
 
 
de71dbe
 
99a4a27
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import re
import pdfminer
from pdfminer.high_level import extract_pages
from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer

import tensorflow as tf
import streamlit as st


def preprocess_text(element):
    """Preprocesses text elements from the PDF.

    Args:
        element: A PDFminer text element.

    Returns:
        The preprocessed text.
    """
    if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
        text = element.get_text().strip()
        # Remove non-textual elements
        text = re.sub(r'[^\w\s]', '', text)  # Replace with your preferred regular expression

        # Remove stop words (optional)
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        text = " ".join([word for word in text.split() if word not in stop_words])

        # Convert to lowercase (optional)
        text = text.lower()
        return text
    else:
        return ""


def answer_question(text, question, max_length=512):
    """Answers a question using the provided text and a pre-trained model.

    Args:
        text: The preprocessed text from the PDF.
        question: The user's question.

    Returns:
        The answer extracted from the text using the model.
    """
    
    qa_model_name = "bert-base-uncased"  # Replace with your model

    qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
    tokenizer = AutoTokenizer.from_pretrained(qa_model_name)

    # Truncate text if necessary:
    if len(text) > max_length:
        text = text[:max_length]

    # Add special tokens and tokenize:
    inputs = tokenizer(
        question, text, return_tensors="tf", padding="max_length", truncation=True
    )

    outputs = qa_model(inputs)

    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Ensure start_logits and end_logits are tensors
    start_logits = tf.convert_to_tensor(start_logits)
    end_logits = tf.convert_to_tensor(end_logits)

    # Find the indices of the start and end positions
    answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
    answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0]  # Increment by 1 for exclusive end index

    # Extract the answer text from the original text
    answer = text[answer_start:answer_end].strip()

    return answer if answer else "No answer found."


## Streamlit app

st.set_page_config(page_title="PDF Summarizer and Q&A")
st.header("PDF Summarizer and Q&A")

# User options
st.subheader("Settings")
min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100)
summarization_model = "facebook/bart-large-cnn"

# File upload and processing
uploaded_file = st.file_uploader("Choose a PDF file")
if uploaded_file is not None:
    with st.spinner("Processing..."):
        text = ""
        for page_layout in extract_pages(uploaded_file):
            for element in page_layout:
                text += preprocess_text(element) + "\n"
        if text:
            question = st.text_input("Ask a question about the PDF:")
            summarize_button = st.button("Generate Summary")
            if summarize_button:
                with st.spinner("Summarizing..."):
                    max_input_length = 1024  # Example value, adjust according to your model
                    truncated_text = text[:max_input_length]  # Truncate the text
                    summary_response = pipeline("summarization", model=summarization_model)(truncated_text, min_length=min_summary_length)
                    st.subheader("Summary")
                    st.write(summary_response[0]["summary_text"])
            if question:
                with st.spinner("Answering..."):
                    answer = answer_question(text, question)
                    st.subheader("Answer")
                    st.write(answer)
        else:
            st.error("No text found in the PDF.")