import os
import re
import pdfminer
from pdfminer.high_level import extract_pages
from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer

import tensorflow as tf
import streamlit as st


def preprocess_text(element):
    """Preprocesses text elements from the PDF.

    Args:
        element: A PDFminer text element.

    Returns:
        The preprocessed text.
    """
    if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
        text = element.get_text().strip()
        # Remove non-textual elements
        text = re.sub(r'[^\w\s]', '', text)  # Replace with your preferred regular expression

        # Remove stop words (optional)
        # from nltk.corpus import stopwords
        # stop_words = set(stopwords.words('english'))
        # text = " ".join([word for word in text.split() if word not in stop_words])

        # Convert to lowercase (optional)
        text = text.lower()
        return text
    else:
        return ""


def answer_question(text, question, max_length=512):
    """Answers a question using the provided text and a pre-trained model.

    Args:
        text: The preprocessed text from the PDF.
        question: The user's question.

    Returns:
        The answer extracted from the text using the model.
    """
    
    qa_model_name = "bert-base-uncased"  # Replace with your model

    qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
    tokenizer = AutoTokenizer.from_pretrained(qa_model_name)

    # Truncate text if necessary:
    if len(text) > max_length:
        text = text[:max_length]

    # Add special tokens and tokenize:
    inputs = tokenizer(
        question, text, return_tensors="tf", padding="max_length", truncation=True
    )

    outputs = qa_model(inputs)

    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Ensure start_logits and end_logits are tensors
    start_logits = tf.convert_to_tensor(start_logits)
    end_logits = tf.convert_to_tensor(end_logits)

    # Find the indices of the start and end positions
    answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
    answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0]  # Increment by 1 for exclusive end index

    # Extract the answer text from the original text
    answer = text[answer_start:answer_end].strip()

    return answer if answer else "No answer found."


## Streamlit app

st.set_page_config(page_title="PDF Summarizer and Q&A")
st.header("PDF Summarizer and Q&A")

# User options
st.subheader("Settings")
min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100)
summarization_model = "facebook/bart-large-cnn"

# File upload and processing
uploaded_file = st.file_uploader("Choose a PDF file")
if uploaded_file is not None:
    with st.spinner("Processing..."):
        text = ""
        for page_layout in extract_pages(uploaded_file):
            for element in page_layout:
                text += preprocess_text(element) + "\n"
        if text:
            question = st.text_input("Ask a question about the PDF:")
            summarize_button = st.button("Generate Summary")
            if summarize_button:
                with st.spinner("Summarizing..."):
                    max_input_length = 1024  # Example value, adjust according to your model
                    truncated_text = text[:max_input_length]  # Truncate the text
                    summary_response = pipeline("summarization", model=summarization_model)(truncated_text, min_length=min_summary_length)
                    st.subheader("Summary")
                    st.write(summary_response[0]["summary_text"])
            if question:
                with st.spinner("Answering..."):
                    answer = answer_question(text, question)
                    st.subheader("Answer")
                    st.write(answer)
        else:
            st.error("No text found in the PDF.")