import streamlit as st
import pandas as pd
from openai import OpenAI

# source: eagle0504/document-search-q-series
def read_and_textify_advanced(files, chunk_size):
    """
    Reads PDF files and extracts text from each page, breaking the text into specified segments.
    This function iterates over a list of uploaded PDF files, extracts text from each page,
    and compiles a list of texts and corresponding source information, segmented into smaller parts
    of approximately 'chunk_size' words each.
    Args:
    files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
    chunk_size (int): The number of words per text segment. Default is 50.
    Returns: A list of strings, where each string is a segment of text extracted from a PDF page.
    """

    text_list = []  # List to store extracted text segments

    # Iterate over each file
    for file in files:
        pdfReader = PyPDF2.PdfReader(file)  # Create a PDF reader object
        # Iterate over each page in the PDF
        for i in range(len(pdfReader.pages)):
            pageObj = pdfReader.pages[i]  # Get the page object
            text = pageObj.extract_text()  # Extract text from the page
            if text:
                # Split text into chunks of approximately 'chunk_size' words
                words = text.split(". ")
                for j in range(len(words)):
                    # Get the chunk of text from j-chunk_size to j+chunk_size
                    start = max(0, j - chunk_size)
                    end = min(len(words), j + chunk_size + 1)
                    chunk = ". ".join(words[start:end]) + '.'
                    text_list.append(chunk)
                    # Create a source identifier for each chunk and add it to the list
            else:
                # If no text extracted, still add a placeholder
                text_list.append("")
            pageObj.clear()  # Clear the page object (optional, for memory management)

    return text_list

def get_questions(context, instructions) -> str:
    """
    Given a text context, generates a list of questions using OpenAI's GPT-3 API.

    Args:
    - context: A string representing the context for which questions should be generated.

    Returns:
    - A string containing the question generated by the API.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": f"{instructions}\n\nText: {context}\n\nQuestions:\n"}
            ]
        )
        # Extract question text from the response
        question_text = response.choices[0].message.content
        return question_text
    except:
        # Return an empty string if there was an error
        return ""

def get_answers(row, instructions) -> str:
    """
    Given a dataframe row containing context and questions, generates an answer using OpenAI's GPT-3 API.

    Args:
    - row: A pandas dataframe row containing 'context' and 'questions' columns.

    Returns:
    - A string containing the answer generated by the API.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": f"{instructions}\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n"}
            ]
        )
        # Extract answer text from the response
        answer_text = response.choices[0].message.content
        return answer_text
    except Exception as e:
        # Print the error message and return an empty string if there was an error
        print (e)
        return ""

st.set_page_config(page_title="ChatbotGuide", layout="wide")
st.title("Chatbot Guide")
# Write instructions for input PDF

# Write instructions for these inputs
uploaded_files = st.file_uploader("Upload PDFs Here", type="pdf", accept_multiple_files=True)
st.success(f"{len(uploaded_files)} document(s) loaded...")

question_protocol = st.text_input("Provide instructions for how questions should be generated", "Write questions based on the text")
answer_protocol = st.text_input("Provide instructions for how answers should be generated", "Write answers based on the text")

sentence_chunks = st.number_input("Number sentences per Q/A pair", value=2, step=1, min_value=1, max_value=3)

openai_api_key = st.text_input("Enter your OpenAI API key", type="password")

submit = st.button("Submit")

if submit:
    if openai_api_key:
        # try:
        client = OpenAI(api_key=openai_api_key)
        if uploaded_files:
            textify_output = read_and_textify_advanced(uploaded_files, sentence_chunks)
            
            df = pd.DataFrame(textify_output)
            df.columns = ['context']

            if question_protocol == "":
                question_protocol = "Write questions based on the text"
            df['questions'] = df.apply(lambda row: get_questions(row['context'], question_protocol), axis=1)

            if answer_protocol == "":
                answer_protocol = "Write answers based on the text"
            df['answers'] = df.apply(lambda row: get_answers(row, answer_protocol), axis=1)

            df = df.drop['context']

            csv = df.to_csv(index=False).encode('utf-8')
            st.download_button(
                label="Download Q/A pairs as CSV",
                data=csv,
                file_name='questions_answers.csv',
                mime='text/csv',
            )
        else:
            st.error("Please upload at least 1 PDF")
        # except:
        #     st.error("OpenAI API key is invalid")