import streamlit as st import pandas as pd from openai import OpenAI # source: eagle0504/document-search-q-series def read_and_textify_advanced(files, chunk_size): """ Reads PDF files and extracts text from each page, breaking the text into specified segments. This function iterates over a list of uploaded PDF files, extracts text from each page, and compiles a list of texts and corresponding source information, segmented into smaller parts of approximately 'chunk_size' words each. Args: files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files. chunk_size (int): The number of words per text segment. Default is 50. Returns: A list of strings, where each string is a segment of text extracted from a PDF page. """ text_list = [] # List to store extracted text segments # Iterate over each file for file in files: pdfReader = PyPDF2.PdfReader(file) # Create a PDF reader object # Iterate over each page in the PDF for i in range(len(pdfReader.pages)): pageObj = pdfReader.pages[i] # Get the page object text = pageObj.extract_text() # Extract text from the page if text: # Split text into chunks of approximately 'chunk_size' words words = text.split(". ") for j in range(len(words)): # Get the chunk of text from j-chunk_size to j+chunk_size start = max(0, j - chunk_size) end = min(len(words), j + chunk_size + 1) chunk = ". ".join(words[start:end]) + '.' text_list.append(chunk) # Create a source identifier for each chunk and add it to the list else: # If no text extracted, still add a placeholder text_list.append("") pageObj.clear() # Clear the page object (optional, for memory management) return text_list def get_questions(context, instructions) -> str: """ Given a text context, generates a list of questions using OpenAI's GPT-3 API. Args: - context: A string representing the context for which questions should be generated. Returns: - A string containing the question generated by the API. """ try: response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": f"{instructions}\n\nText: {context}\n\nQuestions:\n"} ] ) # Extract question text from the response question_text = response.choices[0].message.content return question_text except: # Return an empty string if there was an error return "" def get_answers(row, instructions) -> str: """ Given a dataframe row containing context and questions, generates an answer using OpenAI's GPT-3 API. Args: - row: A pandas dataframe row containing 'context' and 'questions' columns. Returns: - A string containing the answer generated by the API. """ try: response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": f"{instructions}\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n"} ] ) # Extract answer text from the response answer_text = response.choices[0].message.content return answer_text except Exception as e: # Print the error message and return an empty string if there was an error print (e) return "" st.set_page_config(page_title="ChatbotGuide", layout="wide") st.title("Chatbot Guide") # Write instructions for input PDF # Write instructions for these inputs uploaded_files = st.file_uploader("Upload PDFs Here", type="pdf", accept_multiple_files=True) st.success(f"{len(uploaded_files)} document(s) loaded...") question_protocol = st.text_input("Provide instructions for how questions should be generated", "Write questions based on the text") answer_protocol = st.text_input("Provide instructions for how answers should be generated", "Write answers based on the text") sentence_chunks = st.number_input("Number sentences per Q/A pair", value=2, step=1, min_value=1, max_value=3) openai_api_key = st.text_input("Enter your OpenAI API key", type="password") submit = st.button("Submit") if submit: if openai_api_key: # try: client = OpenAI(api_key=openai_api_key) if uploaded_files: textify_output = read_and_textify_advanced(uploaded_files, sentence_chunks) df = pd.DataFrame(textify_output) df.columns = ['context'] if question_protocol == "": question_protocol = "Write questions based on the text" df['questions'] = df.apply(lambda row: get_questions(row['context'], question_protocol), axis=1) if answer_protocol == "": answer_protocol = "Write answers based on the text" df['answers'] = df.apply(lambda row: get_answers(row, answer_protocol), axis=1) df = df.drop['context'] csv = df.to_csv(index=False).encode('utf-8') st.download_button( label="Download Q/A pairs as CSV", data=csv, file_name='questions_answers.csv', mime='text/csv', ) else: st.error("Please upload at least 1 PDF") # except: # st.error("OpenAI API key is invalid")