Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
from openai import OpenAI | |
from PyPDF2 import PdfReader | |
# source: eagle0504/document-search-q-series | |
def read_and_textify_advanced(files, chunk_size): | |
""" | |
Reads PDF files and extracts text from each page, breaking the text into specified segments. | |
This function iterates over a list of uploaded PDF files, extracts text from each page, | |
and compiles a list of texts and corresponding source information, segmented into smaller parts | |
of approximately 'chunk_size' words each. | |
Args: | |
files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files. | |
chunk_size (int): The number of words per text segment. Default is 50. | |
Returns: A list of strings, where each string is a segment of text extracted from a PDF page. | |
""" | |
text_list = [] # List to store extracted text segments | |
# Iterate over each file | |
for file in files: | |
pdfReader = PdfReader(file) # Create a PDF reader object | |
# Iterate over each page in the PDF | |
for i in range(len(pdfReader.pages)): | |
pageObj = pdfReader.pages[i] # Get the page object | |
text = pageObj.extract_text() # Extract text from the page | |
if text: | |
# Split text into chunks of approximately 'chunk_size' words | |
words = text.split(".") | |
for j in range(0, len(words), chunk_size): | |
# Get the chunk of text from j-chunk_size to j+chunk_size | |
# start = max(0, j - chunk_size) | |
# end = min(len(words), j + chunk_size + 1) | |
chunk = ".".join(words[j:j+chunk_size]) + '.' | |
chunk = chunk.strip() | |
text_list.append(chunk) | |
# Create a source identifier for each chunk and add it to the list | |
else: | |
# If no text extracted, still add a placeholder | |
text_list.append("") | |
pageObj.clear() # Clear the page object (optional, for memory management) | |
return text_list | |
def get_questions(context, instructions) -> str: | |
""" | |
Given a text context, generates a list of questions using OpenAI's GPT-3 API. | |
Args: | |
- context: A string representing the context for which questions should be generated. | |
Returns: | |
- A string containing the question generated by the API. | |
""" | |
try: | |
response = client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": f"{instructions}\n\nText: {context}\n\nQuestions:\n"} | |
] | |
) | |
# Extract question text from the response | |
question_text = response.choices[0].message.content | |
return question_text | |
except: | |
# Return an empty string if there was an error | |
return "" | |
def get_answers(row, instructions) -> str: | |
""" | |
Given a dataframe row containing context and questions, generates an answer using OpenAI's GPT-3 API. | |
Args: | |
- row: A pandas dataframe row containing 'context' and 'questions' columns. | |
Returns: | |
- A string containing the answer generated by the API. | |
""" | |
try: | |
response = client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": f"{instructions}\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n"} | |
] | |
) | |
# Extract answer text from the response | |
answer_text = response.choices[0].message.content | |
return answer_text | |
except Exception as e: | |
# Print the error message and return an empty string if there was an error | |
print (e) | |
return "" | |
st.set_page_config(page_title="ChatbotGuide", layout="wide") | |
st.title("Chatbot Guide") | |
# Define the options in the dropdown menu | |
app_options = [ | |
"1) Create CSVs", | |
"2) Merge CSVs", | |
"3) Upload Datasets", | |
"4) Create Chatbot" | |
] | |
# Sidebar dropdown for selecting the application | |
selected_app = st.sidebar.selectbox("Select Step (1-4)", app_options) | |
# Clear session state when switching apps | |
if 'last_selected_app' in st.session_state: | |
if st.session_state.last_selected_app != selected_app: | |
st.session_state.clear() | |
st.session_state.last_selected_app = selected_app | |
if 'submit' not in st.session_state: | |
st.session_state.submit = False | |
if 'error' not in st.session_state: | |
st.session_state.error = "" | |
if 'success' not in st.session_state: | |
st.session_state.success = None | |
if st.session_state.error != "": | |
st.error(st.session_state.error) | |
if st.session_state.success != None: | |
st.success("Success! Download the Q/A pairs below / Click reset to upload more PDFs") | |
st.download_button( | |
label="Download CSV", | |
data=st.session_state.success, | |
file_name='questions_answers.csv', | |
mime='text/csv', | |
) | |
if st.button('Reset'): | |
st.session_state.clear() | |
st.rerun() | |
if selected_app == "1) Create CSVs": | |
if st.session_state.success == None: | |
uploaded_files = st.file_uploader("Upload PDFs Here", type="pdf", accept_multiple_files=True) | |
question_protocol = st.text_input("Provide instructions for how questions should be generated", "Write a question based on the text") | |
answer_protocol = st.text_input("Provide instructions for how answers should be generated", "Write an answer based on the text") | |
sentence_chunks = st.number_input("Number sentences per Q/A pair", value=2, step=1, min_value=1, max_value=3) | |
openai_api_key = st.text_input("Enter your OpenAI API key", type="password") | |
submit = st.button("Submit") | |
if submit: | |
st.session_state.submit = True | |
if st.session_state.submit: | |
if uploaded_files: | |
client = OpenAI(api_key=openai_api_key) | |
# test api key | |
try: | |
response = client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": "Say this is a test"} | |
] | |
) | |
except: | |
st.session_state.error = "OpenAI API key is invalid" | |
st.session_state.success = None | |
st.session_state.submit = False | |
st.rerun() | |
with st.spinner("Loading, please be patient with us ... π"): | |
textify_output = read_and_textify_advanced(uploaded_files, sentence_chunks) | |
df = pd.DataFrame(textify_output) | |
df.columns = ['context'] | |
if question_protocol == "": | |
question_protocol = "Write questions based on the text" | |
df['questions'] = df.apply(lambda row: get_questions(row['context'], question_protocol), axis=1) | |
if answer_protocol == "": | |
answer_protocol = "Write answers based on the text" | |
df['answers'] = df.apply(lambda row: get_answers(row, answer_protocol), axis=1) | |
df = df.drop('context', axis=1) | |
csv = df.to_csv(index=False).encode('utf-8') | |
st.session_state.error = "" | |
st.session_state.success = csv | |
st.session_state.submit = False | |
st.rerun() | |
else: | |
st.session_state.error = "Please upload at least 1 PDF" | |
st.session_state.success = None | |
st.session_state.submit = False | |
st.rerun() | |
if selected_app == "2) Merge CSVs": | |
if st.session_state.success == None: | |
uploaded_files = st.file_uploader("Upload CSV files to merge", accept_multiple_files=True, type="csv") | |
submit = st.button("Submit") | |
if submit: | |
st.session_state.submit = True | |
if st.session_state.submit: | |
if len(uploaded_files) > 1: | |
a = 1 | |
else: | |
st.session_state.error = "Please upload at least 2 CSVs to merge" | |
st.session_state.success = None | |
st.session_state.submit = False | |
st.rerun() | |