Spaces:

KeshavRa
/

ChatbotGuide

Sleeping

App Files Files Community

ChatbotGuide / create.py

KeshavRa

Update create.py

9a894ab verified 7 months ago

raw

history blame

7.47 kB

	import streamlit as st
	import pandas as pd
	from openai import OpenAI
	from PyPDF2 import PdfReader

	# source: eagle0504/document-search-q-series
	def read_and_textify_advanced(files, chunk_size):
	"""
	Reads PDF files and extracts text from each page, breaking the text into specified segments.
	This function iterates over a list of uploaded PDF files, extracts text from each page,
	and compiles a list of texts and corresponding source information, segmented into smaller parts
	of approximately 'chunk_size' words each.
	Args:
	files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
	chunk_size (int): The number of words per text segment. Default is 50.
	Returns: A list of strings, where each string is a segment of text extracted from a PDF page.
	"""

	text_list = [] # List to store extracted text segments

	# Iterate over each file
	for file in files:
	pdfReader = PdfReader(file) # Create a PDF reader object
	# Iterate over each page in the PDF
	for i in range(len(pdfReader.pages)):
	pageObj = pdfReader.pages[i] # Get the page object
	text = pageObj.extract_text() # Extract text from the page
	if text:
	# Split text into chunks of approximately 'chunk_size' words
	words = text.split(".")
	for j in range(0, len(words), chunk_size):
	# Get the chunk of text from j-chunk_size to j+chunk_size
	# start = max(0, j - chunk_size)
	# end = min(len(words), j + chunk_size + 1)
	chunk = ".".join(words[j:j+chunk_size]) + '.'
	chunk = chunk.strip()
	text_list.append(chunk)
	# Create a source identifier for each chunk and add it to the list
	else:
	# If no text extracted, still add a placeholder
	text_list.append("")
	pageObj.clear() # Clear the page object (optional, for memory management)

	return text_list

	def get_questions(context, instructions) -> str:
	"""
	Given a text context, generates a list of questions using OpenAI's GPT-3 API.

	Args:
	- context: A string representing the context for which questions should be generated.

	Returns:
	- A string containing the question generated by the API.
	"""

	try:
	response = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": f"{instructions}\n\nText: {context}\n\nQuestions:\n"}
	]
	)
	# Extract question text from the response
	question_text = response.choices[0].message.content
	return question_text
	except:
	# Return an empty string if there was an error
	return ""

	def get_answers(row, instructions) -> str:
	"""
	Given a dataframe row containing context and questions, generates an answer using OpenAI's GPT-3 API.

	Args:
	- row: A pandas dataframe row containing 'context' and 'questions' columns.

	Returns:
	- A string containing the answer generated by the API.
	"""

	try:
	response = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": f"{instructions}\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n"}
	]
	)
	# Extract answer text from the response
	answer_text = response.choices[0].message.content
	return answer_text
	except Exception as e:
	# Print the error message and return an empty string if there was an error
	print (e)
	return ""

	def run():
	st.set_page_config(page_title="ChatbotGuide", layout="wide")
	st.title("Chatbot Guide")

	if 'submit' not in st.session_state:
	st.session_state.submit = False
	if 'error' not in st.session_state:
	st.session_state.error = ""
	if 'success' not in st.session_state:
	st.session_state.success = None

	if st.session_state.error != "":
	st.error(st.session_state.error)

	if st.session_state.success != None:
	st.success("Success! Download the Q/A pairs below / Click reset to upload more PDFs")
	st.download_button(
	label="Download CSV",
	data=st.session_state.success,
	file_name='questions_answers.csv',
	mime='text/csv',
	)
	if st.button('Reset'):
	st.session_state.success = None
	st.rerun()

	else:
	uploaded_files = st.file_uploader("Upload PDFs Here", type="pdf", accept_multiple_files=True)

	question_protocol = st.text_input("Provide instructions for how questions should be generated", "Write a question based on the text")
	answer_protocol = st.text_input("Provide instructions for how answers should be generated", "Write an answer based on the text")

	sentence_chunks = st.number_input("Number sentences per Q/A pair", value=2, step=1, min_value=1, max_value=3)

	openai_api_key = st.text_input("Enter your OpenAI API key", type="password")

	submit = st.button("Submit")

	if submit:
	st.session_state.submit = True

	if st.session_state.submit:
	if uploaded_files:
	client = OpenAI(api_key=openai_api_key)

	# test api key
	try:
	response = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": "Say this is a test"}
	]
	)
	except:
	st.session_state.error = "OpenAI API key is invalid"
	st.session_state.success = None
	st.session_state.submit = False
	run()

	textify_output = read_and_textify_advanced(uploaded_files, sentence_chunks)

	df = pd.DataFrame(textify_output)
	df.columns = ['context']

	if question_protocol == "":
	question_protocol = "Write questions based on the text"
	df['questions'] = df.apply(lambda row: get_questions(row['context'], question_protocol), axis=1)

	if answer_protocol == "":
	answer_protocol = "Write answers based on the text"
	df['answers'] = df.apply(lambda row: get_answers(row, answer_protocol), axis=1)

	df = df.drop('context', axis=1)

	csv = df.to_csv(index=False).encode('utf-8')

	st.session_state.error = ""
	st.session_state.success = csv
	st.session_state.submit = False
	run()
	else:
	st.session_state.error = "Please upload at least 1 PDF"
	st.session_state.success = None
	st.session_state.submit = False
	run()