File size: 7,138 Bytes
32a98d6
f6f33c3
b1a0926
21c729c
f6f33c3
 
a289e51
f6f33c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740fc36
f6f33c3
 
 
 
 
 
f932e1e
eda36d5
f6f33c3
f932e1e
 
 
 
f6f33c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32a98d6
cee6e3a
c9d1679
85e21cb
 
 
1ed2504
 
ae7d0a8
 
85e21cb
1ed2504
 
c9d1679
ae7d0a8
66a64b7
c9d1679
66a64b7
c9d1679
 
 
28c2bb6
c9d1679
 
 
 
fce100b
 
 
 
 
 
 
 
 
1ed2504
fce100b
cc7ef1f
e7dc910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b8532f
 
e7dc910
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import streamlit as st
import pandas as pd
from openai import OpenAI
from PyPDF2 import PdfReader

# source: eagle0504/document-search-q-series
def read_and_textify_advanced(files, chunk_size):
    """
    Reads PDF files and extracts text from each page, breaking the text into specified segments.
    This function iterates over a list of uploaded PDF files, extracts text from each page,
    and compiles a list of texts and corresponding source information, segmented into smaller parts
    of approximately 'chunk_size' words each.
    Args:
    files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
    chunk_size (int): The number of words per text segment. Default is 50.
    Returns: A list of strings, where each string is a segment of text extracted from a PDF page.
    """

    text_list = []  # List to store extracted text segments

    # Iterate over each file
    for file in files:
        pdfReader = PdfReader(file)  # Create a PDF reader object
        # Iterate over each page in the PDF
        for i in range(len(pdfReader.pages)):
            pageObj = pdfReader.pages[i]  # Get the page object
            text = pageObj.extract_text()  # Extract text from the page
            if text:
                # Split text into chunks of approximately 'chunk_size' words
                words = text.split(".")
                for j in range(0, len(words), chunk_size):
                    # Get the chunk of text from j-chunk_size to j+chunk_size
                    # start = max(0, j - chunk_size)
                    # end = min(len(words), j + chunk_size + 1)
                    chunk = ".".join(words[j:j+chunk_size]) + '.'
                    chunk = chunk.strip()
                    text_list.append(chunk)
                    # Create a source identifier for each chunk and add it to the list
            else:
                # If no text extracted, still add a placeholder
                text_list.append("")
            pageObj.clear()  # Clear the page object (optional, for memory management)

    return text_list

def get_questions(context, instructions) -> str:
    """
    Given a text context, generates a list of questions using OpenAI's GPT-3 API.

    Args:
    - context: A string representing the context for which questions should be generated.

    Returns:
    - A string containing the question generated by the API.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": f"{instructions}\n\nText: {context}\n\nQuestions:\n"}
            ]
        )
        # Extract question text from the response
        question_text = response.choices[0].message.content
        return question_text
    except:
        # Return an empty string if there was an error
        return ""

def get_answers(row, instructions) -> str:
    """
    Given a dataframe row containing context and questions, generates an answer using OpenAI's GPT-3 API.

    Args:
    - row: A pandas dataframe row containing 'context' and 'questions' columns.

    Returns:
    - A string containing the answer generated by the API.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": f"{instructions}\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n"}
            ]
        )
        # Extract answer text from the response
        answer_text = response.choices[0].message.content
        return answer_text
    except Exception as e:
        # Print the error message and return an empty string if there was an error
        print (e)
        return ""

st.set_page_config(page_title="ChatbotGuide", layout="wide")
st.title("Chatbot Guide")

if 'submit' not in st.session_state:
    st.session_state.submit = False
if 'error' not in st.session_state:
    st.session_state.error = ""
if 'success' not in st.session_state:
    st.session_state.success = None

if st.session_state.error != "":
    st.error(st.session_state.error)
    
if st.session_state.success != None:
    st.success("Success! Download the Q/A pairs below / Click reset to upload more PDFs")
    st.download_button(
        label="Download CSV",
        data=st.session_state.success,
        file_name='questions_answers.csv',
        mime='text/csv',
    )
    if st.button('Reset'):
        st.session_state.success = None
        st.rerun()
        
else:
    uploaded_files = st.file_uploader("Upload PDFs Here", type="pdf", accept_multiple_files=True)
    
    question_protocol = st.text_input("Provide instructions for how questions should be generated", "Write a question based on the text")
    answer_protocol = st.text_input("Provide instructions for how answers should be generated", "Write an answer based on the text")
    
    sentence_chunks = st.number_input("Number sentences per Q/A pair", value=2, step=1, min_value=1, max_value=3)
    
    openai_api_key = st.text_input("Enter your OpenAI API key", type="password")

    submit = st.button("Submit")

    if submit:
        st.session_state.submit = True
    
    if st.session_state.submit:
        if uploaded_files:
            client = OpenAI(api_key=openai_api_key)
    
            # test api key
            try:
                response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                      {"role": "system", "content": "You are a helpful assistant."},
                      {"role": "user", "content": "Say this is a test"}
                    ]
                )
            except:
                st.session_state.error = "OpenAI API key is invalid"
                st.session_state.success = None
                st.session_state.submit = False
                st.rerun()
            
            textify_output = read_and_textify_advanced(uploaded_files, sentence_chunks)
            
            df = pd.DataFrame(textify_output)
            df.columns = ['context']
    
            if question_protocol == "":
                question_protocol = "Write questions based on the text"
            df['questions'] = df.apply(lambda row: get_questions(row['context'], question_protocol), axis=1)
    
            if answer_protocol == "":
                answer_protocol = "Write answers based on the text"
            df['answers'] = df.apply(lambda row: get_answers(row, answer_protocol), axis=1)
    
            df = df.drop('context', axis=1)
    
            csv = df.to_csv(index=False).encode('utf-8')
            
            st.session_state.error = ""
            st.session_state.success = csv
            st.session_state.submit = False
            st.rerun()
        else:
            st.session_state.error = "Please upload at least 1 PDF"
            st.session_state.success = None
            st.session_state.submit = False
            st.rerun()