File size: 8,691 Bytes
32a98d6
f6f33c3
b1a0926
21c729c
f6f33c3
 
a289e51
f6f33c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740fc36
f6f33c3
 
 
 
 
 
f932e1e
eda36d5
f6f33c3
f932e1e
 
 
 
f6f33c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32a98d6
0809a99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a894ab
 
 
 
 
c9d1679
fce100b
9a894ab
 
 
 
 
 
 
 
 
 
 
 
15fb029
9a894ab
 
 
 
 
 
 
 
 
 
 
e7dc910
9a894ab
e7dc910
9a894ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0809a99
5040d0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a894ab
5040d0d
9a894ab
5040d0d
 
 
 
9a894ab
 
e7dc910
 
15fb029
 
 
 
 
4f4ce1a
 
 
 
 
 
 
 
 
 
 
 
15fb029
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import streamlit as st
import pandas as pd
from openai import OpenAI
from PyPDF2 import PdfReader

# source: eagle0504/document-search-q-series
def read_and_textify_advanced(files, chunk_size):
    """
    Reads PDF files and extracts text from each page, breaking the text into specified segments.
    This function iterates over a list of uploaded PDF files, extracts text from each page,
    and compiles a list of texts and corresponding source information, segmented into smaller parts
    of approximately 'chunk_size' words each.
    Args:
    files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
    chunk_size (int): The number of words per text segment. Default is 50.
    Returns: A list of strings, where each string is a segment of text extracted from a PDF page.
    """

    text_list = []  # List to store extracted text segments

    # Iterate over each file
    for file in files:
        pdfReader = PdfReader(file)  # Create a PDF reader object
        # Iterate over each page in the PDF
        for i in range(len(pdfReader.pages)):
            pageObj = pdfReader.pages[i]  # Get the page object
            text = pageObj.extract_text()  # Extract text from the page
            if text:
                # Split text into chunks of approximately 'chunk_size' words
                words = text.split(".")
                for j in range(0, len(words), chunk_size):
                    # Get the chunk of text from j-chunk_size to j+chunk_size
                    # start = max(0, j - chunk_size)
                    # end = min(len(words), j + chunk_size + 1)
                    chunk = ".".join(words[j:j+chunk_size]) + '.'
                    chunk = chunk.strip()
                    text_list.append(chunk)
                    # Create a source identifier for each chunk and add it to the list
            else:
                # If no text extracted, still add a placeholder
                text_list.append("")
            pageObj.clear()  # Clear the page object (optional, for memory management)

    return text_list

def get_questions(context, instructions) -> str:
    """
    Given a text context, generates a list of questions using OpenAI's GPT-3 API.

    Args:
    - context: A string representing the context for which questions should be generated.

    Returns:
    - A string containing the question generated by the API.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": f"{instructions}\n\nText: {context}\n\nQuestions:\n"}
            ]
        )
        # Extract question text from the response
        question_text = response.choices[0].message.content
        return question_text
    except:
        # Return an empty string if there was an error
        return ""

def get_answers(row, instructions) -> str:
    """
    Given a dataframe row containing context and questions, generates an answer using OpenAI's GPT-3 API.

    Args:
    - row: A pandas dataframe row containing 'context' and 'questions' columns.

    Returns:
    - A string containing the answer generated by the API.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": f"{instructions}\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n"}
            ]
        )
        # Extract answer text from the response
        answer_text = response.choices[0].message.content
        return answer_text
    except Exception as e:
        # Print the error message and return an empty string if there was an error
        print (e)
        return ""

st.set_page_config(page_title="ChatbotGuide", layout="wide")
st.title("Chatbot Guide")

# Define the options in the dropdown menu
app_options = [
    "1) Create CSVs",
    "2) Merge CSVs",
    "3) Upload Datasets",
    "4) Create Chatbot"
]

# Sidebar dropdown for selecting the application
selected_app = st.sidebar.selectbox("Select Step (1-4)", app_options)

# Clear session state when switching apps
if 'last_selected_app' in st.session_state:
    if st.session_state.last_selected_app != selected_app:
        st.session_state.clear()

st.session_state.last_selected_app = selected_app

if selected_app == "1) Create CSVs":
    if 'submit' not in st.session_state:
        st.session_state.submit = False
    if 'error' not in st.session_state:
        st.session_state.error = ""
    if 'success' not in st.session_state:
        st.session_state.success = None
    
    if st.session_state.error != "":
        st.error(st.session_state.error)
        
    if st.session_state.success != None:
        st.success("Success! Download the Q/A pairs below / Click reset to upload more PDFs")
        st.download_button(
            label="Download CSV",
            data=st.session_state.success,
            file_name='questions_answers.csv',
            mime='text/csv',
        )
        if st.button('Reset'):
            st.session_state.clear()
            st.rerun()
            
    else:
        uploaded_files = st.file_uploader("Upload PDFs Here", type="pdf", accept_multiple_files=True)
        
        question_protocol = st.text_input("Provide instructions for how questions should be generated", "Write a question based on the text")
        answer_protocol = st.text_input("Provide instructions for how answers should be generated", "Write an answer based on the text")
        
        sentence_chunks = st.number_input("Number sentences per Q/A pair", value=2, step=1, min_value=1, max_value=3)
        
        openai_api_key = st.text_input("Enter your OpenAI API key", type="password")
    
        submit = st.button("Submit")
    
        if submit:
            st.session_state.submit = True
        
        if st.session_state.submit:
            if uploaded_files:
                client = OpenAI(api_key=openai_api_key)
        
                # test api key
                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                          {"role": "system", "content": "You are a helpful assistant."},
                          {"role": "user", "content": "Say this is a test"}
                        ]
                    )
                except:
                    st.session_state.error = "OpenAI API key is invalid"
                    st.session_state.success = None
                    st.session_state.submit = False
                    st.rerun()
                    
                with st.spinner("Loading, please be patient with us ... πŸ™"):
                    textify_output = read_and_textify_advanced(uploaded_files, sentence_chunks)
                    
                    df = pd.DataFrame(textify_output)
                    df.columns = ['context']
            
                    if question_protocol == "":
                        question_protocol = "Write questions based on the text"
                    df['questions'] = df.apply(lambda row: get_questions(row['context'], question_protocol), axis=1)
            
                    if answer_protocol == "":
                        answer_protocol = "Write answers based on the text"
                    df['answers'] = df.apply(lambda row: get_answers(row, answer_protocol), axis=1)
            
                    df = df.drop('context', axis=1)
        
                    csv = df.to_csv(index=False).encode('utf-8')
                
                    st.session_state.error = ""
                    st.session_state.success = csv
                    st.session_state.submit = False
                    st.rerun()
            else:
                st.session_state.error = "Please upload at least 1 PDF"
                st.session_state.success = None
                st.session_state.submit = False
                st.rerun()

if selected_app == "2) Merge CSVs":
    uploaded_files = st.file_uploader("Upload CSV files to merge", accept_multiple_files=True, type="csv")

    submit = st.button("Submit")
    if submit:
        st.session_state.submit = True
    
    if st.session_state.submit:
        if len(uploaded_files) > 1:
            a = 1
        else:
            st.session_state.error = "Please upload at least 2 CSVs to merge"
            st.session_state.success = None
            st.session_state.submit = False
            st.rerun()