File size: 6,847 Bytes
32a98d6
f6f33c3
b1a0926
21c729c
f6f33c3
 
a289e51
f6f33c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740fc36
f6f33c3
 
 
 
 
 
f932e1e
eda36d5
f6f33c3
f932e1e
 
 
 
f6f33c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32a98d6
cee6e3a
85e21cb
 
 
1ed2504
 
ae7d0a8
 
85e21cb
a1543e8
32a98d6
1ed2504
 
ae7d0a8
 
1ed2504
65ccf5b
cc7ef1f
 
f932e1e
 
cc7ef1f
 
 
b1ed94b
cc7ef1f
770420c
85e21cb
 
f6f33c3
e70be8a
122a344
4b8532f
35655d6
 
6a1543a
 
 
 
 
 
 
 
 
4b8532f
ae7d0a8
4b8532f
 
 
122a344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae7d0a8
122a344
 
 
 
 
ae7d0a8
01532aa
ae7d0a8
01532aa
 
122a344
1ed2504
ae7d0a8
78801a1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import streamlit as st
import pandas as pd
from openai import OpenAI
from PyPDF2 import PdfReader

# source: eagle0504/document-search-q-series
def read_and_textify_advanced(files, chunk_size):
    """
    Reads PDF files and extracts text from each page, breaking the text into specified segments.
    This function iterates over a list of uploaded PDF files, extracts text from each page,
    and compiles a list of texts and corresponding source information, segmented into smaller parts
    of approximately 'chunk_size' words each.
    Args:
    files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
    chunk_size (int): The number of words per text segment. Default is 50.
    Returns: A list of strings, where each string is a segment of text extracted from a PDF page.
    """

    text_list = []  # List to store extracted text segments

    # Iterate over each file
    for file in files:
        pdfReader = PdfReader(file)  # Create a PDF reader object
        # Iterate over each page in the PDF
        for i in range(len(pdfReader.pages)):
            pageObj = pdfReader.pages[i]  # Get the page object
            text = pageObj.extract_text()  # Extract text from the page
            if text:
                # Split text into chunks of approximately 'chunk_size' words
                words = text.split(".")
                for j in range(0, len(words), chunk_size):
                    # Get the chunk of text from j-chunk_size to j+chunk_size
                    # start = max(0, j - chunk_size)
                    # end = min(len(words), j + chunk_size + 1)
                    chunk = ".".join(words[j:j+chunk_size]) + '.'
                    chunk = chunk.strip()
                    text_list.append(chunk)
                    # Create a source identifier for each chunk and add it to the list
            else:
                # If no text extracted, still add a placeholder
                text_list.append("")
            pageObj.clear()  # Clear the page object (optional, for memory management)

    return text_list

def get_questions(context, instructions) -> str:
    """
    Given a text context, generates a list of questions using OpenAI's GPT-3 API.

    Args:
    - context: A string representing the context for which questions should be generated.

    Returns:
    - A string containing the question generated by the API.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": f"{instructions}\n\nText: {context}\n\nQuestions:\n"}
            ]
        )
        # Extract question text from the response
        question_text = response.choices[0].message.content
        return question_text
    except:
        # Return an empty string if there was an error
        return ""

def get_answers(row, instructions) -> str:
    """
    Given a dataframe row containing context and questions, generates an answer using OpenAI's GPT-3 API.

    Args:
    - row: A pandas dataframe row containing 'context' and 'questions' columns.

    Returns:
    - A string containing the answer generated by the API.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": f"{instructions}\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n"}
            ]
        )
        # Extract answer text from the response
        answer_text = response.choices[0].message.content
        return answer_text
    except Exception as e:
        # Print the error message and return an empty string if there was an error
        print (e)
        return ""

st.set_page_config(page_title="ChatbotGuide", layout="wide")

if 'submit' not in st.session_state:
    st.session_state.submit = False
if 'error' not in st.session_state:
    st.session_state.error = ""
if 'success' not in st.session_state:
    st.session_state.success = None

st.title("Chatbot Guide")

if st.session_state.error != "":
    st.error(st.session_state.error)
if st.session_state.success != None:
    st.success(st.session_state.success)

uploaded_files = st.file_uploader("Upload PDFs Here", type="pdf", accept_multiple_files=True)
st.success(f"{len(uploaded_files)} document(s) loaded...")

question_protocol = st.text_input("Provide instructions for how questions should be generated", "Write a question based on the text")
answer_protocol = st.text_input("Provide instructions for how answers should be generated", "Write an answer based on the text")

sentence_chunks = st.number_input("Number sentences per Q/A pair", value=2, step=1, min_value=1, max_value=3)

openai_api_key = st.text_input("Enter your OpenAI API key", type="password")

submit = st.button("Submit")
if submit:
    st.session_state.submit = True

if st.session_state.submit:
    if uploaded_files:
        client = OpenAI(api_key=openai_api_key)

        # test api key
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                  {"role": "system", "content": "You are a helpful assistant."},
                  {"role": "user", "content": "Say this is a test"}
                ]
            )
        except:
            st.session_state.error = "OpenAI API key is invalid"
            st.session_state.success = None
            st.session_state.submit = False
            st.rerun()
        
        textify_output = read_and_textify_advanced(uploaded_files, sentence_chunks)
        
        df = pd.DataFrame(textify_output)
        df.columns = ['context']

        if question_protocol == "":
            question_protocol = "Write questions based on the text"
        df['questions'] = df.apply(lambda row: get_questions(row['context'], question_protocol), axis=1)

        if answer_protocol == "":
            answer_protocol = "Write answers based on the text"
        df['answers'] = df.apply(lambda row: get_answers(row, answer_protocol), axis=1)

        df = df.drop('context', axis=1)

        csv = df.to_csv(index=False).encode('utf-8')
        download = st.download_button(
            label="Download Q/A pairs as CSV",
            data=csv,
            file_name='questions_answers.csv',
            mime='text/csv',
        )
        
        st.session_state.error = ""
        st.session_state.success = download
        st.session_state.submit = False
        st.rerun()
    else:
        st.session_state.error = "Please upload at least 1 PDF"
        st.session_state.success = None
        st.session_state.submit = False
        st.rerun()