File size: 7,474 Bytes
32a98d6
f6f33c3
b1a0926
21c729c
f6f33c3
 
a289e51
f6f33c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740fc36
f6f33c3
 
 
 
 
 
f932e1e
eda36d5
f6f33c3
f932e1e
 
 
 
f6f33c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32a98d6
9a894ab
 
 
c9d1679
9a894ab
 
 
 
 
c9d1679
fce100b
9a894ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7dc910
9a894ab
e7dc910
9a894ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7dc910
 
9a894ab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import streamlit as st
import pandas as pd
from openai import OpenAI
from PyPDF2 import PdfReader

# source: eagle0504/document-search-q-series
def read_and_textify_advanced(files, chunk_size):
    """
    Reads PDF files and extracts text from each page, breaking the text into specified segments.
    This function iterates over a list of uploaded PDF files, extracts text from each page,
    and compiles a list of texts and corresponding source information, segmented into smaller parts
    of approximately 'chunk_size' words each.
    Args:
    files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
    chunk_size (int): The number of words per text segment. Default is 50.
    Returns: A list of strings, where each string is a segment of text extracted from a PDF page.
    """

    text_list = []  # List to store extracted text segments

    # Iterate over each file
    for file in files:
        pdfReader = PdfReader(file)  # Create a PDF reader object
        # Iterate over each page in the PDF
        for i in range(len(pdfReader.pages)):
            pageObj = pdfReader.pages[i]  # Get the page object
            text = pageObj.extract_text()  # Extract text from the page
            if text:
                # Split text into chunks of approximately 'chunk_size' words
                words = text.split(".")
                for j in range(0, len(words), chunk_size):
                    # Get the chunk of text from j-chunk_size to j+chunk_size
                    # start = max(0, j - chunk_size)
                    # end = min(len(words), j + chunk_size + 1)
                    chunk = ".".join(words[j:j+chunk_size]) + '.'
                    chunk = chunk.strip()
                    text_list.append(chunk)
                    # Create a source identifier for each chunk and add it to the list
            else:
                # If no text extracted, still add a placeholder
                text_list.append("")
            pageObj.clear()  # Clear the page object (optional, for memory management)

    return text_list

def get_questions(context, instructions) -> str:
    """
    Given a text context, generates a list of questions using OpenAI's GPT-3 API.

    Args:
    - context: A string representing the context for which questions should be generated.

    Returns:
    - A string containing the question generated by the API.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": f"{instructions}\n\nText: {context}\n\nQuestions:\n"}
            ]
        )
        # Extract question text from the response
        question_text = response.choices[0].message.content
        return question_text
    except:
        # Return an empty string if there was an error
        return ""

def get_answers(row, instructions) -> str:
    """
    Given a dataframe row containing context and questions, generates an answer using OpenAI's GPT-3 API.

    Args:
    - row: A pandas dataframe row containing 'context' and 'questions' columns.

    Returns:
    - A string containing the answer generated by the API.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": f"{instructions}\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n"}
            ]
        )
        # Extract answer text from the response
        answer_text = response.choices[0].message.content
        return answer_text
    except Exception as e:
        # Print the error message and return an empty string if there was an error
        print (e)
        return ""

def run():
    st.set_page_config(page_title="ChatbotGuide", layout="wide")
    st.title("Chatbot Guide")
    
    if 'submit' not in st.session_state:
        st.session_state.submit = False
    if 'error' not in st.session_state:
        st.session_state.error = ""
    if 'success' not in st.session_state:
        st.session_state.success = None
    
    if st.session_state.error != "":
        st.error(st.session_state.error)
        
    if st.session_state.success != None:
        st.success("Success! Download the Q/A pairs below / Click reset to upload more PDFs")
        st.download_button(
            label="Download CSV",
            data=st.session_state.success,
            file_name='questions_answers.csv',
            mime='text/csv',
        )
        if st.button('Reset'):
            st.session_state.success = None
            st.rerun()
            
    else:
        uploaded_files = st.file_uploader("Upload PDFs Here", type="pdf", accept_multiple_files=True)
        
        question_protocol = st.text_input("Provide instructions for how questions should be generated", "Write a question based on the text")
        answer_protocol = st.text_input("Provide instructions for how answers should be generated", "Write an answer based on the text")
        
        sentence_chunks = st.number_input("Number sentences per Q/A pair", value=2, step=1, min_value=1, max_value=3)
        
        openai_api_key = st.text_input("Enter your OpenAI API key", type="password")
    
        submit = st.button("Submit")
    
        if submit:
            st.session_state.submit = True
        
        if st.session_state.submit:
            if uploaded_files:
                client = OpenAI(api_key=openai_api_key)
        
                # test api key
                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                          {"role": "system", "content": "You are a helpful assistant."},
                          {"role": "user", "content": "Say this is a test"}
                        ]
                    )
                except:
                    st.session_state.error = "OpenAI API key is invalid"
                    st.session_state.success = None
                    st.session_state.submit = False
                    run()
                
                textify_output = read_and_textify_advanced(uploaded_files, sentence_chunks)
                
                df = pd.DataFrame(textify_output)
                df.columns = ['context']
        
                if question_protocol == "":
                    question_protocol = "Write questions based on the text"
                df['questions'] = df.apply(lambda row: get_questions(row['context'], question_protocol), axis=1)
        
                if answer_protocol == "":
                    answer_protocol = "Write answers based on the text"
                df['answers'] = df.apply(lambda row: get_answers(row, answer_protocol), axis=1)
        
                df = df.drop('context', axis=1)
        
                csv = df.to_csv(index=False).encode('utf-8')
                
                st.session_state.error = ""
                st.session_state.success = csv
                st.session_state.submit = False
                run()
            else:
                st.session_state.error = "Please upload at least 1 PDF"
                st.session_state.success = None
                st.session_state.submit = False
                run()